Пример #1
0
int main(int argc, const char *argv[]) {

  OptArgs opts;
  opts.ParseCmdLine(argc, argv);
  string queryFile, goldFile;
  double epsilon;
  bool help = false;
  bool version = false;
  int allowedWrong = 0;
  double maxAbsVal = 0;
  double minCorrelation = 1;
  opts.GetOption(queryFile, "", 'q', "query-wells");
  opts.GetOption(goldFile, "", 'g', "gold-wells");
  opts.GetOption(epsilon, "0.0", 'e', "epsilon");
  opts.GetOption(allowedWrong, "0", 'm', "max-mismatch");
  opts.GetOption(minCorrelation, "1", 'c', "min-cor");
  opts.GetOption(maxAbsVal, "1e3", '-', "max-val");
  opts.GetOption(help, "false", 'h', "help");
  opts.GetOption(version, "false", 'v', "version");
  opts.CheckNoLeftovers();
  
  if (version) {
  	fprintf (stdout, "%s", IonVersion::GetFullVersion("RawWellsEquivalent").c_str());
  	exit(0);
  }
  
  if (queryFile.empty() || goldFile.empty() || help) {
    cout << "RawWellsEquivalent - Check to see how similar two wells files are to each other" << endl 
	 << "options: " << endl
	 << "   -g,--gold-wells    trusted wells to compare against." << endl
	 << "   -q,--query-wells   new wells to check." << endl
	 << "   -e,--epsilon       maximum allowed difference to be considered equivalent." << endl 
	 << "   -m,--max-mixmatch  maximum number of non-equivalent entries to allow." << endl
	 << "   -c,--min-cor       minimum correlation allowed to be considered equivalent." << endl
	 << "      --max-val       maximum absolute value considered (avoid extreme values)." << endl
	 << "   -h,--help          this message." << endl
	 << "" << endl 
         << "usage: " << endl
	 << "   RawWellsEquivalent -e 10 --query-wells query.wells --gold-wells gold.wells " << endl;
    exit(1);
  }

  NumericalComparison<double> compare = CompareWells(queryFile, goldFile, epsilon, maxAbsVal);
  cout << compare.GetCount() << " total values. " << endl
       << compare.GetNumSame() << " (" << (100.0 * compare.GetNumSame())/compare.GetCount() <<  "%) are equivalent. " << endl
       << compare.GetNumDiff() << " (" << (100.0 * compare.GetNumDiff())/compare.GetCount() <<  "%) are not equivalent. " << endl 

       << "Correlation of: " << compare.GetCorrelation() << endl;

  if((compare.GetCount() - allowedWrong) >= compare.GetNumSame() || 
     compare.GetCorrelation() < minCorrelation) {
    cout << "Wells files not equivalent for allowed mismatch: " << allowedWrong 
	 << " minimum correlation: " << minCorrelation << endl;
    return 1;
  }
  cout << "Wells files equivalent for allowed mismatch: " << allowedWrong 
       << " minimum correlation: " << minCorrelation << endl;
  return 0;
}
Пример #2
0
int main(int argc, const char *argv[]) {
  OptArgs opts;
  opts.ParseCmdLine(argc, argv);
  string queryFile, goldFile;
  double epsilon;
  bool help = false;
  bool version = false;
  int allowedWrong = 0;
  double maxAbsVal = 0;
  double minCorrelation = 1;
  bool dumpMisMatch = false;
  opts.GetOption(queryFile, "", 'q', "query-wells");
  opts.GetOption(goldFile, "", 'g', "gold-wells");
  opts.GetOption(epsilon, "0.0", 'e', "epsilon");
  opts.GetOption(allowedWrong, "0", 'm', "max-mismatch");
  opts.GetOption(minCorrelation, "1", 'c', "min-cor");
  opts.GetOption(maxAbsVal, "1e3", '-', "max-val");
  opts.GetOption(help, "false", 'h', "help");
  opts.GetOption(version, "false", 'v', "version");
  opts.GetOption(dumpMisMatch, "false", 'o', "dump-mismatch");
  opts.CheckNoLeftovers();
  
  if (version) {
  	fprintf (stdout, "%s", IonVersion::GetFullVersion("RawWellsEquivalent").c_str());
  	exit(0);
  }
  
  if (queryFile.empty() || goldFile.empty() || help) {
    printUsage();
    exit(1);
  }

  DumpMismatches dump(dumpMisMatch);
  NumericalComparison<double> compare = CompareWells(queryFile, goldFile, epsilon, maxAbsVal, dump);
  cout << compare.GetCount() << " total values. " << endl
       << compare.GetNumSame() << " (" << (100.0 * compare.GetNumSame())/compare.GetCount() <<  "%) are equivalent. " << endl
       << compare.GetNumDiff() << " (" << (100.0 * compare.GetNumDiff())/compare.GetCount() <<  "%) are not equivalent. " << endl 
       << "Correlation of: " << compare.GetCorrelation() << endl;

  if((compare.GetCount() - allowedWrong) > compare.GetNumSame() || 
     (compare.GetCorrelation() < minCorrelation && compare.GetCount() != compare.GetNumSame())) {
     cout << "Wells files not equivalent for allowed mismatch: " << allowedWrong 
     << " minimum correlation: " << minCorrelation << endl;
     return 1;
  }
  cout << "Wells files equivalent for allowed mismatch: " << allowedWrong 
       << " minimum correlation: " << minCorrelation << endl;
  return 0;
}
Пример #3
0
int main (int argc, const char *argv[])
{
  time_t program_start_time;
  time(&program_start_time);
  Json::Value calibration_json(Json::objectValue);
  DumpStartingStateOfProgram (argc,argv,program_start_time, calibration_json["Calibration"]);

  //
  // Step 1. Process command line options
  //

  OptArgs opts;
  opts.ParseCmdLine(argc, argv);

  CalibrationContext calib_context;
  if (not calib_context.InitializeFromOpts(opts)){
    PrintHelp_CalModules();
  }

  HistogramCalibration master_histogram(opts, calib_context);
  calib_context.hist_calibration_master = &master_histogram;

  LinearCalibrationModel master_linear_model(opts, calib_context);
  calib_context.linear_model_master = &master_linear_model;

  opts.CheckNoLeftovers();

  //
  // Step 2. Execute threaded calibration
  //

  time_t calibration_start_time;
  time(&calibration_start_time);

  pthread_mutex_init(&calib_context.read_mutex,  NULL);
  pthread_mutex_init(&calib_context.write_mutex, NULL);

  pthread_t worker_id[calib_context.num_threads];
  for (int worker = 0; worker < calib_context.num_threads; worker++)
  if (pthread_create(&worker_id[worker], NULL, CalibrationWorker, &calib_context)) {
    cerr << "Calibration ERROR: Problem starting thread" << endl;
    exit (EXIT_FAILURE);
  }

  for (int worker = 0; worker < calib_context.num_threads; worker++)
    pthread_join(worker_id[worker], NULL);

  pthread_mutex_destroy(&calib_context.read_mutex);
  pthread_mutex_destroy(&calib_context.write_mutex);

  time_t calibration_end_time;
  time(&calibration_end_time);


  //
  // Step 3. Create models, write output, and close modules
  //

  // HP histogram calibration
  if (master_histogram.CreateCalibrationModel())
    master_histogram.ExportModelToJson(calibration_json["HPHistogram"]);

  // Linear Model
  if (master_linear_model.CreateCalibrationModel())
    master_linear_model.ExportModelToJson(calibration_json["LinearModel"], "");


  // Transfer stuff from calibration context and close bam reader
  calib_context.Close(calibration_json["Calibration"]);

  time_t program_end_time;
  time(&program_end_time);

  calibration_json["Calibration"]["end_time"] = get_time_iso_string(program_end_time);
  calibration_json["Calibration"]["total_duration"] = (Json::Int)difftime(program_end_time,program_start_time);
  calibration_json["Calibration"]["calibration_duration"] = (Json::Int)difftime(calibration_end_time,calibration_start_time);

  SaveJson(calibration_json, calib_context.filename_json);
  return EXIT_SUCCESS;
}
Пример #4
0
int main (int argc, const char *argv[])
{
  printf ("------------- bamrealignment --------------\n");

  OptArgs opts;
  opts.ParseCmdLine(argc, argv);
  vector<int> score_vals(4);

  string input_bam  = opts.GetFirstString  ('i', "input", "");
  string output_bam = opts.GetFirstString  ('o', "output", "");
  opts.GetOption(score_vals, "4,-6,-5,-2", 's', "scores");
  int    clipping   = opts.GetFirstInt     ('c', "clipping", 2);
  bool   anchors    = opts.GetFirstBoolean ('a', "anchors", true);
  int    bandwidth  = opts.GetFirstInt     ('b', "bandwidth", 10);
  bool   verbose    = opts.GetFirstBoolean ('v', "verbose", false);
  bool   debug      = opts.GetFirstBoolean ('d', "debug", false);
  int    format     = opts.GetFirstInt     ('f', "format", 1);
  int  num_threads  = opts.GetFirstInt     ('t', "threads", 8);
  string log_fname  = opts.GetFirstString  ('l', "log", "");
  

  if (input_bam.empty() or output_bam.empty())
    return PrintHelp();

  opts.CheckNoLeftovers();

  std::ofstream logf;
  if (log_fname.size ())
  {
    logf.open (log_fname.c_str ());
    if (!logf.is_open ())
    {
      fprintf (stderr, "bamrealignment: Failed to open log file %s\n", log_fname.c_str());
      return 1;
    }
  }

  BamReader reader;
  if (!reader.Open(input_bam)) {
    fprintf(stderr, "bamrealignment: Failed to open input file %s\n", input_bam.c_str());
    return 1;
  }

  SamHeader header = reader.GetHeader();
  RefVector refs   = reader.GetReferenceData();

  BamWriter writer;
  writer.SetNumThreads(num_threads);
  if (format == 1)
    writer.SetCompressionMode(BamWriter::Uncompressed);
  else
    writer.SetCompressionMode(BamWriter::Compressed);

  if (!writer.Open(output_bam, header, refs)) {
    fprintf(stderr, "bamrealignment: Failed to open output file %s\n", output_bam.c_str());
    return 1;
  }


  // The meat starts here ------------------------------------

  if (verbose)
    cout << "Verbose option is activated, each alignment will print to screen." << endl
         << "  After a read hit RETURN to continue to the next one," << endl
         << "  or press q RETURN to quit the program," << endl
         << "  or press s Return to silence verbose," << endl
         << "  or press c RETURN to continue printing without further prompt." << endl << endl;

  unsigned int readcounter = 0;
  unsigned int mapped_readcounter = 0;
  unsigned int realigned_readcounter = 0;
  unsigned int modified_alignment_readcounter = 0;
  unsigned int pos_update_readcounter = 0;
  unsigned int failed_clip_realigned_readcount = 0;
  
  unsigned int already_perfect_readcount = 0;
  
  unsigned int bad_md_tag_readcount = 0;
  unsigned int error_recreate_ref_readcount = 0;
  unsigned int error_clip_anchor_readcount = 0;
  unsigned int error_sw_readcount = 0;
  unsigned int error_unclip_readcount = 0;
  
  unsigned int start_position_shift;
  int orig_position;
  int new_position;

  string  md_tag, new_md_tag, input = "x";
  vector<CigarOp>    new_cigar_data;
  vector<MDelement>  new_md_data;
  bool position_shift = false;
  time_t start_time = time(NULL);

  Realigner aligner;
  aligner.verbose_ = verbose;
  aligner.debug_   = debug;
  if (!aligner.SetScores(score_vals))
    cout << "bamrealignment: Four scores need to be provided: match, mismatch, gap open, gap extend score!" << endl;

  aligner.SetAlignmentBandwidth(bandwidth);

  BamAlignment alignment;
  while(reader.GetNextAlignment(alignment)){
    readcounter ++;
    position_shift = false;
    
    if ( (readcounter % 100000) == 0 )
       cout << "Processed " << readcounter << " reads. Elapsed time: " << (time(NULL) - start_time) << endl;

    if (alignment.IsMapped()) {
      
      
      
      orig_position = alignment.Position;
      mapped_readcounter++;
      aligner.SetClipping(clipping, !alignment.IsReverseStrand());
      if (aligner.verbose_) {
    	cout << endl;
        if (alignment.IsReverseStrand())
          cout << "The read is from the reverse strand." << endl;
        else
          cout << "The read is from the forward strand." << endl;
      }

      if (!alignment.GetTag("MD", md_tag)) {
    	if (aligner.verbose_)
          cout << "Warning: Skipping read " << alignment.Name << ". It is mapped but missing MD tag." << endl;
	if (logf.is_open ())
	  logf << alignment.Name << '\t' << alignment.IsReverseStrand() << '\t' << alignment.RefID << '\t' << setfill ('0') << setw (8) << orig_position << '\t' << "MISSMD" << '\n';
	bad_md_tag_readcount++;
      } else if (aligner.CreateRefFromQueryBases(alignment.QueryBases, alignment.CigarData, md_tag, anchors)) {
	bool clipfail = false;
	if (Realigner::CR_ERR_CLIP_ANCHOR == aligner.GetCreateRefError ())
	{
	  clipfail = true;
	  failed_clip_realigned_readcount ++;
	}

        if (!aligner.computeSWalignment(new_cigar_data, new_md_data, start_position_shift)) {
          if (aligner.verbose_)
            cout << "Error in the alignment! Not updating read information." << endl;
	  if (logf.is_open ())
	    logf << alignment.Name << '\t' << alignment.IsReverseStrand() << '\t' << alignment.RefID << '\t' << setfill ('0') << setw (8) << orig_position << '\t' << "SWERR" << '\n';
	  error_sw_readcount++;
          writer.SaveAlignment(alignment);  // Write alignment unchanged
          continue;
        }

        if (!aligner.addClippedBasesToTags(new_cigar_data, new_md_data, alignment.QueryBases.size())) {
          if (aligner.verbose_)
            cout << "Error when adding clipped anchors back to tags! Not updating read information." << endl;
	  if (logf.is_open ())
	    logf << alignment.Name << '\t' << alignment.IsReverseStrand() << '\t' << alignment.RefID << '\t' << setfill ('0') << setw (8) << orig_position << '\t' << "UNCLIPERR" << '\n';
          writer.SaveAlignment(alignment);  // Write alignment unchanged
	  error_unclip_readcount ++;
          continue;
        }
        new_md_tag = aligner.GetMDstring(new_md_data);
        realigned_readcounter++;

        // adjust start position of read
        if (!aligner.LeftAnchorClipped() and start_position_shift != 0) {
          new_position = aligner.updateReadPosition(alignment.CigarData, (int)start_position_shift, alignment.Position);
          if (new_position != alignment.Position) {
            pos_update_readcounter++;
            position_shift = true;
            alignment.Position = new_position;
          }
        }
        
        if (position_shift || alignment.CigarData.size () != new_cigar_data.size () || md_tag != new_md_tag)
	{
	  if (logf.is_open ())
	  {
	    logf << alignment.Name << '\t' << alignment.IsReverseStrand() << '\t' << alignment.RefID << '\t' << setfill ('0') << setw (8) << orig_position << '\t' << "MOD";
	    if (position_shift)
	      logf << "-SHIFT";
	    if (clipfail)
	      logf << " NOCLIP";
	    logf << '\n';
	  }
	  modified_alignment_readcounter++;
	}
	else
	{
            if (logf.is_open ())
	    {
	      logf << alignment.Name << '\t' << alignment.IsReverseStrand() << '\t' << alignment.RefID << '\t' << setfill ('0') << setw (8) << orig_position << '\t' << "UNMOD";
              if (clipfail)
	        logf << " NOCLIP";
	      logf << '\n';
	    }
	}

        if (aligner.verbose_){
          cout << alignment.Name << endl;
          cout << "------------------------------------------" << endl;
          // Wait for input to continue or quit program
          if (input.size() == 0)
            input = 'x';
          else if (input[0] != 'c' and input[0] != 'C')
            getline(cin, input);
          if (input.size()>0){
            if (input[0] == 'q' or input[0] == 'Q')
              return 1;
            else if (input[0] == 's' or input[0] == 'S')
              aligner.verbose_ = false;
          }
        }

        // Finally update alignment information
        alignment.CigarData = new_cigar_data;
        alignment.EditTag("MD", "Z" , new_md_tag);

      } // end of CreateRef else if
      else {
	switch (aligner.GetCreateRefError ())
	{
	  case Realigner::CR_ERR_RECREATE_REF:
            if (logf.is_open ())
	      logf << alignment.Name << '\t' << alignment.IsReverseStrand() << '\t' << alignment.RefID << '\t' << setfill ('0') << setw (8) << orig_position << '\t' << "RECRERR" << '\n';
	    error_recreate_ref_readcount++;
	    break;
	  case Realigner::CR_ERR_CLIP_ANCHOR:
            if (logf.is_open ())
	      logf << alignment.Name << '\t' << alignment.IsReverseStrand() << '\t' << alignment.RefID << '\t' << setfill ('0') << setw (8) << orig_position << '\t' << "CLIPERR" << '\n';
	    error_clip_anchor_readcount++;
	    break;
	  default:
		  //  On a good run this writes way too many reads to the log file - don't want to create a too large txt file
          //  if (logf.is_open ())
	      //logf << alignment.Name << '\t' << alignment.IsReverseStrand() << '\t' << alignment.RefID << '\t' << setfill ('0') << setw (8) << orig_position << '\t' << "PERFECT" << '\n';
	    already_perfect_readcount++;
	    break;
	}
	
	if (aligner.verbose_) {
	  cout << alignment.Name << endl;
	  cout << "------------------------------------------" << endl;
	  // Wait for input to continue or quit program
	  if (input.size() == 0)
	    input = 'x';
	  else if (input[0] != 'c' and input[0] != 'C')
	    getline(cin, input);
	  if (input.size()>0){
	    if (input[0] == 'q' or input[0] == 'Q')
	      return 1;
	    else if (input[0] == 's' or input[0] == 'S')
	      aligner.verbose_ = false;
	  }
	}
      }

      // --- Debug output for Rajesh ---
      if (debug && aligner.invalid_cigar_in_input) {
        aligner.verbose_ = true;
        cout << "Invalid cigar string / md tag pair in read " << alignment.Name << endl;
        // Rerun reference generation to display error
        aligner.CreateRefFromQueryBases(alignment.QueryBases, alignment.CigarData, md_tag, anchors);

        aligner.verbose_ = verbose;
        aligner.invalid_cigar_in_input = false;
      }
      // --- --- ---


    } // end of if isMapped

    writer.SaveAlignment(alignment);

  } // end while loop over reads

  if (aligner.invalid_cigar_in_input)
    cerr << "WARNING bamrealignment: There were invalid cigar string / md tag pairs in the input bam file." << endl;

  // ----------------------------------------------------------------
  // program end -- output summary information
  cout   << "                            File: " << input_bam    << endl
         << "                     Total reads: " << readcounter  << endl
         << "                    Mapped reads: " << mapped_readcounter << endl;
  if (bad_md_tag_readcount)
    cout << "            Skipped: bad MD tags: " << bad_md_tag_readcount << endl;
  if (error_recreate_ref_readcount)
    cout << " Skipped: unable to recreate ref: " << error_recreate_ref_readcount << endl;
  if (error_clip_anchor_readcount)
    cout << "  Skipped: error clipping anchor: " << error_clip_anchor_readcount << endl;
  cout  <<  "       Skipped:  already perfect: " << already_perfect_readcount << endl
        <<  "           Total reads realigned: " << mapped_readcounter - already_perfect_readcount - bad_md_tag_readcount - error_recreate_ref_readcount - error_clip_anchor_readcount << endl;
  if (failed_clip_realigned_readcount)
    cout << "                      (including  " << failed_clip_realigned_readcount << " that failed to clip)" << endl;
  if (error_sw_readcount)
    cout << " Failed to complete SW alignment: " << error_sw_readcount << endl;
  if (error_unclip_readcount)
    cout << "         Failed to unclip anchor: " << error_unclip_readcount << endl;
  cout   << "           Succesfully realigned: " << realigned_readcounter << endl
         << "             Modified alignments: " << modified_alignment_readcounter << endl
         << "                Shifted position: " << pos_update_readcounter << endl;
  
  cout << "Processing time: " << (time(NULL)-start_time) << " seconds." << endl;
  cout << "INFO: The output BAM file may be unsorted." << endl;
  cout << "------------------------------------------" << endl;
  return 0;
}
Пример #5
0
int PrepareHotspots(int argc, const char *argv[])
{
  OptArgs opts;
  opts.ParseCmdLine(argc, argv);
  string input_bed_filename       = opts.GetFirstString ('b', "input-bed", "");
  string input_vcf_filename       = opts.GetFirstString ('v', "input-vcf", "");
  string input_real_vcf_filename  = opts.GetFirstString ('p', "input-real-vcf", "");
  string output_hot_vcf		  = opts.GetFirstString ('q', "output-fake-hot-vcf", "");
  string output_bed_filename      = opts.GetFirstString ('d', "output-bed", "");
  string output_vcf_filename      = opts.GetFirstString ('o', "output-vcf", "");
  string reference_filename       = opts.GetFirstString ('r', "reference", "");
  string unmerged_bed 		  = opts.GetFirstString ('u', "unmerged-bed", "");
  bool left_alignment             = opts.GetFirstBoolean('a', "left-alignment", false);
  bool filter_bypass              = opts.GetFirstBoolean('f', "filter-bypass", false);
  bool allow_block_substitutions  = opts.GetFirstBoolean('s', "allow-block-substitutions", true);
  bool strict_check               = opts.GetFirstBoolean('S', "strict-check", true);
  opts.CheckNoLeftovers();

  if((input_bed_filename.empty() == (input_vcf_filename.empty() and input_real_vcf_filename.empty())) or
      (output_bed_filename.empty() and output_vcf_filename.empty()) or reference_filename.empty()) {
    PrepareHotspotsHelp();
    return 1;
  }
  if ((not input_real_vcf_filename.empty()) and (output_vcf_filename.empty() or not input_vcf_filename.empty())) {
    PrepareHotspotsHelp();
    return 1;
  }


  // Populate chromosome list from reference.fai
  // Use mmap to fetch the entire reference

  int ref_handle = open(reference_filename.c_str(),O_RDONLY);

  struct stat ref_stat;
  fstat(ref_handle, &ref_stat);
  char *ref = (char *)mmap(0, ref_stat.st_size, PROT_READ, MAP_SHARED, ref_handle, 0);


  FILE *fai = fopen((reference_filename+".fai").c_str(), "r");
  if (!fai) {
    fprintf(stderr, "ERROR: Cannot open %s.fai\n", reference_filename.c_str());
    return 1;
  }

  vector<Reference>  ref_index;
  map<string,int> ref_map;
  char line[1024], chrom_name[1024];
  while (fgets(line, 1024, fai) != NULL) {
    Reference ref_entry;
    long chr_start;
    if (5 != sscanf(line, "%1020s\t%ld\t%ld\t%d\t%d", chrom_name, &ref_entry.size, &chr_start,
                    &ref_entry.bases_per_line, &ref_entry.bytes_per_line))
      continue;
    ref_entry.chr = chrom_name;
    ref_entry.start = ref + chr_start;
    ref_index.push_back(ref_entry);
    ref_map[ref_entry.chr] = (int) ref_index.size() - 1;
  }
  fclose(fai);
  junction junc;
  if (!unmerged_bed.empty()) {
    FILE *fp = fopen(unmerged_bed.c_str(), "r");
    if (!fp) {
	fprintf(stderr, "ERROR: Cannot open %s\n", unmerged_bed.c_str());
	return 1;
    }
    char line2[65536];

    junc.init(ref_index.size());
    bool line_overflow = false;
    while (fgets(line2, 65536, fp) != NULL) {
      if (line2[0] and line2[strlen(line2)-1] != '\n' and strlen(line2) == 65535) {
        line_overflow = true;
	continue;
      }
      if (line_overflow) {
        line_overflow = false;
        continue;
      }
     if (strstr(line2, "track")) continue;
      char chr[100];
      int b, e;
      sscanf(line2, "%s %d %d", chr,  &b, &e);
      junc.add(ref_map[chr], b, e);
    }
    fclose(fp);
  }

  // Load input BED or load input VCF, group by chromosome

  deque<LineStatus> line_status;
  vector<deque<Allele> > alleles(ref_index.size());

  if (!input_bed_filename.empty()) {

    FILE *input = fopen(input_bed_filename.c_str(),"r");
    if (!input) {
      fprintf(stderr,"ERROR: Cannot open %s\n", input_bed_filename.c_str());
      return 1;
    }

    char line2[65536];

    int line_number = 0;
    bool line_overflow = false;
    while (fgets(line2, 65536, input) != NULL) {
      if (line2[0] and line2[strlen(line2)-1] != '\n' and strlen(line2) == 65535) {
        line_overflow = true;
        continue;
      }
      line_number++;
      if (line_overflow) {
        line_overflow = false;
        line_status.push_back(LineStatus(line_number));
        line_status.back().filter_message_prefix = "Malformed hotspot BED line: line length exceeds 64K";
        continue;
      }

      if (strncmp(line2, "browser", 7) == 0)
        continue;

      if (strncmp(line2, "track", 5) == 0) {
        if (string::npos != string(line2).find("allowBlockSubstitutions=true"))
          allow_block_substitutions = true;
        continue;
      }

      // OID= table has special meaning
      if (string::npos != string(line2).find("OID=")) {
	line_status.push_back(LineStatus(line_number));
        line_status.back().filter_message_prefix = "Bed line contains OID=";
        continue;
      }

      char *current_chr = strtok(line2, "\t\r\n");
      char *current_start = strtok(NULL, "\t\r\n");
      char *current_end = strtok(NULL, "\t\r\n");
      char *current_id = strtok(NULL, "\t\r\n");
      char *penultimate = strtok(NULL, "\t\r\n");
      char *ultimate = strtok(NULL, "\t\r\n");
      for (char *next = strtok(NULL, "\t\r\n"); next; next = strtok(NULL, "\t\r\n")) {
        penultimate = ultimate;
        ultimate = next;
      }

      if (!current_chr or !current_start or !current_end or !current_id or !penultimate or !ultimate) {
        line_status.push_back(LineStatus(line_number));
        line_status.back().filter_message_prefix = "Malformed hotspot BED line: expected at least 6 fields";
        continue;
      }

      Allele allele;

      string string_chr(current_chr);
      if (ref_map.find(string_chr) != ref_map.end())
        allele.chr_idx = ref_map[string_chr];
      else if (ref_map.find("chr"+string_chr) != ref_map.end())
        allele.chr_idx = ref_map["chr"+string_chr];
      else if (string_chr == "MT" and ref_map.find("chrM") != ref_map.end())
        allele.chr_idx = ref_map["chrM"];
      else {
        line_status.push_back(LineStatus(line_number));
        line_status.back().filter_message_prefix = "Unknown chromosome name: ";
        line_status.back().filter_message = string_chr;
        continue;
      }

      allele.pos = strtol(current_start,NULL,10);
      allele.id = current_id;

      char *current_ref = NULL;
      char *current_alt = NULL;
      for (char *next = strtok(penultimate, ";"); next; next = strtok(NULL, ";")) {
        if (strncmp(next,"REF=",4) == 0)
          current_ref = next;
        else if (strncmp(next,"OBS=",4) == 0)
          current_alt = next;
        else if (strncmp(next,"ANCHOR=",7) == 0) {
          // ignore ANCHOR
        } else {
          char *value = next;
          while (*value and *value != '=')
            ++value;
          if (*value == '=')
            *value++ = 0;
          allele.custom_tags[next] = value;
        }
      }
      if (!current_ref or !current_alt) {
        line_status.push_back(LineStatus(line_number));
        line_status.back().filter_message_prefix = "Malformed hotspot BED line: REF and OBS fields required in penultimate column";
        continue;
      }
      for (char *pos = current_ref+4; *pos; ++pos)
        allele.ref += toupper(*pos);
      for (char *pos = current_alt+4; *pos; ++pos)
        allele.alt += toupper(*pos);
      // here is the place to check the length of the hotspot cover the amplicon junction. ZZ
      /*
      if (junc.contain(allele.chr_idx, allele.pos, (unsigned int) allele.ref.size())) {
	line_status.push_back(LineStatus(line_number));
        line_status.back().filter_message_prefix = "hotspot BED line contain the complete overlapping region of two amplicon, the variant cannot be detected by tvc";
        continue;
      }
      if (not junc.contained_in_ampl(allele.chr_idx, allele.pos, (unsigned int) allele.ref.size())) {
        line_status.push_back(LineStatus(line_number));
        line_status.back().filter_message_prefix = "hotspot BED line is not contained in any amplicon, the variant cannot be detected by tvc";
        continue;
      }
      */

      allele.filtered = false;
      line_status.push_back(LineStatus(line_number));
      allele.line_status = &line_status.back();
      allele.opos = allele.pos;
      allele.oref = allele.ref;
      allele.oalt = allele.alt;
      alleles[allele.chr_idx].push_back(allele);
      //line_status.back().allele = &alleles[allele.chr_idx].back();
      line_status.back().chr_idx = allele.chr_idx;
      line_status.back().opos = allele.opos;
      line_status.back().id = allele.id;
    }

    fclose(input);
  }



  if (!input_vcf_filename.empty() or !input_real_vcf_filename.empty()) {

    bool real_vcf = false;
    FILE *input;
    FILE *out_real = NULL;
    FILE *out_hot = NULL;
    int fake_ = 0;
    int hn = 1;
    if (!input_real_vcf_filename.empty()) {
	real_vcf = true;
	input = fopen(input_real_vcf_filename.c_str(),"r");
	if (!input) {
	    fprintf(stderr,"ERROR: Cannot open %s\n", input_real_vcf_filename.c_str());
            return 1;
	}
	out_real = fopen(output_vcf_filename.c_str(), "w");
	if (!out_real) {
            fprintf(stderr,"ERROR: Cannot open %s\n", output_vcf_filename.c_str());
            return 1;
        }
	if (!output_hot_vcf.empty()) {
	    out_hot = fopen(output_hot_vcf.c_str(), "w");
	    if (!out_hot) {
		fprintf(stderr,"ERROR: Cannot open %s\n", output_hot_vcf.c_str());
		return 1;
	    } 
   	} else out_hot = stdout;
	fprintf(out_hot, "##fileformat=VCFv4.1\n##allowBlockSubstitutions=true\n#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO\n");
    } else {
        input = fopen(input_vcf_filename.c_str(),"r");
        if (!input) {
            fprintf(stderr,"ERROR: Cannot open %s\n", input_vcf_filename.c_str());
            return 1;
    	}
    }

    char line2[65536];
    char line3[65536];
    int line_number = 0;
    bool line_overflow = false;
    list<one_vcfline> vcflist;

    char last_chr[1024] = "";
    while (fgets(line2, 65536, input) != NULL) {
      if (line2[0] and line2[strlen(line2)-1] != '\n' and strlen(line2) == 65535) {
        line_overflow = true;
        continue;
      }
      line_number++;
      if (line_overflow) {
        line_overflow = false;
        line_status.push_back(LineStatus(line_number));
        line_status.back().filter_message_prefix = "Malformed hotspot VCF line: line length exceeds 64K";
        continue;
      }

      if (strncmp(line2, "##allowBlockSubstitutions=true", 30) == 0) {
        allow_block_substitutions = true;
        continue;
      }
      if (line2[0] == '#') {
	if (out_real) { fprintf(out_real, "%s", line2);}
        continue;
      }

      if (real_vcf) strcpy(line3, line2);
      char *current_chr = strtok(line2, "\t\r\n");
      char *current_start = strtok(NULL, "\t\r\n");
      char *current_id = strtok(NULL, "\t\r\n");
      char *current_ref = strtok(NULL, "\t\r\n");
      char *current_alt = strtok(NULL, "\t\r\n");
      strtok(NULL, "\t\r\n"); // Ignore QUAL
      strtok(NULL, "\t\r\n"); // Ignore FILTER
      char *current_info = strtok(NULL, "\t\r\n");
      strtok(NULL, "\t\r\n");
      char *gt = strtok(NULL, "\t\r\n");

      if (!current_chr or !current_start or !current_id or !current_ref or !current_alt) {
        line_status.push_back(LineStatus(line_number));
        if (real_vcf) line_status.back().filter_message_prefix = "Malformed real VCF line: expected at least 5 fields";
	else line_status.back().filter_message_prefix = "Malformed hotspot VCF line: expected at least 5 fields";
        continue;
      }


      string string_chr(current_chr);
      int chr_idx = 0;
      if (ref_map.find(string_chr) != ref_map.end())
        chr_idx = ref_map[string_chr];
      else if (ref_map.find("chr"+string_chr) != ref_map.end())
        chr_idx = ref_map["chr"+string_chr];
      else if (string_chr == "MT" and ref_map.find("chrM") != ref_map.end())
        chr_idx = ref_map["chrM"];
      else {
        line_status.push_back(LineStatus(line_number));
        line_status.back().filter_message_prefix = "Unknown chromosome name: ";
        line_status.back().filter_message = string_chr;
        continue;
      }

      for (char *pos = current_ref; *pos; ++pos)
        *pos = toupper(*pos);
      for (char *pos = current_alt; *pos; ++pos)
        *pos = toupper(*pos);


      // Process custom tags
      vector<string>  bstrand;
      vector<string>  hp_max_length;
      string raw_oid;
      string raw_omapalt;
      string raw_oalt;
      string raw_oref;
      string raw_opos;

      if (current_info) {
        string raw_bstrand;
        string raw_hp_max_length;
        for (char *next = strtok(current_info, ";"); next; next = strtok(NULL, ";")) {

          char *value = next;
          while (*value and *value != '=')
            ++value;
          if (*value == '=')
            *value++ = 0;

          if (strcmp(next, "TYPE") == 0)
            continue;
          if (strcmp(next, "HRUN") == 0)
            continue;
          if (strcmp(next, "HBASE") == 0)
            continue;
          if (strcmp(next, "FR") == 0)
            continue;
          if (strcmp(next, "OPOS") == 0) {
	    raw_opos = value;
            continue;
	  }
          if (strcmp(next, "OREF") == 0) {
	    raw_oref = value;
            continue;
	  }
          if (strcmp(next, "OALT") == 0) {
	    raw_oalt = value;
            continue;
	  }
          if (strcmp(next, "OID") == 0) {
            raw_oid = value;
            continue;
          }
          if (strcmp(next, "OMAPALT") == 0) {
            raw_omapalt = value;
            continue;
          }
          if (strcmp(next, "BSTRAND") == 0) {
            raw_bstrand = value;
            continue;
          }
          if (strcmp(next, "hp_max_length") == 0) {
            raw_hp_max_length = value;
            continue;
          }
        }

        if (not raw_bstrand.empty())
          split(raw_bstrand, ',', bstrand);
        if (not raw_hp_max_length.empty())
          split(raw_hp_max_length, ',', hp_max_length);

      }

      if (real_vcf) {
	//fprintf(stderr, "%s\n", gt);
        if (gt == NULL) continue;
	// get gt
	int g1 = atoi(gt), g2;
	gt = strchr(gt, '/');
	if (gt) g2 = atoi(gt+1);
	else {fprintf(stderr, "GT not formatted right\n"); exit(1);}
	//if (g1 == 0 and g2 == 0) continue;
	unsigned int cur_pos = atoi(current_start);
	one_vcfline newline(current_ref, current_alt, cur_pos, g1, g2, line3);
	bool new_chr = false;
	if (strcmp(current_chr, last_chr) != 0) {
	    new_chr = true;
	}
	while (not vcflist.empty()) {
	    if ((not new_chr) and vcflist.front().pos+strlen(vcflist.front().ref) > cur_pos) break;
	    if (vcflist.front().produce_hot_vcf(last_chr, out_real, hn, out_hot)) fake_++;
	    vcflist.pop_front();
	}
	if (new_chr) strcpy(last_chr, current_chr);
	for (list<one_vcfline>::iterator it = vcflist.begin(); it != vcflist.end(); it++) {
	    it->check_subset(newline);
	}
	if (not newline.alts.empty()) vcflist.push_back(newline);
	continue;
      } 
      unsigned int allele_idx = 0;
      for (char *sub_alt = strtok(current_alt,","); sub_alt; sub_alt = strtok(NULL,",")) {

        Allele allele;
        allele.chr_idx = chr_idx;
        allele.ref = current_ref;
        allele.alt = sub_alt;
        allele.pos = strtol(current_start,NULL,10)-1;
        allele.id = current_id;
        if (allele.id == ".")
          allele.id = "hotspot";

        allele.filtered = false;
        line_status.push_back(LineStatus(line_number));
        allele.line_status = &line_status.back();
        allele.opos = allele.pos;
        allele.oref = allele.ref;
        allele.oalt = allele.alt;

        if (allele_idx < bstrand.size()) {
          if (bstrand[allele_idx] != ".")
            allele.custom_tags["BSTRAND"] = bstrand[allele_idx];
        }

        if (allele_idx < hp_max_length.size()) {
          if (hp_max_length[allele_idx] != ".")
            allele.custom_tags["hp_max_length"] = hp_max_length[allele_idx];
        }

        alleles[allele.chr_idx].push_back(allele);
        //line_status.back().allele = &alleles[allele.chr_idx].back();
        line_status.back().chr_idx = allele.chr_idx;
        line_status.back().opos = allele.opos;
        line_status.back().id = allele.id;
        allele_idx++;
      }
    }

    fclose(input);
    if (real_vcf) {
        while (not vcflist.empty()) {
            if (vcflist.front().produce_hot_vcf(last_chr, out_real, hn, out_hot)) fake_++;
            vcflist.pop_front();
        }
	fclose(out_real);
	fclose(out_hot);
	if (fake_ > 0) 
            return 0;
	else return 1;
    }
  }


  // Process by chromosome:
  //   - Verify reference allele
  //   - Left align
  //   - Sort
  //   - Filter for block substitutions, write

  FILE *output_vcf = NULL;
  if (!output_vcf_filename.empty()) {
    output_vcf = fopen(output_vcf_filename.c_str(), "w");
    if (!output_vcf) {
      fprintf(stderr,"ERROR: Cannot open %s for writing\n", output_vcf_filename.c_str());
      return 1;
    }
    fprintf(output_vcf, "##fileformat=VCFv4.1\n");
    if (allow_block_substitutions)
      fprintf(output_vcf, "##allowBlockSubstitutions=true\n");
    fprintf(output_vcf, "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\n");
  }
  FILE *output_bed = NULL;
  if (!output_bed_filename.empty()) {
    output_bed = fopen(output_bed_filename.c_str(), "w");
    if (!output_bed) {
      fprintf(stderr,"ERROR: Cannot open %s for writing\n", output_bed_filename.c_str());
      if (output_vcf)
        fclose(output_vcf);
      return 1;
    }
    if (allow_block_substitutions)
      fprintf(output_bed, "track name=\"hotspot\" type=bedDetail allowBlockSubstitutions=true\n");
    else
      fprintf(output_bed, "track name=\"hotspot\" type=bedDetail\n");
  }


  for (int chr_idx = 0; chr_idx < (int)ref_index.size(); ++chr_idx) {

    for (deque<Allele>::iterator A = alleles[chr_idx].begin(); A != alleles[chr_idx].end(); ++A) {

      // check bed file
      if (junc.contain(A->chr_idx, A->pos, (unsigned int) A->ref.size())) {
	A->filtered = true;
        A->line_status->filter_message_prefix = "hotspot BED line contain the complete overlapping region of two amplicon, the variant cannot be detected by tvc";
        continue;
      }
      if (not junc.contained_in_ampl(A->chr_idx, A->pos, (unsigned int) A->ref.size())) {
	A->filtered = true;
        A->line_status->filter_message_prefix = "hotspot BED line is not contained in any amplicon, the variant cannot be detected by tvc";
        continue;
      }


      // Invalid characters

      bool valid = true;
      for (const char *c = A->ref.c_str(); *c ; ++c)
        if (*c != 'A' and *c != 'C' and *c != 'G' and *c != 'T')
          valid = false;
      for (const char *c = A->alt.c_str(); *c ; ++c)
        if (*c != 'A' and *c != 'C' and *c != 'G' and *c != 'T')
          valid = false;
      if (not valid) {
        A->filtered = true;
        A->line_status->filter_message_prefix = "REF and/or ALT contain characters other than ACGT: ";
        A->line_status->filter_message = "REF = " + A->ref + " ALT = " + A->alt;
        continue;
      }

      // Filter REF == ALT

      if (A->ref == A->alt) {
        A->filtered = true;
        A->line_status->filter_message_prefix = "REF and ALT alleles equal";
        continue;
      }

      // Confirm reference allele.

      string ref_expected;
      for (int idx = 0; idx < (int) A->ref.size(); ++idx)
        ref_expected += ref_index[chr_idx].base(A->pos + idx);
      if (A->ref != ref_expected) {
        A->filtered = true;
        A->line_status->filter_message_prefix = "Provided REF allele does not match reference: ";
        A->line_status->filter_message = "Expected " + ref_expected + ", found " + A->ref;
        continue;
      }

      // Trim

      int ref_start = 0;
      int ref_end = A->ref.size();
      int alt_end = A->alt.size();

      // Option 1: trim all trailing bases;

      //while(ref_end and alt_end and A->ref[ref_end-1] == A->alt[alt_end-1]) {
      //  --ref_end;
      //  --alt_end;
      //}

      // Option 2: trim all leading basees;

      //while (ref_start < ref_end and ref_start < alt_end and A->ref[ref_start] == A->alt[ref_start])
      //  ++ref_start;

      // Option 3: trim anchor base if vcf

      if (!input_vcf_filename.empty()) {
        if (ref_end and alt_end and (ref_end == 1 or alt_end == 1) and A->ref[0] == A->alt[0])
          ref_start = 1;
      }

      A->pos += ref_start;
      A->ref = A->ref.substr(ref_start, ref_end-ref_start);
      A->alt = A->alt.substr(ref_start, alt_end-ref_start);
      ref_end -= ref_start;
      alt_end -= ref_start;
      // Left align
      if (left_alignment && A->custom_tags.find("BSTRAND") == A->custom_tags.end()) { // black list variant not to be left aligned.
	string trailing;
	int can_do = 0, need_do = 0;
	int ref_end_orig= ref_end, alt_end_orig = alt_end;
	while(ref_end and alt_end and A->ref[ref_end-1] == A->alt[alt_end-1]) {
	    ref_end--; alt_end--;
	} 
	if (ref_end == 0 || alt_end == 0) {
	    can_do = need_do = 1; // indel type, ZZ
	} else {
	    int tmp_start = ref_start;
	    int ref_end_0 = ref_end, alt_end_0 = alt_end; // end after remove trailing match ZZ
	    while (tmp_start < ref_end and tmp_start < alt_end and A->ref[tmp_start] == A->alt[tmp_start])
     		++tmp_start;
	    if (tmp_start == ref_end || tmp_start == alt_end) {
		can_do = 1; need_do = 0; // indel but indel is not at the left. ZZ
	    } else {
		ref_end--; alt_end--;
		while(ref_end and alt_end and A->ref[ref_end-1] == A->alt[alt_end-1]) {
            	    ref_end--; alt_end--;
        	}
		if (ref_end == 0 || alt_end == 0) {
		   // complex with 1 bp MM at right end
		    can_do = need_do = 1;
		    if (ref_end + alt_end == 0) need_do = 0; // SNP
		} else {
		  int tmp_start0 = tmp_start; // start after removing leading matches
		  tmp_start++;
		  while (tmp_start < ref_end_orig and tmp_start < alt_end_orig and A->ref[tmp_start] == A->alt[tmp_start])
			tmp_start++;
		  if (tmp_start >= ref_end_0 || tmp_start >= alt_end_0 || ref_end <= tmp_start0 || alt_end <= tmp_start0) {
			// 1MM plus indel in middle, by definition cannot move the indel left enough to change A->pos
		    	can_do = 1; need_do = 0;
		  } // else real complex 
		}
	    }
	}
	if (!can_do or !need_do) {
	    // do nothing
	    // if !can_do need add some more DP
	    ref_end = ref_end_orig;
	    alt_end = alt_end_orig;
	} else {
	 // left align the indel part, here either ref_end = 0 or alt_end = 0
	  int opos = A->pos;
          while (A->pos > 0) {
            char nuc = ref_index[chr_idx].base(A->pos-1);
            if (ref_end > 0 and A->ref[ref_end-1] != nuc)
              break;
            if (alt_end > 0 and A->alt[alt_end-1] != nuc)
              break;
            A->ref = string(1,nuc) + A->ref;
            A->alt = string(1,nuc) + A->alt;
            A->pos--;
          }
	  if (ref_end != ref_end_orig) {
	    // trailing part is aligned, the whole ref and alt need to be kept. ZZ
	    ref_end = A->ref.size();
	    alt_end = A->alt.size();
	  } 
	  if (junc.contain(chr_idx, A->pos, ref_end) or not junc.contained_in_ampl(chr_idx, A->pos, ref_end)) {
		// after left align the hotspot contain an overlap region, revert to the original ZZ
		if (opos != A->pos) {
		    A->ref.erase(0, opos-A->pos);
		    A->alt.erase(0, opos-A->pos);
		    A->pos = opos;
		    ref_end = ref_end_orig;
		    alt_end = alt_end_orig;
		}
	  }
       }
      }
      A->ref.resize(ref_end);
      A->alt.resize(alt_end);


      // Filter block substitutions: take 1

      if (ref_end > 0 and alt_end > 0 and ref_end != alt_end and not allow_block_substitutions and not filter_bypass) {
        A->filtered = true;
        A->line_status->filter_message_prefix = "Block substitutions not supported";
        continue;
      }

    }



    if (output_bed) {
      // Sort - without anchor base
      stable_sort(alleles[chr_idx].begin(), alleles[chr_idx].end(), compare_alleles);

      // Write
      for (deque<Allele>::iterator I = alleles[chr_idx].begin(); I != alleles[chr_idx].end(); ++I) {
        if (I->filtered)
          continue;

        fprintf(output_bed, "%s\t%ld\t%ld\t%s\tREF=%s;OBS=%s",
            ref_index[chr_idx].chr.c_str(), I->pos, I->pos + I->ref.size(), I->id.c_str(),
            I->ref.c_str(), I->alt.c_str());

        for (map<string,string>::iterator C = I->custom_tags.begin(); C != I->custom_tags.end(); ++C)
          fprintf(output_bed, ";%s=%s", C->first.c_str(), C->second.c_str());

        fprintf(output_bed, "\tNONE\n");

        /*
        if (I->pos)
          fprintf(output_bed, "%s\t%ld\t%ld\t%s\t0\t+\tREF=%s;OBS=%s;ANCHOR=%c\tNONE\n",
              ref_index[chr_idx].chr.c_str(), I->pos, I->pos + I->ref.size(), I->id.c_str(),
              I->ref.c_str(), I->alt.c_str(), ref_index[chr_idx].base(I->pos-1));
        else
          fprintf(output_bed, "%s\t%ld\t%ld\t%s\t0\t+\tREF=%s;OBS=%s;ANCHOR=\tNONE\n",
              ref_index[chr_idx].chr.c_str(), I->pos, I->pos + I->ref.size(), I->id.c_str(),
              I->ref.c_str(), I->alt.c_str());
        */
      }
    }


    if (output_vcf) {

      // Add anchor base to indels
      for (deque<Allele>::iterator I = alleles[chr_idx].begin(); I != alleles[chr_idx].end(); ++I) {
        if (I->filtered)
          continue;
        if (not I->ref.empty() and not I->alt.empty())
          continue;
        if (I->pos == 0) {
          I->filtered = true;
          I->line_status->filter_message_prefix = "INDELs at chromosome start not supported";
          continue;
        }
        I->pos--;
        I->ref = string(1,ref_index[chr_idx].base(I->pos)) + I->ref;
        I->alt = string(1,ref_index[chr_idx].base(I->pos)) + I->alt;
      }

      // Sort - with anchor base
      stable_sort(alleles[chr_idx].begin(), alleles[chr_idx].end(), compare_alleles);


      // Merge alleles, remove block substitutions, write
      for (deque<Allele>::iterator A = alleles[chr_idx].begin(); A != alleles[chr_idx].end(); ) {

        string max_ref;
        deque<Allele>::iterator B = A;
        for (; B != alleles[chr_idx].end() and B->pos == A->pos; ++B)
          if (!B->filtered and max_ref.size() < B->ref.size())
            max_ref = B->ref;

        bool filtered = true;
        map<string,set<string> > unique_alts_and_ids;
        for (deque<Allele>::iterator I = A; I != B; ++I) {
          if (I->filtered)
            continue;

          string new_alt = I->alt + max_ref.substr(I->ref.size());

          if (new_alt.size() > 1 and max_ref.size() > 1 and new_alt.size() != max_ref.size() and not allow_block_substitutions and not filter_bypass) {
            I->filtered = true;
            I->line_status->filter_message_prefix = "Block substitutions not supported (post-merge)";
            continue;
          }

          I->ref = max_ref;
          I->alt = new_alt;

          // Filter alleles with duplicate ALT + ID pairs
          map<string,set<string> >::iterator alt_iter = unique_alts_and_ids.find(new_alt);
          if (alt_iter != unique_alts_and_ids.end()) {
            if (alt_iter->second.count(I->id) > 0) {
              I->filtered = true;
              I->line_status->filter_message_prefix = "Duplicate allele and ID";
              continue;
            }
          }
          unique_alts_and_ids[new_alt].insert(I->id);

          filtered = false;
        }

        if (not filtered) {



          fprintf(output_vcf, "%s\t%ld\t.\t%s\t",
              ref_index[chr_idx].chr.c_str(), A->pos+1, max_ref.c_str());

          bool comma = false;

          map<string,map<string,string> > unique_alts_and_tags;
          set<string> unique_tags;
	  set<string> unique_alt_alleles;

          for (deque<Allele>::iterator I = A; I != B; ++I) {
            if (I->filtered)
              continue;
            unique_alts_and_tags[I->alt].insert(I->custom_tags.begin(), I->custom_tags.end());
            for (map<string,string>::iterator S = I->custom_tags.begin(); S != I->custom_tags.end(); ++S)
              unique_tags.insert(S->first);
            if (unique_alt_alleles.count(I->alt) > 0)
              continue;
            unique_alt_alleles.insert(I->alt);
            if (comma)
              fprintf(output_vcf, ",");
            comma = true;
            fprintf(output_vcf, "%s", I->alt.c_str());
          }
	  /*
          for (deque<Allele>::iterator I = A; I != B; ++I) {
            if (I->filtered)
              continue;
            map<string,map<string,string> >::iterator Q = unique_alts_and_tags.find(I->alt);
            if (comma)
              fprintf(output_vcf, ",");
            comma = true;
            if (Q == unique_alts_and_tags.end()) {fprintf(output_vcf, "."); continue;}
            fprintf(output_vcf, "%s", Q->first.c_str());
          }
          */

          fprintf(output_vcf, "\t.\t.\tOID=");
          comma = false;
          for (deque<Allele>::iterator I = A; I != B; ++I) {
            if (I->filtered)
              continue;
            if (comma)
              fprintf(output_vcf, ",");
            comma = true;
            fprintf(output_vcf, "%s", I->id.c_str());
          }

          fprintf(output_vcf, ";OPOS=");
          comma = false;
          for (deque<Allele>::iterator I = A; I != B; ++I) {
            if (I->filtered)
              continue;
            if (comma)
              fprintf(output_vcf, ",");
            comma = true;
            fprintf(output_vcf, "%ld", I->opos+1);
          }

          fprintf(output_vcf, ";OREF=");
          comma = false;
          for (deque<Allele>::iterator I = A; I != B; ++I) {
            if (I->filtered)
              continue;
            if (comma)
              fprintf(output_vcf, ",");
            comma = true;
            fprintf(output_vcf, "%s", I->oref.c_str());
          }

          fprintf(output_vcf, ";OALT=");
          comma = false;
          for (deque<Allele>::iterator I = A; I != B; ++I) {
            if (I->filtered)
              continue;
            if (comma)
              fprintf(output_vcf, ",");
            comma = true;
            fprintf(output_vcf, "%s", I->oalt.c_str());
          }

          fprintf(output_vcf, ";OMAPALT=");
          comma = false;
          for (deque<Allele>::iterator I = A; I != B; ++I) {
            if (I->filtered)
              continue;
            if (comma)
              fprintf(output_vcf, ",");
            comma = true;
            fprintf(output_vcf, "%s", I->alt.c_str());
          }

          for (set<string>::iterator S = unique_tags.begin(); S != unique_tags.end(); ++S) {
            fprintf(output_vcf, ";%s=", S->c_str());
            comma=false;
            for (deque<Allele>::iterator I = A; I != B; ++I) {
              if (I->filtered)
                continue;
              map<string,map<string,string> >::iterator Q = unique_alts_and_tags.find(I->alt);
              if (comma)
                fprintf(output_vcf, ",");
              comma = true;
              if (Q == unique_alts_and_tags.end()) {fprintf(output_vcf, "."); continue;}
              map<string,string>::iterator W = Q->second.find(*S);
              if (W == Q->second.end())
                fprintf(output_vcf, ".");
              else
                fprintf(output_vcf, "%s", W->second.c_str());
            }
          }
//            fprintf(output_vcf, ";%s=%s", S->first.c_str(), S->second.c_str());

          fprintf(output_vcf, "\n");
        }

        A = B;
      }
    }
  }



  if (output_bed) {
    fflush(output_bed);
    fclose(output_bed);
  }
  if (output_vcf) {
    fflush(output_vcf);
    fclose(output_vcf);
  }


  int lines_ignored = 0;
  for (deque<LineStatus>::iterator L = line_status.begin(); L != line_status.end(); ++L) {
    if (L->filter_message_prefix) {
      if (L->chr_idx >= 0)
        printf("Line %d ignored: [%s:%ld %s] %s%s\n", L->line_number, ref_index[L->chr_idx].chr.c_str(), L->opos+1, L->id.c_str(),
            L->filter_message_prefix, L->filter_message.c_str());
      else
        printf("Line %d ignored: %s%s\n", L->line_number, L->filter_message_prefix, L->filter_message.c_str());
      lines_ignored++;
    }
  }
  printf("Ignored %d out of %d lines\n", lines_ignored, (int)line_status.size());


  munmap(ref, ref_stat.st_size);
  close(ref_handle);
  if (lines_ignored > 0 and strict_check) return 1;

  return 0;
}
Пример #6
0
int main (int argc, const char *argv[])
{
  time_t program_start_time;
  time(&program_start_time);
  Json::Value calibration_json(Json::objectValue);
  DumpStartingStateOfProgram (argc,argv,program_start_time, calibration_json["Calibration"]);

  //
  // Step 1. Process command line options
  //

  OptArgs opts;
  opts.ParseCmdLine(argc, argv);

  // enable floating point exceptions during program execution
  if (opts.GetFirstBoolean('-', "float-exceptions", true)) {
    cout << "Calibration: Floating point exceptions enabled." << endl;
    feenableexcept(FE_DIVBYZERO | FE_INVALID | FE_OVERFLOW);
  } //*/

  CalibrationContext calib_context;
  if (not calib_context.InitializeFromOpts(opts)){
    PrintHelp_CalModules();
  }

  HistogramCalibration master_histogram(opts, calib_context);
  calib_context.hist_calibration_master = &master_histogram;

  LinearCalibrationModel master_linear_model(opts, calib_context);
  calib_context.linear_model_master = &master_linear_model;

  opts.CheckNoLeftovers();

  //
  // Step 2. Execute threaded calibration
  //
  int calibration_thread_time = 0;

  if (calib_context.successive_fit) {

    // first train linear model
    if (master_linear_model.DoTraining()) {
      int l_thread_time = 0;
      for (int i_iteration=0; i_iteration<calib_context.num_train_iterations; i_iteration++) {
        cout << " -Training Iteration " << i_iteration+1;
        l_thread_time = ExecuteThreadedCalibrationTraining(calib_context);

        // Activate master linear model after every round of training
        master_linear_model.CreateCalibrationModel(false); // make linear model
        master_linear_model.SetModelGainsAndOffsets(); // expand for use in basecalling

        calibration_thread_time += l_thread_time;
        calib_context.bam_reader.Rewind(); // reset all files for another pass
        cout << " Duration = " << l_thread_time << endl;
      }
    }

    // Then apply it during polish model training
    if (master_histogram.DoTraining()) {
      calib_context.local_fit_linear_model = false;
      calib_context.local_fit_polish_model = true;
      calibration_thread_time += ExecuteThreadedCalibrationTraining(calib_context);
    }
  }
  else {
    // Single pass in which both models are fit jointly
    calibration_thread_time=ExecuteThreadedCalibrationTraining(calib_context);
  }


  //
  // Step 3. Create models, write output, and close modules
  //

  // Linear Model
  if (master_linear_model.CreateCalibrationModel())
    master_linear_model.ExportModelToJson(calibration_json["LinearModel"], "");

  // HP histogram calibration
  if (master_histogram.CreateCalibrationModel())
    master_histogram.ExportModelToJson(calibration_json["HPHistogram"]);


  // Transfer stuff from calibration context and close bam reader
  calib_context.Close(calibration_json["Calibration"]);

  time_t program_end_time;
  time(&program_end_time);

  calibration_json["Calibration"]["end_time"] = get_time_iso_string(program_end_time);
  calibration_json["Calibration"]["total_duration"] = (Json::Int)difftime(program_end_time,program_start_time);
  calibration_json["Calibration"]["calibration_duration"] = (Json::Int)calibration_thread_time;

  SaveJson(calibration_json, calib_context.filename_json);
  return EXIT_SUCCESS;
}
Пример #7
0
int main(int argc, const char *argv[]) {
  OptArgs opts;
  opts.ParseCmdLine(argc, argv);
  int hpLength;
  string statsOut;
  string alignmentOut;
  string pairedOut;
  string flowsOut;
  string summaryOut;
  string samFile;
  string qScoreCol;
  string wellsFile;
  string bfmaskFile;
  string snrFile;
  string binnedHpSigFile;
  string flowErrFile;
  string gcErrFile;
  int gcWin;
  string flowOrder;
  string keySeq;
  int numFlows;
  bool help;
  int qLength;
  double colCenter;
  double rowCenter;
  int colSize;
  int rowSize;
  int sampleSize;
  string wellsToUse;
  string run1, run2;
  opts.GetOption(run1, "", '-', "sff1");
  opts.GetOption(run2, "", '-', "sff2");
  opts.GetOption(wellsToUse, "", '-', "use-wells");
  opts.GetOption(samFile, "", '-', "sam-parsed");
  opts.GetOption(statsOut, "", '-', "stats-out");
  opts.GetOption(flowsOut, "", '-', "flows-out");
  opts.GetOption(alignmentOut, "", '-', "align-out");
  opts.GetOption(summaryOut, "", '-', "summary-out");
  opts.GetOption(pairedOut, "", '-', "paired-out");
  opts.GetOption(numFlows, "40", '-', "num-flows");
  opts.GetOption(hpLength, "6", '-', "max-hp");
  opts.GetOption(qScoreCol, "q7Len", '-', "qscore-col");
  opts.GetOption(qLength, "25", '-', "min-qlength");
  opts.GetOption(help,   "false", 'h', "help");
  opts.GetOption(wellsFile,   "", '-', "wells-file");
  opts.GetOption(bfmaskFile,   "", '-', "bfmask-file");
  opts.GetOption(snrFile,   "", '-', "snr-file");
  opts.GetOption(binnedHpSigFile,   "", '-', "binned-hp-sig-file");
  opts.GetOption(flowErrFile, "", '-', "flow-err-file");
  opts.GetOption(gcErrFile, "", '-', "gc-err-file");
  opts.GetOption(flowOrder, "", '-', "flow-order");
  opts.GetOption(keySeq, "", '-', "key-seq");
  opts.GetOption(colCenter, "0.5", '-', "col-center");
  opts.GetOption(rowCenter, "0.5", '-', "row-center");
  opts.GetOption(colSize, "0", '-', "col-size");
  opts.GetOption(rowSize, "0", '-', "row-size");
  opts.GetOption(gcErrFile, "", '-', "gc-err-file");
  opts.GetOption(gcWin, "40", '-', "gc-win");
  opts.GetOption(sampleSize, "100000", '-', "sample-size");
  if (help || samFile.empty() || statsOut.empty() || summaryOut.empty()) {
    usage();
  }
  opts.CheckNoLeftovers();

  // Some checks to make sure sensible bounds have been set
  if(colCenter < 0 || colCenter > 1) {
    cerr << "AnalyzeHPErrs - col-center must be in the range [0,1]" << endl;
    exit(1);
  }
  if(rowCenter < 0 || rowCenter > 1) {
    cerr << "AnalyzeHPErrs - row-center must be in the range [0,1]" << endl;
    exit(1);
  }
  if(colSize < 0) {
    cerr << "AnalyzeHPErrs - col-size cannot be negative." << endl;
    exit(1);
  }
  if(rowSize < 0) {
    cerr << "AnalyzeHPErrs - row-size cannot be negative." << endl;
    exit(1);
  }

  // Determine rows & cols if a bfmask file was supplied
  int nRow=0;
  int nCol=0;
  if(!bfmaskFile.empty()) {
    if(GetRowColFromBfmask(bfmaskFile, &nRow, &nCol)) {
      cerr << "AnalyzeHPErrs - problem determining rows & columns from bfmask file " << bfmaskFile << endl;
      exit(1);
    }
  }
	
  // Set up fds object
  FlowDiffStats* fds;
  if (!run1.empty()) {
    SffDiffStats* sds = new SffDiffStats(hpLength, nCol, nRow, qScoreCol, run1, run2);
    if (!pairedOut.empty())
      sds->SetPairedOut(pairedOut);
    fds = dynamic_cast<FlowDiffStats*>(sds);
  }
  else {
    GenomeDiffStats* gds = new GenomeDiffStats(hpLength, nCol, nRow, qScoreCol);
    if(alignmentOut != "") {
      gds->SetAlignmentsOut(alignmentOut);
    }
    if (!flowsOut.empty()) {
      gds->SetFlowsOut(flowsOut);
    }
    fds = dynamic_cast<FlowDiffStats*>(gds);
  }

  if (gcErrFile != "") {
    fds->SetFlowGCOut(gcErrFile);
    fds->SetGCWindowSize(gcWin);
  }

  if(keySeq != "") {
    fds->SetKeySeq(keySeq);
  }
  if(flowOrder != "") {
    fds->SetFlowOrder(flowOrder);
  }
  fds->SetStatsOut(statsOut);

  if (!wellsToUse.empty()) {
    std::vector<int> wells;
    std::vector<bool> use;
    ReadSetFromFile(wellsToUse, 0, wells);
    use.resize(nRow * nCol, false);
    int count = 0;
    ReservoirSample<int> wellSample(sampleSize);
    for (size_t i = 0; i < wells.size(); i++) {
      wellSample.Add(wells[i]);
    }
    wells = wellSample.GetData();
    for (size_t i = 0; i < wells.size(); i++) {
      use[wells[i]] = true;
      count++;
    }
    cout << "Read: " << count << " reads." << endl;
    fds->SetWellToAnalyze(use);
  }


  // Set integer-value row & column bounds
  int minRow=-1;
  int maxRow=-1;
  int minCol=-1;
  int maxCol=-1;
  if(colSize > 0 || rowSize > 0) {
    if(bfmaskFile.empty()) {
      cerr << "AnalyzeHPErrs - must specify bfmask file when restricting row or column ranges" << endl;
      exit(1);
    }
    if(rowSize > 0) {
      minRow = floor(nRow * rowCenter - rowSize / 2.0);
      maxRow = minRow + rowSize;
      minRow = std::max(0,minRow);
      maxRow = std::min(nRow,maxRow);
    }
    if(colSize > 0) {
      minCol = floor(nCol * colCenter - colSize / 2.0);
      maxCol = minCol + colSize;
      minCol = std::max(0,minCol);
      maxCol = std::min(nCol,maxCol);
    }
  }

  if (wellsFile != "") {
    std::vector<int32_t> xSubset, ySubset;
    fds->FillInSubset(samFile, qLength, minRow, maxRow, minCol, maxCol, xSubset, ySubset);
    if(bfmaskFile.empty()) {
      cerr << "AnalyzeHPErrs - must specify bfmask file when specifying wells file" << endl;
      exit(1);
    }
    fds->SetWellsFile(wellsFile, nRow, nCol, numFlows, xSubset, ySubset);
  }
  if (snrFile != "") {
    cout << "Opening snr file: " << snrFile << endl;
    fds->SetSNROut(snrFile);
  }
  if (binnedHpSigFile != "") {
    cout << "Opening binned HP signal file: " << binnedHpSigFile << endl;
    fds->SetBinnedHpSigOut(binnedHpSigFile);
  }
  if (flowErrFile != "") {
    cout << "Opening flow err file: " << flowErrFile << endl;
    fds->SetFlowErrOut(flowErrFile);
  }
  ofstream summary;
  summary.open(summaryOut.c_str());
  cout << "Reading and analyzing alignments from: " << samFile << endl;
  if(minCol > -1 || maxCol > -1)
    cout << "  Restricting to " << (maxCol-minCol) << " cols in the range [" << minCol << "," << maxCol << ")" << endl;
  if(minRow > -1 || maxRow > -1)
    cout << "  Restricting to " << (maxRow-minRow) << " rows in the range [" << minRow << "," << maxRow << ")" << endl;

  fds->SetAlignmentInFile(samFile);
  fds->FilterAndCompare(numFlows, summary, qLength, minRow, maxRow, minCol, maxCol);

  summary.close();
  delete fds;
  cout << "Done." << endl;
  return 0;
}
Пример #8
0
int PrepareHotspots(int argc, const char *argv[])
{
  OptArgs opts;
  opts.ParseCmdLine(argc, argv);
  string input_bed_filename       = opts.GetFirstString ('b', "input-bed", "");
  string input_vcf_filename       = opts.GetFirstString ('v', "input-vcf", "");
  string output_bed_filename      = opts.GetFirstString ('d', "output-bed", "");
  string output_vcf_filename      = opts.GetFirstString ('o', "output-vcf", "");
  string reference_filename       = opts.GetFirstString ('r', "reference", "");
  bool left_alignment             = opts.GetFirstBoolean('a', "left-alignment", false);
  bool filter_bypass              = opts.GetFirstBoolean('f', "filter-bypass", false);
  bool allow_block_substitutions  = opts.GetFirstBoolean('s', "allow-block-substitutions", false);
  opts.CheckNoLeftovers();

  if((input_bed_filename.empty() == input_vcf_filename.empty()) or
      (output_bed_filename.empty() and output_vcf_filename.empty()) or reference_filename.empty()) {
    PrepareHotspotsHelp();
    return 1;
  }


  // Populate chromosome list from reference.fai
  // Use mmap to fetch the entire reference

  int ref_handle = open(reference_filename.c_str(),O_RDONLY);

  struct stat ref_stat;
  fstat(ref_handle, &ref_stat);
  char *ref = (char *)mmap(0, ref_stat.st_size, PROT_READ, MAP_SHARED, ref_handle, 0);


  FILE *fai = fopen((reference_filename+".fai").c_str(), "r");
  if (!fai) {
    fprintf(stderr, "ERROR: Cannot open %s.fai\n", reference_filename.c_str());
    return 1;
  }

  vector<Reference>  ref_index;
  map<string,int> ref_map;
  char line[1024], chrom_name[1024];
  while (fgets(line, 1024, fai) != NULL) {
    Reference ref_entry;
    long chr_start;
    if (5 != sscanf(line, "%s\t%ld\t%ld\t%d\t%d", chrom_name, &ref_entry.size, &chr_start,
                    &ref_entry.bases_per_line, &ref_entry.bytes_per_line))
      continue;
    ref_entry.chr = chrom_name;
    ref_entry.start = ref + chr_start;
    ref_index.push_back(ref_entry);
    ref_map[ref_entry.chr] = (int) ref_index.size() - 1;
  }
  fclose(fai);


  // Load input BED or load input VCF, group by chromosome

  deque<LineStatus> line_status;
  vector<deque<Allele> > alleles(ref_index.size());

  if (!input_bed_filename.empty()) {

    FILE *input = fopen(input_bed_filename.c_str(),"r");
    if (!input) {
      fprintf(stderr,"ERROR: Cannot open %s\n", input_bed_filename.c_str());
      return 1;
    }

    char line2[65536];

    int line_number = 0;
    bool line_overflow = false;
    while (fgets(line2, 65536, input) != NULL) {
      if (line2[0] and line2[strlen(line2)-1] != '\n' and strlen(line2) == 65535) {
        line_overflow = true;
        continue;
      }
      line_number++;
      if (line_overflow) {
        line_overflow = false;
        line_status.push_back(LineStatus(line_number));
        line_status.back().filter_message_prefix = "Malformed hotspot BED line: line length exceeds 64K";
        continue;
      }

      if (strncmp(line2, "browser", 7) == 0)
        continue;

      if (strncmp(line2, "track", 5) == 0) {
        if (string::npos != string(line2).find("allowBlockSubstitutions=true"))
          allow_block_substitutions = true;
        continue;
      }

      char *current_chr = strtok(line2, "\t\r\n");
      char *current_start = strtok(NULL, "\t\r\n");
      char *current_end = strtok(NULL, "\t\r\n");
      char *current_id = strtok(NULL, "\t\r\n");
      char *penultimate = strtok(NULL, "\t\r\n");
      char *ultimate = strtok(NULL, "\t\r\n");
      for (char *next = strtok(NULL, "\t\r\n"); next; next = strtok(NULL, "\t\r\n")) {
        penultimate = ultimate;
        ultimate = next;
      }

      if (!current_chr or !current_start or !current_end or !current_id or !penultimate or !ultimate) {
        line_status.push_back(LineStatus(line_number));
        line_status.back().filter_message_prefix = "Malformed hotspot BED line: expected at least 6 fields";
        continue;
      }

      Allele allele;

      string string_chr(current_chr);
      if (ref_map.find(string_chr) != ref_map.end())
        allele.chr_idx = ref_map[string_chr];
      else if (ref_map.find("chr"+string_chr) != ref_map.end())
        allele.chr_idx = ref_map["chr"+string_chr];
      else if (string_chr == "MT" and ref_map.find("chrM") != ref_map.end())
        allele.chr_idx = ref_map["chrM"];
      else {
        line_status.push_back(LineStatus(line_number));
        line_status.back().filter_message_prefix = "Unknown chromosome name: ";
        line_status.back().filter_message = string_chr;
        continue;
      }

      allele.pos = strtol(current_start,NULL,10);
      allele.id = current_id;

      char *current_ref = NULL;
      char *current_alt = NULL;
      for (char *next = strtok(penultimate, ";"); next; next = strtok(NULL, ";")) {
        if (strncmp(next,"REF=",4) == 0)
          current_ref = next;
        else if (strncmp(next,"OBS=",4) == 0)
          current_alt = next;
      }
      if (!current_ref or !current_alt) {
        line_status.push_back(LineStatus(line_number));
        line_status.back().filter_message_prefix = "Malformed hotspot BED line: REF and OBS fields required in penultimate column";
        continue;
      }
      for (char *pos = current_ref+4; *pos; ++pos)
        allele.ref += toupper(*pos);
      for (char *pos = current_alt+4; *pos; ++pos)
        allele.alt += toupper(*pos);
      allele.filtered = false;
      line_status.push_back(LineStatus(line_number));
      allele.line_status = &line_status.back();
      allele.opos = allele.pos;
      allele.oref = allele.ref;
      allele.oalt = allele.alt;
      alleles[allele.chr_idx].push_back(allele);
      line_status.back().allele = &alleles[allele.chr_idx].back();
    }

    fclose(input);
  }


  if (!input_vcf_filename.empty()) {

    FILE *input = fopen(input_vcf_filename.c_str(),"r");
    if (!input) {
      fprintf(stderr,"ERROR: Cannot open %s\n", input_vcf_filename.c_str());
      return 1;
    }

    char line2[65536];
    int line_number = 0;
    bool line_overflow = false;
    while (fgets(line2, 65536, input) != NULL) {
      if (line2[0] and line2[strlen(line2)-1] != '\n' and strlen(line2) == 65535) {
        line_overflow = true;
        continue;
      }
      line_number++;
      if (line_overflow) {
        line_overflow = false;
        line_status.push_back(LineStatus(line_number));
        line_status.back().filter_message_prefix = "Malformed hotspot VCF line: line length exceeds 64K";
        continue;
      }

      if (strncmp(line2, "##allowBlockSubstitutions=true", 30) == 0) {
        allow_block_substitutions = true;
        continue;
      }
      if (line2[0] == '#')
        continue;

      char *current_chr = strtok(line2, "\t\r\n");
      char *current_start = strtok(NULL, "\t\r\n");
      char *current_id = strtok(NULL, "\t\r\n");
      char *current_ref = strtok(NULL, "\t\r\n");
      char *current_alt = strtok(NULL, "\t\r\n");

      if (!current_chr or !current_start or !current_id or !current_ref or !current_alt) {
        line_status.push_back(LineStatus(line_number));
        line_status.back().filter_message_prefix = "Malformed hotspot VCF line: expected at least 5 fields";
        continue;
      }


      string string_chr(current_chr);
      int chr_idx = 0;
      if (ref_map.find(string_chr) != ref_map.end())
        chr_idx = ref_map[string_chr];
      else if (ref_map.find("chr"+string_chr) != ref_map.end())
        chr_idx = ref_map["chr"+string_chr];
      else if (string_chr == "MT" and ref_map.find("chrM") != ref_map.end())
        chr_idx = ref_map["chrM"];
      else {
        line_status.push_back(LineStatus(line_number));
        line_status.back().filter_message_prefix = "Unknown chromosome name: ";
        line_status.back().filter_message = string_chr;
        continue;
      }

      for (char *pos = current_ref; *pos; ++pos)
        *pos = toupper(*pos);
      for (char *pos = current_alt; *pos; ++pos)
        *pos = toupper(*pos);


      for (char *sub_alt = strtok(current_alt,","); sub_alt; sub_alt = strtok(NULL,",")) {

        Allele allele;
        allele.chr_idx = chr_idx;
        allele.ref = current_ref;
        allele.alt = sub_alt;
        allele.pos = strtol(current_start,NULL,10)-1;
        allele.id = current_id;
        if (allele.id == ".")
          allele.id = "hotspot";

        allele.filtered = false;
        line_status.push_back(LineStatus(line_number));
        allele.line_status = &line_status.back();
        allele.opos = allele.pos;
        allele.oref = allele.ref;
        allele.oalt = allele.alt;
        alleles[allele.chr_idx].push_back(allele);
        line_status.back().allele = &alleles[allele.chr_idx].back();
      }
    }

    fclose(input);
  }

  // Process by chromosome:
  //   - Verify reference allele
  //   - Left align
  //   - Sort
  //   - Filter for block substitutions, write

  FILE *output_vcf = NULL;
  if (!output_vcf_filename.empty()) {
    output_vcf = fopen(output_vcf_filename.c_str(), "w");
    if (!output_vcf) {
      fprintf(stderr,"ERROR: Cannot open %s for writing\n", output_vcf_filename.c_str());
      return 1;
    }
    fprintf(output_vcf, "##fileformat=VCFv4.1\n");
    if (allow_block_substitutions)
      fprintf(output_vcf, "##allowBlockSubstitutions=true\n");
    fprintf(output_vcf, "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\n");
  }
  FILE *output_bed = NULL;
  if (!output_bed_filename.empty()) {
    output_bed = fopen(output_bed_filename.c_str(), "w");
    if (!output_bed) {
      fprintf(stderr,"ERROR: Cannot open %s for writing\n", output_bed_filename.c_str());
      if (output_vcf)
        fclose(output_vcf);
      return 1;
    }
    if (allow_block_substitutions)
      fprintf(output_bed, "track name=\"hotspot\" type=bedDetail allowBlockSubstitutions=true\n");
    else
      fprintf(output_bed, "track name=\"hotspot\" type=bedDetail\n");
  }


  for (int chr_idx = 0; chr_idx < (int)ref_index.size(); ++chr_idx) {

    for (deque<Allele>::iterator A = alleles[chr_idx].begin(); A != alleles[chr_idx].end(); ++A) {

      // Invalid characters

      bool valid = true;
      for (const char *c = A->ref.c_str(); *c ; ++c)
        if (*c != 'A' and *c != 'C' and *c != 'G' and *c != 'T')
          valid = false;
      for (const char *c = A->alt.c_str(); *c ; ++c)
        if (*c != 'A' and *c != 'C' and *c != 'G' and *c != 'T')
          valid = false;
      if (not valid) {
        A->filtered = true;
        A->line_status->filter_message_prefix = "REF and/or ALT contain characters other than ACGT: ";
        A->line_status->filter_message = "REF = " + A->ref + " ALT = " + A->alt;
        continue;
      }

      // Filter REF == ALT

      if (A->ref == A->alt) {
        A->filtered = true;
        A->line_status->filter_message_prefix = "REF and ALT alleles equal";
        continue;
      }

      // Confirm reference allele.

      string ref_expected;
      for (int idx = 0; idx < (int) A->ref.size(); ++idx)
        ref_expected += ref_index[chr_idx].base(A->pos + idx);
      if (A->ref != ref_expected) {
        A->filtered = true;
        A->line_status->filter_message_prefix = "Provided REF allele does not match reference: ";
        A->line_status->filter_message = "Expected " + ref_expected + ", found " + A->ref;
        continue;
      }

      // Trim

      int ref_start = 0;
      int ref_end = A->ref.size();
      int alt_end = A->alt.size();

      // Option 1: trim all trailing bases

      //while(ref_end and alt_end and A->ref[ref_end-1] == A->alt[alt_end-1]) {
      //  --ref_end;
      //  --alt_end;
      //}

      // Option 2: trim all leading basees

      //while (ref_start < ref_end and ref_start < alt_end and A->ref[ref_start] == A->alt[ref_start])
      //  ++ref_start;


      // Option 3: trim anchor base if vcf

      if (!input_vcf_filename.empty()) {
        if (ref_end and alt_end and (ref_end == 1 or alt_end == 1) and A->ref[0] == A->alt[0])
          ref_start = 1;
      }

      A->pos += ref_start;
      A->ref = A->ref.substr(ref_start, ref_end-ref_start);
      A->alt = A->alt.substr(ref_start, alt_end-ref_start);
      ref_end -= ref_start;
      alt_end -= ref_start;

      // Left align
      if (left_alignment) {
        while (A->pos > 0) {
          char nuc = ref_index[chr_idx].base(A->pos-1);
          if (ref_end > 0 and A->ref[ref_end-1] != nuc)
            break;
          if (alt_end > 0 and A->alt[alt_end-1] != nuc)
            break;
          A->ref = string(1,nuc) + A->ref;
          A->alt = string(1,nuc) + A->alt;
          A->pos--;
        }
      }
      A->ref.resize(ref_end);
      A->alt.resize(alt_end);


      // Filter block substitutions: take 1

      if (ref_end > 0 and alt_end > 0 and ref_end != alt_end and not allow_block_substitutions and not filter_bypass) {
        A->filtered = true;
        A->line_status->filter_message_prefix = "Block substitutions not supported";
        continue;
      }

    }



    if (output_bed) {
      // Sort - without anchor base
      sort(alleles[chr_idx].begin(), alleles[chr_idx].end(), compare_alleles);

      // Write
      for (deque<Allele>::iterator I = alleles[chr_idx].begin(); I != alleles[chr_idx].end(); ++I) {
        if (I->filtered)
          continue;
        if (I->pos)
          fprintf(output_bed, "%s\t%ld\t%ld\t%s\t0\t+\tREF=%s;OBS=%s;ANCHOR=%c\tNONE\n",
              ref_index[chr_idx].chr.c_str(), I->pos, I->pos + I->ref.size(), I->id.c_str(),
              I->ref.c_str(), I->alt.c_str(), ref_index[chr_idx].base(I->pos-1));
        else
          fprintf(output_bed, "%s\t%ld\t%ld\t%s\t0\t+\tREF=%s;OBS=%s;ANCHOR=\tNONE\n",
              ref_index[chr_idx].chr.c_str(), I->pos, I->pos + I->ref.size(), I->id.c_str(),
              I->ref.c_str(), I->alt.c_str());
      }
    }


    if (output_vcf) {

      // Add anchor base to indels
      for (deque<Allele>::iterator I = alleles[chr_idx].begin(); I != alleles[chr_idx].end(); ++I) {
        if (I->filtered)
          continue;
        if (not I->ref.empty() and not I->alt.empty())
          continue;
        if (I->pos == 0) {
          I->filtered = true;
          I->line_status->filter_message_prefix = "INDELs at chromosome start not supported";
          continue;
        }
        I->pos--;
        I->ref = string(1,ref_index[chr_idx].base(I->pos)) + I->ref;
        I->alt = string(1,ref_index[chr_idx].base(I->pos)) + I->alt;
      }

      // Sort - with anchor base
      sort(alleles[chr_idx].begin(), alleles[chr_idx].end(), compare_alleles);


      // Merge alleles, remove block substitutions, write
      for (deque<Allele>::iterator A = alleles[chr_idx].begin(); A != alleles[chr_idx].end(); ) {

        string max_ref;
        deque<Allele>::iterator B = A;
        for (; B != alleles[chr_idx].end() and B->pos == A->pos; ++B)
          if (!B->filtered and max_ref.size() < B->ref.size())
            max_ref = B->ref;

        bool filtered = true;
        for (deque<Allele>::iterator I = A; I != B; ++I) {
          if (I->filtered)
            continue;

          string new_alt = I->alt + max_ref.substr(I->ref.size());

          if (new_alt.size() > 1 and max_ref.size() > 1 and new_alt.size() != max_ref.size() and not allow_block_substitutions and not filter_bypass) {
            I->filtered = true;
            I->line_status->filter_message_prefix = "Block substitutions not supported (post-merge)";
            continue;
          }

          I->ref = max_ref;
          I->alt = new_alt;
          filtered = false;
        }

        if (not filtered) {

          fprintf(output_vcf, "%s\t%ld\t.\t%s\t",
              ref_index[chr_idx].chr.c_str(), A->pos+1, max_ref.c_str());

          bool comma = false;
          set<string> unique_alt_alleles;
          for (deque<Allele>::iterator I = A; I != B; ++I) {
            if (I->filtered)
              continue;
            if (unique_alt_alleles.count(I->alt) > 0)
              continue;
            unique_alt_alleles.insert(I->alt);
            if (comma)
              fprintf(output_vcf, ",");
            comma = true;
            fprintf(output_vcf, "%s", I->alt.c_str());
          }

          fprintf(output_vcf, "\t.\t.\tOID=");
          comma = false;
          for (deque<Allele>::iterator I = A; I != B; ++I) {
            if (I->filtered)
              continue;
            if (comma)
              fprintf(output_vcf, ",");
            comma = true;
            fprintf(output_vcf, "%s", I->id.c_str());
          }

          fprintf(output_vcf, ";OPOS=");
          comma = false;
          for (deque<Allele>::iterator I = A; I != B; ++I) {
            if (I->filtered)
              continue;
            if (comma)
              fprintf(output_vcf, ",");
            comma = true;
            fprintf(output_vcf, "%ld", I->opos+1);
          }

          fprintf(output_vcf, ";OREF=");
          comma = false;
          for (deque<Allele>::iterator I = A; I != B; ++I) {
            if (I->filtered)
              continue;
            if (comma)
              fprintf(output_vcf, ",");
            comma = true;
            fprintf(output_vcf, "%s", I->oref.c_str());
          }

          fprintf(output_vcf, ";OALT=");
          comma = false;
          for (deque<Allele>::iterator I = A; I != B; ++I) {
            if (I->filtered)
              continue;
            if (comma)
              fprintf(output_vcf, ",");
            comma = true;
            fprintf(output_vcf, "%s", I->oalt.c_str());
          }

          fprintf(output_vcf, ";OMAPALT=");
          comma = false;
          for (deque<Allele>::iterator I = A; I != B; ++I) {
            if (I->filtered)
              continue;
            if (comma)
              fprintf(output_vcf, ",");
            comma = true;
            fprintf(output_vcf, "%s", I->alt.c_str());
          }

          fprintf(output_vcf, "\n");
        }

        A = B;
      }
    }
  }



  if (output_bed) {
    fflush(output_bed);
    fclose(output_bed);
  }
  if (output_vcf) {
    fflush(output_vcf);
    fclose(output_vcf);
  }


  int lines_ignored = 0;
  for (deque<LineStatus>::iterator L = line_status.begin(); L != line_status.end(); ++L) {
    if (L->filter_message_prefix) {
      if (L->allele)
        printf("Line %d ignored: [%s:%ld %s] %s%s\n", L->line_number, ref_index[L->allele->chr_idx].chr.c_str(), L->allele->opos+1, L->allele->id.c_str(),
            L->filter_message_prefix, L->filter_message.c_str());
      else
        printf("Line %d ignored: %s%s\n", L->line_number, L->filter_message_prefix, L->filter_message.c_str());
      lines_ignored++;
    }
  }
  printf("Ignored %d out of %d lines\n", lines_ignored, (int)line_status.size());


  munmap(ref, ref_stat.st_size);
  close(ref_handle);

  return 0;
}
Пример #9
0
int main(int argc, const char* argv[])
{
  printf ("tvcvalidator %s-%s (%s) - Prototype tvc validation tool\n\n",
      IonVersion::GetVersion().c_str(), IonVersion::GetRelease().c_str(), IonVersion::GetSvnRev().c_str());

  if (argc == 1) {
    VariantValidatorHelp();
    return 1;
  }

  OptArgs opts;
  opts.ParseCmdLine(argc, argv);

  if (opts.GetFirstBoolean('v', "version", false)) {
    return 0;
  }
  if (opts.GetFirstBoolean('h', "help", false)) {
    VariantValidatorHelp();
    return 0;
  }

  string input_vcf_filename = opts.GetFirstString ('i', "input-vcf", "");
  string truth_filename = opts.GetFirstString ('t', "truth-file", "");
  string truth_dir = opts.GetFirstString ('d', "truth-dir", "/results/plugins/validateVariantCaller/files");

  // TODO: reference optional, only used to verify reference allele in input-vcf and truth files
  //string reference_filename = opts.GetFirstString ('r', "reference", "");

  opts.CheckNoLeftovers();


  //
  // Step 1. Load input VCF file into memory
  //

  if (input_vcf_filename.empty()) {
    VariantValidatorHelp();
    cerr << "ERROR: Input VCF file not specified " << endl;
    return 1;
  }

  VariantCallerResults results_vcf;
  results_vcf.load_vcf(input_vcf_filename);
  printf("Loaded VCF %s with %d variant calls\n", input_vcf_filename.c_str(), (int)results_vcf.variants.size());



  //
  // Step 2. Parse truth files, compare them to the input vcf, and compute match scores
  //

  if (not truth_filename.empty()) {
    ValidatorTruth truth;
    truth.ReadTruthFile(truth_filename);
    truth.CompareToCalls(results_vcf);
    return 0;
  }

  truth_dir += "/*.bed";
  glob_t glob_result;
  glob(truth_dir.c_str(), GLOB_TILDE, NULL, &glob_result);
  for(unsigned int i = 0; i < glob_result.gl_pathc; ++i) {

    ValidatorTruth truth;
    truth.ReadTruthFile(string(glob_result.gl_pathv[i]));
    truth.CompareToCalls(results_vcf);

  }
  globfree(&glob_result);


  return 0;
}
Пример #10
0
int main(int argc, const char *argv[]) 
{
	OptArgs opts;
	opts.ParseCmdLine(argc, argv);
	string inFile, outFile;
	bool help = false;
	bool version = false;
	double lower = -5.0;
	double upper = 28.0;
	opts.GetOption(inFile, "", 'i', "input-file");
	opts.GetOption(outFile, "", 'o', "output-file");
	opts.GetOption(lower, "-5.0", '-', "wells-convert-low");
	opts.GetOption(upper, "28.0", '-', "wells-convert-high");
	opts.GetOption(help, "false", 'h', "help");
	opts.GetOption(version, "false", 'v', "version");
	opts.CheckNoLeftovers();
  
	if (version) 
	{
		fprintf (stdout, "%s", IonVersion::GetFullVersion("RawWellsConverter").c_str());
		exit(0);
	}

	if (inFile.empty() || help)
	{
		cout << "RawWellsConverter - Convert unsigned short type wells file to float type wells file, or vice versa." << endl 
			 << "options: " << endl
			 << "   -i,--input-file    input wells file." << endl
			 << "   -o,--output-file   output wells file." << endl
			 << "     ,--wells-convert-low   lower bound for converting to unsigned short." << endl
			 << "     ,--wells-convert-high  upper bound for converting to unsigned short." << endl
			 << "   -h,--help          this message." << endl
			 << "" << endl 
			 << "usage: " << endl
			 << "   RawWellsConverter -i input_path/1.wells -o output_path/1.wells " << endl;
		exit(1);
	}

	struct stat sb;
	if(stat(inFile.c_str(), &sb) != 0)
	{
		cerr << "RawWellsConverter ERROR: " << inFile << " does not exist." << endl; 
		exit (1);
	}

	if (outFile.empty())
	{
		outFile = inFile;
		outFile += ".converted";
	}

	string cmd("cp ");
	cmd += inFile;
	cmd += " ";
	cmd += outFile;
	int ret0 = system(cmd.c_str());

	hid_t root = H5Fopen(outFile.c_str(), H5F_ACC_RDWR, H5P_DEFAULT);
	if(root < 0)				
	{				
		cerr << "RawWellsConverter ERROR: Fail to open " << outFile << endl;
		exit(1);
	}	

	H5G_info_t group_info;
	group_info.nlinks = 0;
	if(H5Gget_info(root, &group_info) < 0)
	{
		H5Fclose(root);
		cerr << "RawWellsConverter ERROR: Fail H5Gget_info." << endl;
		exit(1);
	}

	char name[10];
	string sName;
	bool bWells = false;
	bool bCopies = false;
	for(unsigned int i = 0; i < group_info.nlinks; ++i)
	{
		int size = H5Gget_objname_by_idx(root, i, NULL, 0);
		if(H5Gget_objname_by_idx(root, i, name, size + 1) < 0)
		{
			H5Fclose(root);
			cerr << "RawWellsConverter ERROR: Fail H5Gget_objname_by_idx." << endl;
			exit(1);
		}
		else
		{
			sName = name;
			if(sName == "wells")
			{
				bWells = true;
			}
			if(sName == "wells_copies")
			{
				bCopies = true;
			}
		}
	}

	if(!bWells)
	{
		H5Fclose(root);
		cerr << "RawWellsConverter ERROR: There is no dataset wells." << endl;
		exit(1);
	}

	hid_t dsWells = H5Dopen2(root, "wells", H5P_DEFAULT);
	if(dsWells < 0)
	{
		H5Fclose(root);
		cerr << "RawWellsConverter ERROR: Fail H5Dopen2 wells." << endl;
		exit(1);
	}
	  
	bool saveAsUShort = false;
	if(H5Aexists(dsWells, "convert_low") > 0)
	{
		hid_t attrLower = H5Aopen(dsWells, "convert_low", H5T_NATIVE_FLOAT );
		H5Aread(attrLower, H5T_NATIVE_FLOAT, &lower); 
		saveAsUShort = true;
		H5Aclose(attrLower);
	}
	if(H5Aexists(dsWells, "convert_high") > 0)
	{
		hid_t attrUpper = H5Aopen(dsWells, "convert_high", H5T_NATIVE_FLOAT);
		H5Aread(attrUpper, H5T_NATIVE_FLOAT, &upper); 
		saveAsUShort = true;
		H5Aclose(attrUpper);
	}

	hid_t dataSpace = H5Dget_space(dsWells);
	if(dataSpace < 0)
	{
		H5Dclose(dsWells);
		H5Fclose(root);
		cerr << "RawWellsConverter ERROR: Fail H5Dget_space wells." << endl;
		exit(1);
	}

	hssize_t dsSize = H5Sget_simple_extent_npoints(dataSpace);		
	if(dsSize < 1)
	{
		H5Sclose(dataSpace);
		H5Dclose(dsWells);
		H5Fclose(root);
		cerr << "RawWellsConverter ERROR: Wrong size of dataset wells - " << dsSize << endl;
		exit(1);
	}

	int nRows = 0;
	int nCols = 0;
	int nFlows = 0;

	int rank = H5Sget_simple_extent_ndims(dataSpace);
    if(rank != 3)
	{
		bCopies = false;
	}
	else
	{
		hsize_t dims_out[3];
		int status_n = H5Sget_simple_extent_dims(dataSpace, dims_out, NULL);
		if(status_n < 0)
		{
			bCopies = false;
		}
		else
		{
			nRows = dims_out[0];
			nCols = dims_out[1];
			nFlows = dims_out[2];
		}
	}

	float* fPtr = new float[dsSize];
	unsigned short* usPtr = new unsigned short[dsSize];
	if(fPtr == NULL || usPtr == NULL)
	{
		H5Sclose(dataSpace);
		H5Dclose(dsWells);
		H5Fclose(root);
		cerr << "RawWellsConverter ERROR: Fail to allocate fPtr or usPtr." << endl;
		exit(1);
	}

	hid_t dcpl = H5Dget_create_plist(dsWells);
	if(dcpl < 0)
	{
		H5Sclose(dataSpace);
		H5Dclose(dsWells);
		H5Fclose(root);
		cerr << "RawWellsConverter ERROR: Fail H5Dget_create_plist." << endl;
		exit(1);
	}
	hid_t dapl = H5Dget_access_plist(dsWells);
	if(dapl < 0)
	{
		H5Pclose(dcpl);
		H5Sclose(dataSpace);
		H5Dclose(dsWells);
		H5Fclose(root);
		cerr << "RawWellsConverter ERROR: Fail H5Dget_access_plist." << endl;
		exit(1);
	}

	if(saveAsUShort)
	{
		cout << "RawWellsConverter: converting unsigned short wells file - " << inFile << " to float wells file - " << outFile << " with boundary (" << lower << "," << upper << ")" << endl;
	
		herr_t ret = H5Dread(dsWells, H5T_NATIVE_USHORT, H5S_ALL, H5S_ALL, H5P_DEFAULT, usPtr);
		H5Dclose(dsWells);
		if(ret < 0)
		{
			delete [] fPtr;
			delete [] usPtr;
			H5Sclose(dataSpace);
			H5Pclose(dcpl);
			H5Pclose(dapl);
			H5Fclose(root);
			cerr << "RawWellsConverter ERROR: Fail to read dataset wells." << endl;
			exit(1);
		}

		float factor = 65535.0 / (upper - lower);
		float* fPtr2 = fPtr;
		unsigned short* usPtr2 = usPtr;

		for(unsigned int i = 0; i < dsSize; ++i, ++fPtr2, ++usPtr2)
		{
			(*fPtr2) = (float)(*usPtr2) / factor + lower;
		}

		delete [] usPtr;

		if(bCopies)
		{
			vector<float> copies(nRows * nCols, 1.0);
			hid_t dsCopies = H5Dopen2(root, "wells_copies", H5P_DEFAULT);
			if(dsCopies < 0)
			{
				cerr << "RawWellsConverter WARNING: 1.wells files does not have wells_copies." << endl;
			}
			else
			{
				hid_t dataSpace2 = H5Dget_space(dsCopies);
				if(dataSpace2 < 0)
				{
					H5Dclose(dsCopies);
					cerr << "RawWellsConverter WARNING: fail to H5Dget_space for dataset wells_copies." << endl;          
				}
				else
				{
					hssize_t dsSize2 = H5Sget_simple_extent_npoints(dataSpace2);
					H5Sclose(dataSpace2);
					if(dsSize2 != (hssize_t)(nRows * nCols))
					{
						H5Dclose(dsCopies);
						cerr << "RawWellsConverter WARNING: dataset wells_copies size is " << dsSize2 << ", it is different from nRows * nCols = " << nRows * nCols << endl;          
					}
					else
					{
						herr_t ret = H5Dread(dsCopies, H5T_NATIVE_FLOAT, H5S_ALL, H5S_ALL, H5P_DEFAULT, &copies[0]);
						H5Dclose(dsCopies);
						if(ret < 0)
						{
							copies.resize(nRows * nCols, 1.0);
							cerr << "RawWellsConverter WARNING: failto load dataset wells_copies." << endl;          
						}
					}
				}
			}
			
			uint64_t fptrCount = 0;
			uint64_t copyCount = 0;
			for(int row = 0; row < nRows; ++row)
			{
				for(int col = 0; col < nCols; ++col)
				{
					for(int flow = 0; flow < nFlows; ++flow)
					{
						if(copies[copyCount] > 0)
						{
							fPtr[fptrCount] *= copies[copyCount];
						}
						else
						{
							fPtr[fptrCount] = -1.0;
						}
						
						++fptrCount;
					}

					++copyCount;
				}
			}
		}

	    H5Ldelete(root, "wells", H5P_DEFAULT);

		hid_t dsWells2 = H5Dcreate2 (root, "wells", H5T_NATIVE_FLOAT, dataSpace, H5P_DEFAULT, dcpl, dapl);
		if(dsWells2 < 0)
		{
			delete [] fPtr;
			H5Sclose(dataSpace);
			H5Pclose(dcpl);
			H5Pclose(dapl);
			H5Fclose(root);
			cerr << "RawWellsConverter ERROR: Fail to create dataset wells." << endl;
			exit(1);
		}

		ret = H5Dwrite(dsWells2, H5T_NATIVE_FLOAT, H5S_ALL, H5S_ALL, H5P_DEFAULT, fPtr);
		delete [] fPtr;
		H5Dclose(dsWells2);
		H5Sclose(dataSpace);
		H5Pclose(dcpl);
		H5Pclose(dapl);
		H5Fclose(root);
		if(ret < 0)
		{
			cerr << "RawWellsConverter ERROR: Fail to write dataset wells." << endl;
			exit(1);
		}
	}
	else
	{
		cout << "RawWellsConverter: converting float wells file - " << inFile << " to unsigned short wells file - " << outFile << " with boundary (" << lower << "," << upper << ")" << endl;
	
		herr_t ret = H5Dread(dsWells, H5T_NATIVE_FLOAT, H5S_ALL, H5S_ALL, H5P_DEFAULT, fPtr);
		H5Dclose(dsWells);
		if(ret < 0)
		{
			delete [] fPtr;
			delete [] usPtr;
			H5Sclose(dataSpace);
			H5Pclose(dcpl);
			H5Pclose(dapl);
			H5Fclose(root);
			cerr << "RawWellsConverter ERROR: Fail to read dataset wells." << endl;
			exit(1);
		}

		float factor = 65535.0 / (upper - lower);
		float* fPtr2 = fPtr;
		unsigned short* usPtr2 = usPtr;

		for(unsigned int i = 0; i < dsSize; ++i, ++fPtr2, ++usPtr2)
		{
			if(*fPtr2 < lower)
			{
				(*usPtr2) = 0;
			}
			else if(*fPtr2 > upper)
			{
				(*usPtr2) = 65535;
			}
			else
			{
				(*usPtr2) = (unsigned short)((*fPtr2 - lower) * factor);
			}
		}

		delete [] fPtr;

	    H5Ldelete(root, "wells", H5P_DEFAULT);

		hid_t dsWells2 = H5Dcreate2 (root, "wells", H5T_NATIVE_USHORT, dataSpace, H5P_DEFAULT, dcpl, dapl);
		if(dsWells2 < 0)
		{
			delete [] usPtr;
			H5Sclose(dataSpace);
			H5Pclose(dcpl);
			H5Pclose(dapl);
			H5Fclose(root);
			cerr << "RawWellsConverter ERROR: Fail to create dataset wells." << endl;
			exit(1);
		}

		ret = H5Dwrite(dsWells2, H5T_NATIVE_USHORT, H5S_ALL, H5S_ALL, H5P_DEFAULT, usPtr);
		delete [] usPtr;
		if(dsWells2 < 0)
		{
			H5Dclose(dsWells2);
			H5Sclose(dataSpace);
			H5Pclose(dcpl);
			H5Pclose(dapl);
			H5Fclose(root);
			cerr << "RawWellsConverter ERROR: Fail to write dataset wells." << endl;
			exit(1);
		}

		float lower2 = (float)lower;
		float upper2 = (float)upper;
		hsize_t dimsa[1];
		dimsa[0] = 1;
		hid_t dataspacea = H5Screate_simple(1, dimsa, NULL);
		hid_t attrLower = H5Acreate(dsWells2, "convert_low", H5T_NATIVE_FLOAT, dataspacea, H5P_DEFAULT, H5P_DEFAULT );
		H5Awrite(attrLower, H5T_NATIVE_FLOAT, &lower2);
		H5Aclose(attrLower);
		hid_t attrUpper = H5Acreate(dsWells2, "convert_high", H5T_NATIVE_FLOAT, dataspacea, H5P_DEFAULT, H5P_DEFAULT );
		H5Awrite(attrUpper, H5T_NATIVE_FLOAT, &upper2);
		H5Aclose(attrUpper);
		H5Sclose(dataspacea);

		H5Dclose(dsWells2);
		H5Sclose(dataSpace);
		H5Pclose(dcpl);
		H5Pclose(dapl);
		H5Fclose(root);
	}

	return 0;
}
Пример #11
0
int main(int argc, const char *argv[])
{
#ifdef _DEBUG
  atexit(memstatus);
  dbgmemInit();
#endif /* _DEBUG */

  printf ("%s - %s-%s (%s)\n", argv[0], IonVersion::GetVersion().c_str(), IonVersion::GetRelease().c_str(), IonVersion::GetSvnRev().c_str());

  string bamInputFilename;
  string fastaInputFilename;
  string jsonOutputFilename;
  bool help;

  OptArgs opts;
  opts.ParseCmdLine(argc, argv);
  opts.GetOption(bamInputFilename,    "",             '-',  "bam");
  opts.GetOption(fastaInputFilename,  "",             '-',  "ref");
  opts.GetOption(jsonOutputFilename,  "TFStats.json", '-',  "output-json");
  opts.GetOption(help,                "false",        'h',  "help");
  opts.CheckNoLeftovers();

  if (help || bamInputFilename.empty() || fastaInputFilename.empty())
    return showHelp();


  // Parse BAM header

  BAMReader bamReader(bamInputFilename);
  bamReader.open();
  bam_header_t *header = (bam_header_t *)bamReader.get_header_ptr();

  int numFlows = 0;
  string flowOrder;
  string key;

  if (header->l_text >= 3) {
    if (header->dict == 0)
      header->dict = sam_header_parse2(header->text);
    int nEntries = 0;
    char **tmp = sam_header2list(header->dict, "RG", "FO", &nEntries);
    if (nEntries) {
      flowOrder = tmp[0];
      numFlows = flowOrder.length();
    }
    if (tmp)
      free(tmp);
    nEntries = 0;
    tmp = sam_header2list(header->dict, "RG", "KS", &nEntries);
    if (nEntries) {
      key = tmp[0];
    }
    if (tmp)
      free(tmp);
  }

  if (numFlows <= 0) {
    fprintf(stderr, "[TFMapper] Could not retrieve flow order from FO BAM tag. SFF-specific tags absent?\n");
    exit(1);
  }
  if (key.empty()) {
    fprintf(stderr, "[TFMapper] Could not retrieve key sequence from KS BAM tag. SFF-specific tags absent?\n");
    exit(1);
  }
  //printf("Retrieved flow order from bam: %s (%d)\n", flowOrder.c_str(), numFlows);
  //printf("Retrieved key from bam: %s\n", key.c_str());


  // Retrieve test fragment sequences

  vector<string>  referenceSequences;
  PopulateReferenceSequences(referenceSequences, fastaInputFilename, header->n_targets, header->target_name, string(""));


  //  Process the BAM reads and generate metrics

  int numTFs = header->n_targets;
  vector<int>     TFCount(numTFs,0);
  MetricGeneratorQualityHistograms  metricGeneratorQualityHistograms[numTFs];
  MetricGeneratorHPAccuracy         metricGeneratorHPAccuracy[numTFs];
  MetricGeneratorSNR                metricGeneratorSNR[numTFs];
  MetricGeneratorAvgIonogram        metricGeneratorAvgIonogram[numTFs];

  for (BAMReader::iterator i = bamReader.get_iterator(); i.good(); i.next()) {

    BAMRead bamRead = i.get();
    int bestTF = bamRead.get_tid();
    if (bestTF < 0)
      continue;
    BAMUtils bamUtil(bamRead);
    TFCount[bestTF]++;

    // Extract flowspace signal from FZ BAM tag

    uint16_t *bam_flowgram = NULL;
    uint8_t *fz = bam_aux_get(bamRead.get_bam_ptr(), "FZ");
    if (fz != NULL) {
      if (fz[0] == (uint8_t)'B' && fz[1] == (uint8_t)'S' && *((uint32_t *)(fz+2)) == (uint32_t)numFlows)
        bam_flowgram = (uint16_t *)(fz+6);
    }
    if (bam_flowgram == NULL) {
      fprintf(stderr, "[TFMapper] Could not retrieve flow signal from FZ BAM tag. SFF-specific tags absent?\n");
      exit(1);
    }


    // Use alignments to generate "synchronized" flowspace reference and read ionograms
    // TODO: Do proper flowspace alignment

    string genome = key + bamUtil.get_tdna();
    string calls = key + bamUtil.get_qdna();

    int numBases = min(genome.length(),calls.length());
    vector<int> refIonogram(numFlows, 0);
    vector<int> readIonogram(numFlows, 0);

    int numFlowsRead = 0;
    int numFlowsRef = 0;
    char gC = flowOrder[0];
    int gBC = 0;

    for (int iBase = 0; (iBase < numBases) && (numFlowsRead < numFlows) && (numFlowsRef < numFlows); iBase++) {

      // Conversion for reads (independent of reference)
      if (calls[iBase] != '-') {
        while ((calls[iBase] != flowOrder[numFlowsRead]) && (numFlowsRead < numFlows))
          numFlowsRead++;
        if (numFlowsRead < numFlows)
          readIonogram[numFlowsRead]++;
      }

      if (genome[iBase] != '-') {

        if (genome[iBase] != gC) {
          // Since a new homopolymer begins, need to drop off the old one
          while ((gC != flowOrder[numFlowsRef]) && (numFlowsRef < numFlows)) {
            numFlowsRef++;
            if (numFlowsRef < numFlows)
              refIonogram[numFlowsRef] = 0;
          }
          if (numFlowsRef < numFlows)
            refIonogram[numFlowsRef] = gBC;

          gC = genome[iBase];
          gBC = 0;
        }
        gBC++;

        if (genome[iBase] == calls[iBase])
          numFlowsRef = numFlowsRead;
      }
    }

    int validFlows = min(numFlowsRef, numFlowsRead);


    metricGeneratorSNR[bestTF].AddElement(bam_flowgram ,key.c_str(), flowOrder);
    metricGeneratorAvgIonogram[bestTF].AddElement(bam_flowgram, numFlows);
    metricGeneratorQualityHistograms[bestTF].AddElement(bamUtil.get_phred_len(10),bamUtil.get_phred_len(17));
    for (int iFlow = 0; iFlow < validFlows-20; iFlow++)
      metricGeneratorHPAccuracy[bestTF].AddElement(refIonogram[iFlow],readIonogram[iFlow]);
  }


  // Save stats to a json file

  Json::Value outputJson(Json::objectValue);

  for(int i = 0; i < numTFs; i++) {
    if (TFCount[i] < minTFCount)
      continue;

    Json::Value currentTFJson(Json::objectValue);
    currentTFJson["TF Name"] = header->target_name[i];
    currentTFJson["TF Seq"] = referenceSequences[i];
    currentTFJson["Num"] = TFCount[i];
    currentTFJson["Top Reads"] = Json::Value(Json::arrayValue); // Obsolete

    metricGeneratorSNR[i].PrintSNR(currentTFJson);
    metricGeneratorHPAccuracy[i].PrintHPAccuracy(currentTFJson);
    metricGeneratorQualityHistograms[i].PrintMetrics(currentTFJson);
    metricGeneratorAvgIonogram[i].PrintIonograms(currentTFJson);

    outputJson[header->target_name[i]] = currentTFJson;
  }

  bamReader.close();  // Closing invalidates the header pointers

  if (!jsonOutputFilename.empty()) {
    ofstream out(jsonOutputFilename.c_str(), ios::out);
    if (out.good())
      out << outputJson.toStyledString();
  }

  return 0;
}