Пример #1
0
bool BaseCallerParameters::InitContextVarsFromOptArgs(OptArgs& opts){

    assert(bc_files.options_set);
    char default_run_id[6]; // Create a run identifier from full output directory string
    ion_run_to_readname (default_run_id, (char*)bc_files.output_directory.c_str(), bc_files.output_directory.length());
    context_vars.run_id                      = opts.GetFirstString ('-', "run-id", default_run_id);
	num_threads_                             = opts.GetFirstInt    ('n', "num-threads", max(2*numCores(), 4));
	num_bamwriter_threads_                   = opts.GetFirstInt    ('-', "num-threads-bamwriter", 6);

    context_vars.flow_signals_type           = opts.GetFirstString ('-', "flow-signals-type", "none");
    context_vars.extra_trim_left             = opts.GetFirstInt    ('-', "extra-trim-left", 0);
    context_vars.only_process_unfiltered_set = opts.GetFirstBoolean('-', "only-process-unfiltered-set", false);

    // Treephaser options
    context_vars.dephaser                    = opts.GetFirstString ('-', "dephaser", "treephaser-sse");
    context_vars.keynormalizer               = opts.GetFirstString ('-', "keynormalizer", "gain");
    context_vars.windowSize                  = opts.GetFirstInt    ('-', "window-size", DPTreephaser::kWindowSizeDefault_);
    context_vars.skip_droop                  = opts.GetFirstBoolean('-', "skip-droop", true);
    context_vars.skip_recal_during_norm      = opts.GetFirstBoolean('-', "skip-recal-during-normalization", false);
    context_vars.diagonal_state_prog         = opts.GetFirstBoolean('-', "diagonal-state-prog", false);

    // Not every combination of options is possible here:
    if (context_vars.diagonal_state_prog and context_vars.dephaser != "treephaser-swan") {
      cout << " === BaseCaller Option Incompatibility: Using dephaser treephaser-swan with diagonal state progression instead of "
           << context_vars.dephaser << endl;
      context_vars.dephaser = "treephaser-swan";
    }

    context_vars.process_tfs      = true;
    context_vars.options_set      = true;
    return true;
};
Пример #2
0
void RecalibrationModel::Initialize(OptArgs& opts, vector<string> &bam_comments, const string & run_id, const ion::ChipSubset & chip_subset)
{
  string model_file_name    = opts.GetFirstString ('-', "model-file", "");
  int model_threshold       = opts.GetFirstInt('-', "recal-model-hp-thres", 4);
  bool save_hpmodel         = opts.GetFirstBoolean('-', "save-hpmodel", true);
  bool diagonal_state_prog  = opts.GetFirstBoolean('-', "diagonal-state-prog", false);

  if (diagonal_state_prog)
    model_file_name.clear();

  if (InitializeModel(model_file_name, model_threshold) and save_hpmodel)
    SaveModelFileToBamComments(model_file_name, bam_comments, run_id, chip_subset.GetColOffset(), chip_subset.GetRowOffset());
}
Пример #3
0
bool RetrieveParameterBool(OptArgs &opts, Json::Value& json, char short_name, const string& long_name_hyphens, bool default_value)
{
  string long_name_underscores = long_name_hyphens;
  for (unsigned int i = 0; i < long_name_underscores.size(); ++i)
    if (long_name_underscores[i] == '-')
      long_name_underscores[i] = '_';

  bool value = default_value;
  string source = "builtin default";

  if (json.isMember(long_name_underscores)) {
    if (json[long_name_underscores].isString())
      value = atoi(json[long_name_underscores].asCString());
    else
      value = json[long_name_underscores].asInt();
    source = "parameters json file";
  }

  if (opts.HasOption(short_name, long_name_hyphens)) {
    value = opts.GetFirstBoolean(short_name, long_name_hyphens, value);
    source = "command line option";
  }

  cout << setw(35) << long_name_hyphens << " = " << setw(10) << (value ? "true" : "false") << " (boolean, " << source << ")" << endl;
  return value;
}
Пример #4
0
TagTrimmerParameters MolecularTagTrimmer::ReadOpts(OptArgs& opts)
{
  // Reading command line options to set tag structures
  TagTrimmerParameters my_params;

  my_params.min_family_size            = opts.GetFirstInt     ('-', "min-tag-fam-size", 3);
  my_params.suppress_mol_tags          = opts.GetFirstBoolean ('-', "suppress-mol-tags", false);
  //my_params.cl_a_handle                = opts.GetFirstString  ('-', "tag-handle", "");
  //my_params.handle_cutoff              = opts.GetFirstInt     ('-', "handle-cutoff", 2);

  my_params.master_tags.prefix_mol_tag = opts.GetFirstString  ('-', "prefix-mol-tag", "");
  my_params.master_tags.suffix_mol_tag = opts.GetFirstString  ('-', "suffix-mol-tag", "");

  ValidateTagString(my_params.master_tags.prefix_mol_tag);
  ValidateTagString(my_params.master_tags.suffix_mol_tag);

  // Overload to disable molecular tagging
  if (my_params.min_family_size == 0)
    my_params.suppress_mol_tags = true;
  else if (my_params.min_family_size < 1) {
    cerr << "MolecularTagTrimmer Error: min-tag-fam-size must be at least 1. " << endl;
    exit(EXIT_FAILURE);
  }

  my_params.command_line_tags = my_params.master_tags.HasTags();

  // Options for read filtering & and trimming method selection
  string trim_method          = opts.GetFirstString  ('-', "tag-trim-method", "sloppy-trim");
  if (trim_method == "sloppy-trim")
    my_params.tag_trim_method = kSloppyTrim;
  else if (trim_method == "strict-trim")
    my_params.tag_trim_method = kStrictTrim;
  else {
    cerr << "MolecularTagTrimmer Error: Unknown tag trimming option " << trim_method << endl;
    exit(EXIT_FAILURE);
  }

  string filter_method        = opts.GetFirstString  ('-', "tag-filter-method", "need-all");
  if (filter_method == "need-all")
    my_params.tag_filter_method = kneed_all_tags;
  else if (filter_method == "need-prefix")
    my_params.tag_filter_method = kneed_only_prefix_tag;
  else if (filter_method == "need-suffix")
    my_params.tag_filter_method = kneed_only_suffix_tag;
  else {
    cerr << "MolecularTagTrimmer Error: Unknown tag filtering option " << filter_method << endl;
    exit(EXIT_FAILURE);
  }
  return my_params;
}
Пример #5
0
void ExtendParameters::SetFreeBayesParameters(OptArgs &opts, Json::Value& fb_params) {
  // FreeBayes parameters
  // primarily used in candidate generation

  targets                               = opts.GetFirstString('t', "target-file", "");
  trim_ampliseq_primers                 = opts.GetFirstBoolean('-', "trim-ampliseq-primers", false);
  if (targets.empty() and trim_ampliseq_primers) {
    cerr << "ERROR: --trim-ampliseq-primers enabled but no --target-file provided" << endl;
    exit(1);
  }

  allowIndels                           = RetrieveParameterBool  (opts, fb_params, '-', "allow-indels", true);
  allowSNPs                             = RetrieveParameterBool  (opts, fb_params, '-', "allow-snps", true);
  allowMNPs                             = RetrieveParameterBool  (opts, fb_params, '-', "allow-mnps", true);
  allowComplex                          = RetrieveParameterBool  (opts, fb_params, '-', "allow-complex", false);
  // deprecated:
  // leftAlignIndels                       = RetrieveParameterBool  (opts, fb_params, '-', "left-align-indels", false);
  RetrieveParameterBool  (opts, fb_params, '-', "left-align-indels", false);
  
  //useBestNAlleles = 0;
  useBestNAlleles                       = RetrieveParameterInt   (opts, fb_params, 'm', "use-best-n-alleles", 2);
  onlyUseInputAlleles                   = RetrieveParameterBool  (opts, fb_params, '-', "use-input-allele-only", false);
  min_mapping_qv                        = RetrieveParameterInt   (opts, fb_params, 'M', "min-mapping-qv", 4);
  read_snp_limit                        = RetrieveParameterInt   (opts, fb_params, 'U', "read-snp-limit", 10);
  readMaxMismatchFraction               = RetrieveParameterDouble(opts, fb_params, 'z', "read-max-mismatch-fraction", 1.0);
  maxComplexGap                         = RetrieveParameterInt   (opts, fb_params, '!', "max-complex-gap", 1);
  // read from json or command line, otherwise default to snp frequency
  minAltFraction                        = RetrieveParameterDouble(opts, fb_params, '-', "gen-min-alt-allele-freq", my_controls.filter_snps.min_allele_freq);
  minCoverage                           = RetrieveParameterInt   (opts, fb_params, '-', "gen-min-coverage", my_controls.filter_snps.min_cov);
  minIndelAltFraction                   = RetrieveParameterDouble(opts, fb_params, '-', "gen-min-indel-alt-allele-freq", my_controls.filter_hp_indel.min_allele_freq);
  //set up debug levels

  if (program_flow.DEBUG > 0)
    debug = true;

  if (program_flow.inputPositionsOnly) {
    processInputPositionsOnly = true;
  }

  if (variantPriorsFile.empty() && (processInputPositionsOnly || onlyUseInputAlleles) ) {
    cerr << "ERROR: Parameter error - Process-input-positions-only: " << processInputPositionsOnly << " use-input-allele-only: " << onlyUseInputAlleles << " :  Specified without Input VCF File " << endl;
    exit(1);
  }
}
Пример #6
0
int IonstatsReduceH5(int argc, const char *argv[])
{
  OptArgs opts;
  opts.ParseCmdLine(argc-1, argv+1);

  string output_h5_filename = opts.GetFirstString  ('o', "output", "");
  bool merge_proton_blocks  = opts.GetFirstBoolean ('b', "merge-proton-blocks", "true");
  vector<string>  input_h5_filename;
  opts.GetLeftoverArguments(input_h5_filename);

  if(input_h5_filename.empty() or output_h5_filename.empty()) {
    IonstatsReduceH5Help();
    return 1;
  }

  if(merge_proton_blocks)
    cout << "NOTE:" << argv[0] << " " << argv[1] << ": --merge-proton-blocks=true so any Proton block-specific read group suffixes will be merged" << endl;

  return IonstatsAlignmentReduceH5(output_h5_filename, input_h5_filename, merge_proton_blocks);
}
Пример #7
0
void PerBaseQual::Init(OptArgs& opts, const string& chip_type, const string &output_directory, bool recalib)
{
	if(phred_table_)
	{
	  delete [] phred_table_;
	  phred_table_ = 0;
	}

  string phred_table_file       = opts.GetFirstString ('-', "phred-table-file", "");
  save_predictors_              = opts.GetFirstBoolean('-', "save-predictors", false);

  // Determine the correct phred table filename to use

  bool binTable = true;

  if (phred_table_file.empty()) {
    ChipIdDecoder::SetGlobalChipId(chip_type.c_str());
    ChipIdEnum chip_id = ChipIdDecoder::GetGlobalChipId();
    switch(chip_id){
    case ChipId314:
      phred_table_file = "phredTable.txt_314.binary";
      break;
    case ChipId316:
      phred_table_file = "phredTable.txt_316.binary";
      break;
    case ChipId316v2:
      phred_table_file = "phredTable.txt_318.binary";
      break; 
    case ChipId318:
      phred_table_file = "phredTable.txt_318.binary";
      break;
    case ChipId900: // Proton chip
      phred_table_file = "phredTable.txt_900.binary";
      break;
    default:
      phred_table_file = "phredTable.txt_314.binary";
      fprintf(stderr, "PerBaseQual: No default phred table for chip_type=%s, trying %s instead\n",
          chip_type.c_str(), phred_table_file.c_str());
      break;
    }

    if (recalib)
	{
		phred_table_file = phred_table_file.substr(0, phred_table_file.length() - 7);
        phred_table_file += ".Recal.binary";
	}

    char* full_filename = GetIonConfigFile(phred_table_file.c_str());
    if(!full_filename)
	{
		printf("WARNING: cannot find binary phred table file %s, try to use non-binary phred table\n", phred_table_file.c_str());
		phred_table_file = phred_table_file.substr(0, phred_table_file.length() - 7); // get rid of .binary
		binTable = false;
		char* full_filename2 = GetIonConfigFile(phred_table_file.c_str());
		if(!full_filename2)
			ION_ABORT("ERROR: Can't find phred table file " + phred_table_file);

		phred_table_file = full_filename2;
		free(full_filename2);
	}
	else
	{
		phred_table_file = full_filename;
		free(full_filename);
	}
  }

  cout << endl << "PerBaseQual::Init... phred_table_file=" << phred_table_file << endl;
  binTable = hasBinaryExtension(phred_table_file);

  // Load the phred table
  if(binTable)
  {
      cout << endl << "PerBaseQual::Init... load binary phred_table_file=" << phred_table_file << endl;
	  vector<size_t> vNumCuts(kNumPredictors, 0);

	  if(H5Fis_hdf5(phred_table_file.c_str()) > 0) 
	  {
			hid_t root = H5Fopen(phred_table_file.c_str(), H5F_ACC_RDONLY, H5P_DEFAULT);
			if(root < 0)
			{
				ION_ABORT("ERROR: cannot open HDF5 file " + phred_table_file);
			}

		    hid_t grpQvTable = H5Gopen(root, "/QvTable", H5P_DEFAULT);
			if (grpQvTable < 0) 
			{
				H5Fclose(root);
				ION_ABORT("ERROR: fail to open HDF5 group QvTable");
			}

			if(H5Aexists(grpQvTable, "NumPredictors") <= 0)
			{
				H5Gclose(grpQvTable);
				H5Fclose(root);
				ION_ABORT("ERROR: HDF5 attribute NumPredictors does not exist");
			}

			hid_t attrNumPreds = H5Aopen(grpQvTable, "NumPredictors", H5P_DEFAULT);
			if (attrNumPreds < 0) 
			{
				H5Gclose(grpQvTable);
				H5Fclose(root);
				ION_ABORT("ERROR: fail to open HDF5 attribute NumPredictors");
			}

			unsigned int numPredictors = 0;
			herr_t ret = H5Aread(attrNumPreds, H5T_NATIVE_UINT, &numPredictors);
			H5Aclose(attrNumPreds);
			if(ret < 0 || numPredictors != (unsigned int)kNumPredictors)
			{
				H5Gclose(grpQvTable);
				H5Fclose(root);
				ION_ABORT("ERROR: HDF5 attribute NumPredictors is wrong");
			}

			char buf[100];
			for(size_t i = 0; i < (size_t)kNumPredictors; ++i)
			{
				offsets_.push_back(1);

				sprintf(buf, "ThresholdsOfPredictor%d", (int)i);

				if(H5Aexists(grpQvTable, buf) <= 0)
				{
					H5Gclose(grpQvTable);
					H5Fclose(root);
					ION_ABORT("ERROR: HDF5 attribute ThresholdsOfPredictor does not exist");
				}

				hid_t attrCuts = H5Aopen(grpQvTable, buf, H5P_DEFAULT);
				if (attrCuts < 0) 
				{
					H5Gclose(grpQvTable);
					H5Fclose(root);
					ION_ABORT("ERROR: fail to open HDF5 attribute ThresholdsOfPredictor");
				}

				hsize_t size = H5Aget_storage_size(attrCuts);
				size /= sizeof(float);

				float* fcuts = new float[size];

				ret = H5Aread(attrCuts, H5T_NATIVE_FLOAT, fcuts);
				H5Aclose(attrCuts);
				if(ret < 0)
				{
					H5Gclose(grpQvTable);
					H5Fclose(root);
					ION_ABORT("ERROR: fail to read HDF5 attribute ThresholdsOfPredictor");
				}

				vector<float> vCuts(size);
				copy(fcuts, fcuts + size, vCuts.begin());

				phred_cuts_.push_back(vCuts);

				delete [] fcuts;
				fcuts = 0;
			}

			hid_t dsQvs = H5Dopen(grpQvTable, "Qvs", H5P_DEFAULT);
			if (dsQvs < 0) 
			{
				H5Gclose(grpQvTable);
				H5Fclose(root);
				ION_ABORT("ERROR: fail to open HDF5 dataset Qvs");
			}

			hsize_t tbSize = H5Dget_storage_size(dsQvs);

			phred_table_ = new unsigned char[tbSize];

			ret = H5Dread(dsQvs, H5T_NATIVE_UCHAR, H5S_ALL, H5S_ALL, H5P_DEFAULT, phred_table_);
			H5Dclose(dsQvs);
			H5Gclose(grpQvTable);
			H5Fclose(root);		
			if (ret < 0)
			{
				delete [] phred_table_;
				phred_table_ = 0;

				ION_ABORT("ERROR: fail to read HDF5 dataset Qvs");
			}
	  }
	  else
	  {
		printf("WARNING: binary phred table file %s is not a HDF5 file, try binary file mode.\n", phred_table_file.c_str());
		ifstream source;
		source.open(phred_table_file.c_str(), ios::in|ios::binary|ios::ate);
		if (!source.is_open())
			ION_ABORT("ERROR: Cannot open file: " + phred_table_file);

		long totalSize = source.tellg();
		char* tbBlock = new char [totalSize];

		source.seekg (0, ios::beg);
		source.read (tbBlock, totalSize);
		source.close();

		long headerSize = 0;
		char* ptr = tbBlock;
		int numPredictors = ptr[0]; //kNumPredictors
		if(numPredictors != kNumPredictors)
		{
			delete [] tbBlock;
			tbBlock = 0;
			ION_ABORT("ERROR: Wrong number of predictors load from " + phred_table_file);
		}

		ptr += 4;
		headerSize += 4;
		
		for(int i = 0; i < kNumPredictors; ++i)
		{
			vNumCuts[i] = ptr[0];
			ptr += 4;
			headerSize += 4;

			offsets_.push_back(1);
		}

		long tbSize = 1;
		for(int i = 0; i < kNumPredictors; ++i)
		{
			vector<float> vCuts;
			tbSize *= vNumCuts[i];
			for(size_t j = 0; j < vNumCuts[i]; ++j)
			{
				float tmp;
				memcpy(&tmp, ptr, 4);
				vCuts.push_back(tmp); 
				ptr += 4;
				headerSize += 4;
			}
			
			phred_cuts_.push_back(vCuts);
		}

		if(tbSize != (totalSize - headerSize))
		{
			delete [] tbBlock;
			tbBlock = 0;
			ION_ABORT("ERROR: Wrong QV table size");
		}	

		phred_table_ = new unsigned char[tbSize];
		memcpy(phred_table_, ptr, tbSize * sizeof(unsigned char));

		delete [] tbBlock;
		tbBlock = 0;
	  }

	  for(size_t i = kNumPredictors - 2; i > 0; --i)
	  {
		offsets_[i] *= phred_cuts_[i + 1].size();
		offsets_[i - 1] = offsets_[i];
	  }
	  offsets_[0] *= phred_cuts_[1].size();
  }
  else
  {
	  ifstream source;
	  source.open(phred_table_file.c_str());
	  if (!source.is_open())
		ION_ABORT("ERROR: Cannot open file: " + phred_table_file);

	  while (!source.eof()) {
		string line;
		getline(source, line);

		if (line.empty())
		  break;

		if (line[0] == '#')
		  continue;

		stringstream strs(line);
		float temp;
		for (int k = 0; k < kNumPredictors; ++k) {
		  strs >> temp;
		  phred_thresholds_[k].push_back(temp);
		}
		strs >> temp; //skip n-th entry
		strs >> temp;
		phred_quality_.push_back(temp);
	  }

	  source.close();

	  for (int k = 0; k < kNumPredictors; ++k)
		phred_thresholds_max_[k] = *max_element(phred_thresholds_[k].begin(), phred_thresholds_[k].end()); 
  }
 
  // Prepare for predictor dump here

  if (save_predictors_) {
    string predictors_filename = output_directory + "/Predictors.txt";
    cout << endl << "Saving PerBaseQual predictors to file " << predictors_filename << endl << endl;
    predictor_dump_.open(predictors_filename.c_str());
    if (!predictor_dump_.is_open())
      ION_ABORT("ERROR: Cannot open file: " + predictors_filename);
  }
}
Пример #8
0
int main (int argc, const char *argv[])
{
  printf ("------------- bamrealignment --------------\n");

  OptArgs opts;
  opts.ParseCmdLine(argc, argv);
  vector<int> score_vals(4);

  string input_bam  = opts.GetFirstString  ('i', "input", "");
  string output_bam = opts.GetFirstString  ('o', "output", "");
  opts.GetOption(score_vals, "4,-6,-5,-2", 's', "scores");
  int    clipping   = opts.GetFirstInt     ('c', "clipping", 2);
  bool   anchors    = opts.GetFirstBoolean ('a', "anchors", true);
  int    bandwidth  = opts.GetFirstInt     ('b', "bandwidth", 10);
  bool   verbose    = opts.GetFirstBoolean ('v', "verbose", false);
  bool   debug      = opts.GetFirstBoolean ('d', "debug", false);
  int    format     = opts.GetFirstInt     ('f', "format", 1);
  int  num_threads  = opts.GetFirstInt     ('t', "threads", 8);
  string log_fname  = opts.GetFirstString  ('l', "log", "");
  

  if (input_bam.empty() or output_bam.empty())
    return PrintHelp();

  opts.CheckNoLeftovers();

  std::ofstream logf;
  if (log_fname.size ())
  {
    logf.open (log_fname.c_str ());
    if (!logf.is_open ())
    {
      fprintf (stderr, "bamrealignment: Failed to open log file %s\n", log_fname.c_str());
      return 1;
    }
  }

  BamReader reader;
  if (!reader.Open(input_bam)) {
    fprintf(stderr, "bamrealignment: Failed to open input file %s\n", input_bam.c_str());
    return 1;
  }

  SamHeader header = reader.GetHeader();
  RefVector refs   = reader.GetReferenceData();

  BamWriter writer;
  writer.SetNumThreads(num_threads);
  if (format == 1)
    writer.SetCompressionMode(BamWriter::Uncompressed);
  else
    writer.SetCompressionMode(BamWriter::Compressed);

  if (!writer.Open(output_bam, header, refs)) {
    fprintf(stderr, "bamrealignment: Failed to open output file %s\n", output_bam.c_str());
    return 1;
  }


  // The meat starts here ------------------------------------

  if (verbose)
    cout << "Verbose option is activated, each alignment will print to screen." << endl
         << "  After a read hit RETURN to continue to the next one," << endl
         << "  or press q RETURN to quit the program," << endl
         << "  or press s Return to silence verbose," << endl
         << "  or press c RETURN to continue printing without further prompt." << endl << endl;

  unsigned int readcounter = 0;
  unsigned int mapped_readcounter = 0;
  unsigned int realigned_readcounter = 0;
  unsigned int modified_alignment_readcounter = 0;
  unsigned int pos_update_readcounter = 0;
  unsigned int failed_clip_realigned_readcount = 0;
  
  unsigned int already_perfect_readcount = 0;
  
  unsigned int bad_md_tag_readcount = 0;
  unsigned int error_recreate_ref_readcount = 0;
  unsigned int error_clip_anchor_readcount = 0;
  unsigned int error_sw_readcount = 0;
  unsigned int error_unclip_readcount = 0;
  
  unsigned int start_position_shift;
  int orig_position;
  int new_position;

  string  md_tag, new_md_tag, input = "x";
  vector<CigarOp>    new_cigar_data;
  vector<MDelement>  new_md_data;
  bool position_shift = false;
  time_t start_time = time(NULL);

  Realigner aligner;
  aligner.verbose_ = verbose;
  aligner.debug_   = debug;
  if (!aligner.SetScores(score_vals))
    cout << "bamrealignment: Four scores need to be provided: match, mismatch, gap open, gap extend score!" << endl;

  aligner.SetAlignmentBandwidth(bandwidth);

  BamAlignment alignment;
  while(reader.GetNextAlignment(alignment)){
    readcounter ++;
    position_shift = false;
    
    if ( (readcounter % 100000) == 0 )
       cout << "Processed " << readcounter << " reads. Elapsed time: " << (time(NULL) - start_time) << endl;

    if (alignment.IsMapped()) {
      
      
      
      orig_position = alignment.Position;
      mapped_readcounter++;
      aligner.SetClipping(clipping, !alignment.IsReverseStrand());
      if (aligner.verbose_) {
    	cout << endl;
        if (alignment.IsReverseStrand())
          cout << "The read is from the reverse strand." << endl;
        else
          cout << "The read is from the forward strand." << endl;
      }

      if (!alignment.GetTag("MD", md_tag)) {
    	if (aligner.verbose_)
          cout << "Warning: Skipping read " << alignment.Name << ". It is mapped but missing MD tag." << endl;
	if (logf.is_open ())
	  logf << alignment.Name << '\t' << alignment.IsReverseStrand() << '\t' << alignment.RefID << '\t' << setfill ('0') << setw (8) << orig_position << '\t' << "MISSMD" << '\n';
	bad_md_tag_readcount++;
      } else if (aligner.CreateRefFromQueryBases(alignment.QueryBases, alignment.CigarData, md_tag, anchors)) {
	bool clipfail = false;
	if (Realigner::CR_ERR_CLIP_ANCHOR == aligner.GetCreateRefError ())
	{
	  clipfail = true;
	  failed_clip_realigned_readcount ++;
	}

        if (!aligner.computeSWalignment(new_cigar_data, new_md_data, start_position_shift)) {
          if (aligner.verbose_)
            cout << "Error in the alignment! Not updating read information." << endl;
	  if (logf.is_open ())
	    logf << alignment.Name << '\t' << alignment.IsReverseStrand() << '\t' << alignment.RefID << '\t' << setfill ('0') << setw (8) << orig_position << '\t' << "SWERR" << '\n';
	  error_sw_readcount++;
          writer.SaveAlignment(alignment);  // Write alignment unchanged
          continue;
        }

        if (!aligner.addClippedBasesToTags(new_cigar_data, new_md_data, alignment.QueryBases.size())) {
          if (aligner.verbose_)
            cout << "Error when adding clipped anchors back to tags! Not updating read information." << endl;
	  if (logf.is_open ())
	    logf << alignment.Name << '\t' << alignment.IsReverseStrand() << '\t' << alignment.RefID << '\t' << setfill ('0') << setw (8) << orig_position << '\t' << "UNCLIPERR" << '\n';
          writer.SaveAlignment(alignment);  // Write alignment unchanged
	  error_unclip_readcount ++;
          continue;
        }
        new_md_tag = aligner.GetMDstring(new_md_data);
        realigned_readcounter++;

        // adjust start position of read
        if (!aligner.LeftAnchorClipped() and start_position_shift != 0) {
          new_position = aligner.updateReadPosition(alignment.CigarData, (int)start_position_shift, alignment.Position);
          if (new_position != alignment.Position) {
            pos_update_readcounter++;
            position_shift = true;
            alignment.Position = new_position;
          }
        }
        
        if (position_shift || alignment.CigarData.size () != new_cigar_data.size () || md_tag != new_md_tag)
	{
	  if (logf.is_open ())
	  {
	    logf << alignment.Name << '\t' << alignment.IsReverseStrand() << '\t' << alignment.RefID << '\t' << setfill ('0') << setw (8) << orig_position << '\t' << "MOD";
	    if (position_shift)
	      logf << "-SHIFT";
	    if (clipfail)
	      logf << " NOCLIP";
	    logf << '\n';
	  }
	  modified_alignment_readcounter++;
	}
	else
	{
            if (logf.is_open ())
	    {
	      logf << alignment.Name << '\t' << alignment.IsReverseStrand() << '\t' << alignment.RefID << '\t' << setfill ('0') << setw (8) << orig_position << '\t' << "UNMOD";
              if (clipfail)
	        logf << " NOCLIP";
	      logf << '\n';
	    }
	}

        if (aligner.verbose_){
          cout << alignment.Name << endl;
          cout << "------------------------------------------" << endl;
          // Wait for input to continue or quit program
          if (input.size() == 0)
            input = 'x';
          else if (input[0] != 'c' and input[0] != 'C')
            getline(cin, input);
          if (input.size()>0){
            if (input[0] == 'q' or input[0] == 'Q')
              return 1;
            else if (input[0] == 's' or input[0] == 'S')
              aligner.verbose_ = false;
          }
        }

        // Finally update alignment information
        alignment.CigarData = new_cigar_data;
        alignment.EditTag("MD", "Z" , new_md_tag);

      } // end of CreateRef else if
      else {
	switch (aligner.GetCreateRefError ())
	{
	  case Realigner::CR_ERR_RECREATE_REF:
            if (logf.is_open ())
	      logf << alignment.Name << '\t' << alignment.IsReverseStrand() << '\t' << alignment.RefID << '\t' << setfill ('0') << setw (8) << orig_position << '\t' << "RECRERR" << '\n';
	    error_recreate_ref_readcount++;
	    break;
	  case Realigner::CR_ERR_CLIP_ANCHOR:
            if (logf.is_open ())
	      logf << alignment.Name << '\t' << alignment.IsReverseStrand() << '\t' << alignment.RefID << '\t' << setfill ('0') << setw (8) << orig_position << '\t' << "CLIPERR" << '\n';
	    error_clip_anchor_readcount++;
	    break;
	  default:
		  //  On a good run this writes way too many reads to the log file - don't want to create a too large txt file
          //  if (logf.is_open ())
	      //logf << alignment.Name << '\t' << alignment.IsReverseStrand() << '\t' << alignment.RefID << '\t' << setfill ('0') << setw (8) << orig_position << '\t' << "PERFECT" << '\n';
	    already_perfect_readcount++;
	    break;
	}
	
	if (aligner.verbose_) {
	  cout << alignment.Name << endl;
	  cout << "------------------------------------------" << endl;
	  // Wait for input to continue or quit program
	  if (input.size() == 0)
	    input = 'x';
	  else if (input[0] != 'c' and input[0] != 'C')
	    getline(cin, input);
	  if (input.size()>0){
	    if (input[0] == 'q' or input[0] == 'Q')
	      return 1;
	    else if (input[0] == 's' or input[0] == 'S')
	      aligner.verbose_ = false;
	  }
	}
      }

      // --- Debug output for Rajesh ---
      if (debug && aligner.invalid_cigar_in_input) {
        aligner.verbose_ = true;
        cout << "Invalid cigar string / md tag pair in read " << alignment.Name << endl;
        // Rerun reference generation to display error
        aligner.CreateRefFromQueryBases(alignment.QueryBases, alignment.CigarData, md_tag, anchors);

        aligner.verbose_ = verbose;
        aligner.invalid_cigar_in_input = false;
      }
      // --- --- ---


    } // end of if isMapped

    writer.SaveAlignment(alignment);

  } // end while loop over reads

  if (aligner.invalid_cigar_in_input)
    cerr << "WARNING bamrealignment: There were invalid cigar string / md tag pairs in the input bam file." << endl;

  // ----------------------------------------------------------------
  // program end -- output summary information
  cout   << "                            File: " << input_bam    << endl
         << "                     Total reads: " << readcounter  << endl
         << "                    Mapped reads: " << mapped_readcounter << endl;
  if (bad_md_tag_readcount)
    cout << "            Skipped: bad MD tags: " << bad_md_tag_readcount << endl;
  if (error_recreate_ref_readcount)
    cout << " Skipped: unable to recreate ref: " << error_recreate_ref_readcount << endl;
  if (error_clip_anchor_readcount)
    cout << "  Skipped: error clipping anchor: " << error_clip_anchor_readcount << endl;
  cout  <<  "       Skipped:  already perfect: " << already_perfect_readcount << endl
        <<  "           Total reads realigned: " << mapped_readcounter - already_perfect_readcount - bad_md_tag_readcount - error_recreate_ref_readcount - error_clip_anchor_readcount << endl;
  if (failed_clip_realigned_readcount)
    cout << "                      (including  " << failed_clip_realigned_readcount << " that failed to clip)" << endl;
  if (error_sw_readcount)
    cout << " Failed to complete SW alignment: " << error_sw_readcount << endl;
  if (error_unclip_readcount)
    cout << "         Failed to unclip anchor: " << error_unclip_readcount << endl;
  cout   << "           Succesfully realigned: " << realigned_readcounter << endl
         << "             Modified alignments: " << modified_alignment_readcounter << endl
         << "                Shifted position: " << pos_update_readcounter << endl;
  
  cout << "Processing time: " << (time(NULL)-start_time) << " seconds." << endl;
  cout << "INFO: The output BAM file may be unsorted." << endl;
  cout << "------------------------------------------" << endl;
  return 0;
}
Пример #9
0
int PrepareHotspots(int argc, const char *argv[])
{
  OptArgs opts;
  opts.ParseCmdLine(argc, argv);
  string input_bed_filename       = opts.GetFirstString ('b', "input-bed", "");
  string input_vcf_filename       = opts.GetFirstString ('v', "input-vcf", "");
  string input_real_vcf_filename  = opts.GetFirstString ('p', "input-real-vcf", "");
  string output_hot_vcf		  = opts.GetFirstString ('q', "output-fake-hot-vcf", "");
  string output_bed_filename      = opts.GetFirstString ('d', "output-bed", "");
  string output_vcf_filename      = opts.GetFirstString ('o', "output-vcf", "");
  string reference_filename       = opts.GetFirstString ('r', "reference", "");
  string unmerged_bed 		  = opts.GetFirstString ('u', "unmerged-bed", "");
  bool left_alignment             = opts.GetFirstBoolean('a', "left-alignment", false);
  bool filter_bypass              = opts.GetFirstBoolean('f', "filter-bypass", false);
  bool allow_block_substitutions  = opts.GetFirstBoolean('s', "allow-block-substitutions", true);
  bool strict_check               = opts.GetFirstBoolean('S', "strict-check", true);
  opts.CheckNoLeftovers();

  if((input_bed_filename.empty() == (input_vcf_filename.empty() and input_real_vcf_filename.empty())) or
      (output_bed_filename.empty() and output_vcf_filename.empty()) or reference_filename.empty()) {
    PrepareHotspotsHelp();
    return 1;
  }
  if ((not input_real_vcf_filename.empty()) and (output_vcf_filename.empty() or not input_vcf_filename.empty())) {
    PrepareHotspotsHelp();
    return 1;
  }


  // Populate chromosome list from reference.fai
  // Use mmap to fetch the entire reference

  int ref_handle = open(reference_filename.c_str(),O_RDONLY);

  struct stat ref_stat;
  fstat(ref_handle, &ref_stat);
  char *ref = (char *)mmap(0, ref_stat.st_size, PROT_READ, MAP_SHARED, ref_handle, 0);


  FILE *fai = fopen((reference_filename+".fai").c_str(), "r");
  if (!fai) {
    fprintf(stderr, "ERROR: Cannot open %s.fai\n", reference_filename.c_str());
    return 1;
  }

  vector<Reference>  ref_index;
  map<string,int> ref_map;
  char line[1024], chrom_name[1024];
  while (fgets(line, 1024, fai) != NULL) {
    Reference ref_entry;
    long chr_start;
    if (5 != sscanf(line, "%1020s\t%ld\t%ld\t%d\t%d", chrom_name, &ref_entry.size, &chr_start,
                    &ref_entry.bases_per_line, &ref_entry.bytes_per_line))
      continue;
    ref_entry.chr = chrom_name;
    ref_entry.start = ref + chr_start;
    ref_index.push_back(ref_entry);
    ref_map[ref_entry.chr] = (int) ref_index.size() - 1;
  }
  fclose(fai);
  junction junc;
  if (!unmerged_bed.empty()) {
    FILE *fp = fopen(unmerged_bed.c_str(), "r");
    if (!fp) {
	fprintf(stderr, "ERROR: Cannot open %s\n", unmerged_bed.c_str());
	return 1;
    }
    char line2[65536];

    junc.init(ref_index.size());
    bool line_overflow = false;
    while (fgets(line2, 65536, fp) != NULL) {
      if (line2[0] and line2[strlen(line2)-1] != '\n' and strlen(line2) == 65535) {
        line_overflow = true;
	continue;
      }
      if (line_overflow) {
        line_overflow = false;
        continue;
      }
     if (strstr(line2, "track")) continue;
      char chr[100];
      int b, e;
      sscanf(line2, "%s %d %d", chr,  &b, &e);
      junc.add(ref_map[chr], b, e);
    }
    fclose(fp);
  }

  // Load input BED or load input VCF, group by chromosome

  deque<LineStatus> line_status;
  vector<deque<Allele> > alleles(ref_index.size());

  if (!input_bed_filename.empty()) {

    FILE *input = fopen(input_bed_filename.c_str(),"r");
    if (!input) {
      fprintf(stderr,"ERROR: Cannot open %s\n", input_bed_filename.c_str());
      return 1;
    }

    char line2[65536];

    int line_number = 0;
    bool line_overflow = false;
    while (fgets(line2, 65536, input) != NULL) {
      if (line2[0] and line2[strlen(line2)-1] != '\n' and strlen(line2) == 65535) {
        line_overflow = true;
        continue;
      }
      line_number++;
      if (line_overflow) {
        line_overflow = false;
        line_status.push_back(LineStatus(line_number));
        line_status.back().filter_message_prefix = "Malformed hotspot BED line: line length exceeds 64K";
        continue;
      }

      if (strncmp(line2, "browser", 7) == 0)
        continue;

      if (strncmp(line2, "track", 5) == 0) {
        if (string::npos != string(line2).find("allowBlockSubstitutions=true"))
          allow_block_substitutions = true;
        continue;
      }

      // OID= table has special meaning
      if (string::npos != string(line2).find("OID=")) {
	line_status.push_back(LineStatus(line_number));
        line_status.back().filter_message_prefix = "Bed line contains OID=";
        continue;
      }

      char *current_chr = strtok(line2, "\t\r\n");
      char *current_start = strtok(NULL, "\t\r\n");
      char *current_end = strtok(NULL, "\t\r\n");
      char *current_id = strtok(NULL, "\t\r\n");
      char *penultimate = strtok(NULL, "\t\r\n");
      char *ultimate = strtok(NULL, "\t\r\n");
      for (char *next = strtok(NULL, "\t\r\n"); next; next = strtok(NULL, "\t\r\n")) {
        penultimate = ultimate;
        ultimate = next;
      }

      if (!current_chr or !current_start or !current_end or !current_id or !penultimate or !ultimate) {
        line_status.push_back(LineStatus(line_number));
        line_status.back().filter_message_prefix = "Malformed hotspot BED line: expected at least 6 fields";
        continue;
      }

      Allele allele;

      string string_chr(current_chr);
      if (ref_map.find(string_chr) != ref_map.end())
        allele.chr_idx = ref_map[string_chr];
      else if (ref_map.find("chr"+string_chr) != ref_map.end())
        allele.chr_idx = ref_map["chr"+string_chr];
      else if (string_chr == "MT" and ref_map.find("chrM") != ref_map.end())
        allele.chr_idx = ref_map["chrM"];
      else {
        line_status.push_back(LineStatus(line_number));
        line_status.back().filter_message_prefix = "Unknown chromosome name: ";
        line_status.back().filter_message = string_chr;
        continue;
      }

      allele.pos = strtol(current_start,NULL,10);
      allele.id = current_id;

      char *current_ref = NULL;
      char *current_alt = NULL;
      for (char *next = strtok(penultimate, ";"); next; next = strtok(NULL, ";")) {
        if (strncmp(next,"REF=",4) == 0)
          current_ref = next;
        else if (strncmp(next,"OBS=",4) == 0)
          current_alt = next;
        else if (strncmp(next,"ANCHOR=",7) == 0) {
          // ignore ANCHOR
        } else {
          char *value = next;
          while (*value and *value != '=')
            ++value;
          if (*value == '=')
            *value++ = 0;
          allele.custom_tags[next] = value;
        }
      }
      if (!current_ref or !current_alt) {
        line_status.push_back(LineStatus(line_number));
        line_status.back().filter_message_prefix = "Malformed hotspot BED line: REF and OBS fields required in penultimate column";
        continue;
      }
      for (char *pos = current_ref+4; *pos; ++pos)
        allele.ref += toupper(*pos);
      for (char *pos = current_alt+4; *pos; ++pos)
        allele.alt += toupper(*pos);
      // here is the place to check the length of the hotspot cover the amplicon junction. ZZ
      /*
      if (junc.contain(allele.chr_idx, allele.pos, (unsigned int) allele.ref.size())) {
	line_status.push_back(LineStatus(line_number));
        line_status.back().filter_message_prefix = "hotspot BED line contain the complete overlapping region of two amplicon, the variant cannot be detected by tvc";
        continue;
      }
      if (not junc.contained_in_ampl(allele.chr_idx, allele.pos, (unsigned int) allele.ref.size())) {
        line_status.push_back(LineStatus(line_number));
        line_status.back().filter_message_prefix = "hotspot BED line is not contained in any amplicon, the variant cannot be detected by tvc";
        continue;
      }
      */

      allele.filtered = false;
      line_status.push_back(LineStatus(line_number));
      allele.line_status = &line_status.back();
      allele.opos = allele.pos;
      allele.oref = allele.ref;
      allele.oalt = allele.alt;
      alleles[allele.chr_idx].push_back(allele);
      //line_status.back().allele = &alleles[allele.chr_idx].back();
      line_status.back().chr_idx = allele.chr_idx;
      line_status.back().opos = allele.opos;
      line_status.back().id = allele.id;
    }

    fclose(input);
  }



  if (!input_vcf_filename.empty() or !input_real_vcf_filename.empty()) {

    bool real_vcf = false;
    FILE *input;
    FILE *out_real = NULL;
    FILE *out_hot = NULL;
    int fake_ = 0;
    int hn = 1;
    if (!input_real_vcf_filename.empty()) {
	real_vcf = true;
	input = fopen(input_real_vcf_filename.c_str(),"r");
	if (!input) {
	    fprintf(stderr,"ERROR: Cannot open %s\n", input_real_vcf_filename.c_str());
            return 1;
	}
	out_real = fopen(output_vcf_filename.c_str(), "w");
	if (!out_real) {
            fprintf(stderr,"ERROR: Cannot open %s\n", output_vcf_filename.c_str());
            return 1;
        }
	if (!output_hot_vcf.empty()) {
	    out_hot = fopen(output_hot_vcf.c_str(), "w");
	    if (!out_hot) {
		fprintf(stderr,"ERROR: Cannot open %s\n", output_hot_vcf.c_str());
		return 1;
	    } 
   	} else out_hot = stdout;
	fprintf(out_hot, "##fileformat=VCFv4.1\n##allowBlockSubstitutions=true\n#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO\n");
    } else {
        input = fopen(input_vcf_filename.c_str(),"r");
        if (!input) {
            fprintf(stderr,"ERROR: Cannot open %s\n", input_vcf_filename.c_str());
            return 1;
    	}
    }

    char line2[65536];
    char line3[65536];
    int line_number = 0;
    bool line_overflow = false;
    list<one_vcfline> vcflist;

    char last_chr[1024] = "";
    while (fgets(line2, 65536, input) != NULL) {
      if (line2[0] and line2[strlen(line2)-1] != '\n' and strlen(line2) == 65535) {
        line_overflow = true;
        continue;
      }
      line_number++;
      if (line_overflow) {
        line_overflow = false;
        line_status.push_back(LineStatus(line_number));
        line_status.back().filter_message_prefix = "Malformed hotspot VCF line: line length exceeds 64K";
        continue;
      }

      if (strncmp(line2, "##allowBlockSubstitutions=true", 30) == 0) {
        allow_block_substitutions = true;
        continue;
      }
      if (line2[0] == '#') {
	if (out_real) { fprintf(out_real, "%s", line2);}
        continue;
      }

      if (real_vcf) strcpy(line3, line2);
      char *current_chr = strtok(line2, "\t\r\n");
      char *current_start = strtok(NULL, "\t\r\n");
      char *current_id = strtok(NULL, "\t\r\n");
      char *current_ref = strtok(NULL, "\t\r\n");
      char *current_alt = strtok(NULL, "\t\r\n");
      strtok(NULL, "\t\r\n"); // Ignore QUAL
      strtok(NULL, "\t\r\n"); // Ignore FILTER
      char *current_info = strtok(NULL, "\t\r\n");
      strtok(NULL, "\t\r\n");
      char *gt = strtok(NULL, "\t\r\n");

      if (!current_chr or !current_start or !current_id or !current_ref or !current_alt) {
        line_status.push_back(LineStatus(line_number));
        if (real_vcf) line_status.back().filter_message_prefix = "Malformed real VCF line: expected at least 5 fields";
	else line_status.back().filter_message_prefix = "Malformed hotspot VCF line: expected at least 5 fields";
        continue;
      }


      string string_chr(current_chr);
      int chr_idx = 0;
      if (ref_map.find(string_chr) != ref_map.end())
        chr_idx = ref_map[string_chr];
      else if (ref_map.find("chr"+string_chr) != ref_map.end())
        chr_idx = ref_map["chr"+string_chr];
      else if (string_chr == "MT" and ref_map.find("chrM") != ref_map.end())
        chr_idx = ref_map["chrM"];
      else {
        line_status.push_back(LineStatus(line_number));
        line_status.back().filter_message_prefix = "Unknown chromosome name: ";
        line_status.back().filter_message = string_chr;
        continue;
      }

      for (char *pos = current_ref; *pos; ++pos)
        *pos = toupper(*pos);
      for (char *pos = current_alt; *pos; ++pos)
        *pos = toupper(*pos);


      // Process custom tags
      vector<string>  bstrand;
      vector<string>  hp_max_length;
      string raw_oid;
      string raw_omapalt;
      string raw_oalt;
      string raw_oref;
      string raw_opos;

      if (current_info) {
        string raw_bstrand;
        string raw_hp_max_length;
        for (char *next = strtok(current_info, ";"); next; next = strtok(NULL, ";")) {

          char *value = next;
          while (*value and *value != '=')
            ++value;
          if (*value == '=')
            *value++ = 0;

          if (strcmp(next, "TYPE") == 0)
            continue;
          if (strcmp(next, "HRUN") == 0)
            continue;
          if (strcmp(next, "HBASE") == 0)
            continue;
          if (strcmp(next, "FR") == 0)
            continue;
          if (strcmp(next, "OPOS") == 0) {
	    raw_opos = value;
            continue;
	  }
          if (strcmp(next, "OREF") == 0) {
	    raw_oref = value;
            continue;
	  }
          if (strcmp(next, "OALT") == 0) {
	    raw_oalt = value;
            continue;
	  }
          if (strcmp(next, "OID") == 0) {
            raw_oid = value;
            continue;
          }
          if (strcmp(next, "OMAPALT") == 0) {
            raw_omapalt = value;
            continue;
          }
          if (strcmp(next, "BSTRAND") == 0) {
            raw_bstrand = value;
            continue;
          }
          if (strcmp(next, "hp_max_length") == 0) {
            raw_hp_max_length = value;
            continue;
          }
        }

        if (not raw_bstrand.empty())
          split(raw_bstrand, ',', bstrand);
        if (not raw_hp_max_length.empty())
          split(raw_hp_max_length, ',', hp_max_length);

      }

      if (real_vcf) {
	//fprintf(stderr, "%s\n", gt);
        if (gt == NULL) continue;
	// get gt
	int g1 = atoi(gt), g2;
	gt = strchr(gt, '/');
	if (gt) g2 = atoi(gt+1);
	else {fprintf(stderr, "GT not formatted right\n"); exit(1);}
	//if (g1 == 0 and g2 == 0) continue;
	unsigned int cur_pos = atoi(current_start);
	one_vcfline newline(current_ref, current_alt, cur_pos, g1, g2, line3);
	bool new_chr = false;
	if (strcmp(current_chr, last_chr) != 0) {
	    new_chr = true;
	}
	while (not vcflist.empty()) {
	    if ((not new_chr) and vcflist.front().pos+strlen(vcflist.front().ref) > cur_pos) break;
	    if (vcflist.front().produce_hot_vcf(last_chr, out_real, hn, out_hot)) fake_++;
	    vcflist.pop_front();
	}
	if (new_chr) strcpy(last_chr, current_chr);
	for (list<one_vcfline>::iterator it = vcflist.begin(); it != vcflist.end(); it++) {
	    it->check_subset(newline);
	}
	if (not newline.alts.empty()) vcflist.push_back(newline);
	continue;
      } 
      unsigned int allele_idx = 0;
      for (char *sub_alt = strtok(current_alt,","); sub_alt; sub_alt = strtok(NULL,",")) {

        Allele allele;
        allele.chr_idx = chr_idx;
        allele.ref = current_ref;
        allele.alt = sub_alt;
        allele.pos = strtol(current_start,NULL,10)-1;
        allele.id = current_id;
        if (allele.id == ".")
          allele.id = "hotspot";

        allele.filtered = false;
        line_status.push_back(LineStatus(line_number));
        allele.line_status = &line_status.back();
        allele.opos = allele.pos;
        allele.oref = allele.ref;
        allele.oalt = allele.alt;

        if (allele_idx < bstrand.size()) {
          if (bstrand[allele_idx] != ".")
            allele.custom_tags["BSTRAND"] = bstrand[allele_idx];
        }

        if (allele_idx < hp_max_length.size()) {
          if (hp_max_length[allele_idx] != ".")
            allele.custom_tags["hp_max_length"] = hp_max_length[allele_idx];
        }

        alleles[allele.chr_idx].push_back(allele);
        //line_status.back().allele = &alleles[allele.chr_idx].back();
        line_status.back().chr_idx = allele.chr_idx;
        line_status.back().opos = allele.opos;
        line_status.back().id = allele.id;
        allele_idx++;
      }
    }

    fclose(input);
    if (real_vcf) {
        while (not vcflist.empty()) {
            if (vcflist.front().produce_hot_vcf(last_chr, out_real, hn, out_hot)) fake_++;
            vcflist.pop_front();
        }
	fclose(out_real);
	fclose(out_hot);
	if (fake_ > 0) 
            return 0;
	else return 1;
    }
  }


  // Process by chromosome:
  //   - Verify reference allele
  //   - Left align
  //   - Sort
  //   - Filter for block substitutions, write

  FILE *output_vcf = NULL;
  if (!output_vcf_filename.empty()) {
    output_vcf = fopen(output_vcf_filename.c_str(), "w");
    if (!output_vcf) {
      fprintf(stderr,"ERROR: Cannot open %s for writing\n", output_vcf_filename.c_str());
      return 1;
    }
    fprintf(output_vcf, "##fileformat=VCFv4.1\n");
    if (allow_block_substitutions)
      fprintf(output_vcf, "##allowBlockSubstitutions=true\n");
    fprintf(output_vcf, "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\n");
  }
  FILE *output_bed = NULL;
  if (!output_bed_filename.empty()) {
    output_bed = fopen(output_bed_filename.c_str(), "w");
    if (!output_bed) {
      fprintf(stderr,"ERROR: Cannot open %s for writing\n", output_bed_filename.c_str());
      if (output_vcf)
        fclose(output_vcf);
      return 1;
    }
    if (allow_block_substitutions)
      fprintf(output_bed, "track name=\"hotspot\" type=bedDetail allowBlockSubstitutions=true\n");
    else
      fprintf(output_bed, "track name=\"hotspot\" type=bedDetail\n");
  }


  for (int chr_idx = 0; chr_idx < (int)ref_index.size(); ++chr_idx) {

    for (deque<Allele>::iterator A = alleles[chr_idx].begin(); A != alleles[chr_idx].end(); ++A) {

      // check bed file
      if (junc.contain(A->chr_idx, A->pos, (unsigned int) A->ref.size())) {
	A->filtered = true;
        A->line_status->filter_message_prefix = "hotspot BED line contain the complete overlapping region of two amplicon, the variant cannot be detected by tvc";
        continue;
      }
      if (not junc.contained_in_ampl(A->chr_idx, A->pos, (unsigned int) A->ref.size())) {
	A->filtered = true;
        A->line_status->filter_message_prefix = "hotspot BED line is not contained in any amplicon, the variant cannot be detected by tvc";
        continue;
      }


      // Invalid characters

      bool valid = true;
      for (const char *c = A->ref.c_str(); *c ; ++c)
        if (*c != 'A' and *c != 'C' and *c != 'G' and *c != 'T')
          valid = false;
      for (const char *c = A->alt.c_str(); *c ; ++c)
        if (*c != 'A' and *c != 'C' and *c != 'G' and *c != 'T')
          valid = false;
      if (not valid) {
        A->filtered = true;
        A->line_status->filter_message_prefix = "REF and/or ALT contain characters other than ACGT: ";
        A->line_status->filter_message = "REF = " + A->ref + " ALT = " + A->alt;
        continue;
      }

      // Filter REF == ALT

      if (A->ref == A->alt) {
        A->filtered = true;
        A->line_status->filter_message_prefix = "REF and ALT alleles equal";
        continue;
      }

      // Confirm reference allele.

      string ref_expected;
      for (int idx = 0; idx < (int) A->ref.size(); ++idx)
        ref_expected += ref_index[chr_idx].base(A->pos + idx);
      if (A->ref != ref_expected) {
        A->filtered = true;
        A->line_status->filter_message_prefix = "Provided REF allele does not match reference: ";
        A->line_status->filter_message = "Expected " + ref_expected + ", found " + A->ref;
        continue;
      }

      // Trim

      int ref_start = 0;
      int ref_end = A->ref.size();
      int alt_end = A->alt.size();

      // Option 1: trim all trailing bases;

      //while(ref_end and alt_end and A->ref[ref_end-1] == A->alt[alt_end-1]) {
      //  --ref_end;
      //  --alt_end;
      //}

      // Option 2: trim all leading basees;

      //while (ref_start < ref_end and ref_start < alt_end and A->ref[ref_start] == A->alt[ref_start])
      //  ++ref_start;

      // Option 3: trim anchor base if vcf

      if (!input_vcf_filename.empty()) {
        if (ref_end and alt_end and (ref_end == 1 or alt_end == 1) and A->ref[0] == A->alt[0])
          ref_start = 1;
      }

      A->pos += ref_start;
      A->ref = A->ref.substr(ref_start, ref_end-ref_start);
      A->alt = A->alt.substr(ref_start, alt_end-ref_start);
      ref_end -= ref_start;
      alt_end -= ref_start;
      // Left align
      if (left_alignment && A->custom_tags.find("BSTRAND") == A->custom_tags.end()) { // black list variant not to be left aligned.
	string trailing;
	int can_do = 0, need_do = 0;
	int ref_end_orig= ref_end, alt_end_orig = alt_end;
	while(ref_end and alt_end and A->ref[ref_end-1] == A->alt[alt_end-1]) {
	    ref_end--; alt_end--;
	} 
	if (ref_end == 0 || alt_end == 0) {
	    can_do = need_do = 1; // indel type, ZZ
	} else {
	    int tmp_start = ref_start;
	    int ref_end_0 = ref_end, alt_end_0 = alt_end; // end after remove trailing match ZZ
	    while (tmp_start < ref_end and tmp_start < alt_end and A->ref[tmp_start] == A->alt[tmp_start])
     		++tmp_start;
	    if (tmp_start == ref_end || tmp_start == alt_end) {
		can_do = 1; need_do = 0; // indel but indel is not at the left. ZZ
	    } else {
		ref_end--; alt_end--;
		while(ref_end and alt_end and A->ref[ref_end-1] == A->alt[alt_end-1]) {
            	    ref_end--; alt_end--;
        	}
		if (ref_end == 0 || alt_end == 0) {
		   // complex with 1 bp MM at right end
		    can_do = need_do = 1;
		    if (ref_end + alt_end == 0) need_do = 0; // SNP
		} else {
		  int tmp_start0 = tmp_start; // start after removing leading matches
		  tmp_start++;
		  while (tmp_start < ref_end_orig and tmp_start < alt_end_orig and A->ref[tmp_start] == A->alt[tmp_start])
			tmp_start++;
		  if (tmp_start >= ref_end_0 || tmp_start >= alt_end_0 || ref_end <= tmp_start0 || alt_end <= tmp_start0) {
			// 1MM plus indel in middle, by definition cannot move the indel left enough to change A->pos
		    	can_do = 1; need_do = 0;
		  } // else real complex 
		}
	    }
	}
	if (!can_do or !need_do) {
	    // do nothing
	    // if !can_do need add some more DP
	    ref_end = ref_end_orig;
	    alt_end = alt_end_orig;
	} else {
	 // left align the indel part, here either ref_end = 0 or alt_end = 0
	  int opos = A->pos;
          while (A->pos > 0) {
            char nuc = ref_index[chr_idx].base(A->pos-1);
            if (ref_end > 0 and A->ref[ref_end-1] != nuc)
              break;
            if (alt_end > 0 and A->alt[alt_end-1] != nuc)
              break;
            A->ref = string(1,nuc) + A->ref;
            A->alt = string(1,nuc) + A->alt;
            A->pos--;
          }
	  if (ref_end != ref_end_orig) {
	    // trailing part is aligned, the whole ref and alt need to be kept. ZZ
	    ref_end = A->ref.size();
	    alt_end = A->alt.size();
	  } 
	  if (junc.contain(chr_idx, A->pos, ref_end) or not junc.contained_in_ampl(chr_idx, A->pos, ref_end)) {
		// after left align the hotspot contain an overlap region, revert to the original ZZ
		if (opos != A->pos) {
		    A->ref.erase(0, opos-A->pos);
		    A->alt.erase(0, opos-A->pos);
		    A->pos = opos;
		    ref_end = ref_end_orig;
		    alt_end = alt_end_orig;
		}
	  }
       }
      }
      A->ref.resize(ref_end);
      A->alt.resize(alt_end);


      // Filter block substitutions: take 1

      if (ref_end > 0 and alt_end > 0 and ref_end != alt_end and not allow_block_substitutions and not filter_bypass) {
        A->filtered = true;
        A->line_status->filter_message_prefix = "Block substitutions not supported";
        continue;
      }

    }



    if (output_bed) {
      // Sort - without anchor base
      stable_sort(alleles[chr_idx].begin(), alleles[chr_idx].end(), compare_alleles);

      // Write
      for (deque<Allele>::iterator I = alleles[chr_idx].begin(); I != alleles[chr_idx].end(); ++I) {
        if (I->filtered)
          continue;

        fprintf(output_bed, "%s\t%ld\t%ld\t%s\tREF=%s;OBS=%s",
            ref_index[chr_idx].chr.c_str(), I->pos, I->pos + I->ref.size(), I->id.c_str(),
            I->ref.c_str(), I->alt.c_str());

        for (map<string,string>::iterator C = I->custom_tags.begin(); C != I->custom_tags.end(); ++C)
          fprintf(output_bed, ";%s=%s", C->first.c_str(), C->second.c_str());

        fprintf(output_bed, "\tNONE\n");

        /*
        if (I->pos)
          fprintf(output_bed, "%s\t%ld\t%ld\t%s\t0\t+\tREF=%s;OBS=%s;ANCHOR=%c\tNONE\n",
              ref_index[chr_idx].chr.c_str(), I->pos, I->pos + I->ref.size(), I->id.c_str(),
              I->ref.c_str(), I->alt.c_str(), ref_index[chr_idx].base(I->pos-1));
        else
          fprintf(output_bed, "%s\t%ld\t%ld\t%s\t0\t+\tREF=%s;OBS=%s;ANCHOR=\tNONE\n",
              ref_index[chr_idx].chr.c_str(), I->pos, I->pos + I->ref.size(), I->id.c_str(),
              I->ref.c_str(), I->alt.c_str());
        */
      }
    }


    if (output_vcf) {

      // Add anchor base to indels
      for (deque<Allele>::iterator I = alleles[chr_idx].begin(); I != alleles[chr_idx].end(); ++I) {
        if (I->filtered)
          continue;
        if (not I->ref.empty() and not I->alt.empty())
          continue;
        if (I->pos == 0) {
          I->filtered = true;
          I->line_status->filter_message_prefix = "INDELs at chromosome start not supported";
          continue;
        }
        I->pos--;
        I->ref = string(1,ref_index[chr_idx].base(I->pos)) + I->ref;
        I->alt = string(1,ref_index[chr_idx].base(I->pos)) + I->alt;
      }

      // Sort - with anchor base
      stable_sort(alleles[chr_idx].begin(), alleles[chr_idx].end(), compare_alleles);


      // Merge alleles, remove block substitutions, write
      for (deque<Allele>::iterator A = alleles[chr_idx].begin(); A != alleles[chr_idx].end(); ) {

        string max_ref;
        deque<Allele>::iterator B = A;
        for (; B != alleles[chr_idx].end() and B->pos == A->pos; ++B)
          if (!B->filtered and max_ref.size() < B->ref.size())
            max_ref = B->ref;

        bool filtered = true;
        map<string,set<string> > unique_alts_and_ids;
        for (deque<Allele>::iterator I = A; I != B; ++I) {
          if (I->filtered)
            continue;

          string new_alt = I->alt + max_ref.substr(I->ref.size());

          if (new_alt.size() > 1 and max_ref.size() > 1 and new_alt.size() != max_ref.size() and not allow_block_substitutions and not filter_bypass) {
            I->filtered = true;
            I->line_status->filter_message_prefix = "Block substitutions not supported (post-merge)";
            continue;
          }

          I->ref = max_ref;
          I->alt = new_alt;

          // Filter alleles with duplicate ALT + ID pairs
          map<string,set<string> >::iterator alt_iter = unique_alts_and_ids.find(new_alt);
          if (alt_iter != unique_alts_and_ids.end()) {
            if (alt_iter->second.count(I->id) > 0) {
              I->filtered = true;
              I->line_status->filter_message_prefix = "Duplicate allele and ID";
              continue;
            }
          }
          unique_alts_and_ids[new_alt].insert(I->id);

          filtered = false;
        }

        if (not filtered) {



          fprintf(output_vcf, "%s\t%ld\t.\t%s\t",
              ref_index[chr_idx].chr.c_str(), A->pos+1, max_ref.c_str());

          bool comma = false;

          map<string,map<string,string> > unique_alts_and_tags;
          set<string> unique_tags;
	  set<string> unique_alt_alleles;

          for (deque<Allele>::iterator I = A; I != B; ++I) {
            if (I->filtered)
              continue;
            unique_alts_and_tags[I->alt].insert(I->custom_tags.begin(), I->custom_tags.end());
            for (map<string,string>::iterator S = I->custom_tags.begin(); S != I->custom_tags.end(); ++S)
              unique_tags.insert(S->first);
            if (unique_alt_alleles.count(I->alt) > 0)
              continue;
            unique_alt_alleles.insert(I->alt);
            if (comma)
              fprintf(output_vcf, ",");
            comma = true;
            fprintf(output_vcf, "%s", I->alt.c_str());
          }
	  /*
          for (deque<Allele>::iterator I = A; I != B; ++I) {
            if (I->filtered)
              continue;
            map<string,map<string,string> >::iterator Q = unique_alts_and_tags.find(I->alt);
            if (comma)
              fprintf(output_vcf, ",");
            comma = true;
            if (Q == unique_alts_and_tags.end()) {fprintf(output_vcf, "."); continue;}
            fprintf(output_vcf, "%s", Q->first.c_str());
          }
          */

          fprintf(output_vcf, "\t.\t.\tOID=");
          comma = false;
          for (deque<Allele>::iterator I = A; I != B; ++I) {
            if (I->filtered)
              continue;
            if (comma)
              fprintf(output_vcf, ",");
            comma = true;
            fprintf(output_vcf, "%s", I->id.c_str());
          }

          fprintf(output_vcf, ";OPOS=");
          comma = false;
          for (deque<Allele>::iterator I = A; I != B; ++I) {
            if (I->filtered)
              continue;
            if (comma)
              fprintf(output_vcf, ",");
            comma = true;
            fprintf(output_vcf, "%ld", I->opos+1);
          }

          fprintf(output_vcf, ";OREF=");
          comma = false;
          for (deque<Allele>::iterator I = A; I != B; ++I) {
            if (I->filtered)
              continue;
            if (comma)
              fprintf(output_vcf, ",");
            comma = true;
            fprintf(output_vcf, "%s", I->oref.c_str());
          }

          fprintf(output_vcf, ";OALT=");
          comma = false;
          for (deque<Allele>::iterator I = A; I != B; ++I) {
            if (I->filtered)
              continue;
            if (comma)
              fprintf(output_vcf, ",");
            comma = true;
            fprintf(output_vcf, "%s", I->oalt.c_str());
          }

          fprintf(output_vcf, ";OMAPALT=");
          comma = false;
          for (deque<Allele>::iterator I = A; I != B; ++I) {
            if (I->filtered)
              continue;
            if (comma)
              fprintf(output_vcf, ",");
            comma = true;
            fprintf(output_vcf, "%s", I->alt.c_str());
          }

          for (set<string>::iterator S = unique_tags.begin(); S != unique_tags.end(); ++S) {
            fprintf(output_vcf, ";%s=", S->c_str());
            comma=false;
            for (deque<Allele>::iterator I = A; I != B; ++I) {
              if (I->filtered)
                continue;
              map<string,map<string,string> >::iterator Q = unique_alts_and_tags.find(I->alt);
              if (comma)
                fprintf(output_vcf, ",");
              comma = true;
              if (Q == unique_alts_and_tags.end()) {fprintf(output_vcf, "."); continue;}
              map<string,string>::iterator W = Q->second.find(*S);
              if (W == Q->second.end())
                fprintf(output_vcf, ".");
              else
                fprintf(output_vcf, "%s", W->second.c_str());
            }
          }
//            fprintf(output_vcf, ";%s=%s", S->first.c_str(), S->second.c_str());

          fprintf(output_vcf, "\n");
        }

        A = B;
      }
    }
  }



  if (output_bed) {
    fflush(output_bed);
    fclose(output_bed);
  }
  if (output_vcf) {
    fflush(output_vcf);
    fclose(output_vcf);
  }


  int lines_ignored = 0;
  for (deque<LineStatus>::iterator L = line_status.begin(); L != line_status.end(); ++L) {
    if (L->filter_message_prefix) {
      if (L->chr_idx >= 0)
        printf("Line %d ignored: [%s:%ld %s] %s%s\n", L->line_number, ref_index[L->chr_idx].chr.c_str(), L->opos+1, L->id.c_str(),
            L->filter_message_prefix, L->filter_message.c_str());
      else
        printf("Line %d ignored: %s%s\n", L->line_number, L->filter_message_prefix, L->filter_message.c_str());
      lines_ignored++;
    }
  }
  printf("Ignored %d out of %d lines\n", lines_ignored, (int)line_status.size());


  munmap(ref, ref_stat.st_size);
  close(ref_handle);
  if (lines_ignored > 0 and strict_check) return 1;

  return 0;
}
Пример #10
0
int main (int argc, const char *argv[])
{
  time_t program_start_time;
  time(&program_start_time);
  Json::Value calibration_json(Json::objectValue);
  DumpStartingStateOfProgram (argc,argv,program_start_time, calibration_json["Calibration"]);

  //
  // Step 1. Process command line options
  //

  OptArgs opts;
  opts.ParseCmdLine(argc, argv);

  // enable floating point exceptions during program execution
  if (opts.GetFirstBoolean('-', "float-exceptions", true)) {
    cout << "Calibration: Floating point exceptions enabled." << endl;
    feenableexcept(FE_DIVBYZERO | FE_INVALID | FE_OVERFLOW);
  } //*/

  CalibrationContext calib_context;
  if (not calib_context.InitializeFromOpts(opts)){
    PrintHelp_CalModules();
  }

  HistogramCalibration master_histogram(opts, calib_context);
  calib_context.hist_calibration_master = &master_histogram;

  LinearCalibrationModel master_linear_model(opts, calib_context);
  calib_context.linear_model_master = &master_linear_model;

  opts.CheckNoLeftovers();

  //
  // Step 2. Execute threaded calibration
  //
  int calibration_thread_time = 0;

  if (calib_context.successive_fit) {

    // first train linear model
    if (master_linear_model.DoTraining()) {
      int l_thread_time = 0;
      for (int i_iteration=0; i_iteration<calib_context.num_train_iterations; i_iteration++) {
        cout << " -Training Iteration " << i_iteration+1;
        l_thread_time = ExecuteThreadedCalibrationTraining(calib_context);

        // Activate master linear model after every round of training
        master_linear_model.CreateCalibrationModel(false); // make linear model
        master_linear_model.SetModelGainsAndOffsets(); // expand for use in basecalling

        calibration_thread_time += l_thread_time;
        calib_context.bam_reader.Rewind(); // reset all files for another pass
        cout << " Duration = " << l_thread_time << endl;
      }
    }

    // Then apply it during polish model training
    if (master_histogram.DoTraining()) {
      calib_context.local_fit_linear_model = false;
      calib_context.local_fit_polish_model = true;
      calibration_thread_time += ExecuteThreadedCalibrationTraining(calib_context);
    }
  }
  else {
    // Single pass in which both models are fit jointly
    calibration_thread_time=ExecuteThreadedCalibrationTraining(calib_context);
  }


  //
  // Step 3. Create models, write output, and close modules
  //

  // Linear Model
  if (master_linear_model.CreateCalibrationModel())
    master_linear_model.ExportModelToJson(calibration_json["LinearModel"], "");

  // HP histogram calibration
  if (master_histogram.CreateCalibrationModel())
    master_histogram.ExportModelToJson(calibration_json["HPHistogram"]);


  // Transfer stuff from calibration context and close bam reader
  calib_context.Close(calibration_json["Calibration"]);

  time_t program_end_time;
  time(&program_end_time);

  calibration_json["Calibration"]["end_time"] = get_time_iso_string(program_end_time);
  calibration_json["Calibration"]["total_duration"] = (Json::Int)difftime(program_end_time,program_start_time);
  calibration_json["Calibration"]["calibration_duration"] = (Json::Int)calibration_thread_time;

  SaveJson(calibration_json, calib_context.filename_json);
  return EXIT_SUCCESS;
}
Пример #11
0
int PrepareHotspots(int argc, const char *argv[])
{
  OptArgs opts;
  opts.ParseCmdLine(argc, argv);
  string input_bed_filename       = opts.GetFirstString ('b', "input-bed", "");
  string input_vcf_filename       = opts.GetFirstString ('v', "input-vcf", "");
  string output_bed_filename      = opts.GetFirstString ('d', "output-bed", "");
  string output_vcf_filename      = opts.GetFirstString ('o', "output-vcf", "");
  string reference_filename       = opts.GetFirstString ('r', "reference", "");
  bool left_alignment             = opts.GetFirstBoolean('a', "left-alignment", false);
  bool filter_bypass              = opts.GetFirstBoolean('f', "filter-bypass", false);
  bool allow_block_substitutions  = opts.GetFirstBoolean('s', "allow-block-substitutions", false);
  opts.CheckNoLeftovers();

  if((input_bed_filename.empty() == input_vcf_filename.empty()) or
      (output_bed_filename.empty() and output_vcf_filename.empty()) or reference_filename.empty()) {
    PrepareHotspotsHelp();
    return 1;
  }


  // Populate chromosome list from reference.fai
  // Use mmap to fetch the entire reference

  int ref_handle = open(reference_filename.c_str(),O_RDONLY);

  struct stat ref_stat;
  fstat(ref_handle, &ref_stat);
  char *ref = (char *)mmap(0, ref_stat.st_size, PROT_READ, MAP_SHARED, ref_handle, 0);


  FILE *fai = fopen((reference_filename+".fai").c_str(), "r");
  if (!fai) {
    fprintf(stderr, "ERROR: Cannot open %s.fai\n", reference_filename.c_str());
    return 1;
  }

  vector<Reference>  ref_index;
  map<string,int> ref_map;
  char line[1024], chrom_name[1024];
  while (fgets(line, 1024, fai) != NULL) {
    Reference ref_entry;
    long chr_start;
    if (5 != sscanf(line, "%s\t%ld\t%ld\t%d\t%d", chrom_name, &ref_entry.size, &chr_start,
                    &ref_entry.bases_per_line, &ref_entry.bytes_per_line))
      continue;
    ref_entry.chr = chrom_name;
    ref_entry.start = ref + chr_start;
    ref_index.push_back(ref_entry);
    ref_map[ref_entry.chr] = (int) ref_index.size() - 1;
  }
  fclose(fai);


  // Load input BED or load input VCF, group by chromosome

  deque<LineStatus> line_status;
  vector<deque<Allele> > alleles(ref_index.size());

  if (!input_bed_filename.empty()) {

    FILE *input = fopen(input_bed_filename.c_str(),"r");
    if (!input) {
      fprintf(stderr,"ERROR: Cannot open %s\n", input_bed_filename.c_str());
      return 1;
    }

    char line2[65536];

    int line_number = 0;
    bool line_overflow = false;
    while (fgets(line2, 65536, input) != NULL) {
      if (line2[0] and line2[strlen(line2)-1] != '\n' and strlen(line2) == 65535) {
        line_overflow = true;
        continue;
      }
      line_number++;
      if (line_overflow) {
        line_overflow = false;
        line_status.push_back(LineStatus(line_number));
        line_status.back().filter_message_prefix = "Malformed hotspot BED line: line length exceeds 64K";
        continue;
      }

      if (strncmp(line2, "browser", 7) == 0)
        continue;

      if (strncmp(line2, "track", 5) == 0) {
        if (string::npos != string(line2).find("allowBlockSubstitutions=true"))
          allow_block_substitutions = true;
        continue;
      }

      char *current_chr = strtok(line2, "\t\r\n");
      char *current_start = strtok(NULL, "\t\r\n");
      char *current_end = strtok(NULL, "\t\r\n");
      char *current_id = strtok(NULL, "\t\r\n");
      char *penultimate = strtok(NULL, "\t\r\n");
      char *ultimate = strtok(NULL, "\t\r\n");
      for (char *next = strtok(NULL, "\t\r\n"); next; next = strtok(NULL, "\t\r\n")) {
        penultimate = ultimate;
        ultimate = next;
      }

      if (!current_chr or !current_start or !current_end or !current_id or !penultimate or !ultimate) {
        line_status.push_back(LineStatus(line_number));
        line_status.back().filter_message_prefix = "Malformed hotspot BED line: expected at least 6 fields";
        continue;
      }

      Allele allele;

      string string_chr(current_chr);
      if (ref_map.find(string_chr) != ref_map.end())
        allele.chr_idx = ref_map[string_chr];
      else if (ref_map.find("chr"+string_chr) != ref_map.end())
        allele.chr_idx = ref_map["chr"+string_chr];
      else if (string_chr == "MT" and ref_map.find("chrM") != ref_map.end())
        allele.chr_idx = ref_map["chrM"];
      else {
        line_status.push_back(LineStatus(line_number));
        line_status.back().filter_message_prefix = "Unknown chromosome name: ";
        line_status.back().filter_message = string_chr;
        continue;
      }

      allele.pos = strtol(current_start,NULL,10);
      allele.id = current_id;

      char *current_ref = NULL;
      char *current_alt = NULL;
      for (char *next = strtok(penultimate, ";"); next; next = strtok(NULL, ";")) {
        if (strncmp(next,"REF=",4) == 0)
          current_ref = next;
        else if (strncmp(next,"OBS=",4) == 0)
          current_alt = next;
      }
      if (!current_ref or !current_alt) {
        line_status.push_back(LineStatus(line_number));
        line_status.back().filter_message_prefix = "Malformed hotspot BED line: REF and OBS fields required in penultimate column";
        continue;
      }
      for (char *pos = current_ref+4; *pos; ++pos)
        allele.ref += toupper(*pos);
      for (char *pos = current_alt+4; *pos; ++pos)
        allele.alt += toupper(*pos);
      allele.filtered = false;
      line_status.push_back(LineStatus(line_number));
      allele.line_status = &line_status.back();
      allele.opos = allele.pos;
      allele.oref = allele.ref;
      allele.oalt = allele.alt;
      alleles[allele.chr_idx].push_back(allele);
      line_status.back().allele = &alleles[allele.chr_idx].back();
    }

    fclose(input);
  }


  if (!input_vcf_filename.empty()) {

    FILE *input = fopen(input_vcf_filename.c_str(),"r");
    if (!input) {
      fprintf(stderr,"ERROR: Cannot open %s\n", input_vcf_filename.c_str());
      return 1;
    }

    char line2[65536];
    int line_number = 0;
    bool line_overflow = false;
    while (fgets(line2, 65536, input) != NULL) {
      if (line2[0] and line2[strlen(line2)-1] != '\n' and strlen(line2) == 65535) {
        line_overflow = true;
        continue;
      }
      line_number++;
      if (line_overflow) {
        line_overflow = false;
        line_status.push_back(LineStatus(line_number));
        line_status.back().filter_message_prefix = "Malformed hotspot VCF line: line length exceeds 64K";
        continue;
      }

      if (strncmp(line2, "##allowBlockSubstitutions=true", 30) == 0) {
        allow_block_substitutions = true;
        continue;
      }
      if (line2[0] == '#')
        continue;

      char *current_chr = strtok(line2, "\t\r\n");
      char *current_start = strtok(NULL, "\t\r\n");
      char *current_id = strtok(NULL, "\t\r\n");
      char *current_ref = strtok(NULL, "\t\r\n");
      char *current_alt = strtok(NULL, "\t\r\n");

      if (!current_chr or !current_start or !current_id or !current_ref or !current_alt) {
        line_status.push_back(LineStatus(line_number));
        line_status.back().filter_message_prefix = "Malformed hotspot VCF line: expected at least 5 fields";
        continue;
      }


      string string_chr(current_chr);
      int chr_idx = 0;
      if (ref_map.find(string_chr) != ref_map.end())
        chr_idx = ref_map[string_chr];
      else if (ref_map.find("chr"+string_chr) != ref_map.end())
        chr_idx = ref_map["chr"+string_chr];
      else if (string_chr == "MT" and ref_map.find("chrM") != ref_map.end())
        chr_idx = ref_map["chrM"];
      else {
        line_status.push_back(LineStatus(line_number));
        line_status.back().filter_message_prefix = "Unknown chromosome name: ";
        line_status.back().filter_message = string_chr;
        continue;
      }

      for (char *pos = current_ref; *pos; ++pos)
        *pos = toupper(*pos);
      for (char *pos = current_alt; *pos; ++pos)
        *pos = toupper(*pos);


      for (char *sub_alt = strtok(current_alt,","); sub_alt; sub_alt = strtok(NULL,",")) {

        Allele allele;
        allele.chr_idx = chr_idx;
        allele.ref = current_ref;
        allele.alt = sub_alt;
        allele.pos = strtol(current_start,NULL,10)-1;
        allele.id = current_id;
        if (allele.id == ".")
          allele.id = "hotspot";

        allele.filtered = false;
        line_status.push_back(LineStatus(line_number));
        allele.line_status = &line_status.back();
        allele.opos = allele.pos;
        allele.oref = allele.ref;
        allele.oalt = allele.alt;
        alleles[allele.chr_idx].push_back(allele);
        line_status.back().allele = &alleles[allele.chr_idx].back();
      }
    }

    fclose(input);
  }

  // Process by chromosome:
  //   - Verify reference allele
  //   - Left align
  //   - Sort
  //   - Filter for block substitutions, write

  FILE *output_vcf = NULL;
  if (!output_vcf_filename.empty()) {
    output_vcf = fopen(output_vcf_filename.c_str(), "w");
    if (!output_vcf) {
      fprintf(stderr,"ERROR: Cannot open %s for writing\n", output_vcf_filename.c_str());
      return 1;
    }
    fprintf(output_vcf, "##fileformat=VCFv4.1\n");
    if (allow_block_substitutions)
      fprintf(output_vcf, "##allowBlockSubstitutions=true\n");
    fprintf(output_vcf, "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\n");
  }
  FILE *output_bed = NULL;
  if (!output_bed_filename.empty()) {
    output_bed = fopen(output_bed_filename.c_str(), "w");
    if (!output_bed) {
      fprintf(stderr,"ERROR: Cannot open %s for writing\n", output_bed_filename.c_str());
      if (output_vcf)
        fclose(output_vcf);
      return 1;
    }
    if (allow_block_substitutions)
      fprintf(output_bed, "track name=\"hotspot\" type=bedDetail allowBlockSubstitutions=true\n");
    else
      fprintf(output_bed, "track name=\"hotspot\" type=bedDetail\n");
  }


  for (int chr_idx = 0; chr_idx < (int)ref_index.size(); ++chr_idx) {

    for (deque<Allele>::iterator A = alleles[chr_idx].begin(); A != alleles[chr_idx].end(); ++A) {

      // Invalid characters

      bool valid = true;
      for (const char *c = A->ref.c_str(); *c ; ++c)
        if (*c != 'A' and *c != 'C' and *c != 'G' and *c != 'T')
          valid = false;
      for (const char *c = A->alt.c_str(); *c ; ++c)
        if (*c != 'A' and *c != 'C' and *c != 'G' and *c != 'T')
          valid = false;
      if (not valid) {
        A->filtered = true;
        A->line_status->filter_message_prefix = "REF and/or ALT contain characters other than ACGT: ";
        A->line_status->filter_message = "REF = " + A->ref + " ALT = " + A->alt;
        continue;
      }

      // Filter REF == ALT

      if (A->ref == A->alt) {
        A->filtered = true;
        A->line_status->filter_message_prefix = "REF and ALT alleles equal";
        continue;
      }

      // Confirm reference allele.

      string ref_expected;
      for (int idx = 0; idx < (int) A->ref.size(); ++idx)
        ref_expected += ref_index[chr_idx].base(A->pos + idx);
      if (A->ref != ref_expected) {
        A->filtered = true;
        A->line_status->filter_message_prefix = "Provided REF allele does not match reference: ";
        A->line_status->filter_message = "Expected " + ref_expected + ", found " + A->ref;
        continue;
      }

      // Trim

      int ref_start = 0;
      int ref_end = A->ref.size();
      int alt_end = A->alt.size();

      // Option 1: trim all trailing bases

      //while(ref_end and alt_end and A->ref[ref_end-1] == A->alt[alt_end-1]) {
      //  --ref_end;
      //  --alt_end;
      //}

      // Option 2: trim all leading basees

      //while (ref_start < ref_end and ref_start < alt_end and A->ref[ref_start] == A->alt[ref_start])
      //  ++ref_start;


      // Option 3: trim anchor base if vcf

      if (!input_vcf_filename.empty()) {
        if (ref_end and alt_end and (ref_end == 1 or alt_end == 1) and A->ref[0] == A->alt[0])
          ref_start = 1;
      }

      A->pos += ref_start;
      A->ref = A->ref.substr(ref_start, ref_end-ref_start);
      A->alt = A->alt.substr(ref_start, alt_end-ref_start);
      ref_end -= ref_start;
      alt_end -= ref_start;

      // Left align
      if (left_alignment) {
        while (A->pos > 0) {
          char nuc = ref_index[chr_idx].base(A->pos-1);
          if (ref_end > 0 and A->ref[ref_end-1] != nuc)
            break;
          if (alt_end > 0 and A->alt[alt_end-1] != nuc)
            break;
          A->ref = string(1,nuc) + A->ref;
          A->alt = string(1,nuc) + A->alt;
          A->pos--;
        }
      }
      A->ref.resize(ref_end);
      A->alt.resize(alt_end);


      // Filter block substitutions: take 1

      if (ref_end > 0 and alt_end > 0 and ref_end != alt_end and not allow_block_substitutions and not filter_bypass) {
        A->filtered = true;
        A->line_status->filter_message_prefix = "Block substitutions not supported";
        continue;
      }

    }



    if (output_bed) {
      // Sort - without anchor base
      sort(alleles[chr_idx].begin(), alleles[chr_idx].end(), compare_alleles);

      // Write
      for (deque<Allele>::iterator I = alleles[chr_idx].begin(); I != alleles[chr_idx].end(); ++I) {
        if (I->filtered)
          continue;
        if (I->pos)
          fprintf(output_bed, "%s\t%ld\t%ld\t%s\t0\t+\tREF=%s;OBS=%s;ANCHOR=%c\tNONE\n",
              ref_index[chr_idx].chr.c_str(), I->pos, I->pos + I->ref.size(), I->id.c_str(),
              I->ref.c_str(), I->alt.c_str(), ref_index[chr_idx].base(I->pos-1));
        else
          fprintf(output_bed, "%s\t%ld\t%ld\t%s\t0\t+\tREF=%s;OBS=%s;ANCHOR=\tNONE\n",
              ref_index[chr_idx].chr.c_str(), I->pos, I->pos + I->ref.size(), I->id.c_str(),
              I->ref.c_str(), I->alt.c_str());
      }
    }


    if (output_vcf) {

      // Add anchor base to indels
      for (deque<Allele>::iterator I = alleles[chr_idx].begin(); I != alleles[chr_idx].end(); ++I) {
        if (I->filtered)
          continue;
        if (not I->ref.empty() and not I->alt.empty())
          continue;
        if (I->pos == 0) {
          I->filtered = true;
          I->line_status->filter_message_prefix = "INDELs at chromosome start not supported";
          continue;
        }
        I->pos--;
        I->ref = string(1,ref_index[chr_idx].base(I->pos)) + I->ref;
        I->alt = string(1,ref_index[chr_idx].base(I->pos)) + I->alt;
      }

      // Sort - with anchor base
      sort(alleles[chr_idx].begin(), alleles[chr_idx].end(), compare_alleles);


      // Merge alleles, remove block substitutions, write
      for (deque<Allele>::iterator A = alleles[chr_idx].begin(); A != alleles[chr_idx].end(); ) {

        string max_ref;
        deque<Allele>::iterator B = A;
        for (; B != alleles[chr_idx].end() and B->pos == A->pos; ++B)
          if (!B->filtered and max_ref.size() < B->ref.size())
            max_ref = B->ref;

        bool filtered = true;
        for (deque<Allele>::iterator I = A; I != B; ++I) {
          if (I->filtered)
            continue;

          string new_alt = I->alt + max_ref.substr(I->ref.size());

          if (new_alt.size() > 1 and max_ref.size() > 1 and new_alt.size() != max_ref.size() and not allow_block_substitutions and not filter_bypass) {
            I->filtered = true;
            I->line_status->filter_message_prefix = "Block substitutions not supported (post-merge)";
            continue;
          }

          I->ref = max_ref;
          I->alt = new_alt;
          filtered = false;
        }

        if (not filtered) {

          fprintf(output_vcf, "%s\t%ld\t.\t%s\t",
              ref_index[chr_idx].chr.c_str(), A->pos+1, max_ref.c_str());

          bool comma = false;
          set<string> unique_alt_alleles;
          for (deque<Allele>::iterator I = A; I != B; ++I) {
            if (I->filtered)
              continue;
            if (unique_alt_alleles.count(I->alt) > 0)
              continue;
            unique_alt_alleles.insert(I->alt);
            if (comma)
              fprintf(output_vcf, ",");
            comma = true;
            fprintf(output_vcf, "%s", I->alt.c_str());
          }

          fprintf(output_vcf, "\t.\t.\tOID=");
          comma = false;
          for (deque<Allele>::iterator I = A; I != B; ++I) {
            if (I->filtered)
              continue;
            if (comma)
              fprintf(output_vcf, ",");
            comma = true;
            fprintf(output_vcf, "%s", I->id.c_str());
          }

          fprintf(output_vcf, ";OPOS=");
          comma = false;
          for (deque<Allele>::iterator I = A; I != B; ++I) {
            if (I->filtered)
              continue;
            if (comma)
              fprintf(output_vcf, ",");
            comma = true;
            fprintf(output_vcf, "%ld", I->opos+1);
          }

          fprintf(output_vcf, ";OREF=");
          comma = false;
          for (deque<Allele>::iterator I = A; I != B; ++I) {
            if (I->filtered)
              continue;
            if (comma)
              fprintf(output_vcf, ",");
            comma = true;
            fprintf(output_vcf, "%s", I->oref.c_str());
          }

          fprintf(output_vcf, ";OALT=");
          comma = false;
          for (deque<Allele>::iterator I = A; I != B; ++I) {
            if (I->filtered)
              continue;
            if (comma)
              fprintf(output_vcf, ",");
            comma = true;
            fprintf(output_vcf, "%s", I->oalt.c_str());
          }

          fprintf(output_vcf, ";OMAPALT=");
          comma = false;
          for (deque<Allele>::iterator I = A; I != B; ++I) {
            if (I->filtered)
              continue;
            if (comma)
              fprintf(output_vcf, ",");
            comma = true;
            fprintf(output_vcf, "%s", I->alt.c_str());
          }

          fprintf(output_vcf, "\n");
        }

        A = B;
      }
    }
  }



  if (output_bed) {
    fflush(output_bed);
    fclose(output_bed);
  }
  if (output_vcf) {
    fflush(output_vcf);
    fclose(output_vcf);
  }


  int lines_ignored = 0;
  for (deque<LineStatus>::iterator L = line_status.begin(); L != line_status.end(); ++L) {
    if (L->filter_message_prefix) {
      if (L->allele)
        printf("Line %d ignored: [%s:%ld %s] %s%s\n", L->line_number, ref_index[L->allele->chr_idx].chr.c_str(), L->allele->opos+1, L->allele->id.c_str(),
            L->filter_message_prefix, L->filter_message.c_str());
      else
        printf("Line %d ignored: %s%s\n", L->line_number, L->filter_message_prefix, L->filter_message.c_str());
      lines_ignored++;
    }
  }
  printf("Ignored %d out of %d lines\n", lines_ignored, (int)line_status.size());


  munmap(ref, ref_stat.st_size);
  close(ref_handle);

  return 0;
}
Пример #12
0
int main(int argc, const char* argv[])
{
  printf ("tvcvalidator %s-%s (%s) - Prototype tvc validation tool\n\n",
      IonVersion::GetVersion().c_str(), IonVersion::GetRelease().c_str(), IonVersion::GetSvnRev().c_str());

  if (argc == 1) {
    VariantValidatorHelp();
    return 1;
  }

  OptArgs opts;
  opts.ParseCmdLine(argc, argv);

  if (opts.GetFirstBoolean('v', "version", false)) {
    return 0;
  }
  if (opts.GetFirstBoolean('h', "help", false)) {
    VariantValidatorHelp();
    return 0;
  }

  string input_vcf_filename = opts.GetFirstString ('i', "input-vcf", "");
  string truth_filename = opts.GetFirstString ('t', "truth-file", "");
  string truth_dir = opts.GetFirstString ('d', "truth-dir", "/results/plugins/validateVariantCaller/files");

  // TODO: reference optional, only used to verify reference allele in input-vcf and truth files
  //string reference_filename = opts.GetFirstString ('r', "reference", "");

  opts.CheckNoLeftovers();


  //
  // Step 1. Load input VCF file into memory
  //

  if (input_vcf_filename.empty()) {
    VariantValidatorHelp();
    cerr << "ERROR: Input VCF file not specified " << endl;
    return 1;
  }

  VariantCallerResults results_vcf;
  results_vcf.load_vcf(input_vcf_filename);
  printf("Loaded VCF %s with %d variant calls\n", input_vcf_filename.c_str(), (int)results_vcf.variants.size());



  //
  // Step 2. Parse truth files, compare them to the input vcf, and compute match scores
  //

  if (not truth_filename.empty()) {
    ValidatorTruth truth;
    truth.ReadTruthFile(truth_filename);
    truth.CompareToCalls(results_vcf);
    return 0;
  }

  truth_dir += "/*.bed";
  glob_t glob_result;
  glob(truth_dir.c_str(), GLOB_TILDE, NULL, &glob_result);
  for(unsigned int i = 0; i < glob_result.gl_pathc; ++i) {

    ValidatorTruth truth;
    truth.ReadTruthFile(string(glob_result.gl_pathv[i]));
    truth.CompareToCalls(results_vcf);

  }
  globfree(&glob_result);


  return 0;
}
Пример #13
0
BaseCallerFilters::BaseCallerFilters(OptArgs& opts,
    const string& _flowOrder, int _numFlows, const vector<KeySequence>& _keys, Mask *_maskPtr)
{
  flowOrder = _flowOrder;

  keypassFilter                   = opts.GetFirstBoolean('k', "keypass-filter", true);
  percentPositiveFlowsFilterTFs   = opts.GetFirstBoolean('-', "clonal-filter-tf", false);
  clonalFilterTraining            = opts.GetFirstBoolean('-', "clonal-filter-train", false);
  clonalFilterSolving             = opts.GetFirstBoolean('-', "clonal-filter-solve", false);
  minReadLength                   = opts.GetFirstInt    ('-', "min-read-length", 8);
  cafieResFilterCalling           = opts.GetFirstBoolean('-', "cr-filter", false);
  cafieResFilterTFs               = opts.GetFirstBoolean('-', "cr-filter-tf", false);
  generate_bead_summary_          = opts.GetFirstBoolean('-', "bead-summary", false);

  // TODO: get this to work right. May require "unwound" flow order, so incompatible with current wells.FlowOrder()
  //flt_control.cafieResMaxValueByFlowOrder[std::string ("TACG") ] = 0.06;  // regular flow order
  //flt_control.cafieResMaxValueByFlowOrder[std::string ("TACGTACGTCTGAGCATCGATCGATGTACAGC") ] = 0.08;  // xdb flow order

  cafieResMaxValue = opts.GetFirstDouble('-',  "cr-filter-max-value", 0.08);

  // SFFTrim options
  trim_adapter = opts.GetFirstString('-', "trim-adapter", "ATCACCGACTGCCCATAGAGAGGCTGAGAC");
  trim_adapter_cutoff = opts.GetFirstDouble('-', "trim-adapter-cutoff", 0.0);
  trim_adapter_closest = opts.GetFirstBoolean('-', "trim-adapter-pick-closest", false);
  trim_qual_wsize = opts.GetFirstInt('-', "trim-qual-window-size", 30);
  trim_qual_cutoff = opts.GetFirstDouble('-', "trim-qual-cutoff", 100.0);
  trim_min_read_len = opts.GetFirstInt('-', "trim-min-read-len", 8);


  // Validate options

  if (minReadLength < 1) {
    fprintf (stderr, "Option Error: min-read-length must specify a positive value (%d invalid).\n", minReadLength);
    exit (EXIT_FAILURE);
  }
  if (cafieResMaxValue <= 0) {
    fprintf (stderr, "Option Error: cr-filter-max-value must specify a positive value (%lf invalid).\n", cafieResMaxValue);
    exit (EXIT_FAILURE);
  }

  keys = _keys;
  numClasses = keys.size();

  assert(numClasses == 2);
  classFilterPolyclonal.resize(numClasses);
  classFilterPolyclonal[0] = clonalFilterSolving;
  classFilterPolyclonal[1] = clonalFilterSolving && percentPositiveFlowsFilterTFs;
  classFilterHighResidual.resize(numClasses);
  classFilterHighResidual[0] = cafieResFilterCalling;
  classFilterHighResidual[1] = cafieResFilterCalling && cafieResFilterTFs;


  string filter_beverly_args      = opts.GetFirstString('-', "beverly-filter", "0.03,0.03,8");
  if (filter_beverly_args == "off") {
    filter_beverly_enabled_ = false; // Nothing, really
    printf("Beverly filter: disabled, use --beverly-filter=filter_ratio,trim_ratio,min_length\n");

  } else {
    int stat = sscanf (filter_beverly_args.c_str(), "%f,%f,%d",
        &filter_beverly_filter_ratio_,
        &filter_beverly_trim_ratio_,
        &filter_beverly_min_read_length_);
    if (stat != 3) {
      fprintf (stderr, "Option Error: beverly-filter %s\n", filter_beverly_args.c_str());
      fprintf (stderr, "Usage: --beverly-filter=filter_ratio,trim_ratio,min_length\n");
      exit (EXIT_FAILURE);
    }
    filter_beverly_enabled_ = true;
    printf("Beverly filter: enabled, use --beverly-filter=off to disable\n");
    printf("Beverly filter: filter_ratio = %1.5f\n", filter_beverly_filter_ratio_);
    printf("Beverly filter: trim_ratio = %1.5f\n", filter_beverly_trim_ratio_);
    printf("Beverly filter: min_length = %d\n", filter_beverly_min_read_length_);
  }

  maskPtr = _maskPtr;
  numFlows = _numFlows;

  filterMask.assign(maskPtr->H()*maskPtr->W(), kUninitialized);
}
Пример #14
0
void PhaseEstimator::InitializeFromOptArgs(OptArgs& opts, const ion::ChipSubset & chip_subset, const string & key_norm_method)
{
  // Parse command line options
  phasing_estimator_      = opts.GetFirstString ('-', "phasing-estimator", "spatial-refiner-2");
  vector<double> cf_ie_dr = opts.GetFirstDoubleVector('-', "libcf-ie-dr", "");
  vector<double> init_cf_ie_dr = opts.GetFirstDoubleVector('-', "initcf-ie-dr", "");
  residual_threshold_     = opts.GetFirstDouble ('-', "phasing-residual-filter", 1.0);
  max_phasing_levels_     = opts.GetFirstInt    ('-', "max-phasing-levels", max_phasing_levels_default_);
  num_fullchip_iterations_= opts.GetFirstInt    ('-', "phasing-fullchip-iterations", 3);
  num_region_iterations_  = opts.GetFirstInt    ('-', "phasing-region-iterations", 1);
  num_reads_per_region_   = opts.GetFirstInt    ('-', "phasing-num-reads", 5000);
  min_reads_per_region_   = opts.GetFirstInt    ('-', "phasing-min-reads", 1000);
  phase_file_name_        = opts.GetFirstString ('-', "phase-estimation-file", "");
  normalization_string_   = opts.GetFirstString ('-', "phase-normalization", "adaptive");
  key_norm_method_        = key_norm_method;

  // Static member variables
  norm_during_param_eval_ = opts.GetFirstBoolean('-', "phase-norm-during-eval", false);
  windowSize_             = opts.GetFirstInt    ('-', "window-size", DPTreephaser::kWindowSizeDefault_);
  phasing_start_flow_     = opts.GetFirstInt    ('-', "phasing-start-flow", 70);
  phasing_end_flow_       = opts.GetFirstInt    ('-', "phasing-end-flow", 150);
  inclusion_threshold_    = opts.GetFirstDouble ('-', "phasing-signal-cutoff", 1.4);
  maxfrac_negative_flows_ = opts.GetFirstDouble ('-', "phasing-norm-threshold", 0.2);

  // Initialize chip size - needed for loading phase parameters
  chip_size_x_   = chip_subset.GetChipSizeX();
  chip_size_y_   = chip_subset.GetChipSizeY();
  region_size_x_ = chip_subset.GetRegionSizeX();
  region_size_y_ = chip_subset.GetRegionSizeY();
  num_regions_x_ = chip_subset.GetNumRegionsX();
  num_regions_y_ = chip_subset.GetNumRegionsY();
  num_regions_   = chip_subset.NumRegions();

  // Loading existing phase estimates from a file takes precedence over all other options
  if (not phase_file_name_.empty()) {
	have_phase_estimates_ = LoadPhaseEstimationTrainSubset(phase_file_name_);
    if (have_phase_estimates_) {
      phasing_estimator_ = "override";
      printf("Phase estimator settings:\n");
      printf("  phase file name        : %s\n", phase_file_name_.c_str());
      printf("  phase estimation mode  : %s\n\n", phasing_estimator_.c_str());
      return;
    } else
      cout << "PhaseEstimator Error loading TrainSubset from file " << phase_file_name_ << endl;
  }

  // Set phase parameters if provided by command line
  if (!cf_ie_dr.empty()) {
    if (cf_ie_dr.size() != 3){
      cerr << "BaseCaller Option Error: libcf-ie-dr needs to be a comma separated vector of 3 values." << endl;
      exit (EXIT_FAILURE);
    }
    SetPhaseParameters(cf_ie_dr.at(0), cf_ie_dr.at(1), cf_ie_dr.at(2));
    return; // --libcf-ie-dr overrides other phasing-related options
  }

  // Set starting values for estimation
  if (!init_cf_ie_dr.empty()) {
    if (init_cf_ie_dr.size() != 3){
      cerr << "BaseCaller Option Error: initcf-ie-dr needs to be a comma separated vector of 3 values." << endl;
      exit (EXIT_FAILURE);
    }
    init_cf_ = init_cf_ie_dr.at(0);
    init_ie_ = init_cf_ie_dr.at(1);
    init_dr_ = init_cf_ie_dr.at(2);
  }

  if (phasing_start_flow_ >= phasing_end_flow_ or phasing_start_flow_ < 0) {
    cerr << "BaseCaller Option Error: phasing-start-flow " << phasing_start_flow_
         << "needs to be positive and smaller than phasing-end-flow " << phasing_end_flow_ << endl;
    exit (EXIT_FAILURE);
  }

  if (normalization_string_ == "adaptive")
    norm_method_ = 1;
  else if (normalization_string_ == "pid")
    norm_method_ = 2;
  else if (normalization_string_ == "variable")
    norm_method_ = 3;
  else if (normalization_string_ == "off")
    norm_method_ = 4;
  else
    norm_method_ = 0; // "gain" and anythign else is default

  printf("Phase estimator settings:\n");
  printf("  phase file name        : %s\n", phase_file_name_.c_str());
  printf("  phase estimation mode  : %s\n", phasing_estimator_.c_str());
  printf("  initial cf,ie,dr values: %f,%f,%f\n", init_cf_,init_ie_,init_dr_);
  printf("  reads per region target: %d-%d\n", min_reads_per_region_, num_reads_per_region_);
  printf("  normalization method   : %s\n", normalization_string_.c_str());
  printf("  variable norm threshold: %f\n", maxfrac_negative_flows_);
  printf("\n");
}