Пример #1
0
void CommandLineOpts::PostProcessArgs(OptArgs &opts)
{
	sys_context.FindExpLogPath();
	SetGlobalChipID ( sys_context.explog_path );

	if(ChipIdDecoder::IsProtonChip())
	{
		if(!opts.HasOption('-', "clonal-filter-bkgmodel"))
		{
			bkg_control.polyclonal_filter.enable = false;
		}
		if(!opts.HasOption('-', "xtalk-correction"))
		{
			bkg_control.enable_trace_xtalk_correction = false;
		}
		if(!opts.HasOption('-', "col-flicker-correct"))
		{
			img_control.col_flicker_correct = true;
		}
		if(!opts.HasOption('-', "col-flicker-correct-aggressive"))
		{
			img_control.aggressive_cnc = true;
		}
		if(!opts.HasOption('-', "img-gain-correct"))
		{
			img_control.gain_correct_images = true;
		}
	}
}
Пример #2
0
int main(int argc, const char *argv[]) {
    OptArgs opts;
    opts.ParseCmdLine(argc, argv);
    bool help;
    string topFile, bottomFile, outFile;
    opts.GetOption(topFile, "", '-', "top");
    opts.GetOption(bottomFile, "", '-', "bottom");
    opts.GetOption(outFile, "", '-', "merged");
    opts.GetOption(help, "false", 'h', "help");
    if (help || argc == 1) {
        usage();
    }
    ION_ASSERT(!topFile.empty() && !bottomFile.empty() && !outFile.empty(),
               "Need top, bottom and merged files. use --help for details.");
    MergeAcq merger;
    Image top;
    Image bottom;
    Image combo;
    cout << "Loading images." << endl;
    ION_ASSERT(top.LoadRaw(topFile.c_str()), "Couldn't load file.");
    ION_ASSERT(bottom.LoadRaw(bottomFile.c_str()), "Couldn't load file.");
    merger.SetFirstImage(&bottom);
    merger.SetSecondImage(&top, bottom.GetRows(), 0); // starting vertically raised but columns the same.
    cout << "Merging." << endl;
    merger.Merge(combo);
    Acq acq;
    cout << "Saving. " << endl;
    acq.SetData(&combo);
    acq.WriteVFC(outFile.c_str(), 0, 0, combo.GetCols(), combo.GetRows());
    cout << "Done." << endl;
    return 0;
}
Пример #3
0
bool RetrieveParameterBool(OptArgs &opts, Json::Value& json, char short_name, const string& long_name_hyphens, bool default_value)
{
  string long_name_underscores = long_name_hyphens;
  for (unsigned int i = 0; i < long_name_underscores.size(); ++i)
    if (long_name_underscores[i] == '-')
      long_name_underscores[i] = '_';

  bool value = default_value;
  string source = "builtin default";

  if (json.isMember(long_name_underscores)) {
    if (json[long_name_underscores].isString())
      value = atoi(json[long_name_underscores].asCString());
    else
      value = json[long_name_underscores].asInt();
    source = "parameters json file";
  }

  if (opts.HasOption(short_name, long_name_hyphens)) {
    value = opts.GetFirstBoolean(short_name, long_name_hyphens, value);
    source = "command line option";
  }

  cout << setw(35) << long_name_hyphens << " = " << setw(10) << (value ? "true" : "false") << " (boolean, " << source << ")" << endl;
  return value;
}
Пример #4
0
bool BaseCallerParameters::InitializeSamplingFromOptArgs(OptArgs& opts, const int num_wells)
{
	assert(context_vars.options_set);

    // If we are just doing phase estimation none of the options matter, so don't spam output
	if (context_vars.just_phase_estimation){
	  sampling_opts.options_set = true;
	  return true;
	}

    sampling_opts.num_unfiltered           = opts.GetFirstInt    ('-', "num-unfiltered", 100000);
    sampling_opts.downsample_size          = opts.GetFirstInt    ('-', "downsample-size", 0);
    sampling_opts.downsample_fraction      = opts.GetFirstDouble ('-', "downsample-fraction", 1.0);

    sampling_opts.calibration_training     = opts.GetFirstInt    ('-', "calibration-training", -1);
    sampling_opts.have_calib_panel         = (not bc_files.calibration_panel_file.empty());
    sampling_opts.MaskNotWanted            = MaskNone;

    // Reconcile parameters downsample_size and downsample_fraction
    bool downsample = sampling_opts.downsample_size > 0 or sampling_opts.downsample_fraction < 1.0;
    if (sampling_opts.downsample_fraction < 1.0) {
      if (sampling_opts.downsample_size == 0)
    	sampling_opts.downsample_size = (int)((float)num_wells*sampling_opts.downsample_fraction);
      else
        sampling_opts.downsample_size = min(sampling_opts.downsample_size, (int)((float)num_wells*sampling_opts.downsample_fraction));
    }
    if (downsample)
      cout << "Downsampling activated: Randomly choosing " << sampling_opts.downsample_size << " reads on this chip." << endl;

    // Calibration training requires additional changes & overwrites command line options
    if (sampling_opts.calibration_training >= 0) {
      if (context_vars.diagonal_state_prog) {
        cerr << " === BaseCaller Option Incompatibility: Calibration training not supported for diagonal state progression. Aborting!" << endl;
        exit(EXIT_FAILURE);
      }
      if (sampling_opts.downsample_size>0)
        sampling_opts.calibration_training = min(sampling_opts.calibration_training, sampling_opts.downsample_size);

      sampling_opts.downsample_size  = max(sampling_opts.calibration_training, 0);
      sampling_opts.MaskNotWanted    = (MaskType)(MaskFilteredBadResidual|MaskFilteredBadPPF|MaskFilteredBadKey);
	  sampling_opts.num_unfiltered   = 0;
      context_vars.process_tfs       = false;
      context_vars.flow_signals_type = "scaled-residual";
      cout << "=== BaseCaller Calibration Training ===" << endl;
      cout << " - Generating a training set up to " << sampling_opts.downsample_size << " randomly selected reads." << endl;
      if (sampling_opts.have_calib_panel)
        cout << " - Adding calibration panel reads specified in " << bc_files.calibration_panel_file << endl;
      cout << endl;
    }

	sampling_opts.options_set = true;
    return true;
};
Пример #5
0
void RecalibrationModel::Initialize(OptArgs& opts, vector<string> &bam_comments, const string & run_id, const ion::ChipSubset & chip_subset)
{
  string model_file_name    = opts.GetFirstString ('-', "model-file", "");
  int model_threshold       = opts.GetFirstInt('-', "recal-model-hp-thres", 4);
  bool save_hpmodel         = opts.GetFirstBoolean('-', "save-hpmodel", true);
  bool diagonal_state_prog  = opts.GetFirstBoolean('-', "diagonal-state-prog", false);

  if (diagonal_state_prog)
    model_file_name.clear();

  if (InitializeModel(model_file_name, model_threshold) and save_hpmodel)
    SaveModelFileToBamComments(model_file_name, bam_comments, run_id, chip_subset.GetColOffset(), chip_subset.GetRowOffset());
}
Пример #6
0
TagTrimmerParameters MolecularTagTrimmer::ReadOpts(OptArgs& opts)
{
  // Reading command line options to set tag structures
  TagTrimmerParameters my_params;

  my_params.min_family_size            = opts.GetFirstInt     ('-', "min-tag-fam-size", 3);
  my_params.suppress_mol_tags          = opts.GetFirstBoolean ('-', "suppress-mol-tags", false);
  //my_params.cl_a_handle                = opts.GetFirstString  ('-', "tag-handle", "");
  //my_params.handle_cutoff              = opts.GetFirstInt     ('-', "handle-cutoff", 2);

  my_params.master_tags.prefix_mol_tag = opts.GetFirstString  ('-', "prefix-mol-tag", "");
  my_params.master_tags.suffix_mol_tag = opts.GetFirstString  ('-', "suffix-mol-tag", "");

  ValidateTagString(my_params.master_tags.prefix_mol_tag);
  ValidateTagString(my_params.master_tags.suffix_mol_tag);

  // Overload to disable molecular tagging
  if (my_params.min_family_size == 0)
    my_params.suppress_mol_tags = true;
  else if (my_params.min_family_size < 1) {
    cerr << "MolecularTagTrimmer Error: min-tag-fam-size must be at least 1. " << endl;
    exit(EXIT_FAILURE);
  }

  my_params.command_line_tags = my_params.master_tags.HasTags();

  // Options for read filtering & and trimming method selection
  string trim_method          = opts.GetFirstString  ('-', "tag-trim-method", "sloppy-trim");
  if (trim_method == "sloppy-trim")
    my_params.tag_trim_method = kSloppyTrim;
  else if (trim_method == "strict-trim")
    my_params.tag_trim_method = kStrictTrim;
  else {
    cerr << "MolecularTagTrimmer Error: Unknown tag trimming option " << trim_method << endl;
    exit(EXIT_FAILURE);
  }

  string filter_method        = opts.GetFirstString  ('-', "tag-filter-method", "need-all");
  if (filter_method == "need-all")
    my_params.tag_filter_method = kneed_all_tags;
  else if (filter_method == "need-prefix")
    my_params.tag_filter_method = kneed_only_prefix_tag;
  else if (filter_method == "need-suffix")
    my_params.tag_filter_method = kneed_only_suffix_tag;
  else {
    cerr << "MolecularTagTrimmer Error: Unknown tag filtering option " << filter_method << endl;
    exit(EXIT_FAILURE);
  }
  return my_params;
}
Пример #7
0
void RecalibrationModel::Initialize(OptArgs& opts)
{
    is_enabled_ = false;

    string model_file_name = opts.GetFirstString ('-', "model-file", "");
    if (model_file_name.empty() or model_file_name == "off") {
        printf("RecalibrationModel: disabled\n\n");
        return;
    }

    ifstream model_file;
    model_file.open(model_file_name.c_str());
    if (model_file.fail()) {
        printf("RecalibrationModel: disabled (cannot open %s)\n\n", model_file_name.c_str());
        model_file.close();
        return;
    }

    recalModelHPThres = opts.GetFirstInt('-', "recal-model-hp-thres", 4);

    string comment_line;
    getline(model_file, comment_line); //skip the comment time

    int flowStart, flowEnd, flowSpan, xMin, xMax, xSpan, yMin, yMax, ySpan, max_hp_calibrated;
    model_file >> flowStart >> flowEnd >> flowSpan >> xMin >> xMax >> xSpan >> yMin >> yMax >> ySpan >>  max_hp_calibrated;
    stratification.SetupRegion(xMin, xMax, xSpan, yMin, yMax, ySpan);
    //calculate number of partitions and initialize the stratifiedAs and stratifiedBs
    SetupStratification(flowStart,flowEnd, flowSpan,xMin,xMax,xSpan,yMin,yMax,ySpan,max_hp_calibrated);

    //TODO: parse model_file into stratifiedAs and stratifiedBs
    while (model_file.good()) {
        float paramA, paramB;
        int refHP;
        char flowBase;
        model_file >> flowBase >> flowStart >> flowEnd >> xMin >> xMax >> yMin >> yMax >> refHP >> paramA >> paramB;
        //populate it to stratifiedAs and startifiedBs
        int nucInd = NuctoInt(flowBase);
        //boundary check
        int offsetRegion = stratification.OffsetRegion(xMin,yMin);
        FillIndexes(offsetRegion,nucInd, refHP, flowStart, flowEnd, paramA, paramB);
    }

    model_file.close();

    printf("Recalibration: enabled (using calibration file %s)\n\n", model_file_name.c_str());
    is_enabled_ = true;
    if (recalModelHPThres > MAX_HPXLEN) is_enabled_ = false;
}
Пример #8
0
void ExtendParameters::SetupFileIO(OptArgs &opts) {
  // freeBayes slot
  fasta                                 = opts.GetFirstString('r', "reference", "");
  if (fasta.empty()) {
    cerr << "Fatal ERROR: Reference file not specified via -r" << endl;
    exit(1);
  }
  ValidateAndCanonicalizePath(fasta);

  // freeBayes slot
  variantPriorsFile                     = opts.GetFirstString('c', "input-vcf", "");
  if (variantPriorsFile.empty()) {
    cerr << "INFO: No input VCF (Hotspot) file specified via -c,--input-vcf" << endl;
  }
  else
	ValidateAndCanonicalizePath(variantPriorsFile);

  sseMotifsFileName                     = opts.GetFirstString('e', "error-motifs", "");
  sseMotifsProvided = true;
  if (sseMotifsFileName.empty()) {
    sseMotifsProvided = false;
    cerr << "INFO: Systematic error motif file not specified via -e" << endl;
  }
  else
	ValidateAndCanonicalizePath(sseMotifsFileName);

  opts.GetOption(bams, "", 'b', "input-bam");
  if (bams.empty()) {
    cerr << "FATAL ERROR: BAM file not specified via -b" << endl;
    exit(-1);
  }
  for (unsigned int i_bam = 0; i_bam < bams.size(); ++i_bam)
    ValidateAndCanonicalizePath(bams[i_bam]);

  outputDir                             = opts.GetFirstString('O', "output-dir", ".");
  ValidateAndCanonicalizePath(outputDir);

  outputFile                            = opts.GetFirstString('o', "output-vcf", "");
  if (outputFile.empty()) {
    cerr << "Fatal ERROR: Output VCF filename not specified via -o" << endl;
    exit(1);
  }

  // Are those file names?
  postprocessed_bam                     = opts.GetFirstString('-', "postprocessed-bam", "");
  sampleName                            = opts.GetFirstString('g', "sample-name", "");
  force_sample_name                     = opts.GetFirstString('-', "force-sample-name", "");

}
Пример #9
0
void ExtendParameters::SetFreeBayesParameters(OptArgs &opts, Json::Value& fb_params) {
  // FreeBayes parameters
  // primarily used in candidate generation

  targets                               = opts.GetFirstString('t', "target-file", "");
  trim_ampliseq_primers                 = opts.GetFirstBoolean('-', "trim-ampliseq-primers", false);
  if (targets.empty() and trim_ampliseq_primers) {
    cerr << "ERROR: --trim-ampliseq-primers enabled but no --target-file provided" << endl;
    exit(1);
  }

  allowIndels                           = RetrieveParameterBool  (opts, fb_params, '-', "allow-indels", true);
  allowSNPs                             = RetrieveParameterBool  (opts, fb_params, '-', "allow-snps", true);
  allowMNPs                             = RetrieveParameterBool  (opts, fb_params, '-', "allow-mnps", true);
  allowComplex                          = RetrieveParameterBool  (opts, fb_params, '-', "allow-complex", false);
  // deprecated:
  // leftAlignIndels                       = RetrieveParameterBool  (opts, fb_params, '-', "left-align-indels", false);
  RetrieveParameterBool  (opts, fb_params, '-', "left-align-indels", false);
  
  //useBestNAlleles = 0;
  useBestNAlleles                       = RetrieveParameterInt   (opts, fb_params, 'm', "use-best-n-alleles", 2);
  onlyUseInputAlleles                   = RetrieveParameterBool  (opts, fb_params, '-', "use-input-allele-only", false);
  min_mapping_qv                        = RetrieveParameterInt   (opts, fb_params, 'M', "min-mapping-qv", 4);
  read_snp_limit                        = RetrieveParameterInt   (opts, fb_params, 'U', "read-snp-limit", 10);
  readMaxMismatchFraction               = RetrieveParameterDouble(opts, fb_params, 'z', "read-max-mismatch-fraction", 1.0);
  maxComplexGap                         = RetrieveParameterInt   (opts, fb_params, '!', "max-complex-gap", 1);
  // read from json or command line, otherwise default to snp frequency
  minAltFraction                        = RetrieveParameterDouble(opts, fb_params, '-', "gen-min-alt-allele-freq", my_controls.filter_snps.min_allele_freq);
  minCoverage                           = RetrieveParameterInt   (opts, fb_params, '-', "gen-min-coverage", my_controls.filter_snps.min_cov);
  minIndelAltFraction                   = RetrieveParameterDouble(opts, fb_params, '-', "gen-min-indel-alt-allele-freq", my_controls.filter_hp_indel.min_allele_freq);
  //set up debug levels

  if (program_flow.DEBUG > 0)
    debug = true;

  if (program_flow.inputPositionsOnly) {
    processInputPositionsOnly = true;
  }

  if (variantPriorsFile.empty() && (processInputPositionsOnly || onlyUseInputAlleles) ) {
    cerr << "ERROR: Parameter error - Process-input-positions-only: " << processInputPositionsOnly << " use-input-allele-only: " << onlyUseInputAlleles << " :  Specified without Input VCF File " << endl;
    exit(1);
  }
}
Пример #10
0
string RetrieveParameterString(OptArgs &opts, Json::Value& json, char short_name, const string& long_name_hyphens, const string& default_value)
{
  string long_name_underscores = GetRidOfDomainAndHyphens(long_name_hyphens);
  string value = default_value;
  string source = "builtin default";

  if (json.isMember(long_name_underscores)) {
    value = json[long_name_underscores].asCString();
    source = "parameters json file";
  }

  if (opts.HasOption(short_name, long_name_hyphens)) {
    value = opts.GetFirstString(short_name, long_name_hyphens, value);
    source = "command line option";
  }

  cout << setw(35) << long_name_hyphens << " = " << setw(10) << value << " (string, " << source << ")" << endl;
  return value;
}
Пример #11
0
int IonstatsReduceH5(int argc, const char *argv[])
{
  OptArgs opts;
  opts.ParseCmdLine(argc-1, argv+1);

  string output_h5_filename = opts.GetFirstString  ('o', "output", "");
  bool merge_proton_blocks  = opts.GetFirstBoolean ('b', "merge-proton-blocks", "true");
  vector<string>  input_h5_filename;
  opts.GetLeftoverArguments(input_h5_filename);

  if(input_h5_filename.empty() or output_h5_filename.empty()) {
    IonstatsReduceH5Help();
    return 1;
  }

  if(merge_proton_blocks)
    cout << "NOTE:" << argv[0] << " " << argv[1] << ": --merge-proton-blocks=true so any Proton block-specific read group suffixes will be merged" << endl;

  return IonstatsAlignmentReduceH5(output_h5_filename, input_h5_filename, merge_proton_blocks);
}
Пример #12
0
int IonstatsReduce(int argc, const char *argv[])
{
  OptArgs opts;
  opts.ParseCmdLine(argc, argv);

  string output_json_filename = opts.GetFirstString('o', "output", "");
  vector<string>  input_jsons;
  opts.GetLeftoverArguments(input_jsons);

  if(input_jsons.empty() or output_json_filename.empty()) {
    IonstatsReduceHelp();
    return 1;
  }

  ifstream in(input_jsons[0].c_str(), ifstream::in);
  if (!in.good()) {
    fprintf(stderr, "[ionstats] ERROR: cannot open %s\n", input_jsons[0].c_str());
    return 1;
  }
  Json::Value first_input_json;
  in >> first_input_json;
  in.close();

  if (!first_input_json.isMember("meta")) {
    fprintf(stderr, "[ionstats] ERROR: %s is not a valid input file for ionstats reduce\n", input_jsons[0].c_str());
    return 1;
  }
  string format_name = first_input_json["meta"].get("format_name","").asString();

  if (format_name == "ionstats_basecaller")
    return IonstatsBasecallerReduce(output_json_filename, input_jsons);
  if (format_name == "ionstats_tf")
    return IonstatsTestFragmentsReduce(output_json_filename, input_jsons);
  if (format_name == "ionstats_alignment")
    return IonstatsAlignmentReduce(output_json_filename, input_jsons);

  fprintf(stderr, "[ionstats] ERROR: %s is not a valid input file for ionstats reduce\n", input_jsons[0].c_str());
  return 1;
}
Пример #13
0
void ProgramControlSettings::SetOpts(OptArgs &opts, Json::Value &tvc_params) {

  DEBUG                                 = opts.GetFirstInt   ('d', "debug", 0);
  nThreads                              = RetrieveParameterInt   (opts, tvc_params, 'n', "num-threads", 12);
  nVariantsPerThread                    = RetrieveParameterInt   (opts, tvc_params, 'N', "num-variants-per-thread", 250);
  use_SSE_basecaller                    = RetrieveParameterBool  (opts, tvc_params, '-', "use-sse-basecaller", true);
  // decide diagnostic
  rich_json_diagnostic                  = RetrieveParameterBool  (opts, tvc_params, '-', "do-json-diagnostic", false);
  minimal_diagnostic                    = RetrieveParameterBool  (opts, tvc_params, '-', "do-minimal-diagnostic", false);


  inputPositionsOnly                    = RetrieveParameterBool  (opts, tvc_params, '-', "process-input-positions-only", false);
  suppress_recalibration                = RetrieveParameterBool  (opts, tvc_params, '-', "suppress-recalibration", true);
  resolve_clipped_bases                 = RetrieveParameterBool  (opts, tvc_params, '-', "resolve-clipped-bases", false);
}
Пример #14
0
void ExtendParameters::ParametersFromJSON(OptArgs &opts, Json::Value &tvc_params, Json::Value &freebayes_params, Json::Value &params_meta) {
  string parameters_file                = opts.GetFirstString('-', "parameters-file", "");
  Json::Value parameters_json(Json::objectValue);
  if (not parameters_file.empty()) {
    ifstream in(parameters_file.c_str(), ifstream::in);

    if (!in.good()) {
      fprintf(stderr, "[tvc] FATAL ERROR: cannot open %s\n", parameters_file.c_str());
      exit(-1);
    }
    
    in >> parameters_json;
    in.close();
    if (parameters_json.isMember("pluginconfig"))
      parameters_json = parameters_json["pluginconfig"];

    tvc_params = parameters_json.get("torrent_variant_caller", Json::objectValue);
    freebayes_params = parameters_json.get("freebayes", Json::objectValue);
    params_meta = parameters_json.get("meta", Json::objectValue);
  }
Пример #15
0
bool BaseCallerParameters::InitializeFilesFromOptArgs(OptArgs& opts)
{
    bc_files.input_directory        = opts.GetFirstString ('i', "input-dir", ".");
    bc_files.output_directory       = opts.GetFirstString ('o', "output-dir", ".");
    bc_files.unfiltered_untrimmed_directory = bc_files.output_directory + "/unfiltered.untrimmed";
    bc_files.unfiltered_trimmed_directory   = bc_files.output_directory + "/unfiltered.trimmed";

    CreateResultsFolder ((char*)bc_files.output_directory.c_str());
    CreateResultsFolder ((char*)bc_files.unfiltered_untrimmed_directory.c_str());
    CreateResultsFolder ((char*)bc_files.unfiltered_trimmed_directory.c_str());

    ValidateAndCanonicalizePath(bc_files.input_directory);
    ValidateAndCanonicalizePath(bc_files.output_directory);
    ValidateAndCanonicalizePath(bc_files.unfiltered_untrimmed_directory);
    ValidateAndCanonicalizePath(bc_files.unfiltered_trimmed_directory);

    bc_files.filename_wells         = opts.GetFirstString ('-', "wells", bc_files.input_directory + "/1.wells");
    bc_files.filename_mask          = opts.GetFirstString ('-', "mask", bc_files.input_directory + "/analysis.bfmask.bin");

    ValidateAndCanonicalizePath(bc_files.filename_wells);
    ValidateAndCanonicalizePath(bc_files.filename_mask, bc_files.input_directory + "/bfmask.bin");

    bc_files.filename_filter_mask   = bc_files.output_directory + "/bfmask.bin";
    bc_files.filename_json          = bc_files.output_directory + "/BaseCaller.json";
    bc_files.filename_phase         = bc_files.output_directory + "/PhaseEstimates.json";

    printf("\n");
    printf("Input files summary:\n");
    printf("     --input-dir %s\n", bc_files.input_directory.c_str());
    printf("         --wells %s\n", bc_files.filename_wells.c_str());
    printf("          --mask %s\n", bc_files.filename_mask.c_str());
    printf("\n");
    printf("Output directories summary:\n");
    printf("    --output-dir %s\n", bc_files.output_directory.c_str());
    printf("        unf.untr %s\n", bc_files.unfiltered_untrimmed_directory.c_str());
    printf("          unf.tr %s\n", bc_files.unfiltered_trimmed_directory.c_str());
    printf("\n");

    bc_files.lib_datasets_file      = opts.GetFirstString ('-', "datasets", "");
    bc_files.calibration_panel_file = opts.GetFirstString ('-', "calibration-panel", "");
    if (not bc_files.lib_datasets_file.empty())
      ValidateAndCanonicalizePath(bc_files.lib_datasets_file);
    if (not bc_files.calibration_panel_file.empty())
      ValidateAndCanonicalizePath(bc_files.calibration_panel_file);

    bc_files.options_set = true;
    return true;
};
Пример #16
0
bool BaseCallerContext::SetKeyAndFlowOrder(OptArgs& opts, const char * FlowOrder, const int NumFlows)
{
    flow_order.SetFlowOrder( opts.GetFirstString ('-', "flow-order", FlowOrder),
                             opts.GetFirstInt    ('f', "flowlimit", NumFlows));
    if (flow_order.num_flows() > NumFlows)
      flow_order.SetNumFlows(NumFlows);
    assert(flow_order.is_ok());

    string lib_key                = opts.GetFirstString ('-', "lib-key", "TCAG"); //! @todo Get default key from wells
    string tf_key                 = opts.GetFirstString ('-', "tf-key", "ATCG");
    lib_key                       = opts.GetFirstString ('-', "librarykey", lib_key);   // Backward compatible opts
    tf_key                        = opts.GetFirstString ('-', "tfkey", tf_key);
    keys.resize(2);
    keys[0].Set(flow_order, lib_key, "lib");
    keys[1].Set(flow_order, tf_key, "tf");
    return true;
};
Пример #17
0
void PhaseEstimator::InitializeFromOptArgs(OptArgs& opts)
{
  phasing_estimator_      = opts.GetFirstString ('-', "phasing-estimator", "spatial-refiner-2");
  string arg_cf_ie_dr     = opts.GetFirstString ('-', "libcf-ie-dr", "");
  residual_threshold_     = opts.GetFirstDouble ('-', "phasing-residual-filter", 1.0);
  max_phasing_levels_     = opts.GetFirstInt    ('-', "max-phasing-levels", max_phasing_levels_default_);
  use_pid_norm_           = opts.GetFirstString ('-', "keynormalizer", "keynorm-old") == "keynorm-new";
  windowSize_             = opts.GetFirstInt    ('-', "window-size", DPTreephaser::kWindowSizeDefault_);

  if (!arg_cf_ie_dr.empty()) {
    phasing_estimator_ = "override";
    result_regions_x_ = 1;
    result_regions_y_ = 1;
    result_cf_.assign(1, 0.0);
    result_ie_.assign(1, 0.0);
    result_dr_.assign(1, 0.0);
    if (3 != sscanf (arg_cf_ie_dr.c_str(), "%f,%f,%f", &result_cf_[0], &result_ie_[0], &result_dr_[0])) {
      fprintf (stderr, "Option Error: libcf-ie-dr %s\n", arg_cf_ie_dr.c_str());
      exit (EXIT_FAILURE);
    }
    return; // --libcf-ie-dr overrides other phasing-related options
  }
}
Пример #18
0
int IonstatsTestFragments(int argc, const char *argv[])
{
  OptArgs opts;
  opts.ParseCmdLine(argc, argv);
  string input_bam_filename   = opts.GetFirstString('i', "input", "");
  string fasta_filename       = opts.GetFirstString('r', "ref", "");
  string output_json_filename = opts.GetFirstString('o', "output", "ionstats_tf.json");
  int histogram_length        = opts.GetFirstInt   ('h', "histogram-length", 400);

  if(argc < 2 or input_bam_filename.empty() or fasta_filename.empty()) {
    IonstatsTestFragmentsHelp();
    return 1;
  }

  //
  // Prepare for metric calculation
  //

  map<string,string> tf_sequences;
  PopulateReferenceSequences(tf_sequences, fasta_filename);


  BamReader input_bam;
  if (!input_bam.Open(input_bam_filename)) {
    fprintf(stderr, "[ionstats] ERROR: cannot open %s\n", input_bam_filename.c_str());
    return 1;
  }

  int num_tfs = input_bam.GetReferenceCount();


  SamHeader sam_header = input_bam.GetHeader();
  if(!sam_header.HasReadGroups()) {
    fprintf(stderr, "[ionstats] ERROR: no read groups in %s\n", input_bam_filename.c_str());
    return 1;
  }

  string flow_order;
  string key;
  for (SamReadGroupIterator rg = sam_header.ReadGroups.Begin(); rg != sam_header.ReadGroups.End(); ++rg) {
    if(rg->HasFlowOrder())
      flow_order = rg->FlowOrder;
    if(rg->HasKeySequence())
      key = rg->KeySequence;
  }


  // Need these metrics stratified by TF.

  vector<ReadLengthHistogram> called_histogram(num_tfs);
  vector<ReadLengthHistogram> aligned_histogram(num_tfs);
  vector<ReadLengthHistogram> AQ10_histogram(num_tfs);
  vector<ReadLengthHistogram> AQ17_histogram(num_tfs);
  vector<SimpleHistogram> error_by_position(num_tfs);
  vector<MetricGeneratorSNR> system_snr(num_tfs);
  vector<MetricGeneratorHPAccuracy> hp_accuracy(num_tfs);

  for (int tf = 0; tf < num_tfs; ++tf) {
    called_histogram[tf].Initialize(histogram_length);
    aligned_histogram[tf].Initialize(histogram_length);
    AQ10_histogram[tf].Initialize(histogram_length);
    AQ17_histogram[tf].Initialize(histogram_length);
    error_by_position[tf].Initialize(histogram_length);
  }

  vector<uint16_t> flow_signal_fz(flow_order.length());
  vector<int16_t> flow_signal_zm(flow_order.length());

  const RefVector& refs = input_bam.GetReferenceData();

  // Missing:
  //  - hp accuracy - tough, copy verbatim from TFMapper?


  BamAlignment alignment;
  vector<char>  MD_op;
  vector<int>   MD_len;
  MD_op.reserve(1024);
  MD_len.reserve(1024);
  string MD_tag;

  //
  // Main loop over mapped reads in the input BAM
  //

  while(input_bam.GetNextAlignment(alignment)) {


    if (!alignment.IsMapped() or !alignment.GetTag("MD",MD_tag))
      continue;

    // The check below eliminates unexpected alignments
    if (alignment.IsReverseStrand() or alignment.Position > 5)
      continue;

    int current_tf = alignment.RefID;

    //
    // Step 1. Parse MD tag
    //

    MD_op.clear();
    MD_len.clear();

    for (const char *MD_ptr = MD_tag.c_str(); *MD_ptr;) {

      int item_length = 0;
      if (*MD_ptr >= '0' and *MD_ptr <= '9') {    // Its a match
        MD_op.push_back('M');
        for (; *MD_ptr and *MD_ptr >= '0' and *MD_ptr <= '9'; ++MD_ptr)
          item_length = 10*item_length + *MD_ptr - '0';
      } else {
        if (*MD_ptr == '^') {                     // Its a deletion
          MD_ptr++;
          MD_op.push_back('D');
        } else                                    // Its a substitution
          MD_op.push_back('X');
        for (; *MD_ptr and *MD_ptr >= 'A' and *MD_ptr <= 'Z'; ++MD_ptr)
          item_length++;
      }
      MD_len.push_back(item_length);
    }

    //
    // Step 2. Synchronously scan through Cigar and MD, doing error accounting
    //

    int MD_idx = alignment.IsReverseStrand() ? MD_op.size()-1 : 0;
    int cigar_idx = alignment.IsReverseStrand() ? alignment.CigarData.size()-1 : 0;
    int increment = alignment.IsReverseStrand() ? -1 : 1;

    int AQ10_bases = 0;
    int AQ17_bases = 0;
    int num_bases = 0;
    int num_errors = 0;

    while (cigar_idx < (int)alignment.CigarData.size() and MD_idx < (int) MD_op.size() and cigar_idx >= 0 and MD_idx >= 0) {

      if (alignment.CigarData[cigar_idx].Length == 0) { // Try advancing cigar
        cigar_idx += increment;
        continue;
      }
      if (MD_len[MD_idx] == 0) { // Try advancing MD
        MD_idx += increment;
        continue;
      }

      // Match
      if (alignment.CigarData[cigar_idx].Type == 'M' and MD_op[MD_idx] == 'M') {
        int advance = min((int)alignment.CigarData[cigar_idx].Length, MD_len[MD_idx]);
        num_bases += advance;
        alignment.CigarData[cigar_idx].Length -= advance;
        MD_len[MD_idx] -= advance;

      // Insertion (read has a base, reference doesn't)
      } else if (alignment.CigarData[cigar_idx].Type == 'I') {
        int advance = alignment.CigarData[cigar_idx].Length;
        for (int cnt = 0; cnt < advance; ++cnt) {
          error_by_position[current_tf].Add(num_bases);
          num_bases++;
          num_errors++;
        }
        alignment.CigarData[cigar_idx].Length -= advance;

      // Deletion (reference has a base, read doesn't)
      } else if (alignment.CigarData[cigar_idx].Type == 'D' and MD_op[MD_idx] == 'D') {
        int advance = min((int)alignment.CigarData[cigar_idx].Length, MD_len[MD_idx]);
        for (int cnt = 0; cnt < advance; ++cnt) {
          error_by_position[current_tf].Add(num_bases);
          num_errors++;
        }
        alignment.CigarData[cigar_idx].Length -= advance;
        MD_len[MD_idx] -= advance;

      // Substitution
      } else if (MD_op[MD_idx] == 'X') {
        int advance = min((int)alignment.CigarData[cigar_idx].Length, MD_len[MD_idx]);
        for (int cnt = 0; cnt < advance; ++cnt) {
          error_by_position[current_tf].Add(num_bases);
          num_bases++;
          num_errors++;
        }
        alignment.CigarData[cigar_idx].Length -= advance;
        MD_len[MD_idx] -= advance;

      } else {
        printf("ionstats tf: Unexpected OP combination: %s Cigar=%c, MD=%c !\n",
            alignment.Name.c_str(), alignment.CigarData[cigar_idx].Type, MD_op[MD_idx]);
        break;
      }

      if (num_errors*10 <= num_bases)   AQ10_bases = num_bases;
      if (num_errors*50 <= num_bases)   AQ17_bases = num_bases;
    }

    //
    // Step 3. Profit
    //

    called_histogram[current_tf].Add(alignment.Length);
    aligned_histogram[current_tf].Add(num_bases);
    AQ10_histogram[current_tf].Add(AQ10_bases);
    AQ17_histogram[current_tf].Add(AQ17_bases);

    if(alignment.GetTag("ZM", flow_signal_zm))
      system_snr[current_tf].Add(flow_signal_zm, key.c_str(), flow_order);
    else if(alignment.GetTag("FZ", flow_signal_fz))
      system_snr[current_tf].Add(flow_signal_fz, key.c_str(), flow_order);


    // HP accuracy - keeping it simple

    if (!alignment.IsReverseStrand()) {

      string genome = key + tf_sequences[refs[current_tf].RefName];
      string calls = key + alignment.QueryBases;
      const char *genome_ptr = genome.c_str();
      const char *calls_ptr = calls.c_str();

      for (int flow = 0; flow < (int)flow_order.length() and *genome_ptr and *calls_ptr; ++flow) {
        int genome_hp = 0;
        int calls_hp = 0;
        while (*genome_ptr == flow_order[flow]) {
          genome_hp++;
          genome_ptr++;
        }
        while (*calls_ptr == flow_order[flow]) {
          calls_hp++;
          calls_ptr++;
        }
        hp_accuracy[current_tf].Add(genome_hp, calls_hp);
      }
    }
  }



  //
  // Processing complete, generate ionstats_tf.json
  //

  Json::Value output_json(Json::objectValue);
  output_json["meta"]["creation_date"] = get_time_iso_string(time(NULL));
  output_json["meta"]["format_name"] = "ionstats_tf";
  output_json["meta"]["format_version"] = "1.0";

  output_json["results_by_tf"] = Json::objectValue;

  for (int tf = 0; tf < num_tfs; ++tf) {

    if (aligned_histogram[tf].num_reads() < 1000)
      continue;

    called_histogram[tf].SaveToJson(output_json["results_by_tf"][refs[tf].RefName]["full"]);
    aligned_histogram[tf].SaveToJson(output_json["results_by_tf"][refs[tf].RefName]["aligned"]);
    AQ10_histogram[tf].SaveToJson(output_json["results_by_tf"][refs[tf].RefName]["AQ10"]);
    AQ17_histogram[tf].SaveToJson(output_json["results_by_tf"][refs[tf].RefName]["AQ17"]);
    error_by_position[tf].SaveToJson(output_json["results_by_tf"][refs[tf].RefName]["error_by_position"]);
    system_snr[tf].SaveToJson(output_json["results_by_tf"][refs[tf].RefName]);
    hp_accuracy[tf].SaveToJson(output_json["results_by_tf"][refs[tf].RefName]);

    output_json["results_by_tf"][refs[tf].RefName]["sequence"] = tf_sequences[refs[tf].RefName];
  }

  input_bam.Close();

  ofstream out(output_json_filename.c_str(), ios::out);
  if (out.good()) {
    out << output_json.toStyledString();
    return 0;
  } else {
    fprintf(stderr, "ERROR: unable to write to '%s'\n", output_json_filename.c_str());
    return 1;
  }
}
Пример #19
0
int main(int argc, const char *argv[]) {
  OptArgs opts;
  opts.ParseCmdLine(argc, argv);
  int hpLength;
  string statsOut;
  string alignmentOut;
  string pairedOut;
  string flowsOut;
  string summaryOut;
  string samFile;
  string qScoreCol;
  string wellsFile;
  string bfmaskFile;
  string snrFile;
  string binnedHpSigFile;
  string flowErrFile;
  string gcErrFile;
  int gcWin;
  string flowOrder;
  string keySeq;
  int numFlows;
  bool help;
  int qLength;
  double colCenter;
  double rowCenter;
  int colSize;
  int rowSize;
  int sampleSize;
  string wellsToUse;
  string run1, run2;
  opts.GetOption(run1, "", '-', "sff1");
  opts.GetOption(run2, "", '-', "sff2");
  opts.GetOption(wellsToUse, "", '-', "use-wells");
  opts.GetOption(samFile, "", '-', "sam-parsed");
  opts.GetOption(statsOut, "", '-', "stats-out");
  opts.GetOption(flowsOut, "", '-', "flows-out");
  opts.GetOption(alignmentOut, "", '-', "align-out");
  opts.GetOption(summaryOut, "", '-', "summary-out");
  opts.GetOption(pairedOut, "", '-', "paired-out");
  opts.GetOption(numFlows, "40", '-', "num-flows");
  opts.GetOption(hpLength, "6", '-', "max-hp");
  opts.GetOption(qScoreCol, "q7Len", '-', "qscore-col");
  opts.GetOption(qLength, "25", '-', "min-qlength");
  opts.GetOption(help,   "false", 'h', "help");
  opts.GetOption(wellsFile,   "", '-', "wells-file");
  opts.GetOption(bfmaskFile,   "", '-', "bfmask-file");
  opts.GetOption(snrFile,   "", '-', "snr-file");
  opts.GetOption(binnedHpSigFile,   "", '-', "binned-hp-sig-file");
  opts.GetOption(flowErrFile, "", '-', "flow-err-file");
  opts.GetOption(gcErrFile, "", '-', "gc-err-file");
  opts.GetOption(flowOrder, "", '-', "flow-order");
  opts.GetOption(keySeq, "", '-', "key-seq");
  opts.GetOption(colCenter, "0.5", '-', "col-center");
  opts.GetOption(rowCenter, "0.5", '-', "row-center");
  opts.GetOption(colSize, "0", '-', "col-size");
  opts.GetOption(rowSize, "0", '-', "row-size");
  opts.GetOption(gcErrFile, "", '-', "gc-err-file");
  opts.GetOption(gcWin, "40", '-', "gc-win");
  opts.GetOption(sampleSize, "100000", '-', "sample-size");
  if (help || samFile.empty() || statsOut.empty() || summaryOut.empty()) {
    usage();
  }
  opts.CheckNoLeftovers();

  // Some checks to make sure sensible bounds have been set
  if(colCenter < 0 || colCenter > 1) {
    cerr << "AnalyzeHPErrs - col-center must be in the range [0,1]" << endl;
    exit(1);
  }
  if(rowCenter < 0 || rowCenter > 1) {
    cerr << "AnalyzeHPErrs - row-center must be in the range [0,1]" << endl;
    exit(1);
  }
  if(colSize < 0) {
    cerr << "AnalyzeHPErrs - col-size cannot be negative." << endl;
    exit(1);
  }
  if(rowSize < 0) {
    cerr << "AnalyzeHPErrs - row-size cannot be negative." << endl;
    exit(1);
  }

  // Determine rows & cols if a bfmask file was supplied
  int nRow=0;
  int nCol=0;
  if(!bfmaskFile.empty()) {
    if(GetRowColFromBfmask(bfmaskFile, &nRow, &nCol)) {
      cerr << "AnalyzeHPErrs - problem determining rows & columns from bfmask file " << bfmaskFile << endl;
      exit(1);
    }
  }
	
  // Set up fds object
  FlowDiffStats* fds;
  if (!run1.empty()) {
    SffDiffStats* sds = new SffDiffStats(hpLength, nCol, nRow, qScoreCol, run1, run2);
    if (!pairedOut.empty())
      sds->SetPairedOut(pairedOut);
    fds = dynamic_cast<FlowDiffStats*>(sds);
  }
  else {
    GenomeDiffStats* gds = new GenomeDiffStats(hpLength, nCol, nRow, qScoreCol);
    if(alignmentOut != "") {
      gds->SetAlignmentsOut(alignmentOut);
    }
    if (!flowsOut.empty()) {
      gds->SetFlowsOut(flowsOut);
    }
    fds = dynamic_cast<FlowDiffStats*>(gds);
  }

  if (gcErrFile != "") {
    fds->SetFlowGCOut(gcErrFile);
    fds->SetGCWindowSize(gcWin);
  }

  if(keySeq != "") {
    fds->SetKeySeq(keySeq);
  }
  if(flowOrder != "") {
    fds->SetFlowOrder(flowOrder);
  }
  fds->SetStatsOut(statsOut);

  if (!wellsToUse.empty()) {
    std::vector<int> wells;
    std::vector<bool> use;
    ReadSetFromFile(wellsToUse, 0, wells);
    use.resize(nRow * nCol, false);
    int count = 0;
    ReservoirSample<int> wellSample(sampleSize);
    for (size_t i = 0; i < wells.size(); i++) {
      wellSample.Add(wells[i]);
    }
    wells = wellSample.GetData();
    for (size_t i = 0; i < wells.size(); i++) {
      use[wells[i]] = true;
      count++;
    }
    cout << "Read: " << count << " reads." << endl;
    fds->SetWellToAnalyze(use);
  }


  // Set integer-value row & column bounds
  int minRow=-1;
  int maxRow=-1;
  int minCol=-1;
  int maxCol=-1;
  if(colSize > 0 || rowSize > 0) {
    if(bfmaskFile.empty()) {
      cerr << "AnalyzeHPErrs - must specify bfmask file when restricting row or column ranges" << endl;
      exit(1);
    }
    if(rowSize > 0) {
      minRow = floor(nRow * rowCenter - rowSize / 2.0);
      maxRow = minRow + rowSize;
      minRow = std::max(0,minRow);
      maxRow = std::min(nRow,maxRow);
    }
    if(colSize > 0) {
      minCol = floor(nCol * colCenter - colSize / 2.0);
      maxCol = minCol + colSize;
      minCol = std::max(0,minCol);
      maxCol = std::min(nCol,maxCol);
    }
  }

  if (wellsFile != "") {
    std::vector<int32_t> xSubset, ySubset;
    fds->FillInSubset(samFile, qLength, minRow, maxRow, minCol, maxCol, xSubset, ySubset);
    if(bfmaskFile.empty()) {
      cerr << "AnalyzeHPErrs - must specify bfmask file when specifying wells file" << endl;
      exit(1);
    }
    fds->SetWellsFile(wellsFile, nRow, nCol, numFlows, xSubset, ySubset);
  }
  if (snrFile != "") {
    cout << "Opening snr file: " << snrFile << endl;
    fds->SetSNROut(snrFile);
  }
  if (binnedHpSigFile != "") {
    cout << "Opening binned HP signal file: " << binnedHpSigFile << endl;
    fds->SetBinnedHpSigOut(binnedHpSigFile);
  }
  if (flowErrFile != "") {
    cout << "Opening flow err file: " << flowErrFile << endl;
    fds->SetFlowErrOut(flowErrFile);
  }
  ofstream summary;
  summary.open(summaryOut.c_str());
  cout << "Reading and analyzing alignments from: " << samFile << endl;
  if(minCol > -1 || maxCol > -1)
    cout << "  Restricting to " << (maxCol-minCol) << " cols in the range [" << minCol << "," << maxCol << ")" << endl;
  if(minRow > -1 || maxRow > -1)
    cout << "  Restricting to " << (maxRow-minRow) << " rows in the range [" << minRow << "," << maxRow << ")" << endl;

  fds->SetAlignmentInFile(samFile);
  fds->FilterAndCompare(numFlows, summary, qLength, minRow, maxRow, minCol, maxCol);

  summary.close();
  delete fds;
  cout << "Done." << endl;
  return 0;
}
Пример #20
0
void PerBaseQual::Init(OptArgs& opts, const string& chip_type, const string &output_directory, bool recalib)
{
	if(phred_table_)
	{
	  delete [] phred_table_;
	  phred_table_ = 0;
	}

  string phred_table_file       = opts.GetFirstString ('-', "phred-table-file", "");
  save_predictors_              = opts.GetFirstBoolean('-', "save-predictors", false);

  // Determine the correct phred table filename to use

  bool binTable = true;

  if (phred_table_file.empty()) {
    ChipIdDecoder::SetGlobalChipId(chip_type.c_str());
    ChipIdEnum chip_id = ChipIdDecoder::GetGlobalChipId();
    switch(chip_id){
    case ChipId314:
      phred_table_file = "phredTable.txt_314.binary";
      break;
    case ChipId316:
      phred_table_file = "phredTable.txt_316.binary";
      break;
    case ChipId316v2:
      phred_table_file = "phredTable.txt_318.binary";
      break; 
    case ChipId318:
      phred_table_file = "phredTable.txt_318.binary";
      break;
    case ChipId900: // Proton chip
      phred_table_file = "phredTable.txt_900.binary";
      break;
    default:
      phred_table_file = "phredTable.txt_314.binary";
      fprintf(stderr, "PerBaseQual: No default phred table for chip_type=%s, trying %s instead\n",
          chip_type.c_str(), phred_table_file.c_str());
      break;
    }

    if (recalib)
	{
		phred_table_file = phred_table_file.substr(0, phred_table_file.length() - 7);
        phred_table_file += ".Recal.binary";
	}

    char* full_filename = GetIonConfigFile(phred_table_file.c_str());
    if(!full_filename)
	{
		printf("WARNING: cannot find binary phred table file %s, try to use non-binary phred table\n", phred_table_file.c_str());
		phred_table_file = phred_table_file.substr(0, phred_table_file.length() - 7); // get rid of .binary
		binTable = false;
		char* full_filename2 = GetIonConfigFile(phred_table_file.c_str());
		if(!full_filename2)
			ION_ABORT("ERROR: Can't find phred table file " + phred_table_file);

		phred_table_file = full_filename2;
		free(full_filename2);
	}
	else
	{
		phred_table_file = full_filename;
		free(full_filename);
	}
  }

  cout << endl << "PerBaseQual::Init... phred_table_file=" << phred_table_file << endl;
  binTable = hasBinaryExtension(phred_table_file);

  // Load the phred table
  if(binTable)
  {
      cout << endl << "PerBaseQual::Init... load binary phred_table_file=" << phred_table_file << endl;
	  vector<size_t> vNumCuts(kNumPredictors, 0);

	  if(H5Fis_hdf5(phred_table_file.c_str()) > 0) 
	  {
			hid_t root = H5Fopen(phred_table_file.c_str(), H5F_ACC_RDONLY, H5P_DEFAULT);
			if(root < 0)
			{
				ION_ABORT("ERROR: cannot open HDF5 file " + phred_table_file);
			}

		    hid_t grpQvTable = H5Gopen(root, "/QvTable", H5P_DEFAULT);
			if (grpQvTable < 0) 
			{
				H5Fclose(root);
				ION_ABORT("ERROR: fail to open HDF5 group QvTable");
			}

			if(H5Aexists(grpQvTable, "NumPredictors") <= 0)
			{
				H5Gclose(grpQvTable);
				H5Fclose(root);
				ION_ABORT("ERROR: HDF5 attribute NumPredictors does not exist");
			}

			hid_t attrNumPreds = H5Aopen(grpQvTable, "NumPredictors", H5P_DEFAULT);
			if (attrNumPreds < 0) 
			{
				H5Gclose(grpQvTable);
				H5Fclose(root);
				ION_ABORT("ERROR: fail to open HDF5 attribute NumPredictors");
			}

			unsigned int numPredictors = 0;
			herr_t ret = H5Aread(attrNumPreds, H5T_NATIVE_UINT, &numPredictors);
			H5Aclose(attrNumPreds);
			if(ret < 0 || numPredictors != (unsigned int)kNumPredictors)
			{
				H5Gclose(grpQvTable);
				H5Fclose(root);
				ION_ABORT("ERROR: HDF5 attribute NumPredictors is wrong");
			}

			char buf[100];
			for(size_t i = 0; i < (size_t)kNumPredictors; ++i)
			{
				offsets_.push_back(1);

				sprintf(buf, "ThresholdsOfPredictor%d", (int)i);

				if(H5Aexists(grpQvTable, buf) <= 0)
				{
					H5Gclose(grpQvTable);
					H5Fclose(root);
					ION_ABORT("ERROR: HDF5 attribute ThresholdsOfPredictor does not exist");
				}

				hid_t attrCuts = H5Aopen(grpQvTable, buf, H5P_DEFAULT);
				if (attrCuts < 0) 
				{
					H5Gclose(grpQvTable);
					H5Fclose(root);
					ION_ABORT("ERROR: fail to open HDF5 attribute ThresholdsOfPredictor");
				}

				hsize_t size = H5Aget_storage_size(attrCuts);
				size /= sizeof(float);

				float* fcuts = new float[size];

				ret = H5Aread(attrCuts, H5T_NATIVE_FLOAT, fcuts);
				H5Aclose(attrCuts);
				if(ret < 0)
				{
					H5Gclose(grpQvTable);
					H5Fclose(root);
					ION_ABORT("ERROR: fail to read HDF5 attribute ThresholdsOfPredictor");
				}

				vector<float> vCuts(size);
				copy(fcuts, fcuts + size, vCuts.begin());

				phred_cuts_.push_back(vCuts);

				delete [] fcuts;
				fcuts = 0;
			}

			hid_t dsQvs = H5Dopen(grpQvTable, "Qvs", H5P_DEFAULT);
			if (dsQvs < 0) 
			{
				H5Gclose(grpQvTable);
				H5Fclose(root);
				ION_ABORT("ERROR: fail to open HDF5 dataset Qvs");
			}

			hsize_t tbSize = H5Dget_storage_size(dsQvs);

			phred_table_ = new unsigned char[tbSize];

			ret = H5Dread(dsQvs, H5T_NATIVE_UCHAR, H5S_ALL, H5S_ALL, H5P_DEFAULT, phred_table_);
			H5Dclose(dsQvs);
			H5Gclose(grpQvTable);
			H5Fclose(root);		
			if (ret < 0)
			{
				delete [] phred_table_;
				phred_table_ = 0;

				ION_ABORT("ERROR: fail to read HDF5 dataset Qvs");
			}
	  }
	  else
	  {
		printf("WARNING: binary phred table file %s is not a HDF5 file, try binary file mode.\n", phred_table_file.c_str());
		ifstream source;
		source.open(phred_table_file.c_str(), ios::in|ios::binary|ios::ate);
		if (!source.is_open())
			ION_ABORT("ERROR: Cannot open file: " + phred_table_file);

		long totalSize = source.tellg();
		char* tbBlock = new char [totalSize];

		source.seekg (0, ios::beg);
		source.read (tbBlock, totalSize);
		source.close();

		long headerSize = 0;
		char* ptr = tbBlock;
		int numPredictors = ptr[0]; //kNumPredictors
		if(numPredictors != kNumPredictors)
		{
			delete [] tbBlock;
			tbBlock = 0;
			ION_ABORT("ERROR: Wrong number of predictors load from " + phred_table_file);
		}

		ptr += 4;
		headerSize += 4;
		
		for(int i = 0; i < kNumPredictors; ++i)
		{
			vNumCuts[i] = ptr[0];
			ptr += 4;
			headerSize += 4;

			offsets_.push_back(1);
		}

		long tbSize = 1;
		for(int i = 0; i < kNumPredictors; ++i)
		{
			vector<float> vCuts;
			tbSize *= vNumCuts[i];
			for(size_t j = 0; j < vNumCuts[i]; ++j)
			{
				float tmp;
				memcpy(&tmp, ptr, 4);
				vCuts.push_back(tmp); 
				ptr += 4;
				headerSize += 4;
			}
			
			phred_cuts_.push_back(vCuts);
		}

		if(tbSize != (totalSize - headerSize))
		{
			delete [] tbBlock;
			tbBlock = 0;
			ION_ABORT("ERROR: Wrong QV table size");
		}	

		phred_table_ = new unsigned char[tbSize];
		memcpy(phred_table_, ptr, tbSize * sizeof(unsigned char));

		delete [] tbBlock;
		tbBlock = 0;
	  }

	  for(size_t i = kNumPredictors - 2; i > 0; --i)
	  {
		offsets_[i] *= phred_cuts_[i + 1].size();
		offsets_[i - 1] = offsets_[i];
	  }
	  offsets_[0] *= phred_cuts_[1].size();
  }
  else
  {
	  ifstream source;
	  source.open(phred_table_file.c_str());
	  if (!source.is_open())
		ION_ABORT("ERROR: Cannot open file: " + phred_table_file);

	  while (!source.eof()) {
		string line;
		getline(source, line);

		if (line.empty())
		  break;

		if (line[0] == '#')
		  continue;

		stringstream strs(line);
		float temp;
		for (int k = 0; k < kNumPredictors; ++k) {
		  strs >> temp;
		  phred_thresholds_[k].push_back(temp);
		}
		strs >> temp; //skip n-th entry
		strs >> temp;
		phred_quality_.push_back(temp);
	  }

	  source.close();

	  for (int k = 0; k < kNumPredictors; ++k)
		phred_thresholds_max_[k] = *max_element(phred_thresholds_[k].begin(), phred_thresholds_[k].end()); 
  }
 
  // Prepare for predictor dump here

  if (save_predictors_) {
    string predictors_filename = output_directory + "/Predictors.txt";
    cout << endl << "Saving PerBaseQual predictors to file " << predictors_filename << endl << endl;
    predictor_dump_.open(predictors_filename.c_str());
    if (!predictor_dump_.is_open())
      ION_ABORT("ERROR: Cannot open file: " + predictors_filename);
  }
}
Пример #21
0
int main(int argc, const char* argv[])
{
  printf ("tvcvalidator %s-%s (%s) - Prototype tvc validation tool\n\n",
      IonVersion::GetVersion().c_str(), IonVersion::GetRelease().c_str(), IonVersion::GetSvnRev().c_str());

  if (argc == 1) {
    VariantValidatorHelp();
    return 1;
  }

  OptArgs opts;
  opts.ParseCmdLine(argc, argv);

  if (opts.GetFirstBoolean('v', "version", false)) {
    return 0;
  }
  if (opts.GetFirstBoolean('h', "help", false)) {
    VariantValidatorHelp();
    return 0;
  }

  string input_vcf_filename = opts.GetFirstString ('i', "input-vcf", "");
  string truth_filename = opts.GetFirstString ('t', "truth-file", "");
  string truth_dir = opts.GetFirstString ('d', "truth-dir", "/results/plugins/validateVariantCaller/files");

  // TODO: reference optional, only used to verify reference allele in input-vcf and truth files
  //string reference_filename = opts.GetFirstString ('r', "reference", "");

  opts.CheckNoLeftovers();


  //
  // Step 1. Load input VCF file into memory
  //

  if (input_vcf_filename.empty()) {
    VariantValidatorHelp();
    cerr << "ERROR: Input VCF file not specified " << endl;
    return 1;
  }

  VariantCallerResults results_vcf;
  results_vcf.load_vcf(input_vcf_filename);
  printf("Loaded VCF %s with %d variant calls\n", input_vcf_filename.c_str(), (int)results_vcf.variants.size());



  //
  // Step 2. Parse truth files, compare them to the input vcf, and compute match scores
  //

  if (not truth_filename.empty()) {
    ValidatorTruth truth;
    truth.ReadTruthFile(truth_filename);
    truth.CompareToCalls(results_vcf);
    return 0;
  }

  truth_dir += "/*.bed";
  glob_t glob_result;
  glob(truth_dir.c_str(), GLOB_TILDE, NULL, &glob_result);
  for(unsigned int i = 0; i < glob_result.gl_pathc; ++i) {

    ValidatorTruth truth;
    truth.ReadTruthFile(string(glob_result.gl_pathv[i]));
    truth.CompareToCalls(results_vcf);

  }
  globfree(&glob_result);


  return 0;
}
Пример #22
0
int main (int argc, const char *argv[])
{
  printf ("------------- bamrealignment --------------\n");

  OptArgs opts;
  opts.ParseCmdLine(argc, argv);
  vector<int> score_vals(4);

  string input_bam  = opts.GetFirstString  ('i', "input", "");
  string output_bam = opts.GetFirstString  ('o', "output", "");
  opts.GetOption(score_vals, "4,-6,-5,-2", 's', "scores");
  int    clipping   = opts.GetFirstInt     ('c', "clipping", 2);
  bool   anchors    = opts.GetFirstBoolean ('a', "anchors", true);
  int    bandwidth  = opts.GetFirstInt     ('b', "bandwidth", 10);
  bool   verbose    = opts.GetFirstBoolean ('v', "verbose", false);
  bool   debug      = opts.GetFirstBoolean ('d', "debug", false);
  int    format     = opts.GetFirstInt     ('f', "format", 1);
  int  num_threads  = opts.GetFirstInt     ('t', "threads", 8);
  string log_fname  = opts.GetFirstString  ('l', "log", "");
  

  if (input_bam.empty() or output_bam.empty())
    return PrintHelp();

  opts.CheckNoLeftovers();

  std::ofstream logf;
  if (log_fname.size ())
  {
    logf.open (log_fname.c_str ());
    if (!logf.is_open ())
    {
      fprintf (stderr, "bamrealignment: Failed to open log file %s\n", log_fname.c_str());
      return 1;
    }
  }

  BamReader reader;
  if (!reader.Open(input_bam)) {
    fprintf(stderr, "bamrealignment: Failed to open input file %s\n", input_bam.c_str());
    return 1;
  }

  SamHeader header = reader.GetHeader();
  RefVector refs   = reader.GetReferenceData();

  BamWriter writer;
  writer.SetNumThreads(num_threads);
  if (format == 1)
    writer.SetCompressionMode(BamWriter::Uncompressed);
  else
    writer.SetCompressionMode(BamWriter::Compressed);

  if (!writer.Open(output_bam, header, refs)) {
    fprintf(stderr, "bamrealignment: Failed to open output file %s\n", output_bam.c_str());
    return 1;
  }


  // The meat starts here ------------------------------------

  if (verbose)
    cout << "Verbose option is activated, each alignment will print to screen." << endl
         << "  After a read hit RETURN to continue to the next one," << endl
         << "  or press q RETURN to quit the program," << endl
         << "  or press s Return to silence verbose," << endl
         << "  or press c RETURN to continue printing without further prompt." << endl << endl;

  unsigned int readcounter = 0;
  unsigned int mapped_readcounter = 0;
  unsigned int realigned_readcounter = 0;
  unsigned int modified_alignment_readcounter = 0;
  unsigned int pos_update_readcounter = 0;
  unsigned int failed_clip_realigned_readcount = 0;
  
  unsigned int already_perfect_readcount = 0;
  
  unsigned int bad_md_tag_readcount = 0;
  unsigned int error_recreate_ref_readcount = 0;
  unsigned int error_clip_anchor_readcount = 0;
  unsigned int error_sw_readcount = 0;
  unsigned int error_unclip_readcount = 0;
  
  unsigned int start_position_shift;
  int orig_position;
  int new_position;

  string  md_tag, new_md_tag, input = "x";
  vector<CigarOp>    new_cigar_data;
  vector<MDelement>  new_md_data;
  bool position_shift = false;
  time_t start_time = time(NULL);

  Realigner aligner;
  aligner.verbose_ = verbose;
  aligner.debug_   = debug;
  if (!aligner.SetScores(score_vals))
    cout << "bamrealignment: Four scores need to be provided: match, mismatch, gap open, gap extend score!" << endl;

  aligner.SetAlignmentBandwidth(bandwidth);

  BamAlignment alignment;
  while(reader.GetNextAlignment(alignment)){
    readcounter ++;
    position_shift = false;
    
    if ( (readcounter % 100000) == 0 )
       cout << "Processed " << readcounter << " reads. Elapsed time: " << (time(NULL) - start_time) << endl;

    if (alignment.IsMapped()) {
      
      
      
      orig_position = alignment.Position;
      mapped_readcounter++;
      aligner.SetClipping(clipping, !alignment.IsReverseStrand());
      if (aligner.verbose_) {
    	cout << endl;
        if (alignment.IsReverseStrand())
          cout << "The read is from the reverse strand." << endl;
        else
          cout << "The read is from the forward strand." << endl;
      }

      if (!alignment.GetTag("MD", md_tag)) {
    	if (aligner.verbose_)
          cout << "Warning: Skipping read " << alignment.Name << ". It is mapped but missing MD tag." << endl;
	if (logf.is_open ())
	  logf << alignment.Name << '\t' << alignment.IsReverseStrand() << '\t' << alignment.RefID << '\t' << setfill ('0') << setw (8) << orig_position << '\t' << "MISSMD" << '\n';
	bad_md_tag_readcount++;
      } else if (aligner.CreateRefFromQueryBases(alignment.QueryBases, alignment.CigarData, md_tag, anchors)) {
	bool clipfail = false;
	if (Realigner::CR_ERR_CLIP_ANCHOR == aligner.GetCreateRefError ())
	{
	  clipfail = true;
	  failed_clip_realigned_readcount ++;
	}

        if (!aligner.computeSWalignment(new_cigar_data, new_md_data, start_position_shift)) {
          if (aligner.verbose_)
            cout << "Error in the alignment! Not updating read information." << endl;
	  if (logf.is_open ())
	    logf << alignment.Name << '\t' << alignment.IsReverseStrand() << '\t' << alignment.RefID << '\t' << setfill ('0') << setw (8) << orig_position << '\t' << "SWERR" << '\n';
	  error_sw_readcount++;
          writer.SaveAlignment(alignment);  // Write alignment unchanged
          continue;
        }

        if (!aligner.addClippedBasesToTags(new_cigar_data, new_md_data, alignment.QueryBases.size())) {
          if (aligner.verbose_)
            cout << "Error when adding clipped anchors back to tags! Not updating read information." << endl;
	  if (logf.is_open ())
	    logf << alignment.Name << '\t' << alignment.IsReverseStrand() << '\t' << alignment.RefID << '\t' << setfill ('0') << setw (8) << orig_position << '\t' << "UNCLIPERR" << '\n';
          writer.SaveAlignment(alignment);  // Write alignment unchanged
	  error_unclip_readcount ++;
          continue;
        }
        new_md_tag = aligner.GetMDstring(new_md_data);
        realigned_readcounter++;

        // adjust start position of read
        if (!aligner.LeftAnchorClipped() and start_position_shift != 0) {
          new_position = aligner.updateReadPosition(alignment.CigarData, (int)start_position_shift, alignment.Position);
          if (new_position != alignment.Position) {
            pos_update_readcounter++;
            position_shift = true;
            alignment.Position = new_position;
          }
        }
        
        if (position_shift || alignment.CigarData.size () != new_cigar_data.size () || md_tag != new_md_tag)
	{
	  if (logf.is_open ())
	  {
	    logf << alignment.Name << '\t' << alignment.IsReverseStrand() << '\t' << alignment.RefID << '\t' << setfill ('0') << setw (8) << orig_position << '\t' << "MOD";
	    if (position_shift)
	      logf << "-SHIFT";
	    if (clipfail)
	      logf << " NOCLIP";
	    logf << '\n';
	  }
	  modified_alignment_readcounter++;
	}
	else
	{
            if (logf.is_open ())
	    {
	      logf << alignment.Name << '\t' << alignment.IsReverseStrand() << '\t' << alignment.RefID << '\t' << setfill ('0') << setw (8) << orig_position << '\t' << "UNMOD";
              if (clipfail)
	        logf << " NOCLIP";
	      logf << '\n';
	    }
	}

        if (aligner.verbose_){
          cout << alignment.Name << endl;
          cout << "------------------------------------------" << endl;
          // Wait for input to continue or quit program
          if (input.size() == 0)
            input = 'x';
          else if (input[0] != 'c' and input[0] != 'C')
            getline(cin, input);
          if (input.size()>0){
            if (input[0] == 'q' or input[0] == 'Q')
              return 1;
            else if (input[0] == 's' or input[0] == 'S')
              aligner.verbose_ = false;
          }
        }

        // Finally update alignment information
        alignment.CigarData = new_cigar_data;
        alignment.EditTag("MD", "Z" , new_md_tag);

      } // end of CreateRef else if
      else {
	switch (aligner.GetCreateRefError ())
	{
	  case Realigner::CR_ERR_RECREATE_REF:
            if (logf.is_open ())
	      logf << alignment.Name << '\t' << alignment.IsReverseStrand() << '\t' << alignment.RefID << '\t' << setfill ('0') << setw (8) << orig_position << '\t' << "RECRERR" << '\n';
	    error_recreate_ref_readcount++;
	    break;
	  case Realigner::CR_ERR_CLIP_ANCHOR:
            if (logf.is_open ())
	      logf << alignment.Name << '\t' << alignment.IsReverseStrand() << '\t' << alignment.RefID << '\t' << setfill ('0') << setw (8) << orig_position << '\t' << "CLIPERR" << '\n';
	    error_clip_anchor_readcount++;
	    break;
	  default:
		  //  On a good run this writes way too many reads to the log file - don't want to create a too large txt file
          //  if (logf.is_open ())
	      //logf << alignment.Name << '\t' << alignment.IsReverseStrand() << '\t' << alignment.RefID << '\t' << setfill ('0') << setw (8) << orig_position << '\t' << "PERFECT" << '\n';
	    already_perfect_readcount++;
	    break;
	}
	
	if (aligner.verbose_) {
	  cout << alignment.Name << endl;
	  cout << "------------------------------------------" << endl;
	  // Wait for input to continue or quit program
	  if (input.size() == 0)
	    input = 'x';
	  else if (input[0] != 'c' and input[0] != 'C')
	    getline(cin, input);
	  if (input.size()>0){
	    if (input[0] == 'q' or input[0] == 'Q')
	      return 1;
	    else if (input[0] == 's' or input[0] == 'S')
	      aligner.verbose_ = false;
	  }
	}
      }

      // --- Debug output for Rajesh ---
      if (debug && aligner.invalid_cigar_in_input) {
        aligner.verbose_ = true;
        cout << "Invalid cigar string / md tag pair in read " << alignment.Name << endl;
        // Rerun reference generation to display error
        aligner.CreateRefFromQueryBases(alignment.QueryBases, alignment.CigarData, md_tag, anchors);

        aligner.verbose_ = verbose;
        aligner.invalid_cigar_in_input = false;
      }
      // --- --- ---


    } // end of if isMapped

    writer.SaveAlignment(alignment);

  } // end while loop over reads

  if (aligner.invalid_cigar_in_input)
    cerr << "WARNING bamrealignment: There were invalid cigar string / md tag pairs in the input bam file." << endl;

  // ----------------------------------------------------------------
  // program end -- output summary information
  cout   << "                            File: " << input_bam    << endl
         << "                     Total reads: " << readcounter  << endl
         << "                    Mapped reads: " << mapped_readcounter << endl;
  if (bad_md_tag_readcount)
    cout << "            Skipped: bad MD tags: " << bad_md_tag_readcount << endl;
  if (error_recreate_ref_readcount)
    cout << " Skipped: unable to recreate ref: " << error_recreate_ref_readcount << endl;
  if (error_clip_anchor_readcount)
    cout << "  Skipped: error clipping anchor: " << error_clip_anchor_readcount << endl;
  cout  <<  "       Skipped:  already perfect: " << already_perfect_readcount << endl
        <<  "           Total reads realigned: " << mapped_readcounter - already_perfect_readcount - bad_md_tag_readcount - error_recreate_ref_readcount - error_clip_anchor_readcount << endl;
  if (failed_clip_realigned_readcount)
    cout << "                      (including  " << failed_clip_realigned_readcount << " that failed to clip)" << endl;
  if (error_sw_readcount)
    cout << " Failed to complete SW alignment: " << error_sw_readcount << endl;
  if (error_unclip_readcount)
    cout << "         Failed to unclip anchor: " << error_unclip_readcount << endl;
  cout   << "           Succesfully realigned: " << realigned_readcounter << endl
         << "             Modified alignments: " << modified_alignment_readcounter << endl
         << "                Shifted position: " << pos_update_readcounter << endl;
  
  cout << "Processing time: " << (time(NULL)-start_time) << " seconds." << endl;
  cout << "INFO: The output BAM file may be unsorted." << endl;
  cout << "------------------------------------------" << endl;
  return 0;
}
Пример #23
0
int main (int argc, const char *argv[])
{
  time_t program_start_time;
  time(&program_start_time);
  Json::Value calibration_json(Json::objectValue);
  DumpStartingStateOfProgram (argc,argv,program_start_time, calibration_json["Calibration"]);

  //
  // Step 1. Process command line options
  //

  OptArgs opts;
  opts.ParseCmdLine(argc, argv);

  CalibrationContext calib_context;
  if (not calib_context.InitializeFromOpts(opts)){
    PrintHelp_CalModules();
  }

  HistogramCalibration master_histogram(opts, calib_context);
  calib_context.hist_calibration_master = &master_histogram;

  LinearCalibrationModel master_linear_model(opts, calib_context);
  calib_context.linear_model_master = &master_linear_model;

  opts.CheckNoLeftovers();

  //
  // Step 2. Execute threaded calibration
  //

  time_t calibration_start_time;
  time(&calibration_start_time);

  pthread_mutex_init(&calib_context.read_mutex,  NULL);
  pthread_mutex_init(&calib_context.write_mutex, NULL);

  pthread_t worker_id[calib_context.num_threads];
  for (int worker = 0; worker < calib_context.num_threads; worker++)
  if (pthread_create(&worker_id[worker], NULL, CalibrationWorker, &calib_context)) {
    cerr << "Calibration ERROR: Problem starting thread" << endl;
    exit (EXIT_FAILURE);
  }

  for (int worker = 0; worker < calib_context.num_threads; worker++)
    pthread_join(worker_id[worker], NULL);

  pthread_mutex_destroy(&calib_context.read_mutex);
  pthread_mutex_destroy(&calib_context.write_mutex);

  time_t calibration_end_time;
  time(&calibration_end_time);


  //
  // Step 3. Create models, write output, and close modules
  //

  // HP histogram calibration
  if (master_histogram.CreateCalibrationModel())
    master_histogram.ExportModelToJson(calibration_json["HPHistogram"]);

  // Linear Model
  if (master_linear_model.CreateCalibrationModel())
    master_linear_model.ExportModelToJson(calibration_json["LinearModel"], "");


  // Transfer stuff from calibration context and close bam reader
  calib_context.Close(calibration_json["Calibration"]);

  time_t program_end_time;
  time(&program_end_time);

  calibration_json["Calibration"]["end_time"] = get_time_iso_string(program_end_time);
  calibration_json["Calibration"]["total_duration"] = (Json::Int)difftime(program_end_time,program_start_time);
  calibration_json["Calibration"]["calibration_duration"] = (Json::Int)difftime(calibration_end_time,calibration_start_time);

  SaveJson(calibration_json, calib_context.filename_json);
  return EXIT_SUCCESS;
}
Пример #24
0
int main(int argc, const char *argv[]) {
  OptArgs opts;  
  TraceConfig config;
  string inputDir;
  string outputDir;
  bool help;

  opts.ParseCmdLine(argc, argv);
  opts.GetOption(inputDir, "", '-', "source-dir");
  opts.GetOption(outputDir, "", '-', "output-dir");
  opts.GetOption(config.precision, "5", '-', "precision");
  opts.GetOption(config.numEvec, "7", '-', "num-evec");
  opts.GetOption(config.doDebug, "false", '-', "debug-files");
  opts.GetOption(config.compressionType, "delta", '-', "compression");
  opts.GetOption(config.numFlows, "-1", '-', "num-flows");
  opts.GetOption(config.numCores, "6", '-', "num-cores");
  opts.GetOption(config.errCon,"0",'-',"err-con");
  opts.GetOption(config.rankGood,"0",'-',"rank-good");
  opts.GetOption(config.pivot,"0",'-',"pivot");
  opts.GetOption(help, "false", 'h', "help");
  opts.GetOption(config.isThumbnail, "false", '-', "thumbnail");
  opts.GetOption(config.use_hard_est, "false",'-', "use-hard-est");
  opts.GetOption(config.t0_hard, "0", '-', "t0-hard");
  opts.GetOption(config.tmid_hard, "0", '-', "tmid-hard");
  opts.GetOption(config.sigma_hard, "0", '-', "sigma-hard");
  opts.GetOption(config.row_step, "100", '-', "row-step");
  opts.GetOption(config.col_step, "100", '-', "col-step");
  opts.GetOption(config.bg_param, "", '-', "region-param");
  opts.GetOption(config.grind_acq_0, "0", '-', "grind-acq0");
  if(help || inputDir.empty() || outputDir.empty()) {
    usage();
  }
  char *explog_path = NULL;
  explog_path = MakeExpLogPathFromDatDir(inputDir.c_str());
  int numFlows = config.numFlows;
  if (numFlows < 0) { 
    numFlows = GetTotalFlows(explog_path); 
  }

  // Check and setup our compression type
  TraceChunkSerializer serializer;
  serializer.SetRecklessAbandon(true);
  if (config.compressionType == "svd") {
    SvdDatCompress *dc = new SvdDatCompress(config.precision, config.numEvec);
    serializer.SetCompressor(dc);
    cout << "Doing lossy svd compression. (" << serializer.GetCompressionType() << ")" << endl;
  }
  // else if (config.compressionType == "svd+") {
  //   SvdDatCompressPlus *dc = new SvdDatCompressPlus();
  //   serializer.SetCompressor(dc);
  //   cout << "Doing lossy svd compression. (" << serializer.GetCompressionType() << ")" << endl;
  // }
  // else if (config.compressionType == "svd++") {
  //   SvdDatCompressPlusPlus *dc = new SvdDatCompressPlusPlus();
  //   if (config.errCon >0 )
  //     dc->SetErrCon(config.errCon);
  //   if (config.rankGood > 0 )
  //     dc->SetRankGood(config.rankGood);
  //   if (config.pivot > 0)
  //     dc->SetPivot(config.pivot);
  //   serializer.SetCompressor(dc);
  //   cout << "Doing lossy svd compression for good traces and delta for bad ones. (" << serializer.GetCompressionType() << ")" << endl;
  // }
  else if (config.compressionType == "delta") {
    VencoLossless *venco = new VencoLossless();
    serializer.SetCompressor(venco);
    cout << "Doing lossless delta compression. (" << serializer.GetCompressionType() << ")" << endl;
  }
  else if (config.compressionType == "delta-plain") {
    DeltaComp *delta = new DeltaComp();
    serializer.SetCompressor(delta);
    cout << "Doing lossless delta plain compression. (" << serializer.GetCompressionType() << ")" << endl;
  }
  else if (config.compressionType == "delta-plain-fst") {
    DeltaCompFst *delta = new DeltaCompFst();
    serializer.SetCompressor(delta);
    cout << "Doing lossless delta plain fast compression. (" << serializer.GetCompressionType() << ")" << endl;
  }
  else if (config.compressionType == "delta-plain-fst-smx") {
   DeltaCompFstSmX *delta = new DeltaCompFstSmX();
    serializer.SetCompressor(delta);
    cout << "Doing lossless delta plain fast compression. (" << serializer.GetCompressionType() << ")" << endl;
  }
  else if (config.compressionType == "none") {
    TraceCompressor *vanilla = new TraceNoCompress();
    serializer.SetCompressor(vanilla);
    cout << "Doing no compression. (" << serializer.GetCompressionType() << ")" << endl;
  }
  else {
    ION_ABORT("Don't recognize compression type: " + config.compressionType);
  }

  const char *id = GetChipId(explog_path);
  if (explog_path) free (explog_path);
  ChipIdDecoder::SetGlobalChipId(id);
  ImageTransformer::CalibrateChannelXTCorrection(inputDir.c_str(), "lsrowimage.dat");

  Image bfImg1;
  string bfFile = inputDir + "/beadfind_pre_0003.dat";
  bfImg1.LoadRaw(bfFile.c_str());
  const RawImage *bf1raw = bfImg1.GetImage(); 
  Mask mask(bf1raw->cols, bf1raw->rows);
  ImageTransformer::XTChannelCorrect(bfImg1.raw,bfImg1.results_folder);

  bfImg1.FilterForPinned (&mask, MaskEmpty, false);

  Image bfImg2;
  string bfFile2 = inputDir + "/beadfind_pre_0001.dat";
  bfImg2.LoadRaw(bfFile2.c_str());
  ImageTransformer::XTChannelCorrect(bfImg2.raw,bfImg1.results_folder);

  bfImg2.FilterForPinned (&mask, MaskEmpty, false);
  const RawImage *bf2raw = bfImg2.GetImage(); 


  GridMesh<T0Prior> t0Prior;
  T0Calc bfT0;
  /* Calc t0 and get prior. */
  cout << "Doing beadfind t0" << endl;
  GenerateBfT0Prior(config, bf1raw->image, bf1raw->baseFrameRate, bf1raw->rows, bf1raw->cols,
                    bf1raw->frames, bf1raw->timestamps,
                    config.row_step, config.col_step, &mask, bfT0, t0Prior);

  GridMesh<T0Prior> t0Prior2;
  T0Calc bfT02;
  GenerateBfT0Prior(config, bf2raw->image, bf2raw->baseFrameRate, bf2raw->rows, bf2raw->cols,
                    bf2raw->frames, bf2raw->timestamps,
                    config.row_step, config.col_step, &mask, bfT02, t0Prior2);

  SigmaTMidNucEstimation sigmaEst;
  sigmaEst.Init(config.rate_sigma_intercept, config.rate_sigma_slope, 
                config.t0_tmid_intercept, config.t0_tmid_slope, bf1raw->baseFrameRate);
  GridMesh<SigmaEst> sigmaTMid;
  bfImg1.Close();
  bfImg2.Close();

  // Calculate individual well t0 by looking at neighboring regions
  vector<float> wellT0;
  bfT0.CalcIndividualT0(wellT0, 0);
  vector<float> wellT02;
  bfT02.CalcIndividualT0(wellT02, 0);
  for (size_t i =0; i< wellT0.size();i++) {
    if (wellT0[i] > 0 && wellT02[i] > 0) {
      wellT0[i] = (wellT0[i] + wellT02[i])/2.0f;
    }
    else {
      wellT0[i] = max(wellT0[i], wellT02[i]);
    }
  }

  // Average the region level t0, should we do this first and then just do sinle well level?
  for (size_t bIx = 0; bIx < bfT0.GetNumRegions(); bIx++) {
    double t1 = bfT0.GetT0(bIx);
    double t2 = bfT02.GetT0(bIx);
    if (t1 > 0 && t2 > 0) {
      t1 = (t1 + t2)/2.0;
    }
    else {
      t1 = max(t1,t2);
    }
    bfT0.SetT0(bIx, t1);
  }

  // Single thread first dat
  for (size_t datIx = 0; datIx < 1; ++datIx) {
    cout << "Doing: " << datIx << endl;
    char buffer[2048];
    snprintf(buffer, sizeof(buffer), "%s/acq_%.4d.dat", inputDir.c_str(), (int) datIx);
    string datFile = buffer;
    /* Use prior to calculate t0 and slope. */
    Image datImg;
    T0Calc t0;
    datImg.LoadRaw(datFile.c_str());
    //    ImageTransformer::XTChannelCorrect(datImg.raw,datImg.results_folder);
    const RawImage *datRaw = datImg.GetImage(); 

    /* Estimate sigma and t_mid_nuc */
    if (datIx == 0) {
      cout << "Doing acquisition t0" << endl;

      GenerateAcqT0Prior(config, datRaw->image, datRaw->baseFrameRate, datRaw->rows, datRaw->cols,
                         datRaw->frames, datRaw->timestamps,
                         config.row_step, config.col_step, &mask, t0, t0Prior);
      
      ClockTimer timer;
      cout << "Estimating sigma." << endl;
      sigmaTMid.Init(datRaw->rows, datRaw->cols, config.row_step, config.col_step);
      for (size_t bIx = 0; bIx < t0.GetNumRegions(); bIx++) {
        t0.SetT0(bIx, bfT0.GetT0(bIx));
      }
      int neighbors = 2;
      if (config.isThumbnail) {
        cout << "Doing thumbnail version of slope." << endl;
        neighbors = 1;
      }
      EstimateSigmaValue(t0, sigmaEst, sigmaTMid, neighbors);
      timer.PrintMilliSeconds(cout,"Sigma Est took:");
      string sigmaFile = outputDir + "/sigma_tmid_est.txt";
      OutputSigmaTmidEstimates(sigmaTMid, sigmaFile.c_str());
    }

    /* For each region do shifting */
    ClockTimer timer;
    cout << "Shifting traces" << endl;
    timer.StartTimer();
    //    ShiftTraces(bfT0, wellT0, datRaw->frames, datRaw->baseFrameRate, datRaw->timestamps, datRaw->image);
    timer.PrintMilliSeconds(cout,"Shift took:");
    if (!config.bg_param.empty()) {
      DataCube<int> rowsCols;
      DataCube<float> tmidSigma;
      DataCube<float> fitTmidSigma;
      string path = config.bg_param + ":/region/region_location";
      if (!H5File::ReadDataCube(path, rowsCols)) {
        ION_ABORT("Couldn't read file: " + path);
      }
      path = config.bg_param + ":/region/region_init_param";
      if (!H5File::ReadDataCube(path, fitTmidSigma)) {
        ION_ABORT("Couldn't read file: " + path);
      }
      for (size_t i = 0; i < rowsCols.GetNumX(); i++) {
        int row = rowsCols.At(i,1,0);
        int col = rowsCols.At(i,0,0);
        SigmaEst &est = sigmaTMid.GetItemByRowCol(row, col);
        float tmid_est =  fitTmidSigma.At(i,0,0);
        float sigma_est = fitTmidSigma.At(i,1,0);
        est.mTMidNuc = tmid_est;
        est.mSigma = sigma_est;
      }
      string fitSigmaFile = outputDir + "/bg_fit_sigma_tmid_est.txt";
      OutputSigmaTmidEstimates(sigmaTMid, fitSigmaFile.c_str());

      // path = config.bg_param + ":/region/region_init_param";
      // if (!H5File::ReadMatrix(path, tmidSigma)) {
      //   ION_ABORT("Couldn't read file: " + path);
      // }
      // for (size_t i = 0; i < rowsCols.n_rows; i++) {
      //   int row = rowsCols.at(i,0);
      //   int col = rowsCols.at(i,1);
      //   SigmaEst &est = sigmaTMid.GetItemByRowCol(row, col);
      //   float tmid_est =  tmidSigma.at(i,0);
      //   float sigma_est = tmidSigma.at(i,1);
      //   est.mTMidNuc = tmid_est;
      //   est.mSigma = sigma_est;
      // }
      // string sigmaFile = outputDir + "/supplied_sigma_tmid_est.txt";
      // OutputSigmaTmidEstimates(sigmaTMid, sigmaFile.c_str());
    }
    else if (config.use_hard_est) {
      for (size_t i = 0; i < bfT0.GetNumRegions(); i++) {
        bfT0.SetT0(i,config.t0_hard * datRaw->baseFrameRate + config.time_start_slop);
      }
      for (size_t i = 0; i < sigmaTMid.GetNumBin(); i++) {
        SigmaEst &est = sigmaTMid.GetItem(i);
        est.mTMidNuc = config.tmid_hard;
        est.mSigma = config.sigma_hard;
        est.mT0 = config.t0_hard;
      }
    }
    /* Use t0 and sigma to get the time compression bkgModel wants. */
    cout << "Generating chunks" << endl;
    //    GridMesh<TraceChunk> traceChunks;
    SynchDat sdat;
    if (datIx == 0  && config.grind_acq_0 > 0) {
      int nTimes = config.grind_acq_0;
      timer.StartTimer();
      size_t processMicroSec = 0;
      size_t hdf5MicroSec = 0;
      size_t compressMicroSec = 0;
      size_t convertMicroSec = 0;
      for (int i = 0; i <nTimes; i++) {
        //GridMesh<TraceChunk> traceChunken;
        SynchDat sdatIn;
        AddMetaData(sdat, datRaw, datIx);
	ClockTimer convTimer;
        GenerateDataChunks(config, bfT0, datRaw, config.row_step, config.col_step, sigmaTMid, sdatIn.mChunks,datImg);
	convertMicroSec += convTimer.GetMicroSec();
        snprintf(buffer, sizeof(buffer), "%s/acq_%.4d.sdat", outputDir.c_str(), (int)datIx);
        serializer.Write(buffer, sdatIn);
	processMicroSec += serializer.computeMicroSec;
	hdf5MicroSec += serializer.ioMicroSec;
	compressMicroSec += serializer.compressMicroSec;
      }
      size_t usec = timer.GetMicroSec();
      cout << "Took: " << usec / 1.0e6 << " seconds, " << usec / (nTimes * 1.0f) << " usec per write." << endl;
      timer.PrintMilliSeconds(cout,"Chunks took:");
      cout << "Read took: " << processMicroSec / (1e3 * nTimes) << " milli seconds per sdat compute." << endl;
      cout << "Read took: " << hdf5MicroSec / (1e3 * nTimes) << " milli seconds per sdat hdf5." << endl;
      cout << "Read took: " << compressMicroSec / (1e3 * nTimes) << " milli seconds per sdat compressing." << endl;
      cout << "Read took: " << convertMicroSec / (1e3 * nTimes) << " milli seconds per sdat converting." << endl;
      exit(0);
    }
    else {
      timer.StartTimer();
      AddMetaData(sdat, datRaw, datIx);
      GenerateDataChunks(config, bfT0, datRaw, config.row_step, config.col_step, sigmaTMid, sdat.mChunks,datImg);
      timer.PrintMilliSeconds(cout,"Chunks took:");
        if (datIx == 0 && config.doDebug) {
          OutputTraceChunks(sdat.mChunks,"flow_0_data_chunks.txt");
        }
    }
    datImg.Close();    

    /* Serialize onto disk. */
    snprintf(buffer, sizeof(buffer), "%s/acq_%.4d.sdat", outputDir.c_str(), (int)datIx);
    serializer.Write(buffer, sdat);
    /* Read back in first flow for checking */
    if (datIx == 0) {
      TraceChunkSerializer readSerializer;
      readSerializer.SetRecklessAbandon(true);
      //      GridMesh<TraceChunk> traceChunksIn;  
      SynchDat sdatIn;
      readSerializer.Read(buffer, sdatIn);
      if (datIx == 0 && config.doDebug) {
        OutputTraceChunks(sdatIn.mChunks, "flow_0_data_chunks_read.txt");
      }
      SampleQuantiles<float> s(50000);
      SampleQuantiles<float> s2(50000);
      SampleQuantiles<float> sAbs(50000);
      SampleStats<double> ss;
      int diffCount = 0;
      for (size_t bIx = 0; bIx < sdatIn.mChunks.mBins.size(); bIx++) {
        if (sdatIn.mChunks.mBins[bIx].mT0 != sdat.mChunks.mBins[bIx].mT0) {
          cout << "Got: " << sdatIn.mChunks.mBins[bIx].mT0 << " vs: " << sdat.mChunks.mBins[bIx].mT0 << endl;
          exit(1);
        }
        for (size_t i = 0; i < sdatIn.mChunks.mBins[bIx].mData.size(); i++) {
          double diff = (double)sdatIn.mChunks.mBins[bIx].mData[i] - (double)sdat.mChunks.mBins[bIx].mData[i];
          if (!std::isfinite(diff)) {
            cout << "NaNs!!" << endl;
          }
          if (diffCount < 10 && fabs(diff) > .00001) { // != 0) {
            diffCount++;
            cout << "Bin: " << bIx << " well: " << i << " diff is: " << diff << endl;
          }
          s.AddValue(diff);
          sAbs.AddValue(fabs(diff));
          ss.AddValue(sqrt(diff * diff));
          s2.AddValue(sqrt(diff * diff));
        }
      }
      cout << "Median rms: " << s2.GetMedian()  << " Avg: " << ss.GetMean() << " diff: " << s.GetMedian() << endl;
      cout << "Abs(diff) Quantiles:" << endl;
      for (size_t i = 0; i <= 100; i+=10) {
        cout << i << "\t" << sAbs.GetQuantile(i/100.0) << endl;
      }
    }      
  }
  // do the next N flows multithreaded
  if (numFlows > 1) {
    PJobQueue jQueue (config.numCores, numFlows-1);  
    vector<CreateSDat> jobs(numFlows -1);
    // for (int i = 0; i < 4; i++) {
    //   char buffer[2048];
    //   snprintf(buffer, sizeof(buffer), "%s/beadfind_pre_%.4d.dat", inputDir.c_str(), (int) i);
    //   string input = buffer;
    //   snprintf(buffer, sizeof(buffer), "%s/beadfind_pre_%.4d.sdat", outputDir.c_str(), (int)i);
    //   string output = buffer;
    //   jobs[i].Init(&config, input, output, &wellT0, &bfT0, &sigmaTMid);
    //   jQueue.AddJob(jobs[i]);
    // }

    // jQueue.WaitUntilDone();
    for (int i = 1; i < numFlows; i++) {
      char buffer[2048];
      snprintf(buffer, sizeof(buffer), "%s/acq_%.4d.dat", inputDir.c_str(), (int) i);
      string input = buffer;
      snprintf(buffer, sizeof(buffer), "%s/acq_%.4d.sdat", outputDir.c_str(), (int)i);
      string output = buffer;
      jobs[i-1].Init(&config, input, output, &wellT0, &bfT0, &sigmaTMid, i);
      jQueue.AddJob(jobs[i-1]);
    }
    jQueue.WaitUntilDone();
  }
  /* Serialize into backbround models */
  cout << "Done." << endl;
}
Пример #25
0
/*************************************************************************************************
 *************************************************************************************************
 *
 *  Start of Main Function
 *
 *************************************************************************************************
 ************************************************************************************************/
int main (int argc, char *argv[])
{
  init_salute();

  ofstream null_ostream("/dev/null"); // must stay live for entire scope, or crash when writing
  TheSilenceOfTheArmadillos(null_ostream);

  TrackProgress my_progress;  
  DumpStartingStateOfProgram (argc,argv,my_progress);
   
  if(argc < 2)
  {
      PrintHelp();
  }

  for(int i = 1; i < argc; ++i)
  {
	  string s = argv[i];
	  if(s == "-" || s == "--")
	  {
	      cerr << "ERROR: command line input \"-\" must be followed by a short option name (a letter) and \"--\" must be followed by a long option name." << endl; 
		  exit ( EXIT_FAILURE );
	  }
	  else if(s == "-?" || s == "-h" || s == "--help")
	  {
	      PrintHelp();
	  }
  }

  ValidateOpts validater;
  validater.Validate(argc, argv);

  char** argv2 = new char*[argc];
  int datind = TrapAndDeprecateOldArgs(argc, argv, argv2);

  OptArgs opts;
  opts.ParseCmdLine(argc, (const char**)argv2);

  for(int k = 0; k < argc ; ++k)
  {
	  delete [] argv2[k];
  }
  delete [] argv2;
   
  Json::Value json_params;
  CommandLineOpts inception_state;
  inception_state.SetOpts(opts, json_params);

  if(datind < 0) // there is no "--dat-source-directory"
  {
	  inception_state.sys_context.dat_source_directory = argv[argc - 1];
	  cout << "dat_source_directory = " << inception_state.sys_context.dat_source_directory << endl;
  }

  inception_state.PostProcessArgs(opts);

  SeqListClass my_keys;
  ImageSpecClass my_image_spec;
  SlicedPrequel my_prequel_setup;  

  SetUpOrLoadInitialState(inception_state, my_keys, my_progress, my_image_spec, my_prequel_setup);

  // Start logging process parameters & timing now that we have somewhere to log
  my_progress.InitFPLog(inception_state);

  // Write processParameters.parse file now that processing is about to begin
  my_progress.WriteProcessParameters(inception_state);
  
  // Do separator
  Region wholeChip(0, 0, my_image_spec.cols, my_image_spec.rows);
  IsolatedBeadFind( my_prequel_setup, my_image_spec, wholeChip, inception_state,
        inception_state.sys_context.GetResultsFolder(), inception_state.sys_context.analysisLocation,  my_keys, my_progress);

  exit (EXIT_SUCCESS);
}
Пример #26
0
int PrepareHotspots(int argc, const char *argv[])
{
  OptArgs opts;
  opts.ParseCmdLine(argc, argv);
  string input_bed_filename       = opts.GetFirstString ('b', "input-bed", "");
  string input_vcf_filename       = opts.GetFirstString ('v', "input-vcf", "");
  string output_bed_filename      = opts.GetFirstString ('d', "output-bed", "");
  string output_vcf_filename      = opts.GetFirstString ('o', "output-vcf", "");
  string reference_filename       = opts.GetFirstString ('r', "reference", "");
  bool left_alignment             = opts.GetFirstBoolean('a', "left-alignment", false);
  bool filter_bypass              = opts.GetFirstBoolean('f', "filter-bypass", false);
  bool allow_block_substitutions  = opts.GetFirstBoolean('s', "allow-block-substitutions", false);
  opts.CheckNoLeftovers();

  if((input_bed_filename.empty() == input_vcf_filename.empty()) or
      (output_bed_filename.empty() and output_vcf_filename.empty()) or reference_filename.empty()) {
    PrepareHotspotsHelp();
    return 1;
  }


  // Populate chromosome list from reference.fai
  // Use mmap to fetch the entire reference

  int ref_handle = open(reference_filename.c_str(),O_RDONLY);

  struct stat ref_stat;
  fstat(ref_handle, &ref_stat);
  char *ref = (char *)mmap(0, ref_stat.st_size, PROT_READ, MAP_SHARED, ref_handle, 0);


  FILE *fai = fopen((reference_filename+".fai").c_str(), "r");
  if (!fai) {
    fprintf(stderr, "ERROR: Cannot open %s.fai\n", reference_filename.c_str());
    return 1;
  }

  vector<Reference>  ref_index;
  map<string,int> ref_map;
  char line[1024], chrom_name[1024];
  while (fgets(line, 1024, fai) != NULL) {
    Reference ref_entry;
    long chr_start;
    if (5 != sscanf(line, "%s\t%ld\t%ld\t%d\t%d", chrom_name, &ref_entry.size, &chr_start,
                    &ref_entry.bases_per_line, &ref_entry.bytes_per_line))
      continue;
    ref_entry.chr = chrom_name;
    ref_entry.start = ref + chr_start;
    ref_index.push_back(ref_entry);
    ref_map[ref_entry.chr] = (int) ref_index.size() - 1;
  }
  fclose(fai);


  // Load input BED or load input VCF, group by chromosome

  deque<LineStatus> line_status;
  vector<deque<Allele> > alleles(ref_index.size());

  if (!input_bed_filename.empty()) {

    FILE *input = fopen(input_bed_filename.c_str(),"r");
    if (!input) {
      fprintf(stderr,"ERROR: Cannot open %s\n", input_bed_filename.c_str());
      return 1;
    }

    char line2[65536];

    int line_number = 0;
    bool line_overflow = false;
    while (fgets(line2, 65536, input) != NULL) {
      if (line2[0] and line2[strlen(line2)-1] != '\n' and strlen(line2) == 65535) {
        line_overflow = true;
        continue;
      }
      line_number++;
      if (line_overflow) {
        line_overflow = false;
        line_status.push_back(LineStatus(line_number));
        line_status.back().filter_message_prefix = "Malformed hotspot BED line: line length exceeds 64K";
        continue;
      }

      if (strncmp(line2, "browser", 7) == 0)
        continue;

      if (strncmp(line2, "track", 5) == 0) {
        if (string::npos != string(line2).find("allowBlockSubstitutions=true"))
          allow_block_substitutions = true;
        continue;
      }

      char *current_chr = strtok(line2, "\t\r\n");
      char *current_start = strtok(NULL, "\t\r\n");
      char *current_end = strtok(NULL, "\t\r\n");
      char *current_id = strtok(NULL, "\t\r\n");
      char *penultimate = strtok(NULL, "\t\r\n");
      char *ultimate = strtok(NULL, "\t\r\n");
      for (char *next = strtok(NULL, "\t\r\n"); next; next = strtok(NULL, "\t\r\n")) {
        penultimate = ultimate;
        ultimate = next;
      }

      if (!current_chr or !current_start or !current_end or !current_id or !penultimate or !ultimate) {
        line_status.push_back(LineStatus(line_number));
        line_status.back().filter_message_prefix = "Malformed hotspot BED line: expected at least 6 fields";
        continue;
      }

      Allele allele;

      string string_chr(current_chr);
      if (ref_map.find(string_chr) != ref_map.end())
        allele.chr_idx = ref_map[string_chr];
      else if (ref_map.find("chr"+string_chr) != ref_map.end())
        allele.chr_idx = ref_map["chr"+string_chr];
      else if (string_chr == "MT" and ref_map.find("chrM") != ref_map.end())
        allele.chr_idx = ref_map["chrM"];
      else {
        line_status.push_back(LineStatus(line_number));
        line_status.back().filter_message_prefix = "Unknown chromosome name: ";
        line_status.back().filter_message = string_chr;
        continue;
      }

      allele.pos = strtol(current_start,NULL,10);
      allele.id = current_id;

      char *current_ref = NULL;
      char *current_alt = NULL;
      for (char *next = strtok(penultimate, ";"); next; next = strtok(NULL, ";")) {
        if (strncmp(next,"REF=",4) == 0)
          current_ref = next;
        else if (strncmp(next,"OBS=",4) == 0)
          current_alt = next;
      }
      if (!current_ref or !current_alt) {
        line_status.push_back(LineStatus(line_number));
        line_status.back().filter_message_prefix = "Malformed hotspot BED line: REF and OBS fields required in penultimate column";
        continue;
      }
      for (char *pos = current_ref+4; *pos; ++pos)
        allele.ref += toupper(*pos);
      for (char *pos = current_alt+4; *pos; ++pos)
        allele.alt += toupper(*pos);
      allele.filtered = false;
      line_status.push_back(LineStatus(line_number));
      allele.line_status = &line_status.back();
      allele.opos = allele.pos;
      allele.oref = allele.ref;
      allele.oalt = allele.alt;
      alleles[allele.chr_idx].push_back(allele);
      line_status.back().allele = &alleles[allele.chr_idx].back();
    }

    fclose(input);
  }


  if (!input_vcf_filename.empty()) {

    FILE *input = fopen(input_vcf_filename.c_str(),"r");
    if (!input) {
      fprintf(stderr,"ERROR: Cannot open %s\n", input_vcf_filename.c_str());
      return 1;
    }

    char line2[65536];
    int line_number = 0;
    bool line_overflow = false;
    while (fgets(line2, 65536, input) != NULL) {
      if (line2[0] and line2[strlen(line2)-1] != '\n' and strlen(line2) == 65535) {
        line_overflow = true;
        continue;
      }
      line_number++;
      if (line_overflow) {
        line_overflow = false;
        line_status.push_back(LineStatus(line_number));
        line_status.back().filter_message_prefix = "Malformed hotspot VCF line: line length exceeds 64K";
        continue;
      }

      if (strncmp(line2, "##allowBlockSubstitutions=true", 30) == 0) {
        allow_block_substitutions = true;
        continue;
      }
      if (line2[0] == '#')
        continue;

      char *current_chr = strtok(line2, "\t\r\n");
      char *current_start = strtok(NULL, "\t\r\n");
      char *current_id = strtok(NULL, "\t\r\n");
      char *current_ref = strtok(NULL, "\t\r\n");
      char *current_alt = strtok(NULL, "\t\r\n");

      if (!current_chr or !current_start or !current_id or !current_ref or !current_alt) {
        line_status.push_back(LineStatus(line_number));
        line_status.back().filter_message_prefix = "Malformed hotspot VCF line: expected at least 5 fields";
        continue;
      }


      string string_chr(current_chr);
      int chr_idx = 0;
      if (ref_map.find(string_chr) != ref_map.end())
        chr_idx = ref_map[string_chr];
      else if (ref_map.find("chr"+string_chr) != ref_map.end())
        chr_idx = ref_map["chr"+string_chr];
      else if (string_chr == "MT" and ref_map.find("chrM") != ref_map.end())
        chr_idx = ref_map["chrM"];
      else {
        line_status.push_back(LineStatus(line_number));
        line_status.back().filter_message_prefix = "Unknown chromosome name: ";
        line_status.back().filter_message = string_chr;
        continue;
      }

      for (char *pos = current_ref; *pos; ++pos)
        *pos = toupper(*pos);
      for (char *pos = current_alt; *pos; ++pos)
        *pos = toupper(*pos);


      for (char *sub_alt = strtok(current_alt,","); sub_alt; sub_alt = strtok(NULL,",")) {

        Allele allele;
        allele.chr_idx = chr_idx;
        allele.ref = current_ref;
        allele.alt = sub_alt;
        allele.pos = strtol(current_start,NULL,10)-1;
        allele.id = current_id;
        if (allele.id == ".")
          allele.id = "hotspot";

        allele.filtered = false;
        line_status.push_back(LineStatus(line_number));
        allele.line_status = &line_status.back();
        allele.opos = allele.pos;
        allele.oref = allele.ref;
        allele.oalt = allele.alt;
        alleles[allele.chr_idx].push_back(allele);
        line_status.back().allele = &alleles[allele.chr_idx].back();
      }
    }

    fclose(input);
  }

  // Process by chromosome:
  //   - Verify reference allele
  //   - Left align
  //   - Sort
  //   - Filter for block substitutions, write

  FILE *output_vcf = NULL;
  if (!output_vcf_filename.empty()) {
    output_vcf = fopen(output_vcf_filename.c_str(), "w");
    if (!output_vcf) {
      fprintf(stderr,"ERROR: Cannot open %s for writing\n", output_vcf_filename.c_str());
      return 1;
    }
    fprintf(output_vcf, "##fileformat=VCFv4.1\n");
    if (allow_block_substitutions)
      fprintf(output_vcf, "##allowBlockSubstitutions=true\n");
    fprintf(output_vcf, "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\n");
  }
  FILE *output_bed = NULL;
  if (!output_bed_filename.empty()) {
    output_bed = fopen(output_bed_filename.c_str(), "w");
    if (!output_bed) {
      fprintf(stderr,"ERROR: Cannot open %s for writing\n", output_bed_filename.c_str());
      if (output_vcf)
        fclose(output_vcf);
      return 1;
    }
    if (allow_block_substitutions)
      fprintf(output_bed, "track name=\"hotspot\" type=bedDetail allowBlockSubstitutions=true\n");
    else
      fprintf(output_bed, "track name=\"hotspot\" type=bedDetail\n");
  }


  for (int chr_idx = 0; chr_idx < (int)ref_index.size(); ++chr_idx) {

    for (deque<Allele>::iterator A = alleles[chr_idx].begin(); A != alleles[chr_idx].end(); ++A) {

      // Invalid characters

      bool valid = true;
      for (const char *c = A->ref.c_str(); *c ; ++c)
        if (*c != 'A' and *c != 'C' and *c != 'G' and *c != 'T')
          valid = false;
      for (const char *c = A->alt.c_str(); *c ; ++c)
        if (*c != 'A' and *c != 'C' and *c != 'G' and *c != 'T')
          valid = false;
      if (not valid) {
        A->filtered = true;
        A->line_status->filter_message_prefix = "REF and/or ALT contain characters other than ACGT: ";
        A->line_status->filter_message = "REF = " + A->ref + " ALT = " + A->alt;
        continue;
      }

      // Filter REF == ALT

      if (A->ref == A->alt) {
        A->filtered = true;
        A->line_status->filter_message_prefix = "REF and ALT alleles equal";
        continue;
      }

      // Confirm reference allele.

      string ref_expected;
      for (int idx = 0; idx < (int) A->ref.size(); ++idx)
        ref_expected += ref_index[chr_idx].base(A->pos + idx);
      if (A->ref != ref_expected) {
        A->filtered = true;
        A->line_status->filter_message_prefix = "Provided REF allele does not match reference: ";
        A->line_status->filter_message = "Expected " + ref_expected + ", found " + A->ref;
        continue;
      }

      // Trim

      int ref_start = 0;
      int ref_end = A->ref.size();
      int alt_end = A->alt.size();

      // Option 1: trim all trailing bases

      //while(ref_end and alt_end and A->ref[ref_end-1] == A->alt[alt_end-1]) {
      //  --ref_end;
      //  --alt_end;
      //}

      // Option 2: trim all leading basees

      //while (ref_start < ref_end and ref_start < alt_end and A->ref[ref_start] == A->alt[ref_start])
      //  ++ref_start;


      // Option 3: trim anchor base if vcf

      if (!input_vcf_filename.empty()) {
        if (ref_end and alt_end and (ref_end == 1 or alt_end == 1) and A->ref[0] == A->alt[0])
          ref_start = 1;
      }

      A->pos += ref_start;
      A->ref = A->ref.substr(ref_start, ref_end-ref_start);
      A->alt = A->alt.substr(ref_start, alt_end-ref_start);
      ref_end -= ref_start;
      alt_end -= ref_start;

      // Left align
      if (left_alignment) {
        while (A->pos > 0) {
          char nuc = ref_index[chr_idx].base(A->pos-1);
          if (ref_end > 0 and A->ref[ref_end-1] != nuc)
            break;
          if (alt_end > 0 and A->alt[alt_end-1] != nuc)
            break;
          A->ref = string(1,nuc) + A->ref;
          A->alt = string(1,nuc) + A->alt;
          A->pos--;
        }
      }
      A->ref.resize(ref_end);
      A->alt.resize(alt_end);


      // Filter block substitutions: take 1

      if (ref_end > 0 and alt_end > 0 and ref_end != alt_end and not allow_block_substitutions and not filter_bypass) {
        A->filtered = true;
        A->line_status->filter_message_prefix = "Block substitutions not supported";
        continue;
      }

    }



    if (output_bed) {
      // Sort - without anchor base
      sort(alleles[chr_idx].begin(), alleles[chr_idx].end(), compare_alleles);

      // Write
      for (deque<Allele>::iterator I = alleles[chr_idx].begin(); I != alleles[chr_idx].end(); ++I) {
        if (I->filtered)
          continue;
        if (I->pos)
          fprintf(output_bed, "%s\t%ld\t%ld\t%s\t0\t+\tREF=%s;OBS=%s;ANCHOR=%c\tNONE\n",
              ref_index[chr_idx].chr.c_str(), I->pos, I->pos + I->ref.size(), I->id.c_str(),
              I->ref.c_str(), I->alt.c_str(), ref_index[chr_idx].base(I->pos-1));
        else
          fprintf(output_bed, "%s\t%ld\t%ld\t%s\t0\t+\tREF=%s;OBS=%s;ANCHOR=\tNONE\n",
              ref_index[chr_idx].chr.c_str(), I->pos, I->pos + I->ref.size(), I->id.c_str(),
              I->ref.c_str(), I->alt.c_str());
      }
    }


    if (output_vcf) {

      // Add anchor base to indels
      for (deque<Allele>::iterator I = alleles[chr_idx].begin(); I != alleles[chr_idx].end(); ++I) {
        if (I->filtered)
          continue;
        if (not I->ref.empty() and not I->alt.empty())
          continue;
        if (I->pos == 0) {
          I->filtered = true;
          I->line_status->filter_message_prefix = "INDELs at chromosome start not supported";
          continue;
        }
        I->pos--;
        I->ref = string(1,ref_index[chr_idx].base(I->pos)) + I->ref;
        I->alt = string(1,ref_index[chr_idx].base(I->pos)) + I->alt;
      }

      // Sort - with anchor base
      sort(alleles[chr_idx].begin(), alleles[chr_idx].end(), compare_alleles);


      // Merge alleles, remove block substitutions, write
      for (deque<Allele>::iterator A = alleles[chr_idx].begin(); A != alleles[chr_idx].end(); ) {

        string max_ref;
        deque<Allele>::iterator B = A;
        for (; B != alleles[chr_idx].end() and B->pos == A->pos; ++B)
          if (!B->filtered and max_ref.size() < B->ref.size())
            max_ref = B->ref;

        bool filtered = true;
        for (deque<Allele>::iterator I = A; I != B; ++I) {
          if (I->filtered)
            continue;

          string new_alt = I->alt + max_ref.substr(I->ref.size());

          if (new_alt.size() > 1 and max_ref.size() > 1 and new_alt.size() != max_ref.size() and not allow_block_substitutions and not filter_bypass) {
            I->filtered = true;
            I->line_status->filter_message_prefix = "Block substitutions not supported (post-merge)";
            continue;
          }

          I->ref = max_ref;
          I->alt = new_alt;
          filtered = false;
        }

        if (not filtered) {

          fprintf(output_vcf, "%s\t%ld\t.\t%s\t",
              ref_index[chr_idx].chr.c_str(), A->pos+1, max_ref.c_str());

          bool comma = false;
          set<string> unique_alt_alleles;
          for (deque<Allele>::iterator I = A; I != B; ++I) {
            if (I->filtered)
              continue;
            if (unique_alt_alleles.count(I->alt) > 0)
              continue;
            unique_alt_alleles.insert(I->alt);
            if (comma)
              fprintf(output_vcf, ",");
            comma = true;
            fprintf(output_vcf, "%s", I->alt.c_str());
          }

          fprintf(output_vcf, "\t.\t.\tOID=");
          comma = false;
          for (deque<Allele>::iterator I = A; I != B; ++I) {
            if (I->filtered)
              continue;
            if (comma)
              fprintf(output_vcf, ",");
            comma = true;
            fprintf(output_vcf, "%s", I->id.c_str());
          }

          fprintf(output_vcf, ";OPOS=");
          comma = false;
          for (deque<Allele>::iterator I = A; I != B; ++I) {
            if (I->filtered)
              continue;
            if (comma)
              fprintf(output_vcf, ",");
            comma = true;
            fprintf(output_vcf, "%ld", I->opos+1);
          }

          fprintf(output_vcf, ";OREF=");
          comma = false;
          for (deque<Allele>::iterator I = A; I != B; ++I) {
            if (I->filtered)
              continue;
            if (comma)
              fprintf(output_vcf, ",");
            comma = true;
            fprintf(output_vcf, "%s", I->oref.c_str());
          }

          fprintf(output_vcf, ";OALT=");
          comma = false;
          for (deque<Allele>::iterator I = A; I != B; ++I) {
            if (I->filtered)
              continue;
            if (comma)
              fprintf(output_vcf, ",");
            comma = true;
            fprintf(output_vcf, "%s", I->oalt.c_str());
          }

          fprintf(output_vcf, ";OMAPALT=");
          comma = false;
          for (deque<Allele>::iterator I = A; I != B; ++I) {
            if (I->filtered)
              continue;
            if (comma)
              fprintf(output_vcf, ",");
            comma = true;
            fprintf(output_vcf, "%s", I->alt.c_str());
          }

          fprintf(output_vcf, "\n");
        }

        A = B;
      }
    }
  }



  if (output_bed) {
    fflush(output_bed);
    fclose(output_bed);
  }
  if (output_vcf) {
    fflush(output_vcf);
    fclose(output_vcf);
  }


  int lines_ignored = 0;
  for (deque<LineStatus>::iterator L = line_status.begin(); L != line_status.end(); ++L) {
    if (L->filter_message_prefix) {
      if (L->allele)
        printf("Line %d ignored: [%s:%ld %s] %s%s\n", L->line_number, ref_index[L->allele->chr_idx].chr.c_str(), L->allele->opos+1, L->allele->id.c_str(),
            L->filter_message_prefix, L->filter_message.c_str());
      else
        printf("Line %d ignored: %s%s\n", L->line_number, L->filter_message_prefix, L->filter_message.c_str());
      lines_ignored++;
    }
  }
  printf("Ignored %d out of %d lines\n", lines_ignored, (int)line_status.size());


  munmap(ref, ref_stat.st_size);
  close(ref_handle);

  return 0;
}
Пример #27
0
int PrepareHotspots(int argc, const char *argv[])
{
  OptArgs opts;
  opts.ParseCmdLine(argc, argv);
  string input_bed_filename       = opts.GetFirstString ('b', "input-bed", "");
  string input_vcf_filename       = opts.GetFirstString ('v', "input-vcf", "");
  string input_real_vcf_filename  = opts.GetFirstString ('p', "input-real-vcf", "");
  string output_hot_vcf		  = opts.GetFirstString ('q', "output-fake-hot-vcf", "");
  string output_bed_filename      = opts.GetFirstString ('d', "output-bed", "");
  string output_vcf_filename      = opts.GetFirstString ('o', "output-vcf", "");
  string reference_filename       = opts.GetFirstString ('r', "reference", "");
  string unmerged_bed 		  = opts.GetFirstString ('u', "unmerged-bed", "");
  bool left_alignment             = opts.GetFirstBoolean('a', "left-alignment", false);
  bool filter_bypass              = opts.GetFirstBoolean('f', "filter-bypass", false);
  bool allow_block_substitutions  = opts.GetFirstBoolean('s', "allow-block-substitutions", true);
  bool strict_check               = opts.GetFirstBoolean('S', "strict-check", true);
  opts.CheckNoLeftovers();

  if((input_bed_filename.empty() == (input_vcf_filename.empty() and input_real_vcf_filename.empty())) or
      (output_bed_filename.empty() and output_vcf_filename.empty()) or reference_filename.empty()) {
    PrepareHotspotsHelp();
    return 1;
  }
  if ((not input_real_vcf_filename.empty()) and (output_vcf_filename.empty() or not input_vcf_filename.empty())) {
    PrepareHotspotsHelp();
    return 1;
  }


  // Populate chromosome list from reference.fai
  // Use mmap to fetch the entire reference

  int ref_handle = open(reference_filename.c_str(),O_RDONLY);

  struct stat ref_stat;
  fstat(ref_handle, &ref_stat);
  char *ref = (char *)mmap(0, ref_stat.st_size, PROT_READ, MAP_SHARED, ref_handle, 0);


  FILE *fai = fopen((reference_filename+".fai").c_str(), "r");
  if (!fai) {
    fprintf(stderr, "ERROR: Cannot open %s.fai\n", reference_filename.c_str());
    return 1;
  }

  vector<Reference>  ref_index;
  map<string,int> ref_map;
  char line[1024], chrom_name[1024];
  while (fgets(line, 1024, fai) != NULL) {
    Reference ref_entry;
    long chr_start;
    if (5 != sscanf(line, "%1020s\t%ld\t%ld\t%d\t%d", chrom_name, &ref_entry.size, &chr_start,
                    &ref_entry.bases_per_line, &ref_entry.bytes_per_line))
      continue;
    ref_entry.chr = chrom_name;
    ref_entry.start = ref + chr_start;
    ref_index.push_back(ref_entry);
    ref_map[ref_entry.chr] = (int) ref_index.size() - 1;
  }
  fclose(fai);
  junction junc;
  if (!unmerged_bed.empty()) {
    FILE *fp = fopen(unmerged_bed.c_str(), "r");
    if (!fp) {
	fprintf(stderr, "ERROR: Cannot open %s\n", unmerged_bed.c_str());
	return 1;
    }
    char line2[65536];

    junc.init(ref_index.size());
    bool line_overflow = false;
    while (fgets(line2, 65536, fp) != NULL) {
      if (line2[0] and line2[strlen(line2)-1] != '\n' and strlen(line2) == 65535) {
        line_overflow = true;
	continue;
      }
      if (line_overflow) {
        line_overflow = false;
        continue;
      }
     if (strstr(line2, "track")) continue;
      char chr[100];
      int b, e;
      sscanf(line2, "%s %d %d", chr,  &b, &e);
      junc.add(ref_map[chr], b, e);
    }
    fclose(fp);
  }

  // Load input BED or load input VCF, group by chromosome

  deque<LineStatus> line_status;
  vector<deque<Allele> > alleles(ref_index.size());

  if (!input_bed_filename.empty()) {

    FILE *input = fopen(input_bed_filename.c_str(),"r");
    if (!input) {
      fprintf(stderr,"ERROR: Cannot open %s\n", input_bed_filename.c_str());
      return 1;
    }

    char line2[65536];

    int line_number = 0;
    bool line_overflow = false;
    while (fgets(line2, 65536, input) != NULL) {
      if (line2[0] and line2[strlen(line2)-1] != '\n' and strlen(line2) == 65535) {
        line_overflow = true;
        continue;
      }
      line_number++;
      if (line_overflow) {
        line_overflow = false;
        line_status.push_back(LineStatus(line_number));
        line_status.back().filter_message_prefix = "Malformed hotspot BED line: line length exceeds 64K";
        continue;
      }

      if (strncmp(line2, "browser", 7) == 0)
        continue;

      if (strncmp(line2, "track", 5) == 0) {
        if (string::npos != string(line2).find("allowBlockSubstitutions=true"))
          allow_block_substitutions = true;
        continue;
      }

      // OID= table has special meaning
      if (string::npos != string(line2).find("OID=")) {
	line_status.push_back(LineStatus(line_number));
        line_status.back().filter_message_prefix = "Bed line contains OID=";
        continue;
      }

      char *current_chr = strtok(line2, "\t\r\n");
      char *current_start = strtok(NULL, "\t\r\n");
      char *current_end = strtok(NULL, "\t\r\n");
      char *current_id = strtok(NULL, "\t\r\n");
      char *penultimate = strtok(NULL, "\t\r\n");
      char *ultimate = strtok(NULL, "\t\r\n");
      for (char *next = strtok(NULL, "\t\r\n"); next; next = strtok(NULL, "\t\r\n")) {
        penultimate = ultimate;
        ultimate = next;
      }

      if (!current_chr or !current_start or !current_end or !current_id or !penultimate or !ultimate) {
        line_status.push_back(LineStatus(line_number));
        line_status.back().filter_message_prefix = "Malformed hotspot BED line: expected at least 6 fields";
        continue;
      }

      Allele allele;

      string string_chr(current_chr);
      if (ref_map.find(string_chr) != ref_map.end())
        allele.chr_idx = ref_map[string_chr];
      else if (ref_map.find("chr"+string_chr) != ref_map.end())
        allele.chr_idx = ref_map["chr"+string_chr];
      else if (string_chr == "MT" and ref_map.find("chrM") != ref_map.end())
        allele.chr_idx = ref_map["chrM"];
      else {
        line_status.push_back(LineStatus(line_number));
        line_status.back().filter_message_prefix = "Unknown chromosome name: ";
        line_status.back().filter_message = string_chr;
        continue;
      }

      allele.pos = strtol(current_start,NULL,10);
      allele.id = current_id;

      char *current_ref = NULL;
      char *current_alt = NULL;
      for (char *next = strtok(penultimate, ";"); next; next = strtok(NULL, ";")) {
        if (strncmp(next,"REF=",4) == 0)
          current_ref = next;
        else if (strncmp(next,"OBS=",4) == 0)
          current_alt = next;
        else if (strncmp(next,"ANCHOR=",7) == 0) {
          // ignore ANCHOR
        } else {
          char *value = next;
          while (*value and *value != '=')
            ++value;
          if (*value == '=')
            *value++ = 0;
          allele.custom_tags[next] = value;
        }
      }
      if (!current_ref or !current_alt) {
        line_status.push_back(LineStatus(line_number));
        line_status.back().filter_message_prefix = "Malformed hotspot BED line: REF and OBS fields required in penultimate column";
        continue;
      }
      for (char *pos = current_ref+4; *pos; ++pos)
        allele.ref += toupper(*pos);
      for (char *pos = current_alt+4; *pos; ++pos)
        allele.alt += toupper(*pos);
      // here is the place to check the length of the hotspot cover the amplicon junction. ZZ
      /*
      if (junc.contain(allele.chr_idx, allele.pos, (unsigned int) allele.ref.size())) {
	line_status.push_back(LineStatus(line_number));
        line_status.back().filter_message_prefix = "hotspot BED line contain the complete overlapping region of two amplicon, the variant cannot be detected by tvc";
        continue;
      }
      if (not junc.contained_in_ampl(allele.chr_idx, allele.pos, (unsigned int) allele.ref.size())) {
        line_status.push_back(LineStatus(line_number));
        line_status.back().filter_message_prefix = "hotspot BED line is not contained in any amplicon, the variant cannot be detected by tvc";
        continue;
      }
      */

      allele.filtered = false;
      line_status.push_back(LineStatus(line_number));
      allele.line_status = &line_status.back();
      allele.opos = allele.pos;
      allele.oref = allele.ref;
      allele.oalt = allele.alt;
      alleles[allele.chr_idx].push_back(allele);
      //line_status.back().allele = &alleles[allele.chr_idx].back();
      line_status.back().chr_idx = allele.chr_idx;
      line_status.back().opos = allele.opos;
      line_status.back().id = allele.id;
    }

    fclose(input);
  }



  if (!input_vcf_filename.empty() or !input_real_vcf_filename.empty()) {

    bool real_vcf = false;
    FILE *input;
    FILE *out_real = NULL;
    FILE *out_hot = NULL;
    int fake_ = 0;
    int hn = 1;
    if (!input_real_vcf_filename.empty()) {
	real_vcf = true;
	input = fopen(input_real_vcf_filename.c_str(),"r");
	if (!input) {
	    fprintf(stderr,"ERROR: Cannot open %s\n", input_real_vcf_filename.c_str());
            return 1;
	}
	out_real = fopen(output_vcf_filename.c_str(), "w");
	if (!out_real) {
            fprintf(stderr,"ERROR: Cannot open %s\n", output_vcf_filename.c_str());
            return 1;
        }
	if (!output_hot_vcf.empty()) {
	    out_hot = fopen(output_hot_vcf.c_str(), "w");
	    if (!out_hot) {
		fprintf(stderr,"ERROR: Cannot open %s\n", output_hot_vcf.c_str());
		return 1;
	    } 
   	} else out_hot = stdout;
	fprintf(out_hot, "##fileformat=VCFv4.1\n##allowBlockSubstitutions=true\n#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO\n");
    } else {
        input = fopen(input_vcf_filename.c_str(),"r");
        if (!input) {
            fprintf(stderr,"ERROR: Cannot open %s\n", input_vcf_filename.c_str());
            return 1;
    	}
    }

    char line2[65536];
    char line3[65536];
    int line_number = 0;
    bool line_overflow = false;
    list<one_vcfline> vcflist;

    char last_chr[1024] = "";
    while (fgets(line2, 65536, input) != NULL) {
      if (line2[0] and line2[strlen(line2)-1] != '\n' and strlen(line2) == 65535) {
        line_overflow = true;
        continue;
      }
      line_number++;
      if (line_overflow) {
        line_overflow = false;
        line_status.push_back(LineStatus(line_number));
        line_status.back().filter_message_prefix = "Malformed hotspot VCF line: line length exceeds 64K";
        continue;
      }

      if (strncmp(line2, "##allowBlockSubstitutions=true", 30) == 0) {
        allow_block_substitutions = true;
        continue;
      }
      if (line2[0] == '#') {
	if (out_real) { fprintf(out_real, "%s", line2);}
        continue;
      }

      if (real_vcf) strcpy(line3, line2);
      char *current_chr = strtok(line2, "\t\r\n");
      char *current_start = strtok(NULL, "\t\r\n");
      char *current_id = strtok(NULL, "\t\r\n");
      char *current_ref = strtok(NULL, "\t\r\n");
      char *current_alt = strtok(NULL, "\t\r\n");
      strtok(NULL, "\t\r\n"); // Ignore QUAL
      strtok(NULL, "\t\r\n"); // Ignore FILTER
      char *current_info = strtok(NULL, "\t\r\n");
      strtok(NULL, "\t\r\n");
      char *gt = strtok(NULL, "\t\r\n");

      if (!current_chr or !current_start or !current_id or !current_ref or !current_alt) {
        line_status.push_back(LineStatus(line_number));
        if (real_vcf) line_status.back().filter_message_prefix = "Malformed real VCF line: expected at least 5 fields";
	else line_status.back().filter_message_prefix = "Malformed hotspot VCF line: expected at least 5 fields";
        continue;
      }


      string string_chr(current_chr);
      int chr_idx = 0;
      if (ref_map.find(string_chr) != ref_map.end())
        chr_idx = ref_map[string_chr];
      else if (ref_map.find("chr"+string_chr) != ref_map.end())
        chr_idx = ref_map["chr"+string_chr];
      else if (string_chr == "MT" and ref_map.find("chrM") != ref_map.end())
        chr_idx = ref_map["chrM"];
      else {
        line_status.push_back(LineStatus(line_number));
        line_status.back().filter_message_prefix = "Unknown chromosome name: ";
        line_status.back().filter_message = string_chr;
        continue;
      }

      for (char *pos = current_ref; *pos; ++pos)
        *pos = toupper(*pos);
      for (char *pos = current_alt; *pos; ++pos)
        *pos = toupper(*pos);


      // Process custom tags
      vector<string>  bstrand;
      vector<string>  hp_max_length;
      string raw_oid;
      string raw_omapalt;
      string raw_oalt;
      string raw_oref;
      string raw_opos;

      if (current_info) {
        string raw_bstrand;
        string raw_hp_max_length;
        for (char *next = strtok(current_info, ";"); next; next = strtok(NULL, ";")) {

          char *value = next;
          while (*value and *value != '=')
            ++value;
          if (*value == '=')
            *value++ = 0;

          if (strcmp(next, "TYPE") == 0)
            continue;
          if (strcmp(next, "HRUN") == 0)
            continue;
          if (strcmp(next, "HBASE") == 0)
            continue;
          if (strcmp(next, "FR") == 0)
            continue;
          if (strcmp(next, "OPOS") == 0) {
	    raw_opos = value;
            continue;
	  }
          if (strcmp(next, "OREF") == 0) {
	    raw_oref = value;
            continue;
	  }
          if (strcmp(next, "OALT") == 0) {
	    raw_oalt = value;
            continue;
	  }
          if (strcmp(next, "OID") == 0) {
            raw_oid = value;
            continue;
          }
          if (strcmp(next, "OMAPALT") == 0) {
            raw_omapalt = value;
            continue;
          }
          if (strcmp(next, "BSTRAND") == 0) {
            raw_bstrand = value;
            continue;
          }
          if (strcmp(next, "hp_max_length") == 0) {
            raw_hp_max_length = value;
            continue;
          }
        }

        if (not raw_bstrand.empty())
          split(raw_bstrand, ',', bstrand);
        if (not raw_hp_max_length.empty())
          split(raw_hp_max_length, ',', hp_max_length);

      }

      if (real_vcf) {
	//fprintf(stderr, "%s\n", gt);
        if (gt == NULL) continue;
	// get gt
	int g1 = atoi(gt), g2;
	gt = strchr(gt, '/');
	if (gt) g2 = atoi(gt+1);
	else {fprintf(stderr, "GT not formatted right\n"); exit(1);}
	//if (g1 == 0 and g2 == 0) continue;
	unsigned int cur_pos = atoi(current_start);
	one_vcfline newline(current_ref, current_alt, cur_pos, g1, g2, line3);
	bool new_chr = false;
	if (strcmp(current_chr, last_chr) != 0) {
	    new_chr = true;
	}
	while (not vcflist.empty()) {
	    if ((not new_chr) and vcflist.front().pos+strlen(vcflist.front().ref) > cur_pos) break;
	    if (vcflist.front().produce_hot_vcf(last_chr, out_real, hn, out_hot)) fake_++;
	    vcflist.pop_front();
	}
	if (new_chr) strcpy(last_chr, current_chr);
	for (list<one_vcfline>::iterator it = vcflist.begin(); it != vcflist.end(); it++) {
	    it->check_subset(newline);
	}
	if (not newline.alts.empty()) vcflist.push_back(newline);
	continue;
      } 
      unsigned int allele_idx = 0;
      for (char *sub_alt = strtok(current_alt,","); sub_alt; sub_alt = strtok(NULL,",")) {

        Allele allele;
        allele.chr_idx = chr_idx;
        allele.ref = current_ref;
        allele.alt = sub_alt;
        allele.pos = strtol(current_start,NULL,10)-1;
        allele.id = current_id;
        if (allele.id == ".")
          allele.id = "hotspot";

        allele.filtered = false;
        line_status.push_back(LineStatus(line_number));
        allele.line_status = &line_status.back();
        allele.opos = allele.pos;
        allele.oref = allele.ref;
        allele.oalt = allele.alt;

        if (allele_idx < bstrand.size()) {
          if (bstrand[allele_idx] != ".")
            allele.custom_tags["BSTRAND"] = bstrand[allele_idx];
        }

        if (allele_idx < hp_max_length.size()) {
          if (hp_max_length[allele_idx] != ".")
            allele.custom_tags["hp_max_length"] = hp_max_length[allele_idx];
        }

        alleles[allele.chr_idx].push_back(allele);
        //line_status.back().allele = &alleles[allele.chr_idx].back();
        line_status.back().chr_idx = allele.chr_idx;
        line_status.back().opos = allele.opos;
        line_status.back().id = allele.id;
        allele_idx++;
      }
    }

    fclose(input);
    if (real_vcf) {
        while (not vcflist.empty()) {
            if (vcflist.front().produce_hot_vcf(last_chr, out_real, hn, out_hot)) fake_++;
            vcflist.pop_front();
        }
	fclose(out_real);
	fclose(out_hot);
	if (fake_ > 0) 
            return 0;
	else return 1;
    }
  }


  // Process by chromosome:
  //   - Verify reference allele
  //   - Left align
  //   - Sort
  //   - Filter for block substitutions, write

  FILE *output_vcf = NULL;
  if (!output_vcf_filename.empty()) {
    output_vcf = fopen(output_vcf_filename.c_str(), "w");
    if (!output_vcf) {
      fprintf(stderr,"ERROR: Cannot open %s for writing\n", output_vcf_filename.c_str());
      return 1;
    }
    fprintf(output_vcf, "##fileformat=VCFv4.1\n");
    if (allow_block_substitutions)
      fprintf(output_vcf, "##allowBlockSubstitutions=true\n");
    fprintf(output_vcf, "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\n");
  }
  FILE *output_bed = NULL;
  if (!output_bed_filename.empty()) {
    output_bed = fopen(output_bed_filename.c_str(), "w");
    if (!output_bed) {
      fprintf(stderr,"ERROR: Cannot open %s for writing\n", output_bed_filename.c_str());
      if (output_vcf)
        fclose(output_vcf);
      return 1;
    }
    if (allow_block_substitutions)
      fprintf(output_bed, "track name=\"hotspot\" type=bedDetail allowBlockSubstitutions=true\n");
    else
      fprintf(output_bed, "track name=\"hotspot\" type=bedDetail\n");
  }


  for (int chr_idx = 0; chr_idx < (int)ref_index.size(); ++chr_idx) {

    for (deque<Allele>::iterator A = alleles[chr_idx].begin(); A != alleles[chr_idx].end(); ++A) {

      // check bed file
      if (junc.contain(A->chr_idx, A->pos, (unsigned int) A->ref.size())) {
	A->filtered = true;
        A->line_status->filter_message_prefix = "hotspot BED line contain the complete overlapping region of two amplicon, the variant cannot be detected by tvc";
        continue;
      }
      if (not junc.contained_in_ampl(A->chr_idx, A->pos, (unsigned int) A->ref.size())) {
	A->filtered = true;
        A->line_status->filter_message_prefix = "hotspot BED line is not contained in any amplicon, the variant cannot be detected by tvc";
        continue;
      }


      // Invalid characters

      bool valid = true;
      for (const char *c = A->ref.c_str(); *c ; ++c)
        if (*c != 'A' and *c != 'C' and *c != 'G' and *c != 'T')
          valid = false;
      for (const char *c = A->alt.c_str(); *c ; ++c)
        if (*c != 'A' and *c != 'C' and *c != 'G' and *c != 'T')
          valid = false;
      if (not valid) {
        A->filtered = true;
        A->line_status->filter_message_prefix = "REF and/or ALT contain characters other than ACGT: ";
        A->line_status->filter_message = "REF = " + A->ref + " ALT = " + A->alt;
        continue;
      }

      // Filter REF == ALT

      if (A->ref == A->alt) {
        A->filtered = true;
        A->line_status->filter_message_prefix = "REF and ALT alleles equal";
        continue;
      }

      // Confirm reference allele.

      string ref_expected;
      for (int idx = 0; idx < (int) A->ref.size(); ++idx)
        ref_expected += ref_index[chr_idx].base(A->pos + idx);
      if (A->ref != ref_expected) {
        A->filtered = true;
        A->line_status->filter_message_prefix = "Provided REF allele does not match reference: ";
        A->line_status->filter_message = "Expected " + ref_expected + ", found " + A->ref;
        continue;
      }

      // Trim

      int ref_start = 0;
      int ref_end = A->ref.size();
      int alt_end = A->alt.size();

      // Option 1: trim all trailing bases;

      //while(ref_end and alt_end and A->ref[ref_end-1] == A->alt[alt_end-1]) {
      //  --ref_end;
      //  --alt_end;
      //}

      // Option 2: trim all leading basees;

      //while (ref_start < ref_end and ref_start < alt_end and A->ref[ref_start] == A->alt[ref_start])
      //  ++ref_start;

      // Option 3: trim anchor base if vcf

      if (!input_vcf_filename.empty()) {
        if (ref_end and alt_end and (ref_end == 1 or alt_end == 1) and A->ref[0] == A->alt[0])
          ref_start = 1;
      }

      A->pos += ref_start;
      A->ref = A->ref.substr(ref_start, ref_end-ref_start);
      A->alt = A->alt.substr(ref_start, alt_end-ref_start);
      ref_end -= ref_start;
      alt_end -= ref_start;
      // Left align
      if (left_alignment && A->custom_tags.find("BSTRAND") == A->custom_tags.end()) { // black list variant not to be left aligned.
	string trailing;
	int can_do = 0, need_do = 0;
	int ref_end_orig= ref_end, alt_end_orig = alt_end;
	while(ref_end and alt_end and A->ref[ref_end-1] == A->alt[alt_end-1]) {
	    ref_end--; alt_end--;
	} 
	if (ref_end == 0 || alt_end == 0) {
	    can_do = need_do = 1; // indel type, ZZ
	} else {
	    int tmp_start = ref_start;
	    int ref_end_0 = ref_end, alt_end_0 = alt_end; // end after remove trailing match ZZ
	    while (tmp_start < ref_end and tmp_start < alt_end and A->ref[tmp_start] == A->alt[tmp_start])
     		++tmp_start;
	    if (tmp_start == ref_end || tmp_start == alt_end) {
		can_do = 1; need_do = 0; // indel but indel is not at the left. ZZ
	    } else {
		ref_end--; alt_end--;
		while(ref_end and alt_end and A->ref[ref_end-1] == A->alt[alt_end-1]) {
            	    ref_end--; alt_end--;
        	}
		if (ref_end == 0 || alt_end == 0) {
		   // complex with 1 bp MM at right end
		    can_do = need_do = 1;
		    if (ref_end + alt_end == 0) need_do = 0; // SNP
		} else {
		  int tmp_start0 = tmp_start; // start after removing leading matches
		  tmp_start++;
		  while (tmp_start < ref_end_orig and tmp_start < alt_end_orig and A->ref[tmp_start] == A->alt[tmp_start])
			tmp_start++;
		  if (tmp_start >= ref_end_0 || tmp_start >= alt_end_0 || ref_end <= tmp_start0 || alt_end <= tmp_start0) {
			// 1MM plus indel in middle, by definition cannot move the indel left enough to change A->pos
		    	can_do = 1; need_do = 0;
		  } // else real complex 
		}
	    }
	}
	if (!can_do or !need_do) {
	    // do nothing
	    // if !can_do need add some more DP
	    ref_end = ref_end_orig;
	    alt_end = alt_end_orig;
	} else {
	 // left align the indel part, here either ref_end = 0 or alt_end = 0
	  int opos = A->pos;
          while (A->pos > 0) {
            char nuc = ref_index[chr_idx].base(A->pos-1);
            if (ref_end > 0 and A->ref[ref_end-1] != nuc)
              break;
            if (alt_end > 0 and A->alt[alt_end-1] != nuc)
              break;
            A->ref = string(1,nuc) + A->ref;
            A->alt = string(1,nuc) + A->alt;
            A->pos--;
          }
	  if (ref_end != ref_end_orig) {
	    // trailing part is aligned, the whole ref and alt need to be kept. ZZ
	    ref_end = A->ref.size();
	    alt_end = A->alt.size();
	  } 
	  if (junc.contain(chr_idx, A->pos, ref_end) or not junc.contained_in_ampl(chr_idx, A->pos, ref_end)) {
		// after left align the hotspot contain an overlap region, revert to the original ZZ
		if (opos != A->pos) {
		    A->ref.erase(0, opos-A->pos);
		    A->alt.erase(0, opos-A->pos);
		    A->pos = opos;
		    ref_end = ref_end_orig;
		    alt_end = alt_end_orig;
		}
	  }
       }
      }
      A->ref.resize(ref_end);
      A->alt.resize(alt_end);


      // Filter block substitutions: take 1

      if (ref_end > 0 and alt_end > 0 and ref_end != alt_end and not allow_block_substitutions and not filter_bypass) {
        A->filtered = true;
        A->line_status->filter_message_prefix = "Block substitutions not supported";
        continue;
      }

    }



    if (output_bed) {
      // Sort - without anchor base
      stable_sort(alleles[chr_idx].begin(), alleles[chr_idx].end(), compare_alleles);

      // Write
      for (deque<Allele>::iterator I = alleles[chr_idx].begin(); I != alleles[chr_idx].end(); ++I) {
        if (I->filtered)
          continue;

        fprintf(output_bed, "%s\t%ld\t%ld\t%s\tREF=%s;OBS=%s",
            ref_index[chr_idx].chr.c_str(), I->pos, I->pos + I->ref.size(), I->id.c_str(),
            I->ref.c_str(), I->alt.c_str());

        for (map<string,string>::iterator C = I->custom_tags.begin(); C != I->custom_tags.end(); ++C)
          fprintf(output_bed, ";%s=%s", C->first.c_str(), C->second.c_str());

        fprintf(output_bed, "\tNONE\n");

        /*
        if (I->pos)
          fprintf(output_bed, "%s\t%ld\t%ld\t%s\t0\t+\tREF=%s;OBS=%s;ANCHOR=%c\tNONE\n",
              ref_index[chr_idx].chr.c_str(), I->pos, I->pos + I->ref.size(), I->id.c_str(),
              I->ref.c_str(), I->alt.c_str(), ref_index[chr_idx].base(I->pos-1));
        else
          fprintf(output_bed, "%s\t%ld\t%ld\t%s\t0\t+\tREF=%s;OBS=%s;ANCHOR=\tNONE\n",
              ref_index[chr_idx].chr.c_str(), I->pos, I->pos + I->ref.size(), I->id.c_str(),
              I->ref.c_str(), I->alt.c_str());
        */
      }
    }


    if (output_vcf) {

      // Add anchor base to indels
      for (deque<Allele>::iterator I = alleles[chr_idx].begin(); I != alleles[chr_idx].end(); ++I) {
        if (I->filtered)
          continue;
        if (not I->ref.empty() and not I->alt.empty())
          continue;
        if (I->pos == 0) {
          I->filtered = true;
          I->line_status->filter_message_prefix = "INDELs at chromosome start not supported";
          continue;
        }
        I->pos--;
        I->ref = string(1,ref_index[chr_idx].base(I->pos)) + I->ref;
        I->alt = string(1,ref_index[chr_idx].base(I->pos)) + I->alt;
      }

      // Sort - with anchor base
      stable_sort(alleles[chr_idx].begin(), alleles[chr_idx].end(), compare_alleles);


      // Merge alleles, remove block substitutions, write
      for (deque<Allele>::iterator A = alleles[chr_idx].begin(); A != alleles[chr_idx].end(); ) {

        string max_ref;
        deque<Allele>::iterator B = A;
        for (; B != alleles[chr_idx].end() and B->pos == A->pos; ++B)
          if (!B->filtered and max_ref.size() < B->ref.size())
            max_ref = B->ref;

        bool filtered = true;
        map<string,set<string> > unique_alts_and_ids;
        for (deque<Allele>::iterator I = A; I != B; ++I) {
          if (I->filtered)
            continue;

          string new_alt = I->alt + max_ref.substr(I->ref.size());

          if (new_alt.size() > 1 and max_ref.size() > 1 and new_alt.size() != max_ref.size() and not allow_block_substitutions and not filter_bypass) {
            I->filtered = true;
            I->line_status->filter_message_prefix = "Block substitutions not supported (post-merge)";
            continue;
          }

          I->ref = max_ref;
          I->alt = new_alt;

          // Filter alleles with duplicate ALT + ID pairs
          map<string,set<string> >::iterator alt_iter = unique_alts_and_ids.find(new_alt);
          if (alt_iter != unique_alts_and_ids.end()) {
            if (alt_iter->second.count(I->id) > 0) {
              I->filtered = true;
              I->line_status->filter_message_prefix = "Duplicate allele and ID";
              continue;
            }
          }
          unique_alts_and_ids[new_alt].insert(I->id);

          filtered = false;
        }

        if (not filtered) {



          fprintf(output_vcf, "%s\t%ld\t.\t%s\t",
              ref_index[chr_idx].chr.c_str(), A->pos+1, max_ref.c_str());

          bool comma = false;

          map<string,map<string,string> > unique_alts_and_tags;
          set<string> unique_tags;
	  set<string> unique_alt_alleles;

          for (deque<Allele>::iterator I = A; I != B; ++I) {
            if (I->filtered)
              continue;
            unique_alts_and_tags[I->alt].insert(I->custom_tags.begin(), I->custom_tags.end());
            for (map<string,string>::iterator S = I->custom_tags.begin(); S != I->custom_tags.end(); ++S)
              unique_tags.insert(S->first);
            if (unique_alt_alleles.count(I->alt) > 0)
              continue;
            unique_alt_alleles.insert(I->alt);
            if (comma)
              fprintf(output_vcf, ",");
            comma = true;
            fprintf(output_vcf, "%s", I->alt.c_str());
          }
	  /*
          for (deque<Allele>::iterator I = A; I != B; ++I) {
            if (I->filtered)
              continue;
            map<string,map<string,string> >::iterator Q = unique_alts_and_tags.find(I->alt);
            if (comma)
              fprintf(output_vcf, ",");
            comma = true;
            if (Q == unique_alts_and_tags.end()) {fprintf(output_vcf, "."); continue;}
            fprintf(output_vcf, "%s", Q->first.c_str());
          }
          */

          fprintf(output_vcf, "\t.\t.\tOID=");
          comma = false;
          for (deque<Allele>::iterator I = A; I != B; ++I) {
            if (I->filtered)
              continue;
            if (comma)
              fprintf(output_vcf, ",");
            comma = true;
            fprintf(output_vcf, "%s", I->id.c_str());
          }

          fprintf(output_vcf, ";OPOS=");
          comma = false;
          for (deque<Allele>::iterator I = A; I != B; ++I) {
            if (I->filtered)
              continue;
            if (comma)
              fprintf(output_vcf, ",");
            comma = true;
            fprintf(output_vcf, "%ld", I->opos+1);
          }

          fprintf(output_vcf, ";OREF=");
          comma = false;
          for (deque<Allele>::iterator I = A; I != B; ++I) {
            if (I->filtered)
              continue;
            if (comma)
              fprintf(output_vcf, ",");
            comma = true;
            fprintf(output_vcf, "%s", I->oref.c_str());
          }

          fprintf(output_vcf, ";OALT=");
          comma = false;
          for (deque<Allele>::iterator I = A; I != B; ++I) {
            if (I->filtered)
              continue;
            if (comma)
              fprintf(output_vcf, ",");
            comma = true;
            fprintf(output_vcf, "%s", I->oalt.c_str());
          }

          fprintf(output_vcf, ";OMAPALT=");
          comma = false;
          for (deque<Allele>::iterator I = A; I != B; ++I) {
            if (I->filtered)
              continue;
            if (comma)
              fprintf(output_vcf, ",");
            comma = true;
            fprintf(output_vcf, "%s", I->alt.c_str());
          }

          for (set<string>::iterator S = unique_tags.begin(); S != unique_tags.end(); ++S) {
            fprintf(output_vcf, ";%s=", S->c_str());
            comma=false;
            for (deque<Allele>::iterator I = A; I != B; ++I) {
              if (I->filtered)
                continue;
              map<string,map<string,string> >::iterator Q = unique_alts_and_tags.find(I->alt);
              if (comma)
                fprintf(output_vcf, ",");
              comma = true;
              if (Q == unique_alts_and_tags.end()) {fprintf(output_vcf, "."); continue;}
              map<string,string>::iterator W = Q->second.find(*S);
              if (W == Q->second.end())
                fprintf(output_vcf, ".");
              else
                fprintf(output_vcf, "%s", W->second.c_str());
            }
          }
//            fprintf(output_vcf, ";%s=%s", S->first.c_str(), S->second.c_str());

          fprintf(output_vcf, "\n");
        }

        A = B;
      }
    }
  }



  if (output_bed) {
    fflush(output_bed);
    fclose(output_bed);
  }
  if (output_vcf) {
    fflush(output_vcf);
    fclose(output_vcf);
  }


  int lines_ignored = 0;
  for (deque<LineStatus>::iterator L = line_status.begin(); L != line_status.end(); ++L) {
    if (L->filter_message_prefix) {
      if (L->chr_idx >= 0)
        printf("Line %d ignored: [%s:%ld %s] %s%s\n", L->line_number, ref_index[L->chr_idx].chr.c_str(), L->opos+1, L->id.c_str(),
            L->filter_message_prefix, L->filter_message.c_str());
      else
        printf("Line %d ignored: %s%s\n", L->line_number, L->filter_message_prefix, L->filter_message.c_str());
      lines_ignored++;
    }
  }
  printf("Ignored %d out of %d lines\n", lines_ignored, (int)line_status.size());


  munmap(ref, ref_stat.st_size);
  close(ref_handle);
  if (lines_ignored > 0 and strict_check) return 1;

  return 0;
}
Пример #28
0
int main(int argc, const char *argv[])
{
    OptArgs opts;
    opts.ParseCmdLine(argc, argv);
    bool help, combineSffs;
    string sffFile;
    string bamFile;
    vector<string> infiles;
    opts.GetOption(help,"false", 'h', "help");
    opts.GetOption(combineSffs,"false", 'c', "combine-sffs");
    opts.GetOption(bamFile,"",'o',"out-filename");
    opts.GetLeftoverArguments(infiles);

    if(help || infiles.empty())
    {
        usage();
    }

	if((!combineSffs) && infiles.size() > 1)
	{
        cerr << "sff2bam ERROR: if you want to combine all sff files into a single bam file, please use option -c true." << endl;
        usage();
	}

    sffFile= infiles.front();

    if(bamFile.length() < 1)
    {
        bamFile = sffFile.substr(0, sffFile.length() - 3);
        bamFile += "bam";
    }

    sff_file_t* sff_file = sff_fopen(sffFile.c_str(), "r", NULL, NULL);
    if(!sff_file)
    {
        cerr << "sff2bam ERROR: fail to open " << sffFile << endl;
        exit(1);
    }

	// All sff files must have the same flow and key
	if(combineSffs && infiles.size() > 1)
	{
        for(size_t n = 1; n < infiles.size(); ++n)
		{
			sff_file_t* sff_file2 = sff_fopen(infiles[n].c_str(), "r", NULL, NULL);
			if(!sff_file2)
			{
				sff_fclose(sff_file);
				cerr << "sff2bam ERROR: fail to open " << infiles[n] << endl;
				exit(1);
			}

			if(strcmp(sff_file2->header->flow->s, sff_file->header->flow->s) != 0 ||
				strcmp(sff_file2->header->key->s, sff_file->header->key->s) != 0)
			{
				sff_fclose(sff_file);
				sff_fclose(sff_file2);
				cerr << "sff2bam ERROR: " << sffFile << " and " << infiles[n] << " have different flows or keys." << endl;
				exit(1);
			}

			sff_fclose(sff_file2);
		}
	}

    sff_t* sff = NULL;
    // Open 1st read for read group name
    sff = sff_read(sff_file);
    if(!sff)
    {
        sff_fclose(sff_file);
        cerr << "sff2bam ERROR: fail to read " << sffFile << endl;
        exit(1);
    }

    // Set up BAM header
    SamHeader sam_header;
    sam_header.Version = "1.4";
    sam_header.SortOrder = "unsorted";

    SamProgram sam_program("sff2bam");
    sam_program.Name = "sff2bam";
    sam_program.Version = SFF2BAM_VERSION;
    sam_program.CommandLine = "sff2bam";
    sam_header.Programs.Add(sam_program);

    string rgname = sff->rheader->name->s;
    int index = rgname.find(":");
    rgname = rgname.substr(0, index);

    SamReadGroup read_group(rgname);
    read_group.FlowOrder = sff->gheader->flow->s;
    read_group.KeySequence = sff->gheader->key->s;

    sam_header.ReadGroups.Add(read_group);

    RefVector refvec;
    BamWriter bamWriter;
    bamWriter.SetCompressionMode(BamWriter::Compressed);

    if(!bamWriter.Open(bamFile, sam_header, refvec))
    {
        sff_fclose(sff_file);
        cerr << "sff2bam ERROR: failed to open " << bamFile << endl;
        exit(1);
    }

    // Save 1st read
    BamAlignment bam_alignment0;
    bam_alignment0.SetIsMapped(false);
    bam_alignment0.Name = sff->rheader->name->s;
    size_t nBases = sff->rheader->n_bases + 1 - sff->rheader->clip_qual_left;
    if(sff->rheader->clip_qual_right > 0)
    {
        nBases = sff->rheader->clip_qual_right - sff->rheader->clip_qual_left;
    }
    if(nBases > 0)
    {
        bam_alignment0.QueryBases.reserve(nBases);
        bam_alignment0.Qualities.reserve(nBases);
        for (int base = sff->rheader->clip_qual_left - 1; base < sff->rheader->clip_qual_right - 1; ++base)
        {
            bam_alignment0.QueryBases.push_back(sff->read->bases->s[base]);
            bam_alignment0.Qualities.push_back(sff->read->quality->s[base] + 33);
        }
    }

    int clip_flow = 0;
    for (unsigned int base = 0; base < sff->rheader->clip_qual_left && base < sff->rheader->n_bases; ++base)
    {
        clip_flow += sff->read->flow_index[base];
    }
    if (clip_flow > 0)
    {
        clip_flow--;
    }

    bam_alignment0.AddTag("RG","Z", rgname);
    bam_alignment0.AddTag("PG","Z", string("sff2bam"));
    bam_alignment0.AddTag("ZF","i", clip_flow); // TODO: trim flow
    vector<uint16_t> flowgram0(sff->gheader->flow_length);
    copy(sff->read->flowgram, sff->read->flowgram + sff->gheader->flow_length, flowgram0.begin());
    bam_alignment0.AddTag("FZ", flowgram0);
    sff_destroy(sff);
    sff = NULL;

    bamWriter.SaveAlignment(bam_alignment0);

    // Save rest reads
    while(NULL != (sff = sff_read(sff_file)))
    {
        BamAlignment bam_alignment;
        bam_alignment.SetIsMapped(false);
        bam_alignment.Name = sff->rheader->name->s;   
        nBases = sff->rheader->n_bases + 1 - sff->rheader->clip_qual_left;
        if(sff->rheader->clip_qual_right > 0)
        {
            nBases = sff->rheader->clip_qual_right - sff->rheader->clip_qual_left;
        }
        if(nBases > 0)
        {
            bam_alignment.QueryBases.reserve(nBases);
            bam_alignment.Qualities.reserve(nBases);
            for (int base = sff->rheader->clip_qual_left - 1; base < sff->rheader->clip_qual_right - 1; ++base)
            {
                bam_alignment.QueryBases.push_back(sff->read->bases->s[base]);
                bam_alignment.Qualities.push_back(sff->read->quality->s[base] + 33);
            }
        }

        clip_flow = 0;
        for (unsigned int base = 0; base <= sff->rheader->clip_qual_left && base < sff->rheader->n_bases; ++base)
        {
            clip_flow += sff->read->flow_index[base];
        }
        if (clip_flow > 0)
        {
            clip_flow--;
        }

        bam_alignment.AddTag("RG","Z", rgname);
        bam_alignment.AddTag("PG","Z", string("sff2bam"));
        bam_alignment.AddTag("ZF","i", clip_flow); // TODO: trim flow
        vector<uint16_t> flowgram(sff->gheader->flow_length);
        copy(sff->read->flowgram, sff->read->flowgram + sff->gheader->flow_length, flowgram.begin());
        bam_alignment.AddTag("FZ", flowgram);
        sff_destroy(sff);
        sff = NULL;

        bamWriter.SaveAlignment(bam_alignment);
    }

	sff_fclose(sff_file);

	if(combineSffs && infiles.size() > 1)
	{
        for(size_t n = 1; n < infiles.size(); ++n)
		{
			sff_file_t* sff_file2 = sff_fopen(infiles[n].c_str(), "r", NULL, NULL);

			while(NULL != (sff = sff_read(sff_file2)))
			{
				BamAlignment bam_alignment;
				bam_alignment.SetIsMapped(false);
				bam_alignment.Name = sff->rheader->name->s;   
				nBases = sff->rheader->n_bases + 1 - sff->rheader->clip_qual_left;
				if(sff->rheader->clip_qual_right > 0)
				{
					nBases = sff->rheader->clip_qual_right - sff->rheader->clip_qual_left;
				}
				if(nBases > 0)
				{
					bam_alignment.QueryBases.reserve(nBases);
					bam_alignment.Qualities.reserve(nBases);
					for (int base = sff->rheader->clip_qual_left - 1; base < sff->rheader->clip_qual_right - 1; ++base)
					{
						bam_alignment.QueryBases.push_back(sff->read->bases->s[base]);
						bam_alignment.Qualities.push_back(sff->read->quality->s[base] + 33);
					}
				}

				clip_flow = 0;
				for (unsigned int base = 0; base <= sff->rheader->clip_qual_left && base < sff->rheader->n_bases; ++base)
				{
					clip_flow += sff->read->flow_index[base];
				}
				if (clip_flow > 0)
				{
					clip_flow--;
				}

				bam_alignment.AddTag("RG","Z", rgname);
				bam_alignment.AddTag("PG","Z", string("sff2bam"));
				bam_alignment.AddTag("ZF","i", clip_flow); // TODO: trim flow
				vector<uint16_t> flowgram(sff->gheader->flow_length);
				copy(sff->read->flowgram, sff->read->flowgram + sff->gheader->flow_length, flowgram.begin());
				bam_alignment.AddTag("FZ", flowgram);
				sff_destroy(sff);
				sff = NULL;

				bamWriter.SaveAlignment(bam_alignment);
			}

			sff_fclose(sff_file2);
		}
	}

    bamWriter.Close();    

    return 0;
}
Пример #29
0
bool BaseCallerParameters::InitContextVarsFromOptArgs(OptArgs& opts){

    assert(bc_files.options_set);
    char default_run_id[6]; // Create a run identifier from full output directory string
    ion_run_to_readname (default_run_id, (char*)bc_files.output_directory.c_str(), bc_files.output_directory.length());
    context_vars.run_id                      = opts.GetFirstString ('-', "run-id", default_run_id);
	num_threads_                             = opts.GetFirstInt    ('n', "num-threads", max(2*numCores(), 4));
	num_bamwriter_threads_                   = opts.GetFirstInt    ('-', "num-threads-bamwriter", 6);

    context_vars.flow_signals_type           = opts.GetFirstString ('-', "flow-signals-type", "none");
    context_vars.extra_trim_left             = opts.GetFirstInt    ('-', "extra-trim-left", 0);
    context_vars.only_process_unfiltered_set = opts.GetFirstBoolean('-', "only-process-unfiltered-set", false);

    // Treephaser options
    context_vars.dephaser                    = opts.GetFirstString ('-', "dephaser", "treephaser-sse");
    context_vars.keynormalizer               = opts.GetFirstString ('-', "keynormalizer", "gain");
    context_vars.windowSize                  = opts.GetFirstInt    ('-', "window-size", DPTreephaser::kWindowSizeDefault_);
    context_vars.skip_droop                  = opts.GetFirstBoolean('-', "skip-droop", true);
    context_vars.skip_recal_during_norm      = opts.GetFirstBoolean('-', "skip-recal-during-normalization", false);
    context_vars.diagonal_state_prog         = opts.GetFirstBoolean('-', "diagonal-state-prog", false);

    // Not every combination of options is possible here:
    if (context_vars.diagonal_state_prog and context_vars.dephaser != "treephaser-swan") {
      cout << " === BaseCaller Option Incompatibility: Using dephaser treephaser-swan with diagonal state progression instead of "
           << context_vars.dephaser << endl;
      context_vars.dephaser = "treephaser-swan";
    }

    context_vars.process_tfs      = true;
    context_vars.options_set      = true;
    return true;
};
Пример #30
0
int main (int argc, const char *argv[])
{
  time_t program_start_time;
  time(&program_start_time);
  Json::Value calibration_json(Json::objectValue);
  DumpStartingStateOfProgram (argc,argv,program_start_time, calibration_json["Calibration"]);

  //
  // Step 1. Process command line options
  //

  OptArgs opts;
  opts.ParseCmdLine(argc, argv);

  // enable floating point exceptions during program execution
  if (opts.GetFirstBoolean('-', "float-exceptions", true)) {
    cout << "Calibration: Floating point exceptions enabled." << endl;
    feenableexcept(FE_DIVBYZERO | FE_INVALID | FE_OVERFLOW);
  } //*/

  CalibrationContext calib_context;
  if (not calib_context.InitializeFromOpts(opts)){
    PrintHelp_CalModules();
  }

  HistogramCalibration master_histogram(opts, calib_context);
  calib_context.hist_calibration_master = &master_histogram;

  LinearCalibrationModel master_linear_model(opts, calib_context);
  calib_context.linear_model_master = &master_linear_model;

  opts.CheckNoLeftovers();

  //
  // Step 2. Execute threaded calibration
  //
  int calibration_thread_time = 0;

  if (calib_context.successive_fit) {

    // first train linear model
    if (master_linear_model.DoTraining()) {
      int l_thread_time = 0;
      for (int i_iteration=0; i_iteration<calib_context.num_train_iterations; i_iteration++) {
        cout << " -Training Iteration " << i_iteration+1;
        l_thread_time = ExecuteThreadedCalibrationTraining(calib_context);

        // Activate master linear model after every round of training
        master_linear_model.CreateCalibrationModel(false); // make linear model
        master_linear_model.SetModelGainsAndOffsets(); // expand for use in basecalling

        calibration_thread_time += l_thread_time;
        calib_context.bam_reader.Rewind(); // reset all files for another pass
        cout << " Duration = " << l_thread_time << endl;
      }
    }

    // Then apply it during polish model training
    if (master_histogram.DoTraining()) {
      calib_context.local_fit_linear_model = false;
      calib_context.local_fit_polish_model = true;
      calibration_thread_time += ExecuteThreadedCalibrationTraining(calib_context);
    }
  }
  else {
    // Single pass in which both models are fit jointly
    calibration_thread_time=ExecuteThreadedCalibrationTraining(calib_context);
  }


  //
  // Step 3. Create models, write output, and close modules
  //

  // Linear Model
  if (master_linear_model.CreateCalibrationModel())
    master_linear_model.ExportModelToJson(calibration_json["LinearModel"], "");

  // HP histogram calibration
  if (master_histogram.CreateCalibrationModel())
    master_histogram.ExportModelToJson(calibration_json["HPHistogram"]);


  // Transfer stuff from calibration context and close bam reader
  calib_context.Close(calibration_json["Calibration"]);

  time_t program_end_time;
  time(&program_end_time);

  calibration_json["Calibration"]["end_time"] = get_time_iso_string(program_end_time);
  calibration_json["Calibration"]["total_duration"] = (Json::Int)difftime(program_end_time,program_start_time);
  calibration_json["Calibration"]["calibration_duration"] = (Json::Int)calibration_thread_time;

  SaveJson(calibration_json, calib_context.filename_json);
  return EXIT_SUCCESS;
}