void CommandLineOpts::PostProcessArgs(OptArgs &opts) { sys_context.FindExpLogPath(); SetGlobalChipID ( sys_context.explog_path ); if(ChipIdDecoder::IsProtonChip()) { if(!opts.HasOption('-', "clonal-filter-bkgmodel")) { bkg_control.polyclonal_filter.enable = false; } if(!opts.HasOption('-', "xtalk-correction")) { bkg_control.enable_trace_xtalk_correction = false; } if(!opts.HasOption('-', "col-flicker-correct")) { img_control.col_flicker_correct = true; } if(!opts.HasOption('-', "col-flicker-correct-aggressive")) { img_control.aggressive_cnc = true; } if(!opts.HasOption('-', "img-gain-correct")) { img_control.gain_correct_images = true; } } }
int main(int argc, const char *argv[]) { OptArgs opts; opts.ParseCmdLine(argc, argv); bool help; string topFile, bottomFile, outFile; opts.GetOption(topFile, "", '-', "top"); opts.GetOption(bottomFile, "", '-', "bottom"); opts.GetOption(outFile, "", '-', "merged"); opts.GetOption(help, "false", 'h', "help"); if (help || argc == 1) { usage(); } ION_ASSERT(!topFile.empty() && !bottomFile.empty() && !outFile.empty(), "Need top, bottom and merged files. use --help for details."); MergeAcq merger; Image top; Image bottom; Image combo; cout << "Loading images." << endl; ION_ASSERT(top.LoadRaw(topFile.c_str()), "Couldn't load file."); ION_ASSERT(bottom.LoadRaw(bottomFile.c_str()), "Couldn't load file."); merger.SetFirstImage(&bottom); merger.SetSecondImage(&top, bottom.GetRows(), 0); // starting vertically raised but columns the same. cout << "Merging." << endl; merger.Merge(combo); Acq acq; cout << "Saving. " << endl; acq.SetData(&combo); acq.WriteVFC(outFile.c_str(), 0, 0, combo.GetCols(), combo.GetRows()); cout << "Done." << endl; return 0; }
bool RetrieveParameterBool(OptArgs &opts, Json::Value& json, char short_name, const string& long_name_hyphens, bool default_value) { string long_name_underscores = long_name_hyphens; for (unsigned int i = 0; i < long_name_underscores.size(); ++i) if (long_name_underscores[i] == '-') long_name_underscores[i] = '_'; bool value = default_value; string source = "builtin default"; if (json.isMember(long_name_underscores)) { if (json[long_name_underscores].isString()) value = atoi(json[long_name_underscores].asCString()); else value = json[long_name_underscores].asInt(); source = "parameters json file"; } if (opts.HasOption(short_name, long_name_hyphens)) { value = opts.GetFirstBoolean(short_name, long_name_hyphens, value); source = "command line option"; } cout << setw(35) << long_name_hyphens << " = " << setw(10) << (value ? "true" : "false") << " (boolean, " << source << ")" << endl; return value; }
bool BaseCallerParameters::InitializeSamplingFromOptArgs(OptArgs& opts, const int num_wells) { assert(context_vars.options_set); // If we are just doing phase estimation none of the options matter, so don't spam output if (context_vars.just_phase_estimation){ sampling_opts.options_set = true; return true; } sampling_opts.num_unfiltered = opts.GetFirstInt ('-', "num-unfiltered", 100000); sampling_opts.downsample_size = opts.GetFirstInt ('-', "downsample-size", 0); sampling_opts.downsample_fraction = opts.GetFirstDouble ('-', "downsample-fraction", 1.0); sampling_opts.calibration_training = opts.GetFirstInt ('-', "calibration-training", -1); sampling_opts.have_calib_panel = (not bc_files.calibration_panel_file.empty()); sampling_opts.MaskNotWanted = MaskNone; // Reconcile parameters downsample_size and downsample_fraction bool downsample = sampling_opts.downsample_size > 0 or sampling_opts.downsample_fraction < 1.0; if (sampling_opts.downsample_fraction < 1.0) { if (sampling_opts.downsample_size == 0) sampling_opts.downsample_size = (int)((float)num_wells*sampling_opts.downsample_fraction); else sampling_opts.downsample_size = min(sampling_opts.downsample_size, (int)((float)num_wells*sampling_opts.downsample_fraction)); } if (downsample) cout << "Downsampling activated: Randomly choosing " << sampling_opts.downsample_size << " reads on this chip." << endl; // Calibration training requires additional changes & overwrites command line options if (sampling_opts.calibration_training >= 0) { if (context_vars.diagonal_state_prog) { cerr << " === BaseCaller Option Incompatibility: Calibration training not supported for diagonal state progression. Aborting!" << endl; exit(EXIT_FAILURE); } if (sampling_opts.downsample_size>0) sampling_opts.calibration_training = min(sampling_opts.calibration_training, sampling_opts.downsample_size); sampling_opts.downsample_size = max(sampling_opts.calibration_training, 0); sampling_opts.MaskNotWanted = (MaskType)(MaskFilteredBadResidual|MaskFilteredBadPPF|MaskFilteredBadKey); sampling_opts.num_unfiltered = 0; context_vars.process_tfs = false; context_vars.flow_signals_type = "scaled-residual"; cout << "=== BaseCaller Calibration Training ===" << endl; cout << " - Generating a training set up to " << sampling_opts.downsample_size << " randomly selected reads." << endl; if (sampling_opts.have_calib_panel) cout << " - Adding calibration panel reads specified in " << bc_files.calibration_panel_file << endl; cout << endl; } sampling_opts.options_set = true; return true; };
void RecalibrationModel::Initialize(OptArgs& opts, vector<string> &bam_comments, const string & run_id, const ion::ChipSubset & chip_subset) { string model_file_name = opts.GetFirstString ('-', "model-file", ""); int model_threshold = opts.GetFirstInt('-', "recal-model-hp-thres", 4); bool save_hpmodel = opts.GetFirstBoolean('-', "save-hpmodel", true); bool diagonal_state_prog = opts.GetFirstBoolean('-', "diagonal-state-prog", false); if (diagonal_state_prog) model_file_name.clear(); if (InitializeModel(model_file_name, model_threshold) and save_hpmodel) SaveModelFileToBamComments(model_file_name, bam_comments, run_id, chip_subset.GetColOffset(), chip_subset.GetRowOffset()); }
TagTrimmerParameters MolecularTagTrimmer::ReadOpts(OptArgs& opts) { // Reading command line options to set tag structures TagTrimmerParameters my_params; my_params.min_family_size = opts.GetFirstInt ('-', "min-tag-fam-size", 3); my_params.suppress_mol_tags = opts.GetFirstBoolean ('-', "suppress-mol-tags", false); //my_params.cl_a_handle = opts.GetFirstString ('-', "tag-handle", ""); //my_params.handle_cutoff = opts.GetFirstInt ('-', "handle-cutoff", 2); my_params.master_tags.prefix_mol_tag = opts.GetFirstString ('-', "prefix-mol-tag", ""); my_params.master_tags.suffix_mol_tag = opts.GetFirstString ('-', "suffix-mol-tag", ""); ValidateTagString(my_params.master_tags.prefix_mol_tag); ValidateTagString(my_params.master_tags.suffix_mol_tag); // Overload to disable molecular tagging if (my_params.min_family_size == 0) my_params.suppress_mol_tags = true; else if (my_params.min_family_size < 1) { cerr << "MolecularTagTrimmer Error: min-tag-fam-size must be at least 1. " << endl; exit(EXIT_FAILURE); } my_params.command_line_tags = my_params.master_tags.HasTags(); // Options for read filtering & and trimming method selection string trim_method = opts.GetFirstString ('-', "tag-trim-method", "sloppy-trim"); if (trim_method == "sloppy-trim") my_params.tag_trim_method = kSloppyTrim; else if (trim_method == "strict-trim") my_params.tag_trim_method = kStrictTrim; else { cerr << "MolecularTagTrimmer Error: Unknown tag trimming option " << trim_method << endl; exit(EXIT_FAILURE); } string filter_method = opts.GetFirstString ('-', "tag-filter-method", "need-all"); if (filter_method == "need-all") my_params.tag_filter_method = kneed_all_tags; else if (filter_method == "need-prefix") my_params.tag_filter_method = kneed_only_prefix_tag; else if (filter_method == "need-suffix") my_params.tag_filter_method = kneed_only_suffix_tag; else { cerr << "MolecularTagTrimmer Error: Unknown tag filtering option " << filter_method << endl; exit(EXIT_FAILURE); } return my_params; }
void RecalibrationModel::Initialize(OptArgs& opts) { is_enabled_ = false; string model_file_name = opts.GetFirstString ('-', "model-file", ""); if (model_file_name.empty() or model_file_name == "off") { printf("RecalibrationModel: disabled\n\n"); return; } ifstream model_file; model_file.open(model_file_name.c_str()); if (model_file.fail()) { printf("RecalibrationModel: disabled (cannot open %s)\n\n", model_file_name.c_str()); model_file.close(); return; } recalModelHPThres = opts.GetFirstInt('-', "recal-model-hp-thres", 4); string comment_line; getline(model_file, comment_line); //skip the comment time int flowStart, flowEnd, flowSpan, xMin, xMax, xSpan, yMin, yMax, ySpan, max_hp_calibrated; model_file >> flowStart >> flowEnd >> flowSpan >> xMin >> xMax >> xSpan >> yMin >> yMax >> ySpan >> max_hp_calibrated; stratification.SetupRegion(xMin, xMax, xSpan, yMin, yMax, ySpan); //calculate number of partitions and initialize the stratifiedAs and stratifiedBs SetupStratification(flowStart,flowEnd, flowSpan,xMin,xMax,xSpan,yMin,yMax,ySpan,max_hp_calibrated); //TODO: parse model_file into stratifiedAs and stratifiedBs while (model_file.good()) { float paramA, paramB; int refHP; char flowBase; model_file >> flowBase >> flowStart >> flowEnd >> xMin >> xMax >> yMin >> yMax >> refHP >> paramA >> paramB; //populate it to stratifiedAs and startifiedBs int nucInd = NuctoInt(flowBase); //boundary check int offsetRegion = stratification.OffsetRegion(xMin,yMin); FillIndexes(offsetRegion,nucInd, refHP, flowStart, flowEnd, paramA, paramB); } model_file.close(); printf("Recalibration: enabled (using calibration file %s)\n\n", model_file_name.c_str()); is_enabled_ = true; if (recalModelHPThres > MAX_HPXLEN) is_enabled_ = false; }
void ExtendParameters::SetupFileIO(OptArgs &opts) { // freeBayes slot fasta = opts.GetFirstString('r', "reference", ""); if (fasta.empty()) { cerr << "Fatal ERROR: Reference file not specified via -r" << endl; exit(1); } ValidateAndCanonicalizePath(fasta); // freeBayes slot variantPriorsFile = opts.GetFirstString('c', "input-vcf", ""); if (variantPriorsFile.empty()) { cerr << "INFO: No input VCF (Hotspot) file specified via -c,--input-vcf" << endl; } else ValidateAndCanonicalizePath(variantPriorsFile); sseMotifsFileName = opts.GetFirstString('e', "error-motifs", ""); sseMotifsProvided = true; if (sseMotifsFileName.empty()) { sseMotifsProvided = false; cerr << "INFO: Systematic error motif file not specified via -e" << endl; } else ValidateAndCanonicalizePath(sseMotifsFileName); opts.GetOption(bams, "", 'b', "input-bam"); if (bams.empty()) { cerr << "FATAL ERROR: BAM file not specified via -b" << endl; exit(-1); } for (unsigned int i_bam = 0; i_bam < bams.size(); ++i_bam) ValidateAndCanonicalizePath(bams[i_bam]); outputDir = opts.GetFirstString('O', "output-dir", "."); ValidateAndCanonicalizePath(outputDir); outputFile = opts.GetFirstString('o', "output-vcf", ""); if (outputFile.empty()) { cerr << "Fatal ERROR: Output VCF filename not specified via -o" << endl; exit(1); } // Are those file names? postprocessed_bam = opts.GetFirstString('-', "postprocessed-bam", ""); sampleName = opts.GetFirstString('g', "sample-name", ""); force_sample_name = opts.GetFirstString('-', "force-sample-name", ""); }
void ExtendParameters::SetFreeBayesParameters(OptArgs &opts, Json::Value& fb_params) { // FreeBayes parameters // primarily used in candidate generation targets = opts.GetFirstString('t', "target-file", ""); trim_ampliseq_primers = opts.GetFirstBoolean('-', "trim-ampliseq-primers", false); if (targets.empty() and trim_ampliseq_primers) { cerr << "ERROR: --trim-ampliseq-primers enabled but no --target-file provided" << endl; exit(1); } allowIndels = RetrieveParameterBool (opts, fb_params, '-', "allow-indels", true); allowSNPs = RetrieveParameterBool (opts, fb_params, '-', "allow-snps", true); allowMNPs = RetrieveParameterBool (opts, fb_params, '-', "allow-mnps", true); allowComplex = RetrieveParameterBool (opts, fb_params, '-', "allow-complex", false); // deprecated: // leftAlignIndels = RetrieveParameterBool (opts, fb_params, '-', "left-align-indels", false); RetrieveParameterBool (opts, fb_params, '-', "left-align-indels", false); //useBestNAlleles = 0; useBestNAlleles = RetrieveParameterInt (opts, fb_params, 'm', "use-best-n-alleles", 2); onlyUseInputAlleles = RetrieveParameterBool (opts, fb_params, '-', "use-input-allele-only", false); min_mapping_qv = RetrieveParameterInt (opts, fb_params, 'M', "min-mapping-qv", 4); read_snp_limit = RetrieveParameterInt (opts, fb_params, 'U', "read-snp-limit", 10); readMaxMismatchFraction = RetrieveParameterDouble(opts, fb_params, 'z', "read-max-mismatch-fraction", 1.0); maxComplexGap = RetrieveParameterInt (opts, fb_params, '!', "max-complex-gap", 1); // read from json or command line, otherwise default to snp frequency minAltFraction = RetrieveParameterDouble(opts, fb_params, '-', "gen-min-alt-allele-freq", my_controls.filter_snps.min_allele_freq); minCoverage = RetrieveParameterInt (opts, fb_params, '-', "gen-min-coverage", my_controls.filter_snps.min_cov); minIndelAltFraction = RetrieveParameterDouble(opts, fb_params, '-', "gen-min-indel-alt-allele-freq", my_controls.filter_hp_indel.min_allele_freq); //set up debug levels if (program_flow.DEBUG > 0) debug = true; if (program_flow.inputPositionsOnly) { processInputPositionsOnly = true; } if (variantPriorsFile.empty() && (processInputPositionsOnly || onlyUseInputAlleles) ) { cerr << "ERROR: Parameter error - Process-input-positions-only: " << processInputPositionsOnly << " use-input-allele-only: " << onlyUseInputAlleles << " : Specified without Input VCF File " << endl; exit(1); } }
string RetrieveParameterString(OptArgs &opts, Json::Value& json, char short_name, const string& long_name_hyphens, const string& default_value) { string long_name_underscores = GetRidOfDomainAndHyphens(long_name_hyphens); string value = default_value; string source = "builtin default"; if (json.isMember(long_name_underscores)) { value = json[long_name_underscores].asCString(); source = "parameters json file"; } if (opts.HasOption(short_name, long_name_hyphens)) { value = opts.GetFirstString(short_name, long_name_hyphens, value); source = "command line option"; } cout << setw(35) << long_name_hyphens << " = " << setw(10) << value << " (string, " << source << ")" << endl; return value; }
int IonstatsReduceH5(int argc, const char *argv[]) { OptArgs opts; opts.ParseCmdLine(argc-1, argv+1); string output_h5_filename = opts.GetFirstString ('o', "output", ""); bool merge_proton_blocks = opts.GetFirstBoolean ('b', "merge-proton-blocks", "true"); vector<string> input_h5_filename; opts.GetLeftoverArguments(input_h5_filename); if(input_h5_filename.empty() or output_h5_filename.empty()) { IonstatsReduceH5Help(); return 1; } if(merge_proton_blocks) cout << "NOTE:" << argv[0] << " " << argv[1] << ": --merge-proton-blocks=true so any Proton block-specific read group suffixes will be merged" << endl; return IonstatsAlignmentReduceH5(output_h5_filename, input_h5_filename, merge_proton_blocks); }
int IonstatsReduce(int argc, const char *argv[]) { OptArgs opts; opts.ParseCmdLine(argc, argv); string output_json_filename = opts.GetFirstString('o', "output", ""); vector<string> input_jsons; opts.GetLeftoverArguments(input_jsons); if(input_jsons.empty() or output_json_filename.empty()) { IonstatsReduceHelp(); return 1; } ifstream in(input_jsons[0].c_str(), ifstream::in); if (!in.good()) { fprintf(stderr, "[ionstats] ERROR: cannot open %s\n", input_jsons[0].c_str()); return 1; } Json::Value first_input_json; in >> first_input_json; in.close(); if (!first_input_json.isMember("meta")) { fprintf(stderr, "[ionstats] ERROR: %s is not a valid input file for ionstats reduce\n", input_jsons[0].c_str()); return 1; } string format_name = first_input_json["meta"].get("format_name","").asString(); if (format_name == "ionstats_basecaller") return IonstatsBasecallerReduce(output_json_filename, input_jsons); if (format_name == "ionstats_tf") return IonstatsTestFragmentsReduce(output_json_filename, input_jsons); if (format_name == "ionstats_alignment") return IonstatsAlignmentReduce(output_json_filename, input_jsons); fprintf(stderr, "[ionstats] ERROR: %s is not a valid input file for ionstats reduce\n", input_jsons[0].c_str()); return 1; }
void ProgramControlSettings::SetOpts(OptArgs &opts, Json::Value &tvc_params) { DEBUG = opts.GetFirstInt ('d', "debug", 0); nThreads = RetrieveParameterInt (opts, tvc_params, 'n', "num-threads", 12); nVariantsPerThread = RetrieveParameterInt (opts, tvc_params, 'N', "num-variants-per-thread", 250); use_SSE_basecaller = RetrieveParameterBool (opts, tvc_params, '-', "use-sse-basecaller", true); // decide diagnostic rich_json_diagnostic = RetrieveParameterBool (opts, tvc_params, '-', "do-json-diagnostic", false); minimal_diagnostic = RetrieveParameterBool (opts, tvc_params, '-', "do-minimal-diagnostic", false); inputPositionsOnly = RetrieveParameterBool (opts, tvc_params, '-', "process-input-positions-only", false); suppress_recalibration = RetrieveParameterBool (opts, tvc_params, '-', "suppress-recalibration", true); resolve_clipped_bases = RetrieveParameterBool (opts, tvc_params, '-', "resolve-clipped-bases", false); }
void ExtendParameters::ParametersFromJSON(OptArgs &opts, Json::Value &tvc_params, Json::Value &freebayes_params, Json::Value ¶ms_meta) { string parameters_file = opts.GetFirstString('-', "parameters-file", ""); Json::Value parameters_json(Json::objectValue); if (not parameters_file.empty()) { ifstream in(parameters_file.c_str(), ifstream::in); if (!in.good()) { fprintf(stderr, "[tvc] FATAL ERROR: cannot open %s\n", parameters_file.c_str()); exit(-1); } in >> parameters_json; in.close(); if (parameters_json.isMember("pluginconfig")) parameters_json = parameters_json["pluginconfig"]; tvc_params = parameters_json.get("torrent_variant_caller", Json::objectValue); freebayes_params = parameters_json.get("freebayes", Json::objectValue); params_meta = parameters_json.get("meta", Json::objectValue); }
bool BaseCallerParameters::InitializeFilesFromOptArgs(OptArgs& opts) { bc_files.input_directory = opts.GetFirstString ('i', "input-dir", "."); bc_files.output_directory = opts.GetFirstString ('o', "output-dir", "."); bc_files.unfiltered_untrimmed_directory = bc_files.output_directory + "/unfiltered.untrimmed"; bc_files.unfiltered_trimmed_directory = bc_files.output_directory + "/unfiltered.trimmed"; CreateResultsFolder ((char*)bc_files.output_directory.c_str()); CreateResultsFolder ((char*)bc_files.unfiltered_untrimmed_directory.c_str()); CreateResultsFolder ((char*)bc_files.unfiltered_trimmed_directory.c_str()); ValidateAndCanonicalizePath(bc_files.input_directory); ValidateAndCanonicalizePath(bc_files.output_directory); ValidateAndCanonicalizePath(bc_files.unfiltered_untrimmed_directory); ValidateAndCanonicalizePath(bc_files.unfiltered_trimmed_directory); bc_files.filename_wells = opts.GetFirstString ('-', "wells", bc_files.input_directory + "/1.wells"); bc_files.filename_mask = opts.GetFirstString ('-', "mask", bc_files.input_directory + "/analysis.bfmask.bin"); ValidateAndCanonicalizePath(bc_files.filename_wells); ValidateAndCanonicalizePath(bc_files.filename_mask, bc_files.input_directory + "/bfmask.bin"); bc_files.filename_filter_mask = bc_files.output_directory + "/bfmask.bin"; bc_files.filename_json = bc_files.output_directory + "/BaseCaller.json"; bc_files.filename_phase = bc_files.output_directory + "/PhaseEstimates.json"; printf("\n"); printf("Input files summary:\n"); printf(" --input-dir %s\n", bc_files.input_directory.c_str()); printf(" --wells %s\n", bc_files.filename_wells.c_str()); printf(" --mask %s\n", bc_files.filename_mask.c_str()); printf("\n"); printf("Output directories summary:\n"); printf(" --output-dir %s\n", bc_files.output_directory.c_str()); printf(" unf.untr %s\n", bc_files.unfiltered_untrimmed_directory.c_str()); printf(" unf.tr %s\n", bc_files.unfiltered_trimmed_directory.c_str()); printf("\n"); bc_files.lib_datasets_file = opts.GetFirstString ('-', "datasets", ""); bc_files.calibration_panel_file = opts.GetFirstString ('-', "calibration-panel", ""); if (not bc_files.lib_datasets_file.empty()) ValidateAndCanonicalizePath(bc_files.lib_datasets_file); if (not bc_files.calibration_panel_file.empty()) ValidateAndCanonicalizePath(bc_files.calibration_panel_file); bc_files.options_set = true; return true; };
bool BaseCallerContext::SetKeyAndFlowOrder(OptArgs& opts, const char * FlowOrder, const int NumFlows) { flow_order.SetFlowOrder( opts.GetFirstString ('-', "flow-order", FlowOrder), opts.GetFirstInt ('f', "flowlimit", NumFlows)); if (flow_order.num_flows() > NumFlows) flow_order.SetNumFlows(NumFlows); assert(flow_order.is_ok()); string lib_key = opts.GetFirstString ('-', "lib-key", "TCAG"); //! @todo Get default key from wells string tf_key = opts.GetFirstString ('-', "tf-key", "ATCG"); lib_key = opts.GetFirstString ('-', "librarykey", lib_key); // Backward compatible opts tf_key = opts.GetFirstString ('-', "tfkey", tf_key); keys.resize(2); keys[0].Set(flow_order, lib_key, "lib"); keys[1].Set(flow_order, tf_key, "tf"); return true; };
void PhaseEstimator::InitializeFromOptArgs(OptArgs& opts) { phasing_estimator_ = opts.GetFirstString ('-', "phasing-estimator", "spatial-refiner-2"); string arg_cf_ie_dr = opts.GetFirstString ('-', "libcf-ie-dr", ""); residual_threshold_ = opts.GetFirstDouble ('-', "phasing-residual-filter", 1.0); max_phasing_levels_ = opts.GetFirstInt ('-', "max-phasing-levels", max_phasing_levels_default_); use_pid_norm_ = opts.GetFirstString ('-', "keynormalizer", "keynorm-old") == "keynorm-new"; windowSize_ = opts.GetFirstInt ('-', "window-size", DPTreephaser::kWindowSizeDefault_); if (!arg_cf_ie_dr.empty()) { phasing_estimator_ = "override"; result_regions_x_ = 1; result_regions_y_ = 1; result_cf_.assign(1, 0.0); result_ie_.assign(1, 0.0); result_dr_.assign(1, 0.0); if (3 != sscanf (arg_cf_ie_dr.c_str(), "%f,%f,%f", &result_cf_[0], &result_ie_[0], &result_dr_[0])) { fprintf (stderr, "Option Error: libcf-ie-dr %s\n", arg_cf_ie_dr.c_str()); exit (EXIT_FAILURE); } return; // --libcf-ie-dr overrides other phasing-related options } }
int IonstatsTestFragments(int argc, const char *argv[]) { OptArgs opts; opts.ParseCmdLine(argc, argv); string input_bam_filename = opts.GetFirstString('i', "input", ""); string fasta_filename = opts.GetFirstString('r', "ref", ""); string output_json_filename = opts.GetFirstString('o', "output", "ionstats_tf.json"); int histogram_length = opts.GetFirstInt ('h', "histogram-length", 400); if(argc < 2 or input_bam_filename.empty() or fasta_filename.empty()) { IonstatsTestFragmentsHelp(); return 1; } // // Prepare for metric calculation // map<string,string> tf_sequences; PopulateReferenceSequences(tf_sequences, fasta_filename); BamReader input_bam; if (!input_bam.Open(input_bam_filename)) { fprintf(stderr, "[ionstats] ERROR: cannot open %s\n", input_bam_filename.c_str()); return 1; } int num_tfs = input_bam.GetReferenceCount(); SamHeader sam_header = input_bam.GetHeader(); if(!sam_header.HasReadGroups()) { fprintf(stderr, "[ionstats] ERROR: no read groups in %s\n", input_bam_filename.c_str()); return 1; } string flow_order; string key; for (SamReadGroupIterator rg = sam_header.ReadGroups.Begin(); rg != sam_header.ReadGroups.End(); ++rg) { if(rg->HasFlowOrder()) flow_order = rg->FlowOrder; if(rg->HasKeySequence()) key = rg->KeySequence; } // Need these metrics stratified by TF. vector<ReadLengthHistogram> called_histogram(num_tfs); vector<ReadLengthHistogram> aligned_histogram(num_tfs); vector<ReadLengthHistogram> AQ10_histogram(num_tfs); vector<ReadLengthHistogram> AQ17_histogram(num_tfs); vector<SimpleHistogram> error_by_position(num_tfs); vector<MetricGeneratorSNR> system_snr(num_tfs); vector<MetricGeneratorHPAccuracy> hp_accuracy(num_tfs); for (int tf = 0; tf < num_tfs; ++tf) { called_histogram[tf].Initialize(histogram_length); aligned_histogram[tf].Initialize(histogram_length); AQ10_histogram[tf].Initialize(histogram_length); AQ17_histogram[tf].Initialize(histogram_length); error_by_position[tf].Initialize(histogram_length); } vector<uint16_t> flow_signal_fz(flow_order.length()); vector<int16_t> flow_signal_zm(flow_order.length()); const RefVector& refs = input_bam.GetReferenceData(); // Missing: // - hp accuracy - tough, copy verbatim from TFMapper? BamAlignment alignment; vector<char> MD_op; vector<int> MD_len; MD_op.reserve(1024); MD_len.reserve(1024); string MD_tag; // // Main loop over mapped reads in the input BAM // while(input_bam.GetNextAlignment(alignment)) { if (!alignment.IsMapped() or !alignment.GetTag("MD",MD_tag)) continue; // The check below eliminates unexpected alignments if (alignment.IsReverseStrand() or alignment.Position > 5) continue; int current_tf = alignment.RefID; // // Step 1. Parse MD tag // MD_op.clear(); MD_len.clear(); for (const char *MD_ptr = MD_tag.c_str(); *MD_ptr;) { int item_length = 0; if (*MD_ptr >= '0' and *MD_ptr <= '9') { // Its a match MD_op.push_back('M'); for (; *MD_ptr and *MD_ptr >= '0' and *MD_ptr <= '9'; ++MD_ptr) item_length = 10*item_length + *MD_ptr - '0'; } else { if (*MD_ptr == '^') { // Its a deletion MD_ptr++; MD_op.push_back('D'); } else // Its a substitution MD_op.push_back('X'); for (; *MD_ptr and *MD_ptr >= 'A' and *MD_ptr <= 'Z'; ++MD_ptr) item_length++; } MD_len.push_back(item_length); } // // Step 2. Synchronously scan through Cigar and MD, doing error accounting // int MD_idx = alignment.IsReverseStrand() ? MD_op.size()-1 : 0; int cigar_idx = alignment.IsReverseStrand() ? alignment.CigarData.size()-1 : 0; int increment = alignment.IsReverseStrand() ? -1 : 1; int AQ10_bases = 0; int AQ17_bases = 0; int num_bases = 0; int num_errors = 0; while (cigar_idx < (int)alignment.CigarData.size() and MD_idx < (int) MD_op.size() and cigar_idx >= 0 and MD_idx >= 0) { if (alignment.CigarData[cigar_idx].Length == 0) { // Try advancing cigar cigar_idx += increment; continue; } if (MD_len[MD_idx] == 0) { // Try advancing MD MD_idx += increment; continue; } // Match if (alignment.CigarData[cigar_idx].Type == 'M' and MD_op[MD_idx] == 'M') { int advance = min((int)alignment.CigarData[cigar_idx].Length, MD_len[MD_idx]); num_bases += advance; alignment.CigarData[cigar_idx].Length -= advance; MD_len[MD_idx] -= advance; // Insertion (read has a base, reference doesn't) } else if (alignment.CigarData[cigar_idx].Type == 'I') { int advance = alignment.CigarData[cigar_idx].Length; for (int cnt = 0; cnt < advance; ++cnt) { error_by_position[current_tf].Add(num_bases); num_bases++; num_errors++; } alignment.CigarData[cigar_idx].Length -= advance; // Deletion (reference has a base, read doesn't) } else if (alignment.CigarData[cigar_idx].Type == 'D' and MD_op[MD_idx] == 'D') { int advance = min((int)alignment.CigarData[cigar_idx].Length, MD_len[MD_idx]); for (int cnt = 0; cnt < advance; ++cnt) { error_by_position[current_tf].Add(num_bases); num_errors++; } alignment.CigarData[cigar_idx].Length -= advance; MD_len[MD_idx] -= advance; // Substitution } else if (MD_op[MD_idx] == 'X') { int advance = min((int)alignment.CigarData[cigar_idx].Length, MD_len[MD_idx]); for (int cnt = 0; cnt < advance; ++cnt) { error_by_position[current_tf].Add(num_bases); num_bases++; num_errors++; } alignment.CigarData[cigar_idx].Length -= advance; MD_len[MD_idx] -= advance; } else { printf("ionstats tf: Unexpected OP combination: %s Cigar=%c, MD=%c !\n", alignment.Name.c_str(), alignment.CigarData[cigar_idx].Type, MD_op[MD_idx]); break; } if (num_errors*10 <= num_bases) AQ10_bases = num_bases; if (num_errors*50 <= num_bases) AQ17_bases = num_bases; } // // Step 3. Profit // called_histogram[current_tf].Add(alignment.Length); aligned_histogram[current_tf].Add(num_bases); AQ10_histogram[current_tf].Add(AQ10_bases); AQ17_histogram[current_tf].Add(AQ17_bases); if(alignment.GetTag("ZM", flow_signal_zm)) system_snr[current_tf].Add(flow_signal_zm, key.c_str(), flow_order); else if(alignment.GetTag("FZ", flow_signal_fz)) system_snr[current_tf].Add(flow_signal_fz, key.c_str(), flow_order); // HP accuracy - keeping it simple if (!alignment.IsReverseStrand()) { string genome = key + tf_sequences[refs[current_tf].RefName]; string calls = key + alignment.QueryBases; const char *genome_ptr = genome.c_str(); const char *calls_ptr = calls.c_str(); for (int flow = 0; flow < (int)flow_order.length() and *genome_ptr and *calls_ptr; ++flow) { int genome_hp = 0; int calls_hp = 0; while (*genome_ptr == flow_order[flow]) { genome_hp++; genome_ptr++; } while (*calls_ptr == flow_order[flow]) { calls_hp++; calls_ptr++; } hp_accuracy[current_tf].Add(genome_hp, calls_hp); } } } // // Processing complete, generate ionstats_tf.json // Json::Value output_json(Json::objectValue); output_json["meta"]["creation_date"] = get_time_iso_string(time(NULL)); output_json["meta"]["format_name"] = "ionstats_tf"; output_json["meta"]["format_version"] = "1.0"; output_json["results_by_tf"] = Json::objectValue; for (int tf = 0; tf < num_tfs; ++tf) { if (aligned_histogram[tf].num_reads() < 1000) continue; called_histogram[tf].SaveToJson(output_json["results_by_tf"][refs[tf].RefName]["full"]); aligned_histogram[tf].SaveToJson(output_json["results_by_tf"][refs[tf].RefName]["aligned"]); AQ10_histogram[tf].SaveToJson(output_json["results_by_tf"][refs[tf].RefName]["AQ10"]); AQ17_histogram[tf].SaveToJson(output_json["results_by_tf"][refs[tf].RefName]["AQ17"]); error_by_position[tf].SaveToJson(output_json["results_by_tf"][refs[tf].RefName]["error_by_position"]); system_snr[tf].SaveToJson(output_json["results_by_tf"][refs[tf].RefName]); hp_accuracy[tf].SaveToJson(output_json["results_by_tf"][refs[tf].RefName]); output_json["results_by_tf"][refs[tf].RefName]["sequence"] = tf_sequences[refs[tf].RefName]; } input_bam.Close(); ofstream out(output_json_filename.c_str(), ios::out); if (out.good()) { out << output_json.toStyledString(); return 0; } else { fprintf(stderr, "ERROR: unable to write to '%s'\n", output_json_filename.c_str()); return 1; } }
int main(int argc, const char *argv[]) { OptArgs opts; opts.ParseCmdLine(argc, argv); int hpLength; string statsOut; string alignmentOut; string pairedOut; string flowsOut; string summaryOut; string samFile; string qScoreCol; string wellsFile; string bfmaskFile; string snrFile; string binnedHpSigFile; string flowErrFile; string gcErrFile; int gcWin; string flowOrder; string keySeq; int numFlows; bool help; int qLength; double colCenter; double rowCenter; int colSize; int rowSize; int sampleSize; string wellsToUse; string run1, run2; opts.GetOption(run1, "", '-', "sff1"); opts.GetOption(run2, "", '-', "sff2"); opts.GetOption(wellsToUse, "", '-', "use-wells"); opts.GetOption(samFile, "", '-', "sam-parsed"); opts.GetOption(statsOut, "", '-', "stats-out"); opts.GetOption(flowsOut, "", '-', "flows-out"); opts.GetOption(alignmentOut, "", '-', "align-out"); opts.GetOption(summaryOut, "", '-', "summary-out"); opts.GetOption(pairedOut, "", '-', "paired-out"); opts.GetOption(numFlows, "40", '-', "num-flows"); opts.GetOption(hpLength, "6", '-', "max-hp"); opts.GetOption(qScoreCol, "q7Len", '-', "qscore-col"); opts.GetOption(qLength, "25", '-', "min-qlength"); opts.GetOption(help, "false", 'h', "help"); opts.GetOption(wellsFile, "", '-', "wells-file"); opts.GetOption(bfmaskFile, "", '-', "bfmask-file"); opts.GetOption(snrFile, "", '-', "snr-file"); opts.GetOption(binnedHpSigFile, "", '-', "binned-hp-sig-file"); opts.GetOption(flowErrFile, "", '-', "flow-err-file"); opts.GetOption(gcErrFile, "", '-', "gc-err-file"); opts.GetOption(flowOrder, "", '-', "flow-order"); opts.GetOption(keySeq, "", '-', "key-seq"); opts.GetOption(colCenter, "0.5", '-', "col-center"); opts.GetOption(rowCenter, "0.5", '-', "row-center"); opts.GetOption(colSize, "0", '-', "col-size"); opts.GetOption(rowSize, "0", '-', "row-size"); opts.GetOption(gcErrFile, "", '-', "gc-err-file"); opts.GetOption(gcWin, "40", '-', "gc-win"); opts.GetOption(sampleSize, "100000", '-', "sample-size"); if (help || samFile.empty() || statsOut.empty() || summaryOut.empty()) { usage(); } opts.CheckNoLeftovers(); // Some checks to make sure sensible bounds have been set if(colCenter < 0 || colCenter > 1) { cerr << "AnalyzeHPErrs - col-center must be in the range [0,1]" << endl; exit(1); } if(rowCenter < 0 || rowCenter > 1) { cerr << "AnalyzeHPErrs - row-center must be in the range [0,1]" << endl; exit(1); } if(colSize < 0) { cerr << "AnalyzeHPErrs - col-size cannot be negative." << endl; exit(1); } if(rowSize < 0) { cerr << "AnalyzeHPErrs - row-size cannot be negative." << endl; exit(1); } // Determine rows & cols if a bfmask file was supplied int nRow=0; int nCol=0; if(!bfmaskFile.empty()) { if(GetRowColFromBfmask(bfmaskFile, &nRow, &nCol)) { cerr << "AnalyzeHPErrs - problem determining rows & columns from bfmask file " << bfmaskFile << endl; exit(1); } } // Set up fds object FlowDiffStats* fds; if (!run1.empty()) { SffDiffStats* sds = new SffDiffStats(hpLength, nCol, nRow, qScoreCol, run1, run2); if (!pairedOut.empty()) sds->SetPairedOut(pairedOut); fds = dynamic_cast<FlowDiffStats*>(sds); } else { GenomeDiffStats* gds = new GenomeDiffStats(hpLength, nCol, nRow, qScoreCol); if(alignmentOut != "") { gds->SetAlignmentsOut(alignmentOut); } if (!flowsOut.empty()) { gds->SetFlowsOut(flowsOut); } fds = dynamic_cast<FlowDiffStats*>(gds); } if (gcErrFile != "") { fds->SetFlowGCOut(gcErrFile); fds->SetGCWindowSize(gcWin); } if(keySeq != "") { fds->SetKeySeq(keySeq); } if(flowOrder != "") { fds->SetFlowOrder(flowOrder); } fds->SetStatsOut(statsOut); if (!wellsToUse.empty()) { std::vector<int> wells; std::vector<bool> use; ReadSetFromFile(wellsToUse, 0, wells); use.resize(nRow * nCol, false); int count = 0; ReservoirSample<int> wellSample(sampleSize); for (size_t i = 0; i < wells.size(); i++) { wellSample.Add(wells[i]); } wells = wellSample.GetData(); for (size_t i = 0; i < wells.size(); i++) { use[wells[i]] = true; count++; } cout << "Read: " << count << " reads." << endl; fds->SetWellToAnalyze(use); } // Set integer-value row & column bounds int minRow=-1; int maxRow=-1; int minCol=-1; int maxCol=-1; if(colSize > 0 || rowSize > 0) { if(bfmaskFile.empty()) { cerr << "AnalyzeHPErrs - must specify bfmask file when restricting row or column ranges" << endl; exit(1); } if(rowSize > 0) { minRow = floor(nRow * rowCenter - rowSize / 2.0); maxRow = minRow + rowSize; minRow = std::max(0,minRow); maxRow = std::min(nRow,maxRow); } if(colSize > 0) { minCol = floor(nCol * colCenter - colSize / 2.0); maxCol = minCol + colSize; minCol = std::max(0,minCol); maxCol = std::min(nCol,maxCol); } } if (wellsFile != "") { std::vector<int32_t> xSubset, ySubset; fds->FillInSubset(samFile, qLength, minRow, maxRow, minCol, maxCol, xSubset, ySubset); if(bfmaskFile.empty()) { cerr << "AnalyzeHPErrs - must specify bfmask file when specifying wells file" << endl; exit(1); } fds->SetWellsFile(wellsFile, nRow, nCol, numFlows, xSubset, ySubset); } if (snrFile != "") { cout << "Opening snr file: " << snrFile << endl; fds->SetSNROut(snrFile); } if (binnedHpSigFile != "") { cout << "Opening binned HP signal file: " << binnedHpSigFile << endl; fds->SetBinnedHpSigOut(binnedHpSigFile); } if (flowErrFile != "") { cout << "Opening flow err file: " << flowErrFile << endl; fds->SetFlowErrOut(flowErrFile); } ofstream summary; summary.open(summaryOut.c_str()); cout << "Reading and analyzing alignments from: " << samFile << endl; if(minCol > -1 || maxCol > -1) cout << " Restricting to " << (maxCol-minCol) << " cols in the range [" << minCol << "," << maxCol << ")" << endl; if(minRow > -1 || maxRow > -1) cout << " Restricting to " << (maxRow-minRow) << " rows in the range [" << minRow << "," << maxRow << ")" << endl; fds->SetAlignmentInFile(samFile); fds->FilterAndCompare(numFlows, summary, qLength, minRow, maxRow, minCol, maxCol); summary.close(); delete fds; cout << "Done." << endl; return 0; }
void PerBaseQual::Init(OptArgs& opts, const string& chip_type, const string &output_directory, bool recalib) { if(phred_table_) { delete [] phred_table_; phred_table_ = 0; } string phred_table_file = opts.GetFirstString ('-', "phred-table-file", ""); save_predictors_ = opts.GetFirstBoolean('-', "save-predictors", false); // Determine the correct phred table filename to use bool binTable = true; if (phred_table_file.empty()) { ChipIdDecoder::SetGlobalChipId(chip_type.c_str()); ChipIdEnum chip_id = ChipIdDecoder::GetGlobalChipId(); switch(chip_id){ case ChipId314: phred_table_file = "phredTable.txt_314.binary"; break; case ChipId316: phred_table_file = "phredTable.txt_316.binary"; break; case ChipId316v2: phred_table_file = "phredTable.txt_318.binary"; break; case ChipId318: phred_table_file = "phredTable.txt_318.binary"; break; case ChipId900: // Proton chip phred_table_file = "phredTable.txt_900.binary"; break; default: phred_table_file = "phredTable.txt_314.binary"; fprintf(stderr, "PerBaseQual: No default phred table for chip_type=%s, trying %s instead\n", chip_type.c_str(), phred_table_file.c_str()); break; } if (recalib) { phred_table_file = phred_table_file.substr(0, phred_table_file.length() - 7); phred_table_file += ".Recal.binary"; } char* full_filename = GetIonConfigFile(phred_table_file.c_str()); if(!full_filename) { printf("WARNING: cannot find binary phred table file %s, try to use non-binary phred table\n", phred_table_file.c_str()); phred_table_file = phred_table_file.substr(0, phred_table_file.length() - 7); // get rid of .binary binTable = false; char* full_filename2 = GetIonConfigFile(phred_table_file.c_str()); if(!full_filename2) ION_ABORT("ERROR: Can't find phred table file " + phred_table_file); phred_table_file = full_filename2; free(full_filename2); } else { phred_table_file = full_filename; free(full_filename); } } cout << endl << "PerBaseQual::Init... phred_table_file=" << phred_table_file << endl; binTable = hasBinaryExtension(phred_table_file); // Load the phred table if(binTable) { cout << endl << "PerBaseQual::Init... load binary phred_table_file=" << phred_table_file << endl; vector<size_t> vNumCuts(kNumPredictors, 0); if(H5Fis_hdf5(phred_table_file.c_str()) > 0) { hid_t root = H5Fopen(phred_table_file.c_str(), H5F_ACC_RDONLY, H5P_DEFAULT); if(root < 0) { ION_ABORT("ERROR: cannot open HDF5 file " + phred_table_file); } hid_t grpQvTable = H5Gopen(root, "/QvTable", H5P_DEFAULT); if (grpQvTable < 0) { H5Fclose(root); ION_ABORT("ERROR: fail to open HDF5 group QvTable"); } if(H5Aexists(grpQvTable, "NumPredictors") <= 0) { H5Gclose(grpQvTable); H5Fclose(root); ION_ABORT("ERROR: HDF5 attribute NumPredictors does not exist"); } hid_t attrNumPreds = H5Aopen(grpQvTable, "NumPredictors", H5P_DEFAULT); if (attrNumPreds < 0) { H5Gclose(grpQvTable); H5Fclose(root); ION_ABORT("ERROR: fail to open HDF5 attribute NumPredictors"); } unsigned int numPredictors = 0; herr_t ret = H5Aread(attrNumPreds, H5T_NATIVE_UINT, &numPredictors); H5Aclose(attrNumPreds); if(ret < 0 || numPredictors != (unsigned int)kNumPredictors) { H5Gclose(grpQvTable); H5Fclose(root); ION_ABORT("ERROR: HDF5 attribute NumPredictors is wrong"); } char buf[100]; for(size_t i = 0; i < (size_t)kNumPredictors; ++i) { offsets_.push_back(1); sprintf(buf, "ThresholdsOfPredictor%d", (int)i); if(H5Aexists(grpQvTable, buf) <= 0) { H5Gclose(grpQvTable); H5Fclose(root); ION_ABORT("ERROR: HDF5 attribute ThresholdsOfPredictor does not exist"); } hid_t attrCuts = H5Aopen(grpQvTable, buf, H5P_DEFAULT); if (attrCuts < 0) { H5Gclose(grpQvTable); H5Fclose(root); ION_ABORT("ERROR: fail to open HDF5 attribute ThresholdsOfPredictor"); } hsize_t size = H5Aget_storage_size(attrCuts); size /= sizeof(float); float* fcuts = new float[size]; ret = H5Aread(attrCuts, H5T_NATIVE_FLOAT, fcuts); H5Aclose(attrCuts); if(ret < 0) { H5Gclose(grpQvTable); H5Fclose(root); ION_ABORT("ERROR: fail to read HDF5 attribute ThresholdsOfPredictor"); } vector<float> vCuts(size); copy(fcuts, fcuts + size, vCuts.begin()); phred_cuts_.push_back(vCuts); delete [] fcuts; fcuts = 0; } hid_t dsQvs = H5Dopen(grpQvTable, "Qvs", H5P_DEFAULT); if (dsQvs < 0) { H5Gclose(grpQvTable); H5Fclose(root); ION_ABORT("ERROR: fail to open HDF5 dataset Qvs"); } hsize_t tbSize = H5Dget_storage_size(dsQvs); phred_table_ = new unsigned char[tbSize]; ret = H5Dread(dsQvs, H5T_NATIVE_UCHAR, H5S_ALL, H5S_ALL, H5P_DEFAULT, phred_table_); H5Dclose(dsQvs); H5Gclose(grpQvTable); H5Fclose(root); if (ret < 0) { delete [] phred_table_; phred_table_ = 0; ION_ABORT("ERROR: fail to read HDF5 dataset Qvs"); } } else { printf("WARNING: binary phred table file %s is not a HDF5 file, try binary file mode.\n", phred_table_file.c_str()); ifstream source; source.open(phred_table_file.c_str(), ios::in|ios::binary|ios::ate); if (!source.is_open()) ION_ABORT("ERROR: Cannot open file: " + phred_table_file); long totalSize = source.tellg(); char* tbBlock = new char [totalSize]; source.seekg (0, ios::beg); source.read (tbBlock, totalSize); source.close(); long headerSize = 0; char* ptr = tbBlock; int numPredictors = ptr[0]; //kNumPredictors if(numPredictors != kNumPredictors) { delete [] tbBlock; tbBlock = 0; ION_ABORT("ERROR: Wrong number of predictors load from " + phred_table_file); } ptr += 4; headerSize += 4; for(int i = 0; i < kNumPredictors; ++i) { vNumCuts[i] = ptr[0]; ptr += 4; headerSize += 4; offsets_.push_back(1); } long tbSize = 1; for(int i = 0; i < kNumPredictors; ++i) { vector<float> vCuts; tbSize *= vNumCuts[i]; for(size_t j = 0; j < vNumCuts[i]; ++j) { float tmp; memcpy(&tmp, ptr, 4); vCuts.push_back(tmp); ptr += 4; headerSize += 4; } phred_cuts_.push_back(vCuts); } if(tbSize != (totalSize - headerSize)) { delete [] tbBlock; tbBlock = 0; ION_ABORT("ERROR: Wrong QV table size"); } phred_table_ = new unsigned char[tbSize]; memcpy(phred_table_, ptr, tbSize * sizeof(unsigned char)); delete [] tbBlock; tbBlock = 0; } for(size_t i = kNumPredictors - 2; i > 0; --i) { offsets_[i] *= phred_cuts_[i + 1].size(); offsets_[i - 1] = offsets_[i]; } offsets_[0] *= phred_cuts_[1].size(); } else { ifstream source; source.open(phred_table_file.c_str()); if (!source.is_open()) ION_ABORT("ERROR: Cannot open file: " + phred_table_file); while (!source.eof()) { string line; getline(source, line); if (line.empty()) break; if (line[0] == '#') continue; stringstream strs(line); float temp; for (int k = 0; k < kNumPredictors; ++k) { strs >> temp; phred_thresholds_[k].push_back(temp); } strs >> temp; //skip n-th entry strs >> temp; phred_quality_.push_back(temp); } source.close(); for (int k = 0; k < kNumPredictors; ++k) phred_thresholds_max_[k] = *max_element(phred_thresholds_[k].begin(), phred_thresholds_[k].end()); } // Prepare for predictor dump here if (save_predictors_) { string predictors_filename = output_directory + "/Predictors.txt"; cout << endl << "Saving PerBaseQual predictors to file " << predictors_filename << endl << endl; predictor_dump_.open(predictors_filename.c_str()); if (!predictor_dump_.is_open()) ION_ABORT("ERROR: Cannot open file: " + predictors_filename); } }
int main(int argc, const char* argv[]) { printf ("tvcvalidator %s-%s (%s) - Prototype tvc validation tool\n\n", IonVersion::GetVersion().c_str(), IonVersion::GetRelease().c_str(), IonVersion::GetSvnRev().c_str()); if (argc == 1) { VariantValidatorHelp(); return 1; } OptArgs opts; opts.ParseCmdLine(argc, argv); if (opts.GetFirstBoolean('v', "version", false)) { return 0; } if (opts.GetFirstBoolean('h', "help", false)) { VariantValidatorHelp(); return 0; } string input_vcf_filename = opts.GetFirstString ('i', "input-vcf", ""); string truth_filename = opts.GetFirstString ('t', "truth-file", ""); string truth_dir = opts.GetFirstString ('d', "truth-dir", "/results/plugins/validateVariantCaller/files"); // TODO: reference optional, only used to verify reference allele in input-vcf and truth files //string reference_filename = opts.GetFirstString ('r', "reference", ""); opts.CheckNoLeftovers(); // // Step 1. Load input VCF file into memory // if (input_vcf_filename.empty()) { VariantValidatorHelp(); cerr << "ERROR: Input VCF file not specified " << endl; return 1; } VariantCallerResults results_vcf; results_vcf.load_vcf(input_vcf_filename); printf("Loaded VCF %s with %d variant calls\n", input_vcf_filename.c_str(), (int)results_vcf.variants.size()); // // Step 2. Parse truth files, compare them to the input vcf, and compute match scores // if (not truth_filename.empty()) { ValidatorTruth truth; truth.ReadTruthFile(truth_filename); truth.CompareToCalls(results_vcf); return 0; } truth_dir += "/*.bed"; glob_t glob_result; glob(truth_dir.c_str(), GLOB_TILDE, NULL, &glob_result); for(unsigned int i = 0; i < glob_result.gl_pathc; ++i) { ValidatorTruth truth; truth.ReadTruthFile(string(glob_result.gl_pathv[i])); truth.CompareToCalls(results_vcf); } globfree(&glob_result); return 0; }
int main (int argc, const char *argv[]) { printf ("------------- bamrealignment --------------\n"); OptArgs opts; opts.ParseCmdLine(argc, argv); vector<int> score_vals(4); string input_bam = opts.GetFirstString ('i', "input", ""); string output_bam = opts.GetFirstString ('o', "output", ""); opts.GetOption(score_vals, "4,-6,-5,-2", 's', "scores"); int clipping = opts.GetFirstInt ('c', "clipping", 2); bool anchors = opts.GetFirstBoolean ('a', "anchors", true); int bandwidth = opts.GetFirstInt ('b', "bandwidth", 10); bool verbose = opts.GetFirstBoolean ('v', "verbose", false); bool debug = opts.GetFirstBoolean ('d', "debug", false); int format = opts.GetFirstInt ('f', "format", 1); int num_threads = opts.GetFirstInt ('t', "threads", 8); string log_fname = opts.GetFirstString ('l', "log", ""); if (input_bam.empty() or output_bam.empty()) return PrintHelp(); opts.CheckNoLeftovers(); std::ofstream logf; if (log_fname.size ()) { logf.open (log_fname.c_str ()); if (!logf.is_open ()) { fprintf (stderr, "bamrealignment: Failed to open log file %s\n", log_fname.c_str()); return 1; } } BamReader reader; if (!reader.Open(input_bam)) { fprintf(stderr, "bamrealignment: Failed to open input file %s\n", input_bam.c_str()); return 1; } SamHeader header = reader.GetHeader(); RefVector refs = reader.GetReferenceData(); BamWriter writer; writer.SetNumThreads(num_threads); if (format == 1) writer.SetCompressionMode(BamWriter::Uncompressed); else writer.SetCompressionMode(BamWriter::Compressed); if (!writer.Open(output_bam, header, refs)) { fprintf(stderr, "bamrealignment: Failed to open output file %s\n", output_bam.c_str()); return 1; } // The meat starts here ------------------------------------ if (verbose) cout << "Verbose option is activated, each alignment will print to screen." << endl << " After a read hit RETURN to continue to the next one," << endl << " or press q RETURN to quit the program," << endl << " or press s Return to silence verbose," << endl << " or press c RETURN to continue printing without further prompt." << endl << endl; unsigned int readcounter = 0; unsigned int mapped_readcounter = 0; unsigned int realigned_readcounter = 0; unsigned int modified_alignment_readcounter = 0; unsigned int pos_update_readcounter = 0; unsigned int failed_clip_realigned_readcount = 0; unsigned int already_perfect_readcount = 0; unsigned int bad_md_tag_readcount = 0; unsigned int error_recreate_ref_readcount = 0; unsigned int error_clip_anchor_readcount = 0; unsigned int error_sw_readcount = 0; unsigned int error_unclip_readcount = 0; unsigned int start_position_shift; int orig_position; int new_position; string md_tag, new_md_tag, input = "x"; vector<CigarOp> new_cigar_data; vector<MDelement> new_md_data; bool position_shift = false; time_t start_time = time(NULL); Realigner aligner; aligner.verbose_ = verbose; aligner.debug_ = debug; if (!aligner.SetScores(score_vals)) cout << "bamrealignment: Four scores need to be provided: match, mismatch, gap open, gap extend score!" << endl; aligner.SetAlignmentBandwidth(bandwidth); BamAlignment alignment; while(reader.GetNextAlignment(alignment)){ readcounter ++; position_shift = false; if ( (readcounter % 100000) == 0 ) cout << "Processed " << readcounter << " reads. Elapsed time: " << (time(NULL) - start_time) << endl; if (alignment.IsMapped()) { orig_position = alignment.Position; mapped_readcounter++; aligner.SetClipping(clipping, !alignment.IsReverseStrand()); if (aligner.verbose_) { cout << endl; if (alignment.IsReverseStrand()) cout << "The read is from the reverse strand." << endl; else cout << "The read is from the forward strand." << endl; } if (!alignment.GetTag("MD", md_tag)) { if (aligner.verbose_) cout << "Warning: Skipping read " << alignment.Name << ". It is mapped but missing MD tag." << endl; if (logf.is_open ()) logf << alignment.Name << '\t' << alignment.IsReverseStrand() << '\t' << alignment.RefID << '\t' << setfill ('0') << setw (8) << orig_position << '\t' << "MISSMD" << '\n'; bad_md_tag_readcount++; } else if (aligner.CreateRefFromQueryBases(alignment.QueryBases, alignment.CigarData, md_tag, anchors)) { bool clipfail = false; if (Realigner::CR_ERR_CLIP_ANCHOR == aligner.GetCreateRefError ()) { clipfail = true; failed_clip_realigned_readcount ++; } if (!aligner.computeSWalignment(new_cigar_data, new_md_data, start_position_shift)) { if (aligner.verbose_) cout << "Error in the alignment! Not updating read information." << endl; if (logf.is_open ()) logf << alignment.Name << '\t' << alignment.IsReverseStrand() << '\t' << alignment.RefID << '\t' << setfill ('0') << setw (8) << orig_position << '\t' << "SWERR" << '\n'; error_sw_readcount++; writer.SaveAlignment(alignment); // Write alignment unchanged continue; } if (!aligner.addClippedBasesToTags(new_cigar_data, new_md_data, alignment.QueryBases.size())) { if (aligner.verbose_) cout << "Error when adding clipped anchors back to tags! Not updating read information." << endl; if (logf.is_open ()) logf << alignment.Name << '\t' << alignment.IsReverseStrand() << '\t' << alignment.RefID << '\t' << setfill ('0') << setw (8) << orig_position << '\t' << "UNCLIPERR" << '\n'; writer.SaveAlignment(alignment); // Write alignment unchanged error_unclip_readcount ++; continue; } new_md_tag = aligner.GetMDstring(new_md_data); realigned_readcounter++; // adjust start position of read if (!aligner.LeftAnchorClipped() and start_position_shift != 0) { new_position = aligner.updateReadPosition(alignment.CigarData, (int)start_position_shift, alignment.Position); if (new_position != alignment.Position) { pos_update_readcounter++; position_shift = true; alignment.Position = new_position; } } if (position_shift || alignment.CigarData.size () != new_cigar_data.size () || md_tag != new_md_tag) { if (logf.is_open ()) { logf << alignment.Name << '\t' << alignment.IsReverseStrand() << '\t' << alignment.RefID << '\t' << setfill ('0') << setw (8) << orig_position << '\t' << "MOD"; if (position_shift) logf << "-SHIFT"; if (clipfail) logf << " NOCLIP"; logf << '\n'; } modified_alignment_readcounter++; } else { if (logf.is_open ()) { logf << alignment.Name << '\t' << alignment.IsReverseStrand() << '\t' << alignment.RefID << '\t' << setfill ('0') << setw (8) << orig_position << '\t' << "UNMOD"; if (clipfail) logf << " NOCLIP"; logf << '\n'; } } if (aligner.verbose_){ cout << alignment.Name << endl; cout << "------------------------------------------" << endl; // Wait for input to continue or quit program if (input.size() == 0) input = 'x'; else if (input[0] != 'c' and input[0] != 'C') getline(cin, input); if (input.size()>0){ if (input[0] == 'q' or input[0] == 'Q') return 1; else if (input[0] == 's' or input[0] == 'S') aligner.verbose_ = false; } } // Finally update alignment information alignment.CigarData = new_cigar_data; alignment.EditTag("MD", "Z" , new_md_tag); } // end of CreateRef else if else { switch (aligner.GetCreateRefError ()) { case Realigner::CR_ERR_RECREATE_REF: if (logf.is_open ()) logf << alignment.Name << '\t' << alignment.IsReverseStrand() << '\t' << alignment.RefID << '\t' << setfill ('0') << setw (8) << orig_position << '\t' << "RECRERR" << '\n'; error_recreate_ref_readcount++; break; case Realigner::CR_ERR_CLIP_ANCHOR: if (logf.is_open ()) logf << alignment.Name << '\t' << alignment.IsReverseStrand() << '\t' << alignment.RefID << '\t' << setfill ('0') << setw (8) << orig_position << '\t' << "CLIPERR" << '\n'; error_clip_anchor_readcount++; break; default: // On a good run this writes way too many reads to the log file - don't want to create a too large txt file // if (logf.is_open ()) //logf << alignment.Name << '\t' << alignment.IsReverseStrand() << '\t' << alignment.RefID << '\t' << setfill ('0') << setw (8) << orig_position << '\t' << "PERFECT" << '\n'; already_perfect_readcount++; break; } if (aligner.verbose_) { cout << alignment.Name << endl; cout << "------------------------------------------" << endl; // Wait for input to continue or quit program if (input.size() == 0) input = 'x'; else if (input[0] != 'c' and input[0] != 'C') getline(cin, input); if (input.size()>0){ if (input[0] == 'q' or input[0] == 'Q') return 1; else if (input[0] == 's' or input[0] == 'S') aligner.verbose_ = false; } } } // --- Debug output for Rajesh --- if (debug && aligner.invalid_cigar_in_input) { aligner.verbose_ = true; cout << "Invalid cigar string / md tag pair in read " << alignment.Name << endl; // Rerun reference generation to display error aligner.CreateRefFromQueryBases(alignment.QueryBases, alignment.CigarData, md_tag, anchors); aligner.verbose_ = verbose; aligner.invalid_cigar_in_input = false; } // --- --- --- } // end of if isMapped writer.SaveAlignment(alignment); } // end while loop over reads if (aligner.invalid_cigar_in_input) cerr << "WARNING bamrealignment: There were invalid cigar string / md tag pairs in the input bam file." << endl; // ---------------------------------------------------------------- // program end -- output summary information cout << " File: " << input_bam << endl << " Total reads: " << readcounter << endl << " Mapped reads: " << mapped_readcounter << endl; if (bad_md_tag_readcount) cout << " Skipped: bad MD tags: " << bad_md_tag_readcount << endl; if (error_recreate_ref_readcount) cout << " Skipped: unable to recreate ref: " << error_recreate_ref_readcount << endl; if (error_clip_anchor_readcount) cout << " Skipped: error clipping anchor: " << error_clip_anchor_readcount << endl; cout << " Skipped: already perfect: " << already_perfect_readcount << endl << " Total reads realigned: " << mapped_readcounter - already_perfect_readcount - bad_md_tag_readcount - error_recreate_ref_readcount - error_clip_anchor_readcount << endl; if (failed_clip_realigned_readcount) cout << " (including " << failed_clip_realigned_readcount << " that failed to clip)" << endl; if (error_sw_readcount) cout << " Failed to complete SW alignment: " << error_sw_readcount << endl; if (error_unclip_readcount) cout << " Failed to unclip anchor: " << error_unclip_readcount << endl; cout << " Succesfully realigned: " << realigned_readcounter << endl << " Modified alignments: " << modified_alignment_readcounter << endl << " Shifted position: " << pos_update_readcounter << endl; cout << "Processing time: " << (time(NULL)-start_time) << " seconds." << endl; cout << "INFO: The output BAM file may be unsorted." << endl; cout << "------------------------------------------" << endl; return 0; }
int main (int argc, const char *argv[]) { time_t program_start_time; time(&program_start_time); Json::Value calibration_json(Json::objectValue); DumpStartingStateOfProgram (argc,argv,program_start_time, calibration_json["Calibration"]); // // Step 1. Process command line options // OptArgs opts; opts.ParseCmdLine(argc, argv); CalibrationContext calib_context; if (not calib_context.InitializeFromOpts(opts)){ PrintHelp_CalModules(); } HistogramCalibration master_histogram(opts, calib_context); calib_context.hist_calibration_master = &master_histogram; LinearCalibrationModel master_linear_model(opts, calib_context); calib_context.linear_model_master = &master_linear_model; opts.CheckNoLeftovers(); // // Step 2. Execute threaded calibration // time_t calibration_start_time; time(&calibration_start_time); pthread_mutex_init(&calib_context.read_mutex, NULL); pthread_mutex_init(&calib_context.write_mutex, NULL); pthread_t worker_id[calib_context.num_threads]; for (int worker = 0; worker < calib_context.num_threads; worker++) if (pthread_create(&worker_id[worker], NULL, CalibrationWorker, &calib_context)) { cerr << "Calibration ERROR: Problem starting thread" << endl; exit (EXIT_FAILURE); } for (int worker = 0; worker < calib_context.num_threads; worker++) pthread_join(worker_id[worker], NULL); pthread_mutex_destroy(&calib_context.read_mutex); pthread_mutex_destroy(&calib_context.write_mutex); time_t calibration_end_time; time(&calibration_end_time); // // Step 3. Create models, write output, and close modules // // HP histogram calibration if (master_histogram.CreateCalibrationModel()) master_histogram.ExportModelToJson(calibration_json["HPHistogram"]); // Linear Model if (master_linear_model.CreateCalibrationModel()) master_linear_model.ExportModelToJson(calibration_json["LinearModel"], ""); // Transfer stuff from calibration context and close bam reader calib_context.Close(calibration_json["Calibration"]); time_t program_end_time; time(&program_end_time); calibration_json["Calibration"]["end_time"] = get_time_iso_string(program_end_time); calibration_json["Calibration"]["total_duration"] = (Json::Int)difftime(program_end_time,program_start_time); calibration_json["Calibration"]["calibration_duration"] = (Json::Int)difftime(calibration_end_time,calibration_start_time); SaveJson(calibration_json, calib_context.filename_json); return EXIT_SUCCESS; }
int main(int argc, const char *argv[]) { OptArgs opts; TraceConfig config; string inputDir; string outputDir; bool help; opts.ParseCmdLine(argc, argv); opts.GetOption(inputDir, "", '-', "source-dir"); opts.GetOption(outputDir, "", '-', "output-dir"); opts.GetOption(config.precision, "5", '-', "precision"); opts.GetOption(config.numEvec, "7", '-', "num-evec"); opts.GetOption(config.doDebug, "false", '-', "debug-files"); opts.GetOption(config.compressionType, "delta", '-', "compression"); opts.GetOption(config.numFlows, "-1", '-', "num-flows"); opts.GetOption(config.numCores, "6", '-', "num-cores"); opts.GetOption(config.errCon,"0",'-',"err-con"); opts.GetOption(config.rankGood,"0",'-',"rank-good"); opts.GetOption(config.pivot,"0",'-',"pivot"); opts.GetOption(help, "false", 'h', "help"); opts.GetOption(config.isThumbnail, "false", '-', "thumbnail"); opts.GetOption(config.use_hard_est, "false",'-', "use-hard-est"); opts.GetOption(config.t0_hard, "0", '-', "t0-hard"); opts.GetOption(config.tmid_hard, "0", '-', "tmid-hard"); opts.GetOption(config.sigma_hard, "0", '-', "sigma-hard"); opts.GetOption(config.row_step, "100", '-', "row-step"); opts.GetOption(config.col_step, "100", '-', "col-step"); opts.GetOption(config.bg_param, "", '-', "region-param"); opts.GetOption(config.grind_acq_0, "0", '-', "grind-acq0"); if(help || inputDir.empty() || outputDir.empty()) { usage(); } char *explog_path = NULL; explog_path = MakeExpLogPathFromDatDir(inputDir.c_str()); int numFlows = config.numFlows; if (numFlows < 0) { numFlows = GetTotalFlows(explog_path); } // Check and setup our compression type TraceChunkSerializer serializer; serializer.SetRecklessAbandon(true); if (config.compressionType == "svd") { SvdDatCompress *dc = new SvdDatCompress(config.precision, config.numEvec); serializer.SetCompressor(dc); cout << "Doing lossy svd compression. (" << serializer.GetCompressionType() << ")" << endl; } // else if (config.compressionType == "svd+") { // SvdDatCompressPlus *dc = new SvdDatCompressPlus(); // serializer.SetCompressor(dc); // cout << "Doing lossy svd compression. (" << serializer.GetCompressionType() << ")" << endl; // } // else if (config.compressionType == "svd++") { // SvdDatCompressPlusPlus *dc = new SvdDatCompressPlusPlus(); // if (config.errCon >0 ) // dc->SetErrCon(config.errCon); // if (config.rankGood > 0 ) // dc->SetRankGood(config.rankGood); // if (config.pivot > 0) // dc->SetPivot(config.pivot); // serializer.SetCompressor(dc); // cout << "Doing lossy svd compression for good traces and delta for bad ones. (" << serializer.GetCompressionType() << ")" << endl; // } else if (config.compressionType == "delta") { VencoLossless *venco = new VencoLossless(); serializer.SetCompressor(venco); cout << "Doing lossless delta compression. (" << serializer.GetCompressionType() << ")" << endl; } else if (config.compressionType == "delta-plain") { DeltaComp *delta = new DeltaComp(); serializer.SetCompressor(delta); cout << "Doing lossless delta plain compression. (" << serializer.GetCompressionType() << ")" << endl; } else if (config.compressionType == "delta-plain-fst") { DeltaCompFst *delta = new DeltaCompFst(); serializer.SetCompressor(delta); cout << "Doing lossless delta plain fast compression. (" << serializer.GetCompressionType() << ")" << endl; } else if (config.compressionType == "delta-plain-fst-smx") { DeltaCompFstSmX *delta = new DeltaCompFstSmX(); serializer.SetCompressor(delta); cout << "Doing lossless delta plain fast compression. (" << serializer.GetCompressionType() << ")" << endl; } else if (config.compressionType == "none") { TraceCompressor *vanilla = new TraceNoCompress(); serializer.SetCompressor(vanilla); cout << "Doing no compression. (" << serializer.GetCompressionType() << ")" << endl; } else { ION_ABORT("Don't recognize compression type: " + config.compressionType); } const char *id = GetChipId(explog_path); if (explog_path) free (explog_path); ChipIdDecoder::SetGlobalChipId(id); ImageTransformer::CalibrateChannelXTCorrection(inputDir.c_str(), "lsrowimage.dat"); Image bfImg1; string bfFile = inputDir + "/beadfind_pre_0003.dat"; bfImg1.LoadRaw(bfFile.c_str()); const RawImage *bf1raw = bfImg1.GetImage(); Mask mask(bf1raw->cols, bf1raw->rows); ImageTransformer::XTChannelCorrect(bfImg1.raw,bfImg1.results_folder); bfImg1.FilterForPinned (&mask, MaskEmpty, false); Image bfImg2; string bfFile2 = inputDir + "/beadfind_pre_0001.dat"; bfImg2.LoadRaw(bfFile2.c_str()); ImageTransformer::XTChannelCorrect(bfImg2.raw,bfImg1.results_folder); bfImg2.FilterForPinned (&mask, MaskEmpty, false); const RawImage *bf2raw = bfImg2.GetImage(); GridMesh<T0Prior> t0Prior; T0Calc bfT0; /* Calc t0 and get prior. */ cout << "Doing beadfind t0" << endl; GenerateBfT0Prior(config, bf1raw->image, bf1raw->baseFrameRate, bf1raw->rows, bf1raw->cols, bf1raw->frames, bf1raw->timestamps, config.row_step, config.col_step, &mask, bfT0, t0Prior); GridMesh<T0Prior> t0Prior2; T0Calc bfT02; GenerateBfT0Prior(config, bf2raw->image, bf2raw->baseFrameRate, bf2raw->rows, bf2raw->cols, bf2raw->frames, bf2raw->timestamps, config.row_step, config.col_step, &mask, bfT02, t0Prior2); SigmaTMidNucEstimation sigmaEst; sigmaEst.Init(config.rate_sigma_intercept, config.rate_sigma_slope, config.t0_tmid_intercept, config.t0_tmid_slope, bf1raw->baseFrameRate); GridMesh<SigmaEst> sigmaTMid; bfImg1.Close(); bfImg2.Close(); // Calculate individual well t0 by looking at neighboring regions vector<float> wellT0; bfT0.CalcIndividualT0(wellT0, 0); vector<float> wellT02; bfT02.CalcIndividualT0(wellT02, 0); for (size_t i =0; i< wellT0.size();i++) { if (wellT0[i] > 0 && wellT02[i] > 0) { wellT0[i] = (wellT0[i] + wellT02[i])/2.0f; } else { wellT0[i] = max(wellT0[i], wellT02[i]); } } // Average the region level t0, should we do this first and then just do sinle well level? for (size_t bIx = 0; bIx < bfT0.GetNumRegions(); bIx++) { double t1 = bfT0.GetT0(bIx); double t2 = bfT02.GetT0(bIx); if (t1 > 0 && t2 > 0) { t1 = (t1 + t2)/2.0; } else { t1 = max(t1,t2); } bfT0.SetT0(bIx, t1); } // Single thread first dat for (size_t datIx = 0; datIx < 1; ++datIx) { cout << "Doing: " << datIx << endl; char buffer[2048]; snprintf(buffer, sizeof(buffer), "%s/acq_%.4d.dat", inputDir.c_str(), (int) datIx); string datFile = buffer; /* Use prior to calculate t0 and slope. */ Image datImg; T0Calc t0; datImg.LoadRaw(datFile.c_str()); // ImageTransformer::XTChannelCorrect(datImg.raw,datImg.results_folder); const RawImage *datRaw = datImg.GetImage(); /* Estimate sigma and t_mid_nuc */ if (datIx == 0) { cout << "Doing acquisition t0" << endl; GenerateAcqT0Prior(config, datRaw->image, datRaw->baseFrameRate, datRaw->rows, datRaw->cols, datRaw->frames, datRaw->timestamps, config.row_step, config.col_step, &mask, t0, t0Prior); ClockTimer timer; cout << "Estimating sigma." << endl; sigmaTMid.Init(datRaw->rows, datRaw->cols, config.row_step, config.col_step); for (size_t bIx = 0; bIx < t0.GetNumRegions(); bIx++) { t0.SetT0(bIx, bfT0.GetT0(bIx)); } int neighbors = 2; if (config.isThumbnail) { cout << "Doing thumbnail version of slope." << endl; neighbors = 1; } EstimateSigmaValue(t0, sigmaEst, sigmaTMid, neighbors); timer.PrintMilliSeconds(cout,"Sigma Est took:"); string sigmaFile = outputDir + "/sigma_tmid_est.txt"; OutputSigmaTmidEstimates(sigmaTMid, sigmaFile.c_str()); } /* For each region do shifting */ ClockTimer timer; cout << "Shifting traces" << endl; timer.StartTimer(); // ShiftTraces(bfT0, wellT0, datRaw->frames, datRaw->baseFrameRate, datRaw->timestamps, datRaw->image); timer.PrintMilliSeconds(cout,"Shift took:"); if (!config.bg_param.empty()) { DataCube<int> rowsCols; DataCube<float> tmidSigma; DataCube<float> fitTmidSigma; string path = config.bg_param + ":/region/region_location"; if (!H5File::ReadDataCube(path, rowsCols)) { ION_ABORT("Couldn't read file: " + path); } path = config.bg_param + ":/region/region_init_param"; if (!H5File::ReadDataCube(path, fitTmidSigma)) { ION_ABORT("Couldn't read file: " + path); } for (size_t i = 0; i < rowsCols.GetNumX(); i++) { int row = rowsCols.At(i,1,0); int col = rowsCols.At(i,0,0); SigmaEst &est = sigmaTMid.GetItemByRowCol(row, col); float tmid_est = fitTmidSigma.At(i,0,0); float sigma_est = fitTmidSigma.At(i,1,0); est.mTMidNuc = tmid_est; est.mSigma = sigma_est; } string fitSigmaFile = outputDir + "/bg_fit_sigma_tmid_est.txt"; OutputSigmaTmidEstimates(sigmaTMid, fitSigmaFile.c_str()); // path = config.bg_param + ":/region/region_init_param"; // if (!H5File::ReadMatrix(path, tmidSigma)) { // ION_ABORT("Couldn't read file: " + path); // } // for (size_t i = 0; i < rowsCols.n_rows; i++) { // int row = rowsCols.at(i,0); // int col = rowsCols.at(i,1); // SigmaEst &est = sigmaTMid.GetItemByRowCol(row, col); // float tmid_est = tmidSigma.at(i,0); // float sigma_est = tmidSigma.at(i,1); // est.mTMidNuc = tmid_est; // est.mSigma = sigma_est; // } // string sigmaFile = outputDir + "/supplied_sigma_tmid_est.txt"; // OutputSigmaTmidEstimates(sigmaTMid, sigmaFile.c_str()); } else if (config.use_hard_est) { for (size_t i = 0; i < bfT0.GetNumRegions(); i++) { bfT0.SetT0(i,config.t0_hard * datRaw->baseFrameRate + config.time_start_slop); } for (size_t i = 0; i < sigmaTMid.GetNumBin(); i++) { SigmaEst &est = sigmaTMid.GetItem(i); est.mTMidNuc = config.tmid_hard; est.mSigma = config.sigma_hard; est.mT0 = config.t0_hard; } } /* Use t0 and sigma to get the time compression bkgModel wants. */ cout << "Generating chunks" << endl; // GridMesh<TraceChunk> traceChunks; SynchDat sdat; if (datIx == 0 && config.grind_acq_0 > 0) { int nTimes = config.grind_acq_0; timer.StartTimer(); size_t processMicroSec = 0; size_t hdf5MicroSec = 0; size_t compressMicroSec = 0; size_t convertMicroSec = 0; for (int i = 0; i <nTimes; i++) { //GridMesh<TraceChunk> traceChunken; SynchDat sdatIn; AddMetaData(sdat, datRaw, datIx); ClockTimer convTimer; GenerateDataChunks(config, bfT0, datRaw, config.row_step, config.col_step, sigmaTMid, sdatIn.mChunks,datImg); convertMicroSec += convTimer.GetMicroSec(); snprintf(buffer, sizeof(buffer), "%s/acq_%.4d.sdat", outputDir.c_str(), (int)datIx); serializer.Write(buffer, sdatIn); processMicroSec += serializer.computeMicroSec; hdf5MicroSec += serializer.ioMicroSec; compressMicroSec += serializer.compressMicroSec; } size_t usec = timer.GetMicroSec(); cout << "Took: " << usec / 1.0e6 << " seconds, " << usec / (nTimes * 1.0f) << " usec per write." << endl; timer.PrintMilliSeconds(cout,"Chunks took:"); cout << "Read took: " << processMicroSec / (1e3 * nTimes) << " milli seconds per sdat compute." << endl; cout << "Read took: " << hdf5MicroSec / (1e3 * nTimes) << " milli seconds per sdat hdf5." << endl; cout << "Read took: " << compressMicroSec / (1e3 * nTimes) << " milli seconds per sdat compressing." << endl; cout << "Read took: " << convertMicroSec / (1e3 * nTimes) << " milli seconds per sdat converting." << endl; exit(0); } else { timer.StartTimer(); AddMetaData(sdat, datRaw, datIx); GenerateDataChunks(config, bfT0, datRaw, config.row_step, config.col_step, sigmaTMid, sdat.mChunks,datImg); timer.PrintMilliSeconds(cout,"Chunks took:"); if (datIx == 0 && config.doDebug) { OutputTraceChunks(sdat.mChunks,"flow_0_data_chunks.txt"); } } datImg.Close(); /* Serialize onto disk. */ snprintf(buffer, sizeof(buffer), "%s/acq_%.4d.sdat", outputDir.c_str(), (int)datIx); serializer.Write(buffer, sdat); /* Read back in first flow for checking */ if (datIx == 0) { TraceChunkSerializer readSerializer; readSerializer.SetRecklessAbandon(true); // GridMesh<TraceChunk> traceChunksIn; SynchDat sdatIn; readSerializer.Read(buffer, sdatIn); if (datIx == 0 && config.doDebug) { OutputTraceChunks(sdatIn.mChunks, "flow_0_data_chunks_read.txt"); } SampleQuantiles<float> s(50000); SampleQuantiles<float> s2(50000); SampleQuantiles<float> sAbs(50000); SampleStats<double> ss; int diffCount = 0; for (size_t bIx = 0; bIx < sdatIn.mChunks.mBins.size(); bIx++) { if (sdatIn.mChunks.mBins[bIx].mT0 != sdat.mChunks.mBins[bIx].mT0) { cout << "Got: " << sdatIn.mChunks.mBins[bIx].mT0 << " vs: " << sdat.mChunks.mBins[bIx].mT0 << endl; exit(1); } for (size_t i = 0; i < sdatIn.mChunks.mBins[bIx].mData.size(); i++) { double diff = (double)sdatIn.mChunks.mBins[bIx].mData[i] - (double)sdat.mChunks.mBins[bIx].mData[i]; if (!std::isfinite(diff)) { cout << "NaNs!!" << endl; } if (diffCount < 10 && fabs(diff) > .00001) { // != 0) { diffCount++; cout << "Bin: " << bIx << " well: " << i << " diff is: " << diff << endl; } s.AddValue(diff); sAbs.AddValue(fabs(diff)); ss.AddValue(sqrt(diff * diff)); s2.AddValue(sqrt(diff * diff)); } } cout << "Median rms: " << s2.GetMedian() << " Avg: " << ss.GetMean() << " diff: " << s.GetMedian() << endl; cout << "Abs(diff) Quantiles:" << endl; for (size_t i = 0; i <= 100; i+=10) { cout << i << "\t" << sAbs.GetQuantile(i/100.0) << endl; } } } // do the next N flows multithreaded if (numFlows > 1) { PJobQueue jQueue (config.numCores, numFlows-1); vector<CreateSDat> jobs(numFlows -1); // for (int i = 0; i < 4; i++) { // char buffer[2048]; // snprintf(buffer, sizeof(buffer), "%s/beadfind_pre_%.4d.dat", inputDir.c_str(), (int) i); // string input = buffer; // snprintf(buffer, sizeof(buffer), "%s/beadfind_pre_%.4d.sdat", outputDir.c_str(), (int)i); // string output = buffer; // jobs[i].Init(&config, input, output, &wellT0, &bfT0, &sigmaTMid); // jQueue.AddJob(jobs[i]); // } // jQueue.WaitUntilDone(); for (int i = 1; i < numFlows; i++) { char buffer[2048]; snprintf(buffer, sizeof(buffer), "%s/acq_%.4d.dat", inputDir.c_str(), (int) i); string input = buffer; snprintf(buffer, sizeof(buffer), "%s/acq_%.4d.sdat", outputDir.c_str(), (int)i); string output = buffer; jobs[i-1].Init(&config, input, output, &wellT0, &bfT0, &sigmaTMid, i); jQueue.AddJob(jobs[i-1]); } jQueue.WaitUntilDone(); } /* Serialize into backbround models */ cout << "Done." << endl; }
/************************************************************************************************* ************************************************************************************************* * * Start of Main Function * ************************************************************************************************* ************************************************************************************************/ int main (int argc, char *argv[]) { init_salute(); ofstream null_ostream("/dev/null"); // must stay live for entire scope, or crash when writing TheSilenceOfTheArmadillos(null_ostream); TrackProgress my_progress; DumpStartingStateOfProgram (argc,argv,my_progress); if(argc < 2) { PrintHelp(); } for(int i = 1; i < argc; ++i) { string s = argv[i]; if(s == "-" || s == "--") { cerr << "ERROR: command line input \"-\" must be followed by a short option name (a letter) and \"--\" must be followed by a long option name." << endl; exit ( EXIT_FAILURE ); } else if(s == "-?" || s == "-h" || s == "--help") { PrintHelp(); } } ValidateOpts validater; validater.Validate(argc, argv); char** argv2 = new char*[argc]; int datind = TrapAndDeprecateOldArgs(argc, argv, argv2); OptArgs opts; opts.ParseCmdLine(argc, (const char**)argv2); for(int k = 0; k < argc ; ++k) { delete [] argv2[k]; } delete [] argv2; Json::Value json_params; CommandLineOpts inception_state; inception_state.SetOpts(opts, json_params); if(datind < 0) // there is no "--dat-source-directory" { inception_state.sys_context.dat_source_directory = argv[argc - 1]; cout << "dat_source_directory = " << inception_state.sys_context.dat_source_directory << endl; } inception_state.PostProcessArgs(opts); SeqListClass my_keys; ImageSpecClass my_image_spec; SlicedPrequel my_prequel_setup; SetUpOrLoadInitialState(inception_state, my_keys, my_progress, my_image_spec, my_prequel_setup); // Start logging process parameters & timing now that we have somewhere to log my_progress.InitFPLog(inception_state); // Write processParameters.parse file now that processing is about to begin my_progress.WriteProcessParameters(inception_state); // Do separator Region wholeChip(0, 0, my_image_spec.cols, my_image_spec.rows); IsolatedBeadFind( my_prequel_setup, my_image_spec, wholeChip, inception_state, inception_state.sys_context.GetResultsFolder(), inception_state.sys_context.analysisLocation, my_keys, my_progress); exit (EXIT_SUCCESS); }
int PrepareHotspots(int argc, const char *argv[]) { OptArgs opts; opts.ParseCmdLine(argc, argv); string input_bed_filename = opts.GetFirstString ('b', "input-bed", ""); string input_vcf_filename = opts.GetFirstString ('v', "input-vcf", ""); string output_bed_filename = opts.GetFirstString ('d', "output-bed", ""); string output_vcf_filename = opts.GetFirstString ('o', "output-vcf", ""); string reference_filename = opts.GetFirstString ('r', "reference", ""); bool left_alignment = opts.GetFirstBoolean('a', "left-alignment", false); bool filter_bypass = opts.GetFirstBoolean('f', "filter-bypass", false); bool allow_block_substitutions = opts.GetFirstBoolean('s', "allow-block-substitutions", false); opts.CheckNoLeftovers(); if((input_bed_filename.empty() == input_vcf_filename.empty()) or (output_bed_filename.empty() and output_vcf_filename.empty()) or reference_filename.empty()) { PrepareHotspotsHelp(); return 1; } // Populate chromosome list from reference.fai // Use mmap to fetch the entire reference int ref_handle = open(reference_filename.c_str(),O_RDONLY); struct stat ref_stat; fstat(ref_handle, &ref_stat); char *ref = (char *)mmap(0, ref_stat.st_size, PROT_READ, MAP_SHARED, ref_handle, 0); FILE *fai = fopen((reference_filename+".fai").c_str(), "r"); if (!fai) { fprintf(stderr, "ERROR: Cannot open %s.fai\n", reference_filename.c_str()); return 1; } vector<Reference> ref_index; map<string,int> ref_map; char line[1024], chrom_name[1024]; while (fgets(line, 1024, fai) != NULL) { Reference ref_entry; long chr_start; if (5 != sscanf(line, "%s\t%ld\t%ld\t%d\t%d", chrom_name, &ref_entry.size, &chr_start, &ref_entry.bases_per_line, &ref_entry.bytes_per_line)) continue; ref_entry.chr = chrom_name; ref_entry.start = ref + chr_start; ref_index.push_back(ref_entry); ref_map[ref_entry.chr] = (int) ref_index.size() - 1; } fclose(fai); // Load input BED or load input VCF, group by chromosome deque<LineStatus> line_status; vector<deque<Allele> > alleles(ref_index.size()); if (!input_bed_filename.empty()) { FILE *input = fopen(input_bed_filename.c_str(),"r"); if (!input) { fprintf(stderr,"ERROR: Cannot open %s\n", input_bed_filename.c_str()); return 1; } char line2[65536]; int line_number = 0; bool line_overflow = false; while (fgets(line2, 65536, input) != NULL) { if (line2[0] and line2[strlen(line2)-1] != '\n' and strlen(line2) == 65535) { line_overflow = true; continue; } line_number++; if (line_overflow) { line_overflow = false; line_status.push_back(LineStatus(line_number)); line_status.back().filter_message_prefix = "Malformed hotspot BED line: line length exceeds 64K"; continue; } if (strncmp(line2, "browser", 7) == 0) continue; if (strncmp(line2, "track", 5) == 0) { if (string::npos != string(line2).find("allowBlockSubstitutions=true")) allow_block_substitutions = true; continue; } char *current_chr = strtok(line2, "\t\r\n"); char *current_start = strtok(NULL, "\t\r\n"); char *current_end = strtok(NULL, "\t\r\n"); char *current_id = strtok(NULL, "\t\r\n"); char *penultimate = strtok(NULL, "\t\r\n"); char *ultimate = strtok(NULL, "\t\r\n"); for (char *next = strtok(NULL, "\t\r\n"); next; next = strtok(NULL, "\t\r\n")) { penultimate = ultimate; ultimate = next; } if (!current_chr or !current_start or !current_end or !current_id or !penultimate or !ultimate) { line_status.push_back(LineStatus(line_number)); line_status.back().filter_message_prefix = "Malformed hotspot BED line: expected at least 6 fields"; continue; } Allele allele; string string_chr(current_chr); if (ref_map.find(string_chr) != ref_map.end()) allele.chr_idx = ref_map[string_chr]; else if (ref_map.find("chr"+string_chr) != ref_map.end()) allele.chr_idx = ref_map["chr"+string_chr]; else if (string_chr == "MT" and ref_map.find("chrM") != ref_map.end()) allele.chr_idx = ref_map["chrM"]; else { line_status.push_back(LineStatus(line_number)); line_status.back().filter_message_prefix = "Unknown chromosome name: "; line_status.back().filter_message = string_chr; continue; } allele.pos = strtol(current_start,NULL,10); allele.id = current_id; char *current_ref = NULL; char *current_alt = NULL; for (char *next = strtok(penultimate, ";"); next; next = strtok(NULL, ";")) { if (strncmp(next,"REF=",4) == 0) current_ref = next; else if (strncmp(next,"OBS=",4) == 0) current_alt = next; } if (!current_ref or !current_alt) { line_status.push_back(LineStatus(line_number)); line_status.back().filter_message_prefix = "Malformed hotspot BED line: REF and OBS fields required in penultimate column"; continue; } for (char *pos = current_ref+4; *pos; ++pos) allele.ref += toupper(*pos); for (char *pos = current_alt+4; *pos; ++pos) allele.alt += toupper(*pos); allele.filtered = false; line_status.push_back(LineStatus(line_number)); allele.line_status = &line_status.back(); allele.opos = allele.pos; allele.oref = allele.ref; allele.oalt = allele.alt; alleles[allele.chr_idx].push_back(allele); line_status.back().allele = &alleles[allele.chr_idx].back(); } fclose(input); } if (!input_vcf_filename.empty()) { FILE *input = fopen(input_vcf_filename.c_str(),"r"); if (!input) { fprintf(stderr,"ERROR: Cannot open %s\n", input_vcf_filename.c_str()); return 1; } char line2[65536]; int line_number = 0; bool line_overflow = false; while (fgets(line2, 65536, input) != NULL) { if (line2[0] and line2[strlen(line2)-1] != '\n' and strlen(line2) == 65535) { line_overflow = true; continue; } line_number++; if (line_overflow) { line_overflow = false; line_status.push_back(LineStatus(line_number)); line_status.back().filter_message_prefix = "Malformed hotspot VCF line: line length exceeds 64K"; continue; } if (strncmp(line2, "##allowBlockSubstitutions=true", 30) == 0) { allow_block_substitutions = true; continue; } if (line2[0] == '#') continue; char *current_chr = strtok(line2, "\t\r\n"); char *current_start = strtok(NULL, "\t\r\n"); char *current_id = strtok(NULL, "\t\r\n"); char *current_ref = strtok(NULL, "\t\r\n"); char *current_alt = strtok(NULL, "\t\r\n"); if (!current_chr or !current_start or !current_id or !current_ref or !current_alt) { line_status.push_back(LineStatus(line_number)); line_status.back().filter_message_prefix = "Malformed hotspot VCF line: expected at least 5 fields"; continue; } string string_chr(current_chr); int chr_idx = 0; if (ref_map.find(string_chr) != ref_map.end()) chr_idx = ref_map[string_chr]; else if (ref_map.find("chr"+string_chr) != ref_map.end()) chr_idx = ref_map["chr"+string_chr]; else if (string_chr == "MT" and ref_map.find("chrM") != ref_map.end()) chr_idx = ref_map["chrM"]; else { line_status.push_back(LineStatus(line_number)); line_status.back().filter_message_prefix = "Unknown chromosome name: "; line_status.back().filter_message = string_chr; continue; } for (char *pos = current_ref; *pos; ++pos) *pos = toupper(*pos); for (char *pos = current_alt; *pos; ++pos) *pos = toupper(*pos); for (char *sub_alt = strtok(current_alt,","); sub_alt; sub_alt = strtok(NULL,",")) { Allele allele; allele.chr_idx = chr_idx; allele.ref = current_ref; allele.alt = sub_alt; allele.pos = strtol(current_start,NULL,10)-1; allele.id = current_id; if (allele.id == ".") allele.id = "hotspot"; allele.filtered = false; line_status.push_back(LineStatus(line_number)); allele.line_status = &line_status.back(); allele.opos = allele.pos; allele.oref = allele.ref; allele.oalt = allele.alt; alleles[allele.chr_idx].push_back(allele); line_status.back().allele = &alleles[allele.chr_idx].back(); } } fclose(input); } // Process by chromosome: // - Verify reference allele // - Left align // - Sort // - Filter for block substitutions, write FILE *output_vcf = NULL; if (!output_vcf_filename.empty()) { output_vcf = fopen(output_vcf_filename.c_str(), "w"); if (!output_vcf) { fprintf(stderr,"ERROR: Cannot open %s for writing\n", output_vcf_filename.c_str()); return 1; } fprintf(output_vcf, "##fileformat=VCFv4.1\n"); if (allow_block_substitutions) fprintf(output_vcf, "##allowBlockSubstitutions=true\n"); fprintf(output_vcf, "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\n"); } FILE *output_bed = NULL; if (!output_bed_filename.empty()) { output_bed = fopen(output_bed_filename.c_str(), "w"); if (!output_bed) { fprintf(stderr,"ERROR: Cannot open %s for writing\n", output_bed_filename.c_str()); if (output_vcf) fclose(output_vcf); return 1; } if (allow_block_substitutions) fprintf(output_bed, "track name=\"hotspot\" type=bedDetail allowBlockSubstitutions=true\n"); else fprintf(output_bed, "track name=\"hotspot\" type=bedDetail\n"); } for (int chr_idx = 0; chr_idx < (int)ref_index.size(); ++chr_idx) { for (deque<Allele>::iterator A = alleles[chr_idx].begin(); A != alleles[chr_idx].end(); ++A) { // Invalid characters bool valid = true; for (const char *c = A->ref.c_str(); *c ; ++c) if (*c != 'A' and *c != 'C' and *c != 'G' and *c != 'T') valid = false; for (const char *c = A->alt.c_str(); *c ; ++c) if (*c != 'A' and *c != 'C' and *c != 'G' and *c != 'T') valid = false; if (not valid) { A->filtered = true; A->line_status->filter_message_prefix = "REF and/or ALT contain characters other than ACGT: "; A->line_status->filter_message = "REF = " + A->ref + " ALT = " + A->alt; continue; } // Filter REF == ALT if (A->ref == A->alt) { A->filtered = true; A->line_status->filter_message_prefix = "REF and ALT alleles equal"; continue; } // Confirm reference allele. string ref_expected; for (int idx = 0; idx < (int) A->ref.size(); ++idx) ref_expected += ref_index[chr_idx].base(A->pos + idx); if (A->ref != ref_expected) { A->filtered = true; A->line_status->filter_message_prefix = "Provided REF allele does not match reference: "; A->line_status->filter_message = "Expected " + ref_expected + ", found " + A->ref; continue; } // Trim int ref_start = 0; int ref_end = A->ref.size(); int alt_end = A->alt.size(); // Option 1: trim all trailing bases //while(ref_end and alt_end and A->ref[ref_end-1] == A->alt[alt_end-1]) { // --ref_end; // --alt_end; //} // Option 2: trim all leading basees //while (ref_start < ref_end and ref_start < alt_end and A->ref[ref_start] == A->alt[ref_start]) // ++ref_start; // Option 3: trim anchor base if vcf if (!input_vcf_filename.empty()) { if (ref_end and alt_end and (ref_end == 1 or alt_end == 1) and A->ref[0] == A->alt[0]) ref_start = 1; } A->pos += ref_start; A->ref = A->ref.substr(ref_start, ref_end-ref_start); A->alt = A->alt.substr(ref_start, alt_end-ref_start); ref_end -= ref_start; alt_end -= ref_start; // Left align if (left_alignment) { while (A->pos > 0) { char nuc = ref_index[chr_idx].base(A->pos-1); if (ref_end > 0 and A->ref[ref_end-1] != nuc) break; if (alt_end > 0 and A->alt[alt_end-1] != nuc) break; A->ref = string(1,nuc) + A->ref; A->alt = string(1,nuc) + A->alt; A->pos--; } } A->ref.resize(ref_end); A->alt.resize(alt_end); // Filter block substitutions: take 1 if (ref_end > 0 and alt_end > 0 and ref_end != alt_end and not allow_block_substitutions and not filter_bypass) { A->filtered = true; A->line_status->filter_message_prefix = "Block substitutions not supported"; continue; } } if (output_bed) { // Sort - without anchor base sort(alleles[chr_idx].begin(), alleles[chr_idx].end(), compare_alleles); // Write for (deque<Allele>::iterator I = alleles[chr_idx].begin(); I != alleles[chr_idx].end(); ++I) { if (I->filtered) continue; if (I->pos) fprintf(output_bed, "%s\t%ld\t%ld\t%s\t0\t+\tREF=%s;OBS=%s;ANCHOR=%c\tNONE\n", ref_index[chr_idx].chr.c_str(), I->pos, I->pos + I->ref.size(), I->id.c_str(), I->ref.c_str(), I->alt.c_str(), ref_index[chr_idx].base(I->pos-1)); else fprintf(output_bed, "%s\t%ld\t%ld\t%s\t0\t+\tREF=%s;OBS=%s;ANCHOR=\tNONE\n", ref_index[chr_idx].chr.c_str(), I->pos, I->pos + I->ref.size(), I->id.c_str(), I->ref.c_str(), I->alt.c_str()); } } if (output_vcf) { // Add anchor base to indels for (deque<Allele>::iterator I = alleles[chr_idx].begin(); I != alleles[chr_idx].end(); ++I) { if (I->filtered) continue; if (not I->ref.empty() and not I->alt.empty()) continue; if (I->pos == 0) { I->filtered = true; I->line_status->filter_message_prefix = "INDELs at chromosome start not supported"; continue; } I->pos--; I->ref = string(1,ref_index[chr_idx].base(I->pos)) + I->ref; I->alt = string(1,ref_index[chr_idx].base(I->pos)) + I->alt; } // Sort - with anchor base sort(alleles[chr_idx].begin(), alleles[chr_idx].end(), compare_alleles); // Merge alleles, remove block substitutions, write for (deque<Allele>::iterator A = alleles[chr_idx].begin(); A != alleles[chr_idx].end(); ) { string max_ref; deque<Allele>::iterator B = A; for (; B != alleles[chr_idx].end() and B->pos == A->pos; ++B) if (!B->filtered and max_ref.size() < B->ref.size()) max_ref = B->ref; bool filtered = true; for (deque<Allele>::iterator I = A; I != B; ++I) { if (I->filtered) continue; string new_alt = I->alt + max_ref.substr(I->ref.size()); if (new_alt.size() > 1 and max_ref.size() > 1 and new_alt.size() != max_ref.size() and not allow_block_substitutions and not filter_bypass) { I->filtered = true; I->line_status->filter_message_prefix = "Block substitutions not supported (post-merge)"; continue; } I->ref = max_ref; I->alt = new_alt; filtered = false; } if (not filtered) { fprintf(output_vcf, "%s\t%ld\t.\t%s\t", ref_index[chr_idx].chr.c_str(), A->pos+1, max_ref.c_str()); bool comma = false; set<string> unique_alt_alleles; for (deque<Allele>::iterator I = A; I != B; ++I) { if (I->filtered) continue; if (unique_alt_alleles.count(I->alt) > 0) continue; unique_alt_alleles.insert(I->alt); if (comma) fprintf(output_vcf, ","); comma = true; fprintf(output_vcf, "%s", I->alt.c_str()); } fprintf(output_vcf, "\t.\t.\tOID="); comma = false; for (deque<Allele>::iterator I = A; I != B; ++I) { if (I->filtered) continue; if (comma) fprintf(output_vcf, ","); comma = true; fprintf(output_vcf, "%s", I->id.c_str()); } fprintf(output_vcf, ";OPOS="); comma = false; for (deque<Allele>::iterator I = A; I != B; ++I) { if (I->filtered) continue; if (comma) fprintf(output_vcf, ","); comma = true; fprintf(output_vcf, "%ld", I->opos+1); } fprintf(output_vcf, ";OREF="); comma = false; for (deque<Allele>::iterator I = A; I != B; ++I) { if (I->filtered) continue; if (comma) fprintf(output_vcf, ","); comma = true; fprintf(output_vcf, "%s", I->oref.c_str()); } fprintf(output_vcf, ";OALT="); comma = false; for (deque<Allele>::iterator I = A; I != B; ++I) { if (I->filtered) continue; if (comma) fprintf(output_vcf, ","); comma = true; fprintf(output_vcf, "%s", I->oalt.c_str()); } fprintf(output_vcf, ";OMAPALT="); comma = false; for (deque<Allele>::iterator I = A; I != B; ++I) { if (I->filtered) continue; if (comma) fprintf(output_vcf, ","); comma = true; fprintf(output_vcf, "%s", I->alt.c_str()); } fprintf(output_vcf, "\n"); } A = B; } } } if (output_bed) { fflush(output_bed); fclose(output_bed); } if (output_vcf) { fflush(output_vcf); fclose(output_vcf); } int lines_ignored = 0; for (deque<LineStatus>::iterator L = line_status.begin(); L != line_status.end(); ++L) { if (L->filter_message_prefix) { if (L->allele) printf("Line %d ignored: [%s:%ld %s] %s%s\n", L->line_number, ref_index[L->allele->chr_idx].chr.c_str(), L->allele->opos+1, L->allele->id.c_str(), L->filter_message_prefix, L->filter_message.c_str()); else printf("Line %d ignored: %s%s\n", L->line_number, L->filter_message_prefix, L->filter_message.c_str()); lines_ignored++; } } printf("Ignored %d out of %d lines\n", lines_ignored, (int)line_status.size()); munmap(ref, ref_stat.st_size); close(ref_handle); return 0; }
int PrepareHotspots(int argc, const char *argv[]) { OptArgs opts; opts.ParseCmdLine(argc, argv); string input_bed_filename = opts.GetFirstString ('b', "input-bed", ""); string input_vcf_filename = opts.GetFirstString ('v', "input-vcf", ""); string input_real_vcf_filename = opts.GetFirstString ('p', "input-real-vcf", ""); string output_hot_vcf = opts.GetFirstString ('q', "output-fake-hot-vcf", ""); string output_bed_filename = opts.GetFirstString ('d', "output-bed", ""); string output_vcf_filename = opts.GetFirstString ('o', "output-vcf", ""); string reference_filename = opts.GetFirstString ('r', "reference", ""); string unmerged_bed = opts.GetFirstString ('u', "unmerged-bed", ""); bool left_alignment = opts.GetFirstBoolean('a', "left-alignment", false); bool filter_bypass = opts.GetFirstBoolean('f', "filter-bypass", false); bool allow_block_substitutions = opts.GetFirstBoolean('s', "allow-block-substitutions", true); bool strict_check = opts.GetFirstBoolean('S', "strict-check", true); opts.CheckNoLeftovers(); if((input_bed_filename.empty() == (input_vcf_filename.empty() and input_real_vcf_filename.empty())) or (output_bed_filename.empty() and output_vcf_filename.empty()) or reference_filename.empty()) { PrepareHotspotsHelp(); return 1; } if ((not input_real_vcf_filename.empty()) and (output_vcf_filename.empty() or not input_vcf_filename.empty())) { PrepareHotspotsHelp(); return 1; } // Populate chromosome list from reference.fai // Use mmap to fetch the entire reference int ref_handle = open(reference_filename.c_str(),O_RDONLY); struct stat ref_stat; fstat(ref_handle, &ref_stat); char *ref = (char *)mmap(0, ref_stat.st_size, PROT_READ, MAP_SHARED, ref_handle, 0); FILE *fai = fopen((reference_filename+".fai").c_str(), "r"); if (!fai) { fprintf(stderr, "ERROR: Cannot open %s.fai\n", reference_filename.c_str()); return 1; } vector<Reference> ref_index; map<string,int> ref_map; char line[1024], chrom_name[1024]; while (fgets(line, 1024, fai) != NULL) { Reference ref_entry; long chr_start; if (5 != sscanf(line, "%1020s\t%ld\t%ld\t%d\t%d", chrom_name, &ref_entry.size, &chr_start, &ref_entry.bases_per_line, &ref_entry.bytes_per_line)) continue; ref_entry.chr = chrom_name; ref_entry.start = ref + chr_start; ref_index.push_back(ref_entry); ref_map[ref_entry.chr] = (int) ref_index.size() - 1; } fclose(fai); junction junc; if (!unmerged_bed.empty()) { FILE *fp = fopen(unmerged_bed.c_str(), "r"); if (!fp) { fprintf(stderr, "ERROR: Cannot open %s\n", unmerged_bed.c_str()); return 1; } char line2[65536]; junc.init(ref_index.size()); bool line_overflow = false; while (fgets(line2, 65536, fp) != NULL) { if (line2[0] and line2[strlen(line2)-1] != '\n' and strlen(line2) == 65535) { line_overflow = true; continue; } if (line_overflow) { line_overflow = false; continue; } if (strstr(line2, "track")) continue; char chr[100]; int b, e; sscanf(line2, "%s %d %d", chr, &b, &e); junc.add(ref_map[chr], b, e); } fclose(fp); } // Load input BED or load input VCF, group by chromosome deque<LineStatus> line_status; vector<deque<Allele> > alleles(ref_index.size()); if (!input_bed_filename.empty()) { FILE *input = fopen(input_bed_filename.c_str(),"r"); if (!input) { fprintf(stderr,"ERROR: Cannot open %s\n", input_bed_filename.c_str()); return 1; } char line2[65536]; int line_number = 0; bool line_overflow = false; while (fgets(line2, 65536, input) != NULL) { if (line2[0] and line2[strlen(line2)-1] != '\n' and strlen(line2) == 65535) { line_overflow = true; continue; } line_number++; if (line_overflow) { line_overflow = false; line_status.push_back(LineStatus(line_number)); line_status.back().filter_message_prefix = "Malformed hotspot BED line: line length exceeds 64K"; continue; } if (strncmp(line2, "browser", 7) == 0) continue; if (strncmp(line2, "track", 5) == 0) { if (string::npos != string(line2).find("allowBlockSubstitutions=true")) allow_block_substitutions = true; continue; } // OID= table has special meaning if (string::npos != string(line2).find("OID=")) { line_status.push_back(LineStatus(line_number)); line_status.back().filter_message_prefix = "Bed line contains OID="; continue; } char *current_chr = strtok(line2, "\t\r\n"); char *current_start = strtok(NULL, "\t\r\n"); char *current_end = strtok(NULL, "\t\r\n"); char *current_id = strtok(NULL, "\t\r\n"); char *penultimate = strtok(NULL, "\t\r\n"); char *ultimate = strtok(NULL, "\t\r\n"); for (char *next = strtok(NULL, "\t\r\n"); next; next = strtok(NULL, "\t\r\n")) { penultimate = ultimate; ultimate = next; } if (!current_chr or !current_start or !current_end or !current_id or !penultimate or !ultimate) { line_status.push_back(LineStatus(line_number)); line_status.back().filter_message_prefix = "Malformed hotspot BED line: expected at least 6 fields"; continue; } Allele allele; string string_chr(current_chr); if (ref_map.find(string_chr) != ref_map.end()) allele.chr_idx = ref_map[string_chr]; else if (ref_map.find("chr"+string_chr) != ref_map.end()) allele.chr_idx = ref_map["chr"+string_chr]; else if (string_chr == "MT" and ref_map.find("chrM") != ref_map.end()) allele.chr_idx = ref_map["chrM"]; else { line_status.push_back(LineStatus(line_number)); line_status.back().filter_message_prefix = "Unknown chromosome name: "; line_status.back().filter_message = string_chr; continue; } allele.pos = strtol(current_start,NULL,10); allele.id = current_id; char *current_ref = NULL; char *current_alt = NULL; for (char *next = strtok(penultimate, ";"); next; next = strtok(NULL, ";")) { if (strncmp(next,"REF=",4) == 0) current_ref = next; else if (strncmp(next,"OBS=",4) == 0) current_alt = next; else if (strncmp(next,"ANCHOR=",7) == 0) { // ignore ANCHOR } else { char *value = next; while (*value and *value != '=') ++value; if (*value == '=') *value++ = 0; allele.custom_tags[next] = value; } } if (!current_ref or !current_alt) { line_status.push_back(LineStatus(line_number)); line_status.back().filter_message_prefix = "Malformed hotspot BED line: REF and OBS fields required in penultimate column"; continue; } for (char *pos = current_ref+4; *pos; ++pos) allele.ref += toupper(*pos); for (char *pos = current_alt+4; *pos; ++pos) allele.alt += toupper(*pos); // here is the place to check the length of the hotspot cover the amplicon junction. ZZ /* if (junc.contain(allele.chr_idx, allele.pos, (unsigned int) allele.ref.size())) { line_status.push_back(LineStatus(line_number)); line_status.back().filter_message_prefix = "hotspot BED line contain the complete overlapping region of two amplicon, the variant cannot be detected by tvc"; continue; } if (not junc.contained_in_ampl(allele.chr_idx, allele.pos, (unsigned int) allele.ref.size())) { line_status.push_back(LineStatus(line_number)); line_status.back().filter_message_prefix = "hotspot BED line is not contained in any amplicon, the variant cannot be detected by tvc"; continue; } */ allele.filtered = false; line_status.push_back(LineStatus(line_number)); allele.line_status = &line_status.back(); allele.opos = allele.pos; allele.oref = allele.ref; allele.oalt = allele.alt; alleles[allele.chr_idx].push_back(allele); //line_status.back().allele = &alleles[allele.chr_idx].back(); line_status.back().chr_idx = allele.chr_idx; line_status.back().opos = allele.opos; line_status.back().id = allele.id; } fclose(input); } if (!input_vcf_filename.empty() or !input_real_vcf_filename.empty()) { bool real_vcf = false; FILE *input; FILE *out_real = NULL; FILE *out_hot = NULL; int fake_ = 0; int hn = 1; if (!input_real_vcf_filename.empty()) { real_vcf = true; input = fopen(input_real_vcf_filename.c_str(),"r"); if (!input) { fprintf(stderr,"ERROR: Cannot open %s\n", input_real_vcf_filename.c_str()); return 1; } out_real = fopen(output_vcf_filename.c_str(), "w"); if (!out_real) { fprintf(stderr,"ERROR: Cannot open %s\n", output_vcf_filename.c_str()); return 1; } if (!output_hot_vcf.empty()) { out_hot = fopen(output_hot_vcf.c_str(), "w"); if (!out_hot) { fprintf(stderr,"ERROR: Cannot open %s\n", output_hot_vcf.c_str()); return 1; } } else out_hot = stdout; fprintf(out_hot, "##fileformat=VCFv4.1\n##allowBlockSubstitutions=true\n#CHROM POS ID REF ALT QUAL FILTER INFO\n"); } else { input = fopen(input_vcf_filename.c_str(),"r"); if (!input) { fprintf(stderr,"ERROR: Cannot open %s\n", input_vcf_filename.c_str()); return 1; } } char line2[65536]; char line3[65536]; int line_number = 0; bool line_overflow = false; list<one_vcfline> vcflist; char last_chr[1024] = ""; while (fgets(line2, 65536, input) != NULL) { if (line2[0] and line2[strlen(line2)-1] != '\n' and strlen(line2) == 65535) { line_overflow = true; continue; } line_number++; if (line_overflow) { line_overflow = false; line_status.push_back(LineStatus(line_number)); line_status.back().filter_message_prefix = "Malformed hotspot VCF line: line length exceeds 64K"; continue; } if (strncmp(line2, "##allowBlockSubstitutions=true", 30) == 0) { allow_block_substitutions = true; continue; } if (line2[0] == '#') { if (out_real) { fprintf(out_real, "%s", line2);} continue; } if (real_vcf) strcpy(line3, line2); char *current_chr = strtok(line2, "\t\r\n"); char *current_start = strtok(NULL, "\t\r\n"); char *current_id = strtok(NULL, "\t\r\n"); char *current_ref = strtok(NULL, "\t\r\n"); char *current_alt = strtok(NULL, "\t\r\n"); strtok(NULL, "\t\r\n"); // Ignore QUAL strtok(NULL, "\t\r\n"); // Ignore FILTER char *current_info = strtok(NULL, "\t\r\n"); strtok(NULL, "\t\r\n"); char *gt = strtok(NULL, "\t\r\n"); if (!current_chr or !current_start or !current_id or !current_ref or !current_alt) { line_status.push_back(LineStatus(line_number)); if (real_vcf) line_status.back().filter_message_prefix = "Malformed real VCF line: expected at least 5 fields"; else line_status.back().filter_message_prefix = "Malformed hotspot VCF line: expected at least 5 fields"; continue; } string string_chr(current_chr); int chr_idx = 0; if (ref_map.find(string_chr) != ref_map.end()) chr_idx = ref_map[string_chr]; else if (ref_map.find("chr"+string_chr) != ref_map.end()) chr_idx = ref_map["chr"+string_chr]; else if (string_chr == "MT" and ref_map.find("chrM") != ref_map.end()) chr_idx = ref_map["chrM"]; else { line_status.push_back(LineStatus(line_number)); line_status.back().filter_message_prefix = "Unknown chromosome name: "; line_status.back().filter_message = string_chr; continue; } for (char *pos = current_ref; *pos; ++pos) *pos = toupper(*pos); for (char *pos = current_alt; *pos; ++pos) *pos = toupper(*pos); // Process custom tags vector<string> bstrand; vector<string> hp_max_length; string raw_oid; string raw_omapalt; string raw_oalt; string raw_oref; string raw_opos; if (current_info) { string raw_bstrand; string raw_hp_max_length; for (char *next = strtok(current_info, ";"); next; next = strtok(NULL, ";")) { char *value = next; while (*value and *value != '=') ++value; if (*value == '=') *value++ = 0; if (strcmp(next, "TYPE") == 0) continue; if (strcmp(next, "HRUN") == 0) continue; if (strcmp(next, "HBASE") == 0) continue; if (strcmp(next, "FR") == 0) continue; if (strcmp(next, "OPOS") == 0) { raw_opos = value; continue; } if (strcmp(next, "OREF") == 0) { raw_oref = value; continue; } if (strcmp(next, "OALT") == 0) { raw_oalt = value; continue; } if (strcmp(next, "OID") == 0) { raw_oid = value; continue; } if (strcmp(next, "OMAPALT") == 0) { raw_omapalt = value; continue; } if (strcmp(next, "BSTRAND") == 0) { raw_bstrand = value; continue; } if (strcmp(next, "hp_max_length") == 0) { raw_hp_max_length = value; continue; } } if (not raw_bstrand.empty()) split(raw_bstrand, ',', bstrand); if (not raw_hp_max_length.empty()) split(raw_hp_max_length, ',', hp_max_length); } if (real_vcf) { //fprintf(stderr, "%s\n", gt); if (gt == NULL) continue; // get gt int g1 = atoi(gt), g2; gt = strchr(gt, '/'); if (gt) g2 = atoi(gt+1); else {fprintf(stderr, "GT not formatted right\n"); exit(1);} //if (g1 == 0 and g2 == 0) continue; unsigned int cur_pos = atoi(current_start); one_vcfline newline(current_ref, current_alt, cur_pos, g1, g2, line3); bool new_chr = false; if (strcmp(current_chr, last_chr) != 0) { new_chr = true; } while (not vcflist.empty()) { if ((not new_chr) and vcflist.front().pos+strlen(vcflist.front().ref) > cur_pos) break; if (vcflist.front().produce_hot_vcf(last_chr, out_real, hn, out_hot)) fake_++; vcflist.pop_front(); } if (new_chr) strcpy(last_chr, current_chr); for (list<one_vcfline>::iterator it = vcflist.begin(); it != vcflist.end(); it++) { it->check_subset(newline); } if (not newline.alts.empty()) vcflist.push_back(newline); continue; } unsigned int allele_idx = 0; for (char *sub_alt = strtok(current_alt,","); sub_alt; sub_alt = strtok(NULL,",")) { Allele allele; allele.chr_idx = chr_idx; allele.ref = current_ref; allele.alt = sub_alt; allele.pos = strtol(current_start,NULL,10)-1; allele.id = current_id; if (allele.id == ".") allele.id = "hotspot"; allele.filtered = false; line_status.push_back(LineStatus(line_number)); allele.line_status = &line_status.back(); allele.opos = allele.pos; allele.oref = allele.ref; allele.oalt = allele.alt; if (allele_idx < bstrand.size()) { if (bstrand[allele_idx] != ".") allele.custom_tags["BSTRAND"] = bstrand[allele_idx]; } if (allele_idx < hp_max_length.size()) { if (hp_max_length[allele_idx] != ".") allele.custom_tags["hp_max_length"] = hp_max_length[allele_idx]; } alleles[allele.chr_idx].push_back(allele); //line_status.back().allele = &alleles[allele.chr_idx].back(); line_status.back().chr_idx = allele.chr_idx; line_status.back().opos = allele.opos; line_status.back().id = allele.id; allele_idx++; } } fclose(input); if (real_vcf) { while (not vcflist.empty()) { if (vcflist.front().produce_hot_vcf(last_chr, out_real, hn, out_hot)) fake_++; vcflist.pop_front(); } fclose(out_real); fclose(out_hot); if (fake_ > 0) return 0; else return 1; } } // Process by chromosome: // - Verify reference allele // - Left align // - Sort // - Filter for block substitutions, write FILE *output_vcf = NULL; if (!output_vcf_filename.empty()) { output_vcf = fopen(output_vcf_filename.c_str(), "w"); if (!output_vcf) { fprintf(stderr,"ERROR: Cannot open %s for writing\n", output_vcf_filename.c_str()); return 1; } fprintf(output_vcf, "##fileformat=VCFv4.1\n"); if (allow_block_substitutions) fprintf(output_vcf, "##allowBlockSubstitutions=true\n"); fprintf(output_vcf, "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\n"); } FILE *output_bed = NULL; if (!output_bed_filename.empty()) { output_bed = fopen(output_bed_filename.c_str(), "w"); if (!output_bed) { fprintf(stderr,"ERROR: Cannot open %s for writing\n", output_bed_filename.c_str()); if (output_vcf) fclose(output_vcf); return 1; } if (allow_block_substitutions) fprintf(output_bed, "track name=\"hotspot\" type=bedDetail allowBlockSubstitutions=true\n"); else fprintf(output_bed, "track name=\"hotspot\" type=bedDetail\n"); } for (int chr_idx = 0; chr_idx < (int)ref_index.size(); ++chr_idx) { for (deque<Allele>::iterator A = alleles[chr_idx].begin(); A != alleles[chr_idx].end(); ++A) { // check bed file if (junc.contain(A->chr_idx, A->pos, (unsigned int) A->ref.size())) { A->filtered = true; A->line_status->filter_message_prefix = "hotspot BED line contain the complete overlapping region of two amplicon, the variant cannot be detected by tvc"; continue; } if (not junc.contained_in_ampl(A->chr_idx, A->pos, (unsigned int) A->ref.size())) { A->filtered = true; A->line_status->filter_message_prefix = "hotspot BED line is not contained in any amplicon, the variant cannot be detected by tvc"; continue; } // Invalid characters bool valid = true; for (const char *c = A->ref.c_str(); *c ; ++c) if (*c != 'A' and *c != 'C' and *c != 'G' and *c != 'T') valid = false; for (const char *c = A->alt.c_str(); *c ; ++c) if (*c != 'A' and *c != 'C' and *c != 'G' and *c != 'T') valid = false; if (not valid) { A->filtered = true; A->line_status->filter_message_prefix = "REF and/or ALT contain characters other than ACGT: "; A->line_status->filter_message = "REF = " + A->ref + " ALT = " + A->alt; continue; } // Filter REF == ALT if (A->ref == A->alt) { A->filtered = true; A->line_status->filter_message_prefix = "REF and ALT alleles equal"; continue; } // Confirm reference allele. string ref_expected; for (int idx = 0; idx < (int) A->ref.size(); ++idx) ref_expected += ref_index[chr_idx].base(A->pos + idx); if (A->ref != ref_expected) { A->filtered = true; A->line_status->filter_message_prefix = "Provided REF allele does not match reference: "; A->line_status->filter_message = "Expected " + ref_expected + ", found " + A->ref; continue; } // Trim int ref_start = 0; int ref_end = A->ref.size(); int alt_end = A->alt.size(); // Option 1: trim all trailing bases; //while(ref_end and alt_end and A->ref[ref_end-1] == A->alt[alt_end-1]) { // --ref_end; // --alt_end; //} // Option 2: trim all leading basees; //while (ref_start < ref_end and ref_start < alt_end and A->ref[ref_start] == A->alt[ref_start]) // ++ref_start; // Option 3: trim anchor base if vcf if (!input_vcf_filename.empty()) { if (ref_end and alt_end and (ref_end == 1 or alt_end == 1) and A->ref[0] == A->alt[0]) ref_start = 1; } A->pos += ref_start; A->ref = A->ref.substr(ref_start, ref_end-ref_start); A->alt = A->alt.substr(ref_start, alt_end-ref_start); ref_end -= ref_start; alt_end -= ref_start; // Left align if (left_alignment && A->custom_tags.find("BSTRAND") == A->custom_tags.end()) { // black list variant not to be left aligned. string trailing; int can_do = 0, need_do = 0; int ref_end_orig= ref_end, alt_end_orig = alt_end; while(ref_end and alt_end and A->ref[ref_end-1] == A->alt[alt_end-1]) { ref_end--; alt_end--; } if (ref_end == 0 || alt_end == 0) { can_do = need_do = 1; // indel type, ZZ } else { int tmp_start = ref_start; int ref_end_0 = ref_end, alt_end_0 = alt_end; // end after remove trailing match ZZ while (tmp_start < ref_end and tmp_start < alt_end and A->ref[tmp_start] == A->alt[tmp_start]) ++tmp_start; if (tmp_start == ref_end || tmp_start == alt_end) { can_do = 1; need_do = 0; // indel but indel is not at the left. ZZ } else { ref_end--; alt_end--; while(ref_end and alt_end and A->ref[ref_end-1] == A->alt[alt_end-1]) { ref_end--; alt_end--; } if (ref_end == 0 || alt_end == 0) { // complex with 1 bp MM at right end can_do = need_do = 1; if (ref_end + alt_end == 0) need_do = 0; // SNP } else { int tmp_start0 = tmp_start; // start after removing leading matches tmp_start++; while (tmp_start < ref_end_orig and tmp_start < alt_end_orig and A->ref[tmp_start] == A->alt[tmp_start]) tmp_start++; if (tmp_start >= ref_end_0 || tmp_start >= alt_end_0 || ref_end <= tmp_start0 || alt_end <= tmp_start0) { // 1MM plus indel in middle, by definition cannot move the indel left enough to change A->pos can_do = 1; need_do = 0; } // else real complex } } } if (!can_do or !need_do) { // do nothing // if !can_do need add some more DP ref_end = ref_end_orig; alt_end = alt_end_orig; } else { // left align the indel part, here either ref_end = 0 or alt_end = 0 int opos = A->pos; while (A->pos > 0) { char nuc = ref_index[chr_idx].base(A->pos-1); if (ref_end > 0 and A->ref[ref_end-1] != nuc) break; if (alt_end > 0 and A->alt[alt_end-1] != nuc) break; A->ref = string(1,nuc) + A->ref; A->alt = string(1,nuc) + A->alt; A->pos--; } if (ref_end != ref_end_orig) { // trailing part is aligned, the whole ref and alt need to be kept. ZZ ref_end = A->ref.size(); alt_end = A->alt.size(); } if (junc.contain(chr_idx, A->pos, ref_end) or not junc.contained_in_ampl(chr_idx, A->pos, ref_end)) { // after left align the hotspot contain an overlap region, revert to the original ZZ if (opos != A->pos) { A->ref.erase(0, opos-A->pos); A->alt.erase(0, opos-A->pos); A->pos = opos; ref_end = ref_end_orig; alt_end = alt_end_orig; } } } } A->ref.resize(ref_end); A->alt.resize(alt_end); // Filter block substitutions: take 1 if (ref_end > 0 and alt_end > 0 and ref_end != alt_end and not allow_block_substitutions and not filter_bypass) { A->filtered = true; A->line_status->filter_message_prefix = "Block substitutions not supported"; continue; } } if (output_bed) { // Sort - without anchor base stable_sort(alleles[chr_idx].begin(), alleles[chr_idx].end(), compare_alleles); // Write for (deque<Allele>::iterator I = alleles[chr_idx].begin(); I != alleles[chr_idx].end(); ++I) { if (I->filtered) continue; fprintf(output_bed, "%s\t%ld\t%ld\t%s\tREF=%s;OBS=%s", ref_index[chr_idx].chr.c_str(), I->pos, I->pos + I->ref.size(), I->id.c_str(), I->ref.c_str(), I->alt.c_str()); for (map<string,string>::iterator C = I->custom_tags.begin(); C != I->custom_tags.end(); ++C) fprintf(output_bed, ";%s=%s", C->first.c_str(), C->second.c_str()); fprintf(output_bed, "\tNONE\n"); /* if (I->pos) fprintf(output_bed, "%s\t%ld\t%ld\t%s\t0\t+\tREF=%s;OBS=%s;ANCHOR=%c\tNONE\n", ref_index[chr_idx].chr.c_str(), I->pos, I->pos + I->ref.size(), I->id.c_str(), I->ref.c_str(), I->alt.c_str(), ref_index[chr_idx].base(I->pos-1)); else fprintf(output_bed, "%s\t%ld\t%ld\t%s\t0\t+\tREF=%s;OBS=%s;ANCHOR=\tNONE\n", ref_index[chr_idx].chr.c_str(), I->pos, I->pos + I->ref.size(), I->id.c_str(), I->ref.c_str(), I->alt.c_str()); */ } } if (output_vcf) { // Add anchor base to indels for (deque<Allele>::iterator I = alleles[chr_idx].begin(); I != alleles[chr_idx].end(); ++I) { if (I->filtered) continue; if (not I->ref.empty() and not I->alt.empty()) continue; if (I->pos == 0) { I->filtered = true; I->line_status->filter_message_prefix = "INDELs at chromosome start not supported"; continue; } I->pos--; I->ref = string(1,ref_index[chr_idx].base(I->pos)) + I->ref; I->alt = string(1,ref_index[chr_idx].base(I->pos)) + I->alt; } // Sort - with anchor base stable_sort(alleles[chr_idx].begin(), alleles[chr_idx].end(), compare_alleles); // Merge alleles, remove block substitutions, write for (deque<Allele>::iterator A = alleles[chr_idx].begin(); A != alleles[chr_idx].end(); ) { string max_ref; deque<Allele>::iterator B = A; for (; B != alleles[chr_idx].end() and B->pos == A->pos; ++B) if (!B->filtered and max_ref.size() < B->ref.size()) max_ref = B->ref; bool filtered = true; map<string,set<string> > unique_alts_and_ids; for (deque<Allele>::iterator I = A; I != B; ++I) { if (I->filtered) continue; string new_alt = I->alt + max_ref.substr(I->ref.size()); if (new_alt.size() > 1 and max_ref.size() > 1 and new_alt.size() != max_ref.size() and not allow_block_substitutions and not filter_bypass) { I->filtered = true; I->line_status->filter_message_prefix = "Block substitutions not supported (post-merge)"; continue; } I->ref = max_ref; I->alt = new_alt; // Filter alleles with duplicate ALT + ID pairs map<string,set<string> >::iterator alt_iter = unique_alts_and_ids.find(new_alt); if (alt_iter != unique_alts_and_ids.end()) { if (alt_iter->second.count(I->id) > 0) { I->filtered = true; I->line_status->filter_message_prefix = "Duplicate allele and ID"; continue; } } unique_alts_and_ids[new_alt].insert(I->id); filtered = false; } if (not filtered) { fprintf(output_vcf, "%s\t%ld\t.\t%s\t", ref_index[chr_idx].chr.c_str(), A->pos+1, max_ref.c_str()); bool comma = false; map<string,map<string,string> > unique_alts_and_tags; set<string> unique_tags; set<string> unique_alt_alleles; for (deque<Allele>::iterator I = A; I != B; ++I) { if (I->filtered) continue; unique_alts_and_tags[I->alt].insert(I->custom_tags.begin(), I->custom_tags.end()); for (map<string,string>::iterator S = I->custom_tags.begin(); S != I->custom_tags.end(); ++S) unique_tags.insert(S->first); if (unique_alt_alleles.count(I->alt) > 0) continue; unique_alt_alleles.insert(I->alt); if (comma) fprintf(output_vcf, ","); comma = true; fprintf(output_vcf, "%s", I->alt.c_str()); } /* for (deque<Allele>::iterator I = A; I != B; ++I) { if (I->filtered) continue; map<string,map<string,string> >::iterator Q = unique_alts_and_tags.find(I->alt); if (comma) fprintf(output_vcf, ","); comma = true; if (Q == unique_alts_and_tags.end()) {fprintf(output_vcf, "."); continue;} fprintf(output_vcf, "%s", Q->first.c_str()); } */ fprintf(output_vcf, "\t.\t.\tOID="); comma = false; for (deque<Allele>::iterator I = A; I != B; ++I) { if (I->filtered) continue; if (comma) fprintf(output_vcf, ","); comma = true; fprintf(output_vcf, "%s", I->id.c_str()); } fprintf(output_vcf, ";OPOS="); comma = false; for (deque<Allele>::iterator I = A; I != B; ++I) { if (I->filtered) continue; if (comma) fprintf(output_vcf, ","); comma = true; fprintf(output_vcf, "%ld", I->opos+1); } fprintf(output_vcf, ";OREF="); comma = false; for (deque<Allele>::iterator I = A; I != B; ++I) { if (I->filtered) continue; if (comma) fprintf(output_vcf, ","); comma = true; fprintf(output_vcf, "%s", I->oref.c_str()); } fprintf(output_vcf, ";OALT="); comma = false; for (deque<Allele>::iterator I = A; I != B; ++I) { if (I->filtered) continue; if (comma) fprintf(output_vcf, ","); comma = true; fprintf(output_vcf, "%s", I->oalt.c_str()); } fprintf(output_vcf, ";OMAPALT="); comma = false; for (deque<Allele>::iterator I = A; I != B; ++I) { if (I->filtered) continue; if (comma) fprintf(output_vcf, ","); comma = true; fprintf(output_vcf, "%s", I->alt.c_str()); } for (set<string>::iterator S = unique_tags.begin(); S != unique_tags.end(); ++S) { fprintf(output_vcf, ";%s=", S->c_str()); comma=false; for (deque<Allele>::iterator I = A; I != B; ++I) { if (I->filtered) continue; map<string,map<string,string> >::iterator Q = unique_alts_and_tags.find(I->alt); if (comma) fprintf(output_vcf, ","); comma = true; if (Q == unique_alts_and_tags.end()) {fprintf(output_vcf, "."); continue;} map<string,string>::iterator W = Q->second.find(*S); if (W == Q->second.end()) fprintf(output_vcf, "."); else fprintf(output_vcf, "%s", W->second.c_str()); } } // fprintf(output_vcf, ";%s=%s", S->first.c_str(), S->second.c_str()); fprintf(output_vcf, "\n"); } A = B; } } } if (output_bed) { fflush(output_bed); fclose(output_bed); } if (output_vcf) { fflush(output_vcf); fclose(output_vcf); } int lines_ignored = 0; for (deque<LineStatus>::iterator L = line_status.begin(); L != line_status.end(); ++L) { if (L->filter_message_prefix) { if (L->chr_idx >= 0) printf("Line %d ignored: [%s:%ld %s] %s%s\n", L->line_number, ref_index[L->chr_idx].chr.c_str(), L->opos+1, L->id.c_str(), L->filter_message_prefix, L->filter_message.c_str()); else printf("Line %d ignored: %s%s\n", L->line_number, L->filter_message_prefix, L->filter_message.c_str()); lines_ignored++; } } printf("Ignored %d out of %d lines\n", lines_ignored, (int)line_status.size()); munmap(ref, ref_stat.st_size); close(ref_handle); if (lines_ignored > 0 and strict_check) return 1; return 0; }
int main(int argc, const char *argv[]) { OptArgs opts; opts.ParseCmdLine(argc, argv); bool help, combineSffs; string sffFile; string bamFile; vector<string> infiles; opts.GetOption(help,"false", 'h', "help"); opts.GetOption(combineSffs,"false", 'c', "combine-sffs"); opts.GetOption(bamFile,"",'o',"out-filename"); opts.GetLeftoverArguments(infiles); if(help || infiles.empty()) { usage(); } if((!combineSffs) && infiles.size() > 1) { cerr << "sff2bam ERROR: if you want to combine all sff files into a single bam file, please use option -c true." << endl; usage(); } sffFile= infiles.front(); if(bamFile.length() < 1) { bamFile = sffFile.substr(0, sffFile.length() - 3); bamFile += "bam"; } sff_file_t* sff_file = sff_fopen(sffFile.c_str(), "r", NULL, NULL); if(!sff_file) { cerr << "sff2bam ERROR: fail to open " << sffFile << endl; exit(1); } // All sff files must have the same flow and key if(combineSffs && infiles.size() > 1) { for(size_t n = 1; n < infiles.size(); ++n) { sff_file_t* sff_file2 = sff_fopen(infiles[n].c_str(), "r", NULL, NULL); if(!sff_file2) { sff_fclose(sff_file); cerr << "sff2bam ERROR: fail to open " << infiles[n] << endl; exit(1); } if(strcmp(sff_file2->header->flow->s, sff_file->header->flow->s) != 0 || strcmp(sff_file2->header->key->s, sff_file->header->key->s) != 0) { sff_fclose(sff_file); sff_fclose(sff_file2); cerr << "sff2bam ERROR: " << sffFile << " and " << infiles[n] << " have different flows or keys." << endl; exit(1); } sff_fclose(sff_file2); } } sff_t* sff = NULL; // Open 1st read for read group name sff = sff_read(sff_file); if(!sff) { sff_fclose(sff_file); cerr << "sff2bam ERROR: fail to read " << sffFile << endl; exit(1); } // Set up BAM header SamHeader sam_header; sam_header.Version = "1.4"; sam_header.SortOrder = "unsorted"; SamProgram sam_program("sff2bam"); sam_program.Name = "sff2bam"; sam_program.Version = SFF2BAM_VERSION; sam_program.CommandLine = "sff2bam"; sam_header.Programs.Add(sam_program); string rgname = sff->rheader->name->s; int index = rgname.find(":"); rgname = rgname.substr(0, index); SamReadGroup read_group(rgname); read_group.FlowOrder = sff->gheader->flow->s; read_group.KeySequence = sff->gheader->key->s; sam_header.ReadGroups.Add(read_group); RefVector refvec; BamWriter bamWriter; bamWriter.SetCompressionMode(BamWriter::Compressed); if(!bamWriter.Open(bamFile, sam_header, refvec)) { sff_fclose(sff_file); cerr << "sff2bam ERROR: failed to open " << bamFile << endl; exit(1); } // Save 1st read BamAlignment bam_alignment0; bam_alignment0.SetIsMapped(false); bam_alignment0.Name = sff->rheader->name->s; size_t nBases = sff->rheader->n_bases + 1 - sff->rheader->clip_qual_left; if(sff->rheader->clip_qual_right > 0) { nBases = sff->rheader->clip_qual_right - sff->rheader->clip_qual_left; } if(nBases > 0) { bam_alignment0.QueryBases.reserve(nBases); bam_alignment0.Qualities.reserve(nBases); for (int base = sff->rheader->clip_qual_left - 1; base < sff->rheader->clip_qual_right - 1; ++base) { bam_alignment0.QueryBases.push_back(sff->read->bases->s[base]); bam_alignment0.Qualities.push_back(sff->read->quality->s[base] + 33); } } int clip_flow = 0; for (unsigned int base = 0; base < sff->rheader->clip_qual_left && base < sff->rheader->n_bases; ++base) { clip_flow += sff->read->flow_index[base]; } if (clip_flow > 0) { clip_flow--; } bam_alignment0.AddTag("RG","Z", rgname); bam_alignment0.AddTag("PG","Z", string("sff2bam")); bam_alignment0.AddTag("ZF","i", clip_flow); // TODO: trim flow vector<uint16_t> flowgram0(sff->gheader->flow_length); copy(sff->read->flowgram, sff->read->flowgram + sff->gheader->flow_length, flowgram0.begin()); bam_alignment0.AddTag("FZ", flowgram0); sff_destroy(sff); sff = NULL; bamWriter.SaveAlignment(bam_alignment0); // Save rest reads while(NULL != (sff = sff_read(sff_file))) { BamAlignment bam_alignment; bam_alignment.SetIsMapped(false); bam_alignment.Name = sff->rheader->name->s; nBases = sff->rheader->n_bases + 1 - sff->rheader->clip_qual_left; if(sff->rheader->clip_qual_right > 0) { nBases = sff->rheader->clip_qual_right - sff->rheader->clip_qual_left; } if(nBases > 0) { bam_alignment.QueryBases.reserve(nBases); bam_alignment.Qualities.reserve(nBases); for (int base = sff->rheader->clip_qual_left - 1; base < sff->rheader->clip_qual_right - 1; ++base) { bam_alignment.QueryBases.push_back(sff->read->bases->s[base]); bam_alignment.Qualities.push_back(sff->read->quality->s[base] + 33); } } clip_flow = 0; for (unsigned int base = 0; base <= sff->rheader->clip_qual_left && base < sff->rheader->n_bases; ++base) { clip_flow += sff->read->flow_index[base]; } if (clip_flow > 0) { clip_flow--; } bam_alignment.AddTag("RG","Z", rgname); bam_alignment.AddTag("PG","Z", string("sff2bam")); bam_alignment.AddTag("ZF","i", clip_flow); // TODO: trim flow vector<uint16_t> flowgram(sff->gheader->flow_length); copy(sff->read->flowgram, sff->read->flowgram + sff->gheader->flow_length, flowgram.begin()); bam_alignment.AddTag("FZ", flowgram); sff_destroy(sff); sff = NULL; bamWriter.SaveAlignment(bam_alignment); } sff_fclose(sff_file); if(combineSffs && infiles.size() > 1) { for(size_t n = 1; n < infiles.size(); ++n) { sff_file_t* sff_file2 = sff_fopen(infiles[n].c_str(), "r", NULL, NULL); while(NULL != (sff = sff_read(sff_file2))) { BamAlignment bam_alignment; bam_alignment.SetIsMapped(false); bam_alignment.Name = sff->rheader->name->s; nBases = sff->rheader->n_bases + 1 - sff->rheader->clip_qual_left; if(sff->rheader->clip_qual_right > 0) { nBases = sff->rheader->clip_qual_right - sff->rheader->clip_qual_left; } if(nBases > 0) { bam_alignment.QueryBases.reserve(nBases); bam_alignment.Qualities.reserve(nBases); for (int base = sff->rheader->clip_qual_left - 1; base < sff->rheader->clip_qual_right - 1; ++base) { bam_alignment.QueryBases.push_back(sff->read->bases->s[base]); bam_alignment.Qualities.push_back(sff->read->quality->s[base] + 33); } } clip_flow = 0; for (unsigned int base = 0; base <= sff->rheader->clip_qual_left && base < sff->rheader->n_bases; ++base) { clip_flow += sff->read->flow_index[base]; } if (clip_flow > 0) { clip_flow--; } bam_alignment.AddTag("RG","Z", rgname); bam_alignment.AddTag("PG","Z", string("sff2bam")); bam_alignment.AddTag("ZF","i", clip_flow); // TODO: trim flow vector<uint16_t> flowgram(sff->gheader->flow_length); copy(sff->read->flowgram, sff->read->flowgram + sff->gheader->flow_length, flowgram.begin()); bam_alignment.AddTag("FZ", flowgram); sff_destroy(sff); sff = NULL; bamWriter.SaveAlignment(bam_alignment); } sff_fclose(sff_file2); } } bamWriter.Close(); return 0; }
bool BaseCallerParameters::InitContextVarsFromOptArgs(OptArgs& opts){ assert(bc_files.options_set); char default_run_id[6]; // Create a run identifier from full output directory string ion_run_to_readname (default_run_id, (char*)bc_files.output_directory.c_str(), bc_files.output_directory.length()); context_vars.run_id = opts.GetFirstString ('-', "run-id", default_run_id); num_threads_ = opts.GetFirstInt ('n', "num-threads", max(2*numCores(), 4)); num_bamwriter_threads_ = opts.GetFirstInt ('-', "num-threads-bamwriter", 6); context_vars.flow_signals_type = opts.GetFirstString ('-', "flow-signals-type", "none"); context_vars.extra_trim_left = opts.GetFirstInt ('-', "extra-trim-left", 0); context_vars.only_process_unfiltered_set = opts.GetFirstBoolean('-', "only-process-unfiltered-set", false); // Treephaser options context_vars.dephaser = opts.GetFirstString ('-', "dephaser", "treephaser-sse"); context_vars.keynormalizer = opts.GetFirstString ('-', "keynormalizer", "gain"); context_vars.windowSize = opts.GetFirstInt ('-', "window-size", DPTreephaser::kWindowSizeDefault_); context_vars.skip_droop = opts.GetFirstBoolean('-', "skip-droop", true); context_vars.skip_recal_during_norm = opts.GetFirstBoolean('-', "skip-recal-during-normalization", false); context_vars.diagonal_state_prog = opts.GetFirstBoolean('-', "diagonal-state-prog", false); // Not every combination of options is possible here: if (context_vars.diagonal_state_prog and context_vars.dephaser != "treephaser-swan") { cout << " === BaseCaller Option Incompatibility: Using dephaser treephaser-swan with diagonal state progression instead of " << context_vars.dephaser << endl; context_vars.dephaser = "treephaser-swan"; } context_vars.process_tfs = true; context_vars.options_set = true; return true; };
int main (int argc, const char *argv[]) { time_t program_start_time; time(&program_start_time); Json::Value calibration_json(Json::objectValue); DumpStartingStateOfProgram (argc,argv,program_start_time, calibration_json["Calibration"]); // // Step 1. Process command line options // OptArgs opts; opts.ParseCmdLine(argc, argv); // enable floating point exceptions during program execution if (opts.GetFirstBoolean('-', "float-exceptions", true)) { cout << "Calibration: Floating point exceptions enabled." << endl; feenableexcept(FE_DIVBYZERO | FE_INVALID | FE_OVERFLOW); } //*/ CalibrationContext calib_context; if (not calib_context.InitializeFromOpts(opts)){ PrintHelp_CalModules(); } HistogramCalibration master_histogram(opts, calib_context); calib_context.hist_calibration_master = &master_histogram; LinearCalibrationModel master_linear_model(opts, calib_context); calib_context.linear_model_master = &master_linear_model; opts.CheckNoLeftovers(); // // Step 2. Execute threaded calibration // int calibration_thread_time = 0; if (calib_context.successive_fit) { // first train linear model if (master_linear_model.DoTraining()) { int l_thread_time = 0; for (int i_iteration=0; i_iteration<calib_context.num_train_iterations; i_iteration++) { cout << " -Training Iteration " << i_iteration+1; l_thread_time = ExecuteThreadedCalibrationTraining(calib_context); // Activate master linear model after every round of training master_linear_model.CreateCalibrationModel(false); // make linear model master_linear_model.SetModelGainsAndOffsets(); // expand for use in basecalling calibration_thread_time += l_thread_time; calib_context.bam_reader.Rewind(); // reset all files for another pass cout << " Duration = " << l_thread_time << endl; } } // Then apply it during polish model training if (master_histogram.DoTraining()) { calib_context.local_fit_linear_model = false; calib_context.local_fit_polish_model = true; calibration_thread_time += ExecuteThreadedCalibrationTraining(calib_context); } } else { // Single pass in which both models are fit jointly calibration_thread_time=ExecuteThreadedCalibrationTraining(calib_context); } // // Step 3. Create models, write output, and close modules // // Linear Model if (master_linear_model.CreateCalibrationModel()) master_linear_model.ExportModelToJson(calibration_json["LinearModel"], ""); // HP histogram calibration if (master_histogram.CreateCalibrationModel()) master_histogram.ExportModelToJson(calibration_json["HPHistogram"]); // Transfer stuff from calibration context and close bam reader calib_context.Close(calibration_json["Calibration"]); time_t program_end_time; time(&program_end_time); calibration_json["Calibration"]["end_time"] = get_time_iso_string(program_end_time); calibration_json["Calibration"]["total_duration"] = (Json::Int)difftime(program_end_time,program_start_time); calibration_json["Calibration"]["calibration_duration"] = (Json::Int)calibration_thread_time; SaveJson(calibration_json, calib_context.filename_json); return EXIT_SUCCESS; }