bool BaseCallerParameters::InitContextVarsFromOptArgs(OptArgs& opts){ assert(bc_files.options_set); char default_run_id[6]; // Create a run identifier from full output directory string ion_run_to_readname (default_run_id, (char*)bc_files.output_directory.c_str(), bc_files.output_directory.length()); context_vars.run_id = opts.GetFirstString ('-', "run-id", default_run_id); num_threads_ = opts.GetFirstInt ('n', "num-threads", max(2*numCores(), 4)); num_bamwriter_threads_ = opts.GetFirstInt ('-', "num-threads-bamwriter", 6); context_vars.flow_signals_type = opts.GetFirstString ('-', "flow-signals-type", "none"); context_vars.extra_trim_left = opts.GetFirstInt ('-', "extra-trim-left", 0); context_vars.only_process_unfiltered_set = opts.GetFirstBoolean('-', "only-process-unfiltered-set", false); // Treephaser options context_vars.dephaser = opts.GetFirstString ('-', "dephaser", "treephaser-sse"); context_vars.keynormalizer = opts.GetFirstString ('-', "keynormalizer", "gain"); context_vars.windowSize = opts.GetFirstInt ('-', "window-size", DPTreephaser::kWindowSizeDefault_); context_vars.skip_droop = opts.GetFirstBoolean('-', "skip-droop", true); context_vars.skip_recal_during_norm = opts.GetFirstBoolean('-', "skip-recal-during-normalization", false); context_vars.diagonal_state_prog = opts.GetFirstBoolean('-', "diagonal-state-prog", false); // Not every combination of options is possible here: if (context_vars.diagonal_state_prog and context_vars.dephaser != "treephaser-swan") { cout << " === BaseCaller Option Incompatibility: Using dephaser treephaser-swan with diagonal state progression instead of " << context_vars.dephaser << endl; context_vars.dephaser = "treephaser-swan"; } context_vars.process_tfs = true; context_vars.options_set = true; return true; };
TagTrimmerParameters MolecularTagTrimmer::ReadOpts(OptArgs& opts) { // Reading command line options to set tag structures TagTrimmerParameters my_params; my_params.min_family_size = opts.GetFirstInt ('-', "min-tag-fam-size", 3); my_params.suppress_mol_tags = opts.GetFirstBoolean ('-', "suppress-mol-tags", false); //my_params.cl_a_handle = opts.GetFirstString ('-', "tag-handle", ""); //my_params.handle_cutoff = opts.GetFirstInt ('-', "handle-cutoff", 2); my_params.master_tags.prefix_mol_tag = opts.GetFirstString ('-', "prefix-mol-tag", ""); my_params.master_tags.suffix_mol_tag = opts.GetFirstString ('-', "suffix-mol-tag", ""); ValidateTagString(my_params.master_tags.prefix_mol_tag); ValidateTagString(my_params.master_tags.suffix_mol_tag); // Overload to disable molecular tagging if (my_params.min_family_size == 0) my_params.suppress_mol_tags = true; else if (my_params.min_family_size < 1) { cerr << "MolecularTagTrimmer Error: min-tag-fam-size must be at least 1. " << endl; exit(EXIT_FAILURE); } my_params.command_line_tags = my_params.master_tags.HasTags(); // Options for read filtering & and trimming method selection string trim_method = opts.GetFirstString ('-', "tag-trim-method", "sloppy-trim"); if (trim_method == "sloppy-trim") my_params.tag_trim_method = kSloppyTrim; else if (trim_method == "strict-trim") my_params.tag_trim_method = kStrictTrim; else { cerr << "MolecularTagTrimmer Error: Unknown tag trimming option " << trim_method << endl; exit(EXIT_FAILURE); } string filter_method = opts.GetFirstString ('-', "tag-filter-method", "need-all"); if (filter_method == "need-all") my_params.tag_filter_method = kneed_all_tags; else if (filter_method == "need-prefix") my_params.tag_filter_method = kneed_only_prefix_tag; else if (filter_method == "need-suffix") my_params.tag_filter_method = kneed_only_suffix_tag; else { cerr << "MolecularTagTrimmer Error: Unknown tag filtering option " << filter_method << endl; exit(EXIT_FAILURE); } return my_params; }
bool BaseCallerContext::SetKeyAndFlowOrder(OptArgs& opts, const char * FlowOrder, const int NumFlows) { flow_order.SetFlowOrder( opts.GetFirstString ('-', "flow-order", FlowOrder), opts.GetFirstInt ('f', "flowlimit", NumFlows)); if (flow_order.num_flows() > NumFlows) flow_order.SetNumFlows(NumFlows); assert(flow_order.is_ok()); string lib_key = opts.GetFirstString ('-', "lib-key", "TCAG"); //! @todo Get default key from wells string tf_key = opts.GetFirstString ('-', "tf-key", "ATCG"); lib_key = opts.GetFirstString ('-', "librarykey", lib_key); // Backward compatible opts tf_key = opts.GetFirstString ('-', "tfkey", tf_key); keys.resize(2); keys[0].Set(flow_order, lib_key, "lib"); keys[1].Set(flow_order, tf_key, "tf"); return true; };
void RecalibrationModel::Initialize(OptArgs& opts, vector<string> &bam_comments, const string & run_id, const ion::ChipSubset & chip_subset) { string model_file_name = opts.GetFirstString ('-', "model-file", ""); int model_threshold = opts.GetFirstInt('-', "recal-model-hp-thres", 4); bool save_hpmodel = opts.GetFirstBoolean('-', "save-hpmodel", true); bool diagonal_state_prog = opts.GetFirstBoolean('-', "diagonal-state-prog", false); if (diagonal_state_prog) model_file_name.clear(); if (InitializeModel(model_file_name, model_threshold) and save_hpmodel) SaveModelFileToBamComments(model_file_name, bam_comments, run_id, chip_subset.GetColOffset(), chip_subset.GetRowOffset()); }
void ExtendParameters::SetupFileIO(OptArgs &opts) { // freeBayes slot fasta = opts.GetFirstString('r', "reference", ""); if (fasta.empty()) { cerr << "Fatal ERROR: Reference file not specified via -r" << endl; exit(1); } ValidateAndCanonicalizePath(fasta); // freeBayes slot variantPriorsFile = opts.GetFirstString('c', "input-vcf", ""); if (variantPriorsFile.empty()) { cerr << "INFO: No input VCF (Hotspot) file specified via -c,--input-vcf" << endl; } else ValidateAndCanonicalizePath(variantPriorsFile); sseMotifsFileName = opts.GetFirstString('e', "error-motifs", ""); sseMotifsProvided = true; if (sseMotifsFileName.empty()) { sseMotifsProvided = false; cerr << "INFO: Systematic error motif file not specified via -e" << endl; } else ValidateAndCanonicalizePath(sseMotifsFileName); opts.GetOption(bams, "", 'b', "input-bam"); if (bams.empty()) { cerr << "FATAL ERROR: BAM file not specified via -b" << endl; exit(-1); } for (unsigned int i_bam = 0; i_bam < bams.size(); ++i_bam) ValidateAndCanonicalizePath(bams[i_bam]); outputDir = opts.GetFirstString('O', "output-dir", "."); ValidateAndCanonicalizePath(outputDir); outputFile = opts.GetFirstString('o', "output-vcf", ""); if (outputFile.empty()) { cerr << "Fatal ERROR: Output VCF filename not specified via -o" << endl; exit(1); } // Are those file names? postprocessed_bam = opts.GetFirstString('-', "postprocessed-bam", ""); sampleName = opts.GetFirstString('g', "sample-name", ""); force_sample_name = opts.GetFirstString('-', "force-sample-name", ""); }
void PhaseEstimator::InitializeFromOptArgs(OptArgs& opts) { phasing_estimator_ = opts.GetFirstString ('-', "phasing-estimator", "spatial-refiner-2"); string arg_cf_ie_dr = opts.GetFirstString ('-', "libcf-ie-dr", ""); residual_threshold_ = opts.GetFirstDouble ('-', "phasing-residual-filter", 1.0); max_phasing_levels_ = opts.GetFirstInt ('-', "max-phasing-levels", max_phasing_levels_default_); use_pid_norm_ = opts.GetFirstString ('-', "keynormalizer", "keynorm-old") == "keynorm-new"; windowSize_ = opts.GetFirstInt ('-', "window-size", DPTreephaser::kWindowSizeDefault_); if (!arg_cf_ie_dr.empty()) { phasing_estimator_ = "override"; result_regions_x_ = 1; result_regions_y_ = 1; result_cf_.assign(1, 0.0); result_ie_.assign(1, 0.0); result_dr_.assign(1, 0.0); if (3 != sscanf (arg_cf_ie_dr.c_str(), "%f,%f,%f", &result_cf_[0], &result_ie_[0], &result_dr_[0])) { fprintf (stderr, "Option Error: libcf-ie-dr %s\n", arg_cf_ie_dr.c_str()); exit (EXIT_FAILURE); } return; // --libcf-ie-dr overrides other phasing-related options } }
void RecalibrationModel::Initialize(OptArgs& opts) { is_enabled_ = false; string model_file_name = opts.GetFirstString ('-', "model-file", ""); if (model_file_name.empty() or model_file_name == "off") { printf("RecalibrationModel: disabled\n\n"); return; } ifstream model_file; model_file.open(model_file_name.c_str()); if (model_file.fail()) { printf("RecalibrationModel: disabled (cannot open %s)\n\n", model_file_name.c_str()); model_file.close(); return; } recalModelHPThres = opts.GetFirstInt('-', "recal-model-hp-thres", 4); string comment_line; getline(model_file, comment_line); //skip the comment time int flowStart, flowEnd, flowSpan, xMin, xMax, xSpan, yMin, yMax, ySpan, max_hp_calibrated; model_file >> flowStart >> flowEnd >> flowSpan >> xMin >> xMax >> xSpan >> yMin >> yMax >> ySpan >> max_hp_calibrated; stratification.SetupRegion(xMin, xMax, xSpan, yMin, yMax, ySpan); //calculate number of partitions and initialize the stratifiedAs and stratifiedBs SetupStratification(flowStart,flowEnd, flowSpan,xMin,xMax,xSpan,yMin,yMax,ySpan,max_hp_calibrated); //TODO: parse model_file into stratifiedAs and stratifiedBs while (model_file.good()) { float paramA, paramB; int refHP; char flowBase; model_file >> flowBase >> flowStart >> flowEnd >> xMin >> xMax >> yMin >> yMax >> refHP >> paramA >> paramB; //populate it to stratifiedAs and startifiedBs int nucInd = NuctoInt(flowBase); //boundary check int offsetRegion = stratification.OffsetRegion(xMin,yMin); FillIndexes(offsetRegion,nucInd, refHP, flowStart, flowEnd, paramA, paramB); } model_file.close(); printf("Recalibration: enabled (using calibration file %s)\n\n", model_file_name.c_str()); is_enabled_ = true; if (recalModelHPThres > MAX_HPXLEN) is_enabled_ = false; }
void ExtendParameters::SetFreeBayesParameters(OptArgs &opts, Json::Value& fb_params) { // FreeBayes parameters // primarily used in candidate generation targets = opts.GetFirstString('t', "target-file", ""); trim_ampliseq_primers = opts.GetFirstBoolean('-', "trim-ampliseq-primers", false); if (targets.empty() and trim_ampliseq_primers) { cerr << "ERROR: --trim-ampliseq-primers enabled but no --target-file provided" << endl; exit(1); } allowIndels = RetrieveParameterBool (opts, fb_params, '-', "allow-indels", true); allowSNPs = RetrieveParameterBool (opts, fb_params, '-', "allow-snps", true); allowMNPs = RetrieveParameterBool (opts, fb_params, '-', "allow-mnps", true); allowComplex = RetrieveParameterBool (opts, fb_params, '-', "allow-complex", false); // deprecated: // leftAlignIndels = RetrieveParameterBool (opts, fb_params, '-', "left-align-indels", false); RetrieveParameterBool (opts, fb_params, '-', "left-align-indels", false); //useBestNAlleles = 0; useBestNAlleles = RetrieveParameterInt (opts, fb_params, 'm', "use-best-n-alleles", 2); onlyUseInputAlleles = RetrieveParameterBool (opts, fb_params, '-', "use-input-allele-only", false); min_mapping_qv = RetrieveParameterInt (opts, fb_params, 'M', "min-mapping-qv", 4); read_snp_limit = RetrieveParameterInt (opts, fb_params, 'U', "read-snp-limit", 10); readMaxMismatchFraction = RetrieveParameterDouble(opts, fb_params, 'z', "read-max-mismatch-fraction", 1.0); maxComplexGap = RetrieveParameterInt (opts, fb_params, '!', "max-complex-gap", 1); // read from json or command line, otherwise default to snp frequency minAltFraction = RetrieveParameterDouble(opts, fb_params, '-', "gen-min-alt-allele-freq", my_controls.filter_snps.min_allele_freq); minCoverage = RetrieveParameterInt (opts, fb_params, '-', "gen-min-coverage", my_controls.filter_snps.min_cov); minIndelAltFraction = RetrieveParameterDouble(opts, fb_params, '-', "gen-min-indel-alt-allele-freq", my_controls.filter_hp_indel.min_allele_freq); //set up debug levels if (program_flow.DEBUG > 0) debug = true; if (program_flow.inputPositionsOnly) { processInputPositionsOnly = true; } if (variantPriorsFile.empty() && (processInputPositionsOnly || onlyUseInputAlleles) ) { cerr << "ERROR: Parameter error - Process-input-positions-only: " << processInputPositionsOnly << " use-input-allele-only: " << onlyUseInputAlleles << " : Specified without Input VCF File " << endl; exit(1); } }
string RetrieveParameterString(OptArgs &opts, Json::Value& json, char short_name, const string& long_name_hyphens, const string& default_value) { string long_name_underscores = GetRidOfDomainAndHyphens(long_name_hyphens); string value = default_value; string source = "builtin default"; if (json.isMember(long_name_underscores)) { value = json[long_name_underscores].asCString(); source = "parameters json file"; } if (opts.HasOption(short_name, long_name_hyphens)) { value = opts.GetFirstString(short_name, long_name_hyphens, value); source = "command line option"; } cout << setw(35) << long_name_hyphens << " = " << setw(10) << value << " (string, " << source << ")" << endl; return value; }
void ExtendParameters::ParametersFromJSON(OptArgs &opts, Json::Value &tvc_params, Json::Value &freebayes_params, Json::Value ¶ms_meta) { string parameters_file = opts.GetFirstString('-', "parameters-file", ""); Json::Value parameters_json(Json::objectValue); if (not parameters_file.empty()) { ifstream in(parameters_file.c_str(), ifstream::in); if (!in.good()) { fprintf(stderr, "[tvc] FATAL ERROR: cannot open %s\n", parameters_file.c_str()); exit(-1); } in >> parameters_json; in.close(); if (parameters_json.isMember("pluginconfig")) parameters_json = parameters_json["pluginconfig"]; tvc_params = parameters_json.get("torrent_variant_caller", Json::objectValue); freebayes_params = parameters_json.get("freebayes", Json::objectValue); params_meta = parameters_json.get("meta", Json::objectValue); }
int IonstatsReduceH5(int argc, const char *argv[]) { OptArgs opts; opts.ParseCmdLine(argc-1, argv+1); string output_h5_filename = opts.GetFirstString ('o', "output", ""); bool merge_proton_blocks = opts.GetFirstBoolean ('b', "merge-proton-blocks", "true"); vector<string> input_h5_filename; opts.GetLeftoverArguments(input_h5_filename); if(input_h5_filename.empty() or output_h5_filename.empty()) { IonstatsReduceH5Help(); return 1; } if(merge_proton_blocks) cout << "NOTE:" << argv[0] << " " << argv[1] << ": --merge-proton-blocks=true so any Proton block-specific read group suffixes will be merged" << endl; return IonstatsAlignmentReduceH5(output_h5_filename, input_h5_filename, merge_proton_blocks); }
bool BaseCallerParameters::InitializeFilesFromOptArgs(OptArgs& opts) { bc_files.input_directory = opts.GetFirstString ('i', "input-dir", "."); bc_files.output_directory = opts.GetFirstString ('o', "output-dir", "."); bc_files.unfiltered_untrimmed_directory = bc_files.output_directory + "/unfiltered.untrimmed"; bc_files.unfiltered_trimmed_directory = bc_files.output_directory + "/unfiltered.trimmed"; CreateResultsFolder ((char*)bc_files.output_directory.c_str()); CreateResultsFolder ((char*)bc_files.unfiltered_untrimmed_directory.c_str()); CreateResultsFolder ((char*)bc_files.unfiltered_trimmed_directory.c_str()); ValidateAndCanonicalizePath(bc_files.input_directory); ValidateAndCanonicalizePath(bc_files.output_directory); ValidateAndCanonicalizePath(bc_files.unfiltered_untrimmed_directory); ValidateAndCanonicalizePath(bc_files.unfiltered_trimmed_directory); bc_files.filename_wells = opts.GetFirstString ('-', "wells", bc_files.input_directory + "/1.wells"); bc_files.filename_mask = opts.GetFirstString ('-', "mask", bc_files.input_directory + "/analysis.bfmask.bin"); ValidateAndCanonicalizePath(bc_files.filename_wells); ValidateAndCanonicalizePath(bc_files.filename_mask, bc_files.input_directory + "/bfmask.bin"); bc_files.filename_filter_mask = bc_files.output_directory + "/bfmask.bin"; bc_files.filename_json = bc_files.output_directory + "/BaseCaller.json"; bc_files.filename_phase = bc_files.output_directory + "/PhaseEstimates.json"; printf("\n"); printf("Input files summary:\n"); printf(" --input-dir %s\n", bc_files.input_directory.c_str()); printf(" --wells %s\n", bc_files.filename_wells.c_str()); printf(" --mask %s\n", bc_files.filename_mask.c_str()); printf("\n"); printf("Output directories summary:\n"); printf(" --output-dir %s\n", bc_files.output_directory.c_str()); printf(" unf.untr %s\n", bc_files.unfiltered_untrimmed_directory.c_str()); printf(" unf.tr %s\n", bc_files.unfiltered_trimmed_directory.c_str()); printf("\n"); bc_files.lib_datasets_file = opts.GetFirstString ('-', "datasets", ""); bc_files.calibration_panel_file = opts.GetFirstString ('-', "calibration-panel", ""); if (not bc_files.lib_datasets_file.empty()) ValidateAndCanonicalizePath(bc_files.lib_datasets_file); if (not bc_files.calibration_panel_file.empty()) ValidateAndCanonicalizePath(bc_files.calibration_panel_file); bc_files.options_set = true; return true; };
int IonstatsReduce(int argc, const char *argv[]) { OptArgs opts; opts.ParseCmdLine(argc, argv); string output_json_filename = opts.GetFirstString('o', "output", ""); vector<string> input_jsons; opts.GetLeftoverArguments(input_jsons); if(input_jsons.empty() or output_json_filename.empty()) { IonstatsReduceHelp(); return 1; } ifstream in(input_jsons[0].c_str(), ifstream::in); if (!in.good()) { fprintf(stderr, "[ionstats] ERROR: cannot open %s\n", input_jsons[0].c_str()); return 1; } Json::Value first_input_json; in >> first_input_json; in.close(); if (!first_input_json.isMember("meta")) { fprintf(stderr, "[ionstats] ERROR: %s is not a valid input file for ionstats reduce\n", input_jsons[0].c_str()); return 1; } string format_name = first_input_json["meta"].get("format_name","").asString(); if (format_name == "ionstats_basecaller") return IonstatsBasecallerReduce(output_json_filename, input_jsons); if (format_name == "ionstats_tf") return IonstatsTestFragmentsReduce(output_json_filename, input_jsons); if (format_name == "ionstats_alignment") return IonstatsAlignmentReduce(output_json_filename, input_jsons); fprintf(stderr, "[ionstats] ERROR: %s is not a valid input file for ionstats reduce\n", input_jsons[0].c_str()); return 1; }
int main(int argc, const char* argv[]) { printf ("tvcvalidator %s-%s (%s) - Prototype tvc validation tool\n\n", IonVersion::GetVersion().c_str(), IonVersion::GetRelease().c_str(), IonVersion::GetSvnRev().c_str()); if (argc == 1) { VariantValidatorHelp(); return 1; } OptArgs opts; opts.ParseCmdLine(argc, argv); if (opts.GetFirstBoolean('v', "version", false)) { return 0; } if (opts.GetFirstBoolean('h', "help", false)) { VariantValidatorHelp(); return 0; } string input_vcf_filename = opts.GetFirstString ('i', "input-vcf", ""); string truth_filename = opts.GetFirstString ('t', "truth-file", ""); string truth_dir = opts.GetFirstString ('d', "truth-dir", "/results/plugins/validateVariantCaller/files"); // TODO: reference optional, only used to verify reference allele in input-vcf and truth files //string reference_filename = opts.GetFirstString ('r', "reference", ""); opts.CheckNoLeftovers(); // // Step 1. Load input VCF file into memory // if (input_vcf_filename.empty()) { VariantValidatorHelp(); cerr << "ERROR: Input VCF file not specified " << endl; return 1; } VariantCallerResults results_vcf; results_vcf.load_vcf(input_vcf_filename); printf("Loaded VCF %s with %d variant calls\n", input_vcf_filename.c_str(), (int)results_vcf.variants.size()); // // Step 2. Parse truth files, compare them to the input vcf, and compute match scores // if (not truth_filename.empty()) { ValidatorTruth truth; truth.ReadTruthFile(truth_filename); truth.CompareToCalls(results_vcf); return 0; } truth_dir += "/*.bed"; glob_t glob_result; glob(truth_dir.c_str(), GLOB_TILDE, NULL, &glob_result); for(unsigned int i = 0; i < glob_result.gl_pathc; ++i) { ValidatorTruth truth; truth.ReadTruthFile(string(glob_result.gl_pathv[i])); truth.CompareToCalls(results_vcf); } globfree(&glob_result); return 0; }
int PrepareHotspots(int argc, const char *argv[]) { OptArgs opts; opts.ParseCmdLine(argc, argv); string input_bed_filename = opts.GetFirstString ('b', "input-bed", ""); string input_vcf_filename = opts.GetFirstString ('v', "input-vcf", ""); string input_real_vcf_filename = opts.GetFirstString ('p', "input-real-vcf", ""); string output_hot_vcf = opts.GetFirstString ('q', "output-fake-hot-vcf", ""); string output_bed_filename = opts.GetFirstString ('d', "output-bed", ""); string output_vcf_filename = opts.GetFirstString ('o', "output-vcf", ""); string reference_filename = opts.GetFirstString ('r', "reference", ""); string unmerged_bed = opts.GetFirstString ('u', "unmerged-bed", ""); bool left_alignment = opts.GetFirstBoolean('a', "left-alignment", false); bool filter_bypass = opts.GetFirstBoolean('f', "filter-bypass", false); bool allow_block_substitutions = opts.GetFirstBoolean('s', "allow-block-substitutions", true); bool strict_check = opts.GetFirstBoolean('S', "strict-check", true); opts.CheckNoLeftovers(); if((input_bed_filename.empty() == (input_vcf_filename.empty() and input_real_vcf_filename.empty())) or (output_bed_filename.empty() and output_vcf_filename.empty()) or reference_filename.empty()) { PrepareHotspotsHelp(); return 1; } if ((not input_real_vcf_filename.empty()) and (output_vcf_filename.empty() or not input_vcf_filename.empty())) { PrepareHotspotsHelp(); return 1; } // Populate chromosome list from reference.fai // Use mmap to fetch the entire reference int ref_handle = open(reference_filename.c_str(),O_RDONLY); struct stat ref_stat; fstat(ref_handle, &ref_stat); char *ref = (char *)mmap(0, ref_stat.st_size, PROT_READ, MAP_SHARED, ref_handle, 0); FILE *fai = fopen((reference_filename+".fai").c_str(), "r"); if (!fai) { fprintf(stderr, "ERROR: Cannot open %s.fai\n", reference_filename.c_str()); return 1; } vector<Reference> ref_index; map<string,int> ref_map; char line[1024], chrom_name[1024]; while (fgets(line, 1024, fai) != NULL) { Reference ref_entry; long chr_start; if (5 != sscanf(line, "%1020s\t%ld\t%ld\t%d\t%d", chrom_name, &ref_entry.size, &chr_start, &ref_entry.bases_per_line, &ref_entry.bytes_per_line)) continue; ref_entry.chr = chrom_name; ref_entry.start = ref + chr_start; ref_index.push_back(ref_entry); ref_map[ref_entry.chr] = (int) ref_index.size() - 1; } fclose(fai); junction junc; if (!unmerged_bed.empty()) { FILE *fp = fopen(unmerged_bed.c_str(), "r"); if (!fp) { fprintf(stderr, "ERROR: Cannot open %s\n", unmerged_bed.c_str()); return 1; } char line2[65536]; junc.init(ref_index.size()); bool line_overflow = false; while (fgets(line2, 65536, fp) != NULL) { if (line2[0] and line2[strlen(line2)-1] != '\n' and strlen(line2) == 65535) { line_overflow = true; continue; } if (line_overflow) { line_overflow = false; continue; } if (strstr(line2, "track")) continue; char chr[100]; int b, e; sscanf(line2, "%s %d %d", chr, &b, &e); junc.add(ref_map[chr], b, e); } fclose(fp); } // Load input BED or load input VCF, group by chromosome deque<LineStatus> line_status; vector<deque<Allele> > alleles(ref_index.size()); if (!input_bed_filename.empty()) { FILE *input = fopen(input_bed_filename.c_str(),"r"); if (!input) { fprintf(stderr,"ERROR: Cannot open %s\n", input_bed_filename.c_str()); return 1; } char line2[65536]; int line_number = 0; bool line_overflow = false; while (fgets(line2, 65536, input) != NULL) { if (line2[0] and line2[strlen(line2)-1] != '\n' and strlen(line2) == 65535) { line_overflow = true; continue; } line_number++; if (line_overflow) { line_overflow = false; line_status.push_back(LineStatus(line_number)); line_status.back().filter_message_prefix = "Malformed hotspot BED line: line length exceeds 64K"; continue; } if (strncmp(line2, "browser", 7) == 0) continue; if (strncmp(line2, "track", 5) == 0) { if (string::npos != string(line2).find("allowBlockSubstitutions=true")) allow_block_substitutions = true; continue; } // OID= table has special meaning if (string::npos != string(line2).find("OID=")) { line_status.push_back(LineStatus(line_number)); line_status.back().filter_message_prefix = "Bed line contains OID="; continue; } char *current_chr = strtok(line2, "\t\r\n"); char *current_start = strtok(NULL, "\t\r\n"); char *current_end = strtok(NULL, "\t\r\n"); char *current_id = strtok(NULL, "\t\r\n"); char *penultimate = strtok(NULL, "\t\r\n"); char *ultimate = strtok(NULL, "\t\r\n"); for (char *next = strtok(NULL, "\t\r\n"); next; next = strtok(NULL, "\t\r\n")) { penultimate = ultimate; ultimate = next; } if (!current_chr or !current_start or !current_end or !current_id or !penultimate or !ultimate) { line_status.push_back(LineStatus(line_number)); line_status.back().filter_message_prefix = "Malformed hotspot BED line: expected at least 6 fields"; continue; } Allele allele; string string_chr(current_chr); if (ref_map.find(string_chr) != ref_map.end()) allele.chr_idx = ref_map[string_chr]; else if (ref_map.find("chr"+string_chr) != ref_map.end()) allele.chr_idx = ref_map["chr"+string_chr]; else if (string_chr == "MT" and ref_map.find("chrM") != ref_map.end()) allele.chr_idx = ref_map["chrM"]; else { line_status.push_back(LineStatus(line_number)); line_status.back().filter_message_prefix = "Unknown chromosome name: "; line_status.back().filter_message = string_chr; continue; } allele.pos = strtol(current_start,NULL,10); allele.id = current_id; char *current_ref = NULL; char *current_alt = NULL; for (char *next = strtok(penultimate, ";"); next; next = strtok(NULL, ";")) { if (strncmp(next,"REF=",4) == 0) current_ref = next; else if (strncmp(next,"OBS=",4) == 0) current_alt = next; else if (strncmp(next,"ANCHOR=",7) == 0) { // ignore ANCHOR } else { char *value = next; while (*value and *value != '=') ++value; if (*value == '=') *value++ = 0; allele.custom_tags[next] = value; } } if (!current_ref or !current_alt) { line_status.push_back(LineStatus(line_number)); line_status.back().filter_message_prefix = "Malformed hotspot BED line: REF and OBS fields required in penultimate column"; continue; } for (char *pos = current_ref+4; *pos; ++pos) allele.ref += toupper(*pos); for (char *pos = current_alt+4; *pos; ++pos) allele.alt += toupper(*pos); // here is the place to check the length of the hotspot cover the amplicon junction. ZZ /* if (junc.contain(allele.chr_idx, allele.pos, (unsigned int) allele.ref.size())) { line_status.push_back(LineStatus(line_number)); line_status.back().filter_message_prefix = "hotspot BED line contain the complete overlapping region of two amplicon, the variant cannot be detected by tvc"; continue; } if (not junc.contained_in_ampl(allele.chr_idx, allele.pos, (unsigned int) allele.ref.size())) { line_status.push_back(LineStatus(line_number)); line_status.back().filter_message_prefix = "hotspot BED line is not contained in any amplicon, the variant cannot be detected by tvc"; continue; } */ allele.filtered = false; line_status.push_back(LineStatus(line_number)); allele.line_status = &line_status.back(); allele.opos = allele.pos; allele.oref = allele.ref; allele.oalt = allele.alt; alleles[allele.chr_idx].push_back(allele); //line_status.back().allele = &alleles[allele.chr_idx].back(); line_status.back().chr_idx = allele.chr_idx; line_status.back().opos = allele.opos; line_status.back().id = allele.id; } fclose(input); } if (!input_vcf_filename.empty() or !input_real_vcf_filename.empty()) { bool real_vcf = false; FILE *input; FILE *out_real = NULL; FILE *out_hot = NULL; int fake_ = 0; int hn = 1; if (!input_real_vcf_filename.empty()) { real_vcf = true; input = fopen(input_real_vcf_filename.c_str(),"r"); if (!input) { fprintf(stderr,"ERROR: Cannot open %s\n", input_real_vcf_filename.c_str()); return 1; } out_real = fopen(output_vcf_filename.c_str(), "w"); if (!out_real) { fprintf(stderr,"ERROR: Cannot open %s\n", output_vcf_filename.c_str()); return 1; } if (!output_hot_vcf.empty()) { out_hot = fopen(output_hot_vcf.c_str(), "w"); if (!out_hot) { fprintf(stderr,"ERROR: Cannot open %s\n", output_hot_vcf.c_str()); return 1; } } else out_hot = stdout; fprintf(out_hot, "##fileformat=VCFv4.1\n##allowBlockSubstitutions=true\n#CHROM POS ID REF ALT QUAL FILTER INFO\n"); } else { input = fopen(input_vcf_filename.c_str(),"r"); if (!input) { fprintf(stderr,"ERROR: Cannot open %s\n", input_vcf_filename.c_str()); return 1; } } char line2[65536]; char line3[65536]; int line_number = 0; bool line_overflow = false; list<one_vcfline> vcflist; char last_chr[1024] = ""; while (fgets(line2, 65536, input) != NULL) { if (line2[0] and line2[strlen(line2)-1] != '\n' and strlen(line2) == 65535) { line_overflow = true; continue; } line_number++; if (line_overflow) { line_overflow = false; line_status.push_back(LineStatus(line_number)); line_status.back().filter_message_prefix = "Malformed hotspot VCF line: line length exceeds 64K"; continue; } if (strncmp(line2, "##allowBlockSubstitutions=true", 30) == 0) { allow_block_substitutions = true; continue; } if (line2[0] == '#') { if (out_real) { fprintf(out_real, "%s", line2);} continue; } if (real_vcf) strcpy(line3, line2); char *current_chr = strtok(line2, "\t\r\n"); char *current_start = strtok(NULL, "\t\r\n"); char *current_id = strtok(NULL, "\t\r\n"); char *current_ref = strtok(NULL, "\t\r\n"); char *current_alt = strtok(NULL, "\t\r\n"); strtok(NULL, "\t\r\n"); // Ignore QUAL strtok(NULL, "\t\r\n"); // Ignore FILTER char *current_info = strtok(NULL, "\t\r\n"); strtok(NULL, "\t\r\n"); char *gt = strtok(NULL, "\t\r\n"); if (!current_chr or !current_start or !current_id or !current_ref or !current_alt) { line_status.push_back(LineStatus(line_number)); if (real_vcf) line_status.back().filter_message_prefix = "Malformed real VCF line: expected at least 5 fields"; else line_status.back().filter_message_prefix = "Malformed hotspot VCF line: expected at least 5 fields"; continue; } string string_chr(current_chr); int chr_idx = 0; if (ref_map.find(string_chr) != ref_map.end()) chr_idx = ref_map[string_chr]; else if (ref_map.find("chr"+string_chr) != ref_map.end()) chr_idx = ref_map["chr"+string_chr]; else if (string_chr == "MT" and ref_map.find("chrM") != ref_map.end()) chr_idx = ref_map["chrM"]; else { line_status.push_back(LineStatus(line_number)); line_status.back().filter_message_prefix = "Unknown chromosome name: "; line_status.back().filter_message = string_chr; continue; } for (char *pos = current_ref; *pos; ++pos) *pos = toupper(*pos); for (char *pos = current_alt; *pos; ++pos) *pos = toupper(*pos); // Process custom tags vector<string> bstrand; vector<string> hp_max_length; string raw_oid; string raw_omapalt; string raw_oalt; string raw_oref; string raw_opos; if (current_info) { string raw_bstrand; string raw_hp_max_length; for (char *next = strtok(current_info, ";"); next; next = strtok(NULL, ";")) { char *value = next; while (*value and *value != '=') ++value; if (*value == '=') *value++ = 0; if (strcmp(next, "TYPE") == 0) continue; if (strcmp(next, "HRUN") == 0) continue; if (strcmp(next, "HBASE") == 0) continue; if (strcmp(next, "FR") == 0) continue; if (strcmp(next, "OPOS") == 0) { raw_opos = value; continue; } if (strcmp(next, "OREF") == 0) { raw_oref = value; continue; } if (strcmp(next, "OALT") == 0) { raw_oalt = value; continue; } if (strcmp(next, "OID") == 0) { raw_oid = value; continue; } if (strcmp(next, "OMAPALT") == 0) { raw_omapalt = value; continue; } if (strcmp(next, "BSTRAND") == 0) { raw_bstrand = value; continue; } if (strcmp(next, "hp_max_length") == 0) { raw_hp_max_length = value; continue; } } if (not raw_bstrand.empty()) split(raw_bstrand, ',', bstrand); if (not raw_hp_max_length.empty()) split(raw_hp_max_length, ',', hp_max_length); } if (real_vcf) { //fprintf(stderr, "%s\n", gt); if (gt == NULL) continue; // get gt int g1 = atoi(gt), g2; gt = strchr(gt, '/'); if (gt) g2 = atoi(gt+1); else {fprintf(stderr, "GT not formatted right\n"); exit(1);} //if (g1 == 0 and g2 == 0) continue; unsigned int cur_pos = atoi(current_start); one_vcfline newline(current_ref, current_alt, cur_pos, g1, g2, line3); bool new_chr = false; if (strcmp(current_chr, last_chr) != 0) { new_chr = true; } while (not vcflist.empty()) { if ((not new_chr) and vcflist.front().pos+strlen(vcflist.front().ref) > cur_pos) break; if (vcflist.front().produce_hot_vcf(last_chr, out_real, hn, out_hot)) fake_++; vcflist.pop_front(); } if (new_chr) strcpy(last_chr, current_chr); for (list<one_vcfline>::iterator it = vcflist.begin(); it != vcflist.end(); it++) { it->check_subset(newline); } if (not newline.alts.empty()) vcflist.push_back(newline); continue; } unsigned int allele_idx = 0; for (char *sub_alt = strtok(current_alt,","); sub_alt; sub_alt = strtok(NULL,",")) { Allele allele; allele.chr_idx = chr_idx; allele.ref = current_ref; allele.alt = sub_alt; allele.pos = strtol(current_start,NULL,10)-1; allele.id = current_id; if (allele.id == ".") allele.id = "hotspot"; allele.filtered = false; line_status.push_back(LineStatus(line_number)); allele.line_status = &line_status.back(); allele.opos = allele.pos; allele.oref = allele.ref; allele.oalt = allele.alt; if (allele_idx < bstrand.size()) { if (bstrand[allele_idx] != ".") allele.custom_tags["BSTRAND"] = bstrand[allele_idx]; } if (allele_idx < hp_max_length.size()) { if (hp_max_length[allele_idx] != ".") allele.custom_tags["hp_max_length"] = hp_max_length[allele_idx]; } alleles[allele.chr_idx].push_back(allele); //line_status.back().allele = &alleles[allele.chr_idx].back(); line_status.back().chr_idx = allele.chr_idx; line_status.back().opos = allele.opos; line_status.back().id = allele.id; allele_idx++; } } fclose(input); if (real_vcf) { while (not vcflist.empty()) { if (vcflist.front().produce_hot_vcf(last_chr, out_real, hn, out_hot)) fake_++; vcflist.pop_front(); } fclose(out_real); fclose(out_hot); if (fake_ > 0) return 0; else return 1; } } // Process by chromosome: // - Verify reference allele // - Left align // - Sort // - Filter for block substitutions, write FILE *output_vcf = NULL; if (!output_vcf_filename.empty()) { output_vcf = fopen(output_vcf_filename.c_str(), "w"); if (!output_vcf) { fprintf(stderr,"ERROR: Cannot open %s for writing\n", output_vcf_filename.c_str()); return 1; } fprintf(output_vcf, "##fileformat=VCFv4.1\n"); if (allow_block_substitutions) fprintf(output_vcf, "##allowBlockSubstitutions=true\n"); fprintf(output_vcf, "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\n"); } FILE *output_bed = NULL; if (!output_bed_filename.empty()) { output_bed = fopen(output_bed_filename.c_str(), "w"); if (!output_bed) { fprintf(stderr,"ERROR: Cannot open %s for writing\n", output_bed_filename.c_str()); if (output_vcf) fclose(output_vcf); return 1; } if (allow_block_substitutions) fprintf(output_bed, "track name=\"hotspot\" type=bedDetail allowBlockSubstitutions=true\n"); else fprintf(output_bed, "track name=\"hotspot\" type=bedDetail\n"); } for (int chr_idx = 0; chr_idx < (int)ref_index.size(); ++chr_idx) { for (deque<Allele>::iterator A = alleles[chr_idx].begin(); A != alleles[chr_idx].end(); ++A) { // check bed file if (junc.contain(A->chr_idx, A->pos, (unsigned int) A->ref.size())) { A->filtered = true; A->line_status->filter_message_prefix = "hotspot BED line contain the complete overlapping region of two amplicon, the variant cannot be detected by tvc"; continue; } if (not junc.contained_in_ampl(A->chr_idx, A->pos, (unsigned int) A->ref.size())) { A->filtered = true; A->line_status->filter_message_prefix = "hotspot BED line is not contained in any amplicon, the variant cannot be detected by tvc"; continue; } // Invalid characters bool valid = true; for (const char *c = A->ref.c_str(); *c ; ++c) if (*c != 'A' and *c != 'C' and *c != 'G' and *c != 'T') valid = false; for (const char *c = A->alt.c_str(); *c ; ++c) if (*c != 'A' and *c != 'C' and *c != 'G' and *c != 'T') valid = false; if (not valid) { A->filtered = true; A->line_status->filter_message_prefix = "REF and/or ALT contain characters other than ACGT: "; A->line_status->filter_message = "REF = " + A->ref + " ALT = " + A->alt; continue; } // Filter REF == ALT if (A->ref == A->alt) { A->filtered = true; A->line_status->filter_message_prefix = "REF and ALT alleles equal"; continue; } // Confirm reference allele. string ref_expected; for (int idx = 0; idx < (int) A->ref.size(); ++idx) ref_expected += ref_index[chr_idx].base(A->pos + idx); if (A->ref != ref_expected) { A->filtered = true; A->line_status->filter_message_prefix = "Provided REF allele does not match reference: "; A->line_status->filter_message = "Expected " + ref_expected + ", found " + A->ref; continue; } // Trim int ref_start = 0; int ref_end = A->ref.size(); int alt_end = A->alt.size(); // Option 1: trim all trailing bases; //while(ref_end and alt_end and A->ref[ref_end-1] == A->alt[alt_end-1]) { // --ref_end; // --alt_end; //} // Option 2: trim all leading basees; //while (ref_start < ref_end and ref_start < alt_end and A->ref[ref_start] == A->alt[ref_start]) // ++ref_start; // Option 3: trim anchor base if vcf if (!input_vcf_filename.empty()) { if (ref_end and alt_end and (ref_end == 1 or alt_end == 1) and A->ref[0] == A->alt[0]) ref_start = 1; } A->pos += ref_start; A->ref = A->ref.substr(ref_start, ref_end-ref_start); A->alt = A->alt.substr(ref_start, alt_end-ref_start); ref_end -= ref_start; alt_end -= ref_start; // Left align if (left_alignment && A->custom_tags.find("BSTRAND") == A->custom_tags.end()) { // black list variant not to be left aligned. string trailing; int can_do = 0, need_do = 0; int ref_end_orig= ref_end, alt_end_orig = alt_end; while(ref_end and alt_end and A->ref[ref_end-1] == A->alt[alt_end-1]) { ref_end--; alt_end--; } if (ref_end == 0 || alt_end == 0) { can_do = need_do = 1; // indel type, ZZ } else { int tmp_start = ref_start; int ref_end_0 = ref_end, alt_end_0 = alt_end; // end after remove trailing match ZZ while (tmp_start < ref_end and tmp_start < alt_end and A->ref[tmp_start] == A->alt[tmp_start]) ++tmp_start; if (tmp_start == ref_end || tmp_start == alt_end) { can_do = 1; need_do = 0; // indel but indel is not at the left. ZZ } else { ref_end--; alt_end--; while(ref_end and alt_end and A->ref[ref_end-1] == A->alt[alt_end-1]) { ref_end--; alt_end--; } if (ref_end == 0 || alt_end == 0) { // complex with 1 bp MM at right end can_do = need_do = 1; if (ref_end + alt_end == 0) need_do = 0; // SNP } else { int tmp_start0 = tmp_start; // start after removing leading matches tmp_start++; while (tmp_start < ref_end_orig and tmp_start < alt_end_orig and A->ref[tmp_start] == A->alt[tmp_start]) tmp_start++; if (tmp_start >= ref_end_0 || tmp_start >= alt_end_0 || ref_end <= tmp_start0 || alt_end <= tmp_start0) { // 1MM plus indel in middle, by definition cannot move the indel left enough to change A->pos can_do = 1; need_do = 0; } // else real complex } } } if (!can_do or !need_do) { // do nothing // if !can_do need add some more DP ref_end = ref_end_orig; alt_end = alt_end_orig; } else { // left align the indel part, here either ref_end = 0 or alt_end = 0 int opos = A->pos; while (A->pos > 0) { char nuc = ref_index[chr_idx].base(A->pos-1); if (ref_end > 0 and A->ref[ref_end-1] != nuc) break; if (alt_end > 0 and A->alt[alt_end-1] != nuc) break; A->ref = string(1,nuc) + A->ref; A->alt = string(1,nuc) + A->alt; A->pos--; } if (ref_end != ref_end_orig) { // trailing part is aligned, the whole ref and alt need to be kept. ZZ ref_end = A->ref.size(); alt_end = A->alt.size(); } if (junc.contain(chr_idx, A->pos, ref_end) or not junc.contained_in_ampl(chr_idx, A->pos, ref_end)) { // after left align the hotspot contain an overlap region, revert to the original ZZ if (opos != A->pos) { A->ref.erase(0, opos-A->pos); A->alt.erase(0, opos-A->pos); A->pos = opos; ref_end = ref_end_orig; alt_end = alt_end_orig; } } } } A->ref.resize(ref_end); A->alt.resize(alt_end); // Filter block substitutions: take 1 if (ref_end > 0 and alt_end > 0 and ref_end != alt_end and not allow_block_substitutions and not filter_bypass) { A->filtered = true; A->line_status->filter_message_prefix = "Block substitutions not supported"; continue; } } if (output_bed) { // Sort - without anchor base stable_sort(alleles[chr_idx].begin(), alleles[chr_idx].end(), compare_alleles); // Write for (deque<Allele>::iterator I = alleles[chr_idx].begin(); I != alleles[chr_idx].end(); ++I) { if (I->filtered) continue; fprintf(output_bed, "%s\t%ld\t%ld\t%s\tREF=%s;OBS=%s", ref_index[chr_idx].chr.c_str(), I->pos, I->pos + I->ref.size(), I->id.c_str(), I->ref.c_str(), I->alt.c_str()); for (map<string,string>::iterator C = I->custom_tags.begin(); C != I->custom_tags.end(); ++C) fprintf(output_bed, ";%s=%s", C->first.c_str(), C->second.c_str()); fprintf(output_bed, "\tNONE\n"); /* if (I->pos) fprintf(output_bed, "%s\t%ld\t%ld\t%s\t0\t+\tREF=%s;OBS=%s;ANCHOR=%c\tNONE\n", ref_index[chr_idx].chr.c_str(), I->pos, I->pos + I->ref.size(), I->id.c_str(), I->ref.c_str(), I->alt.c_str(), ref_index[chr_idx].base(I->pos-1)); else fprintf(output_bed, "%s\t%ld\t%ld\t%s\t0\t+\tREF=%s;OBS=%s;ANCHOR=\tNONE\n", ref_index[chr_idx].chr.c_str(), I->pos, I->pos + I->ref.size(), I->id.c_str(), I->ref.c_str(), I->alt.c_str()); */ } } if (output_vcf) { // Add anchor base to indels for (deque<Allele>::iterator I = alleles[chr_idx].begin(); I != alleles[chr_idx].end(); ++I) { if (I->filtered) continue; if (not I->ref.empty() and not I->alt.empty()) continue; if (I->pos == 0) { I->filtered = true; I->line_status->filter_message_prefix = "INDELs at chromosome start not supported"; continue; } I->pos--; I->ref = string(1,ref_index[chr_idx].base(I->pos)) + I->ref; I->alt = string(1,ref_index[chr_idx].base(I->pos)) + I->alt; } // Sort - with anchor base stable_sort(alleles[chr_idx].begin(), alleles[chr_idx].end(), compare_alleles); // Merge alleles, remove block substitutions, write for (deque<Allele>::iterator A = alleles[chr_idx].begin(); A != alleles[chr_idx].end(); ) { string max_ref; deque<Allele>::iterator B = A; for (; B != alleles[chr_idx].end() and B->pos == A->pos; ++B) if (!B->filtered and max_ref.size() < B->ref.size()) max_ref = B->ref; bool filtered = true; map<string,set<string> > unique_alts_and_ids; for (deque<Allele>::iterator I = A; I != B; ++I) { if (I->filtered) continue; string new_alt = I->alt + max_ref.substr(I->ref.size()); if (new_alt.size() > 1 and max_ref.size() > 1 and new_alt.size() != max_ref.size() and not allow_block_substitutions and not filter_bypass) { I->filtered = true; I->line_status->filter_message_prefix = "Block substitutions not supported (post-merge)"; continue; } I->ref = max_ref; I->alt = new_alt; // Filter alleles with duplicate ALT + ID pairs map<string,set<string> >::iterator alt_iter = unique_alts_and_ids.find(new_alt); if (alt_iter != unique_alts_and_ids.end()) { if (alt_iter->second.count(I->id) > 0) { I->filtered = true; I->line_status->filter_message_prefix = "Duplicate allele and ID"; continue; } } unique_alts_and_ids[new_alt].insert(I->id); filtered = false; } if (not filtered) { fprintf(output_vcf, "%s\t%ld\t.\t%s\t", ref_index[chr_idx].chr.c_str(), A->pos+1, max_ref.c_str()); bool comma = false; map<string,map<string,string> > unique_alts_and_tags; set<string> unique_tags; set<string> unique_alt_alleles; for (deque<Allele>::iterator I = A; I != B; ++I) { if (I->filtered) continue; unique_alts_and_tags[I->alt].insert(I->custom_tags.begin(), I->custom_tags.end()); for (map<string,string>::iterator S = I->custom_tags.begin(); S != I->custom_tags.end(); ++S) unique_tags.insert(S->first); if (unique_alt_alleles.count(I->alt) > 0) continue; unique_alt_alleles.insert(I->alt); if (comma) fprintf(output_vcf, ","); comma = true; fprintf(output_vcf, "%s", I->alt.c_str()); } /* for (deque<Allele>::iterator I = A; I != B; ++I) { if (I->filtered) continue; map<string,map<string,string> >::iterator Q = unique_alts_and_tags.find(I->alt); if (comma) fprintf(output_vcf, ","); comma = true; if (Q == unique_alts_and_tags.end()) {fprintf(output_vcf, "."); continue;} fprintf(output_vcf, "%s", Q->first.c_str()); } */ fprintf(output_vcf, "\t.\t.\tOID="); comma = false; for (deque<Allele>::iterator I = A; I != B; ++I) { if (I->filtered) continue; if (comma) fprintf(output_vcf, ","); comma = true; fprintf(output_vcf, "%s", I->id.c_str()); } fprintf(output_vcf, ";OPOS="); comma = false; for (deque<Allele>::iterator I = A; I != B; ++I) { if (I->filtered) continue; if (comma) fprintf(output_vcf, ","); comma = true; fprintf(output_vcf, "%ld", I->opos+1); } fprintf(output_vcf, ";OREF="); comma = false; for (deque<Allele>::iterator I = A; I != B; ++I) { if (I->filtered) continue; if (comma) fprintf(output_vcf, ","); comma = true; fprintf(output_vcf, "%s", I->oref.c_str()); } fprintf(output_vcf, ";OALT="); comma = false; for (deque<Allele>::iterator I = A; I != B; ++I) { if (I->filtered) continue; if (comma) fprintf(output_vcf, ","); comma = true; fprintf(output_vcf, "%s", I->oalt.c_str()); } fprintf(output_vcf, ";OMAPALT="); comma = false; for (deque<Allele>::iterator I = A; I != B; ++I) { if (I->filtered) continue; if (comma) fprintf(output_vcf, ","); comma = true; fprintf(output_vcf, "%s", I->alt.c_str()); } for (set<string>::iterator S = unique_tags.begin(); S != unique_tags.end(); ++S) { fprintf(output_vcf, ";%s=", S->c_str()); comma=false; for (deque<Allele>::iterator I = A; I != B; ++I) { if (I->filtered) continue; map<string,map<string,string> >::iterator Q = unique_alts_and_tags.find(I->alt); if (comma) fprintf(output_vcf, ","); comma = true; if (Q == unique_alts_and_tags.end()) {fprintf(output_vcf, "."); continue;} map<string,string>::iterator W = Q->second.find(*S); if (W == Q->second.end()) fprintf(output_vcf, "."); else fprintf(output_vcf, "%s", W->second.c_str()); } } // fprintf(output_vcf, ";%s=%s", S->first.c_str(), S->second.c_str()); fprintf(output_vcf, "\n"); } A = B; } } } if (output_bed) { fflush(output_bed); fclose(output_bed); } if (output_vcf) { fflush(output_vcf); fclose(output_vcf); } int lines_ignored = 0; for (deque<LineStatus>::iterator L = line_status.begin(); L != line_status.end(); ++L) { if (L->filter_message_prefix) { if (L->chr_idx >= 0) printf("Line %d ignored: [%s:%ld %s] %s%s\n", L->line_number, ref_index[L->chr_idx].chr.c_str(), L->opos+1, L->id.c_str(), L->filter_message_prefix, L->filter_message.c_str()); else printf("Line %d ignored: %s%s\n", L->line_number, L->filter_message_prefix, L->filter_message.c_str()); lines_ignored++; } } printf("Ignored %d out of %d lines\n", lines_ignored, (int)line_status.size()); munmap(ref, ref_stat.st_size); close(ref_handle); if (lines_ignored > 0 and strict_check) return 1; return 0; }
int main (int argc, const char *argv[]) { printf ("------------- bamrealignment --------------\n"); OptArgs opts; opts.ParseCmdLine(argc, argv); vector<int> score_vals(4); string input_bam = opts.GetFirstString ('i', "input", ""); string output_bam = opts.GetFirstString ('o', "output", ""); opts.GetOption(score_vals, "4,-6,-5,-2", 's', "scores"); int clipping = opts.GetFirstInt ('c', "clipping", 2); bool anchors = opts.GetFirstBoolean ('a', "anchors", true); int bandwidth = opts.GetFirstInt ('b', "bandwidth", 10); bool verbose = opts.GetFirstBoolean ('v', "verbose", false); bool debug = opts.GetFirstBoolean ('d', "debug", false); int format = opts.GetFirstInt ('f', "format", 1); int num_threads = opts.GetFirstInt ('t', "threads", 8); string log_fname = opts.GetFirstString ('l', "log", ""); if (input_bam.empty() or output_bam.empty()) return PrintHelp(); opts.CheckNoLeftovers(); std::ofstream logf; if (log_fname.size ()) { logf.open (log_fname.c_str ()); if (!logf.is_open ()) { fprintf (stderr, "bamrealignment: Failed to open log file %s\n", log_fname.c_str()); return 1; } } BamReader reader; if (!reader.Open(input_bam)) { fprintf(stderr, "bamrealignment: Failed to open input file %s\n", input_bam.c_str()); return 1; } SamHeader header = reader.GetHeader(); RefVector refs = reader.GetReferenceData(); BamWriter writer; writer.SetNumThreads(num_threads); if (format == 1) writer.SetCompressionMode(BamWriter::Uncompressed); else writer.SetCompressionMode(BamWriter::Compressed); if (!writer.Open(output_bam, header, refs)) { fprintf(stderr, "bamrealignment: Failed to open output file %s\n", output_bam.c_str()); return 1; } // The meat starts here ------------------------------------ if (verbose) cout << "Verbose option is activated, each alignment will print to screen." << endl << " After a read hit RETURN to continue to the next one," << endl << " or press q RETURN to quit the program," << endl << " or press s Return to silence verbose," << endl << " or press c RETURN to continue printing without further prompt." << endl << endl; unsigned int readcounter = 0; unsigned int mapped_readcounter = 0; unsigned int realigned_readcounter = 0; unsigned int modified_alignment_readcounter = 0; unsigned int pos_update_readcounter = 0; unsigned int failed_clip_realigned_readcount = 0; unsigned int already_perfect_readcount = 0; unsigned int bad_md_tag_readcount = 0; unsigned int error_recreate_ref_readcount = 0; unsigned int error_clip_anchor_readcount = 0; unsigned int error_sw_readcount = 0; unsigned int error_unclip_readcount = 0; unsigned int start_position_shift; int orig_position; int new_position; string md_tag, new_md_tag, input = "x"; vector<CigarOp> new_cigar_data; vector<MDelement> new_md_data; bool position_shift = false; time_t start_time = time(NULL); Realigner aligner; aligner.verbose_ = verbose; aligner.debug_ = debug; if (!aligner.SetScores(score_vals)) cout << "bamrealignment: Four scores need to be provided: match, mismatch, gap open, gap extend score!" << endl; aligner.SetAlignmentBandwidth(bandwidth); BamAlignment alignment; while(reader.GetNextAlignment(alignment)){ readcounter ++; position_shift = false; if ( (readcounter % 100000) == 0 ) cout << "Processed " << readcounter << " reads. Elapsed time: " << (time(NULL) - start_time) << endl; if (alignment.IsMapped()) { orig_position = alignment.Position; mapped_readcounter++; aligner.SetClipping(clipping, !alignment.IsReverseStrand()); if (aligner.verbose_) { cout << endl; if (alignment.IsReverseStrand()) cout << "The read is from the reverse strand." << endl; else cout << "The read is from the forward strand." << endl; } if (!alignment.GetTag("MD", md_tag)) { if (aligner.verbose_) cout << "Warning: Skipping read " << alignment.Name << ". It is mapped but missing MD tag." << endl; if (logf.is_open ()) logf << alignment.Name << '\t' << alignment.IsReverseStrand() << '\t' << alignment.RefID << '\t' << setfill ('0') << setw (8) << orig_position << '\t' << "MISSMD" << '\n'; bad_md_tag_readcount++; } else if (aligner.CreateRefFromQueryBases(alignment.QueryBases, alignment.CigarData, md_tag, anchors)) { bool clipfail = false; if (Realigner::CR_ERR_CLIP_ANCHOR == aligner.GetCreateRefError ()) { clipfail = true; failed_clip_realigned_readcount ++; } if (!aligner.computeSWalignment(new_cigar_data, new_md_data, start_position_shift)) { if (aligner.verbose_) cout << "Error in the alignment! Not updating read information." << endl; if (logf.is_open ()) logf << alignment.Name << '\t' << alignment.IsReverseStrand() << '\t' << alignment.RefID << '\t' << setfill ('0') << setw (8) << orig_position << '\t' << "SWERR" << '\n'; error_sw_readcount++; writer.SaveAlignment(alignment); // Write alignment unchanged continue; } if (!aligner.addClippedBasesToTags(new_cigar_data, new_md_data, alignment.QueryBases.size())) { if (aligner.verbose_) cout << "Error when adding clipped anchors back to tags! Not updating read information." << endl; if (logf.is_open ()) logf << alignment.Name << '\t' << alignment.IsReverseStrand() << '\t' << alignment.RefID << '\t' << setfill ('0') << setw (8) << orig_position << '\t' << "UNCLIPERR" << '\n'; writer.SaveAlignment(alignment); // Write alignment unchanged error_unclip_readcount ++; continue; } new_md_tag = aligner.GetMDstring(new_md_data); realigned_readcounter++; // adjust start position of read if (!aligner.LeftAnchorClipped() and start_position_shift != 0) { new_position = aligner.updateReadPosition(alignment.CigarData, (int)start_position_shift, alignment.Position); if (new_position != alignment.Position) { pos_update_readcounter++; position_shift = true; alignment.Position = new_position; } } if (position_shift || alignment.CigarData.size () != new_cigar_data.size () || md_tag != new_md_tag) { if (logf.is_open ()) { logf << alignment.Name << '\t' << alignment.IsReverseStrand() << '\t' << alignment.RefID << '\t' << setfill ('0') << setw (8) << orig_position << '\t' << "MOD"; if (position_shift) logf << "-SHIFT"; if (clipfail) logf << " NOCLIP"; logf << '\n'; } modified_alignment_readcounter++; } else { if (logf.is_open ()) { logf << alignment.Name << '\t' << alignment.IsReverseStrand() << '\t' << alignment.RefID << '\t' << setfill ('0') << setw (8) << orig_position << '\t' << "UNMOD"; if (clipfail) logf << " NOCLIP"; logf << '\n'; } } if (aligner.verbose_){ cout << alignment.Name << endl; cout << "------------------------------------------" << endl; // Wait for input to continue or quit program if (input.size() == 0) input = 'x'; else if (input[0] != 'c' and input[0] != 'C') getline(cin, input); if (input.size()>0){ if (input[0] == 'q' or input[0] == 'Q') return 1; else if (input[0] == 's' or input[0] == 'S') aligner.verbose_ = false; } } // Finally update alignment information alignment.CigarData = new_cigar_data; alignment.EditTag("MD", "Z" , new_md_tag); } // end of CreateRef else if else { switch (aligner.GetCreateRefError ()) { case Realigner::CR_ERR_RECREATE_REF: if (logf.is_open ()) logf << alignment.Name << '\t' << alignment.IsReverseStrand() << '\t' << alignment.RefID << '\t' << setfill ('0') << setw (8) << orig_position << '\t' << "RECRERR" << '\n'; error_recreate_ref_readcount++; break; case Realigner::CR_ERR_CLIP_ANCHOR: if (logf.is_open ()) logf << alignment.Name << '\t' << alignment.IsReverseStrand() << '\t' << alignment.RefID << '\t' << setfill ('0') << setw (8) << orig_position << '\t' << "CLIPERR" << '\n'; error_clip_anchor_readcount++; break; default: // On a good run this writes way too many reads to the log file - don't want to create a too large txt file // if (logf.is_open ()) //logf << alignment.Name << '\t' << alignment.IsReverseStrand() << '\t' << alignment.RefID << '\t' << setfill ('0') << setw (8) << orig_position << '\t' << "PERFECT" << '\n'; already_perfect_readcount++; break; } if (aligner.verbose_) { cout << alignment.Name << endl; cout << "------------------------------------------" << endl; // Wait for input to continue or quit program if (input.size() == 0) input = 'x'; else if (input[0] != 'c' and input[0] != 'C') getline(cin, input); if (input.size()>0){ if (input[0] == 'q' or input[0] == 'Q') return 1; else if (input[0] == 's' or input[0] == 'S') aligner.verbose_ = false; } } } // --- Debug output for Rajesh --- if (debug && aligner.invalid_cigar_in_input) { aligner.verbose_ = true; cout << "Invalid cigar string / md tag pair in read " << alignment.Name << endl; // Rerun reference generation to display error aligner.CreateRefFromQueryBases(alignment.QueryBases, alignment.CigarData, md_tag, anchors); aligner.verbose_ = verbose; aligner.invalid_cigar_in_input = false; } // --- --- --- } // end of if isMapped writer.SaveAlignment(alignment); } // end while loop over reads if (aligner.invalid_cigar_in_input) cerr << "WARNING bamrealignment: There were invalid cigar string / md tag pairs in the input bam file." << endl; // ---------------------------------------------------------------- // program end -- output summary information cout << " File: " << input_bam << endl << " Total reads: " << readcounter << endl << " Mapped reads: " << mapped_readcounter << endl; if (bad_md_tag_readcount) cout << " Skipped: bad MD tags: " << bad_md_tag_readcount << endl; if (error_recreate_ref_readcount) cout << " Skipped: unable to recreate ref: " << error_recreate_ref_readcount << endl; if (error_clip_anchor_readcount) cout << " Skipped: error clipping anchor: " << error_clip_anchor_readcount << endl; cout << " Skipped: already perfect: " << already_perfect_readcount << endl << " Total reads realigned: " << mapped_readcounter - already_perfect_readcount - bad_md_tag_readcount - error_recreate_ref_readcount - error_clip_anchor_readcount << endl; if (failed_clip_realigned_readcount) cout << " (including " << failed_clip_realigned_readcount << " that failed to clip)" << endl; if (error_sw_readcount) cout << " Failed to complete SW alignment: " << error_sw_readcount << endl; if (error_unclip_readcount) cout << " Failed to unclip anchor: " << error_unclip_readcount << endl; cout << " Succesfully realigned: " << realigned_readcounter << endl << " Modified alignments: " << modified_alignment_readcounter << endl << " Shifted position: " << pos_update_readcounter << endl; cout << "Processing time: " << (time(NULL)-start_time) << " seconds." << endl; cout << "INFO: The output BAM file may be unsorted." << endl; cout << "------------------------------------------" << endl; return 0; }
int IonstatsAlignment(int argc, const char *argv[]) { OptArgs opts; opts.ParseCmdLine(argc, argv); string input_bam_filename = opts.GetFirstString('i', "input", ""); string output_json_filename = opts.GetFirstString('o', "output", "ionstats_alignment.json"); int histogram_length = opts.GetFirstInt ('h', "histogram-length", 400); if(argc < 2 or input_bam_filename.empty()) { IonstatsAlignmentHelp(); return 1; } // // Prepare for metric calculation // BamReader input_bam; if (!input_bam.Open(input_bam_filename)) { fprintf(stderr, "[ionstats] ERROR: cannot open %s\n", input_bam_filename.c_str()); return 1; } ReadLengthHistogram called_histogram; ReadLengthHistogram aligned_histogram; ReadLengthHistogram AQ7_histogram; ReadLengthHistogram AQ10_histogram; ReadLengthHistogram AQ17_histogram; ReadLengthHistogram AQ20_histogram; ReadLengthHistogram AQ47_histogram; SimpleHistogram error_by_position; called_histogram.Initialize(histogram_length); aligned_histogram.Initialize(histogram_length); AQ7_histogram.Initialize(histogram_length); AQ10_histogram.Initialize(histogram_length); AQ17_histogram.Initialize(histogram_length); AQ20_histogram.Initialize(histogram_length); AQ47_histogram.Initialize(histogram_length); error_by_position.Initialize(histogram_length); BamAlignment alignment; vector<char> MD_op; vector<int> MD_len; MD_op.reserve(1024); MD_len.reserve(1024); string MD_tag; // // Main loop over mapped reads in the input BAM // while(input_bam.GetNextAlignment(alignment)) { // Record read length called_histogram.Add(alignment.Length); if (!alignment.IsMapped() or !alignment.GetTag("MD",MD_tag)) continue; // // Step 1. Parse MD tag // MD_op.clear(); MD_len.clear(); for (const char *MD_ptr = MD_tag.c_str(); *MD_ptr;) { int item_length = 0; if (*MD_ptr >= '0' and *MD_ptr <= '9') { // Its a match MD_op.push_back('M'); for (; *MD_ptr and *MD_ptr >= '0' and *MD_ptr <= '9'; ++MD_ptr) item_length = 10*item_length + *MD_ptr - '0'; } else { if (*MD_ptr == '^') { // Its a deletion MD_ptr++; MD_op.push_back('D'); } else // Its a substitution MD_op.push_back('X'); for (; *MD_ptr and *MD_ptr >= 'A' and *MD_ptr <= 'Z'; ++MD_ptr) item_length++; } MD_len.push_back(item_length); } // // Step 2. Synchronously scan through Cigar and MD, doing error accounting // int MD_idx = alignment.IsReverseStrand() ? MD_op.size()-1 : 0; int cigar_idx = alignment.IsReverseStrand() ? alignment.CigarData.size()-1 : 0; int increment = alignment.IsReverseStrand() ? -1 : 1; int AQ7_bases = 0; int AQ10_bases = 0; int AQ17_bases = 0; int AQ20_bases = 0; int AQ47_bases = 0; int num_bases = 0; int num_errors = 0; while (cigar_idx < (int)alignment.CigarData.size() and MD_idx < (int) MD_op.size() and cigar_idx >= 0 and MD_idx >= 0) { if (alignment.CigarData[cigar_idx].Length == 0) { // Try advancing cigar cigar_idx += increment; continue; } if (MD_len[MD_idx] == 0) { // Try advancing MD MD_idx += increment; continue; } // Match if (alignment.CigarData[cigar_idx].Type == 'M' and MD_op[MD_idx] == 'M') { int advance = min((int)alignment.CigarData[cigar_idx].Length, MD_len[MD_idx]); num_bases += advance; alignment.CigarData[cigar_idx].Length -= advance; MD_len[MD_idx] -= advance; // Insertion (read has a base, reference doesn't) } else if (alignment.CigarData[cigar_idx].Type == 'I') { int advance = alignment.CigarData[cigar_idx].Length; for (int cnt = 0; cnt < advance; ++cnt) { error_by_position.Add(num_bases); num_bases++; num_errors++; } alignment.CigarData[cigar_idx].Length -= advance; // Deletion (reference has a base, read doesn't) } else if (alignment.CigarData[cigar_idx].Type == 'D' and MD_op[MD_idx] == 'D') { int advance = min((int)alignment.CigarData[cigar_idx].Length, MD_len[MD_idx]); for (int cnt = 0; cnt < advance; ++cnt) { error_by_position.Add(num_bases); num_errors++; } alignment.CigarData[cigar_idx].Length -= advance; MD_len[MD_idx] -= advance; // Substitution } else if (MD_op[MD_idx] == 'X') { int advance = min((int)alignment.CigarData[cigar_idx].Length, MD_len[MD_idx]); for (int cnt = 0; cnt < advance; ++cnt) { error_by_position.Add(num_bases); num_bases++; num_errors++; } alignment.CigarData[cigar_idx].Length -= advance; MD_len[MD_idx] -= advance; } else { printf("ionstats alignment: Unexpected OP combination: %s Cigar=%c, MD=%c !\n", alignment.Name.c_str(), alignment.CigarData[cigar_idx].Type, MD_op[MD_idx]); break; } if (num_errors*5 <= num_bases) AQ7_bases = num_bases; if (num_errors*10 <= num_bases) AQ10_bases = num_bases; if (num_errors*50 <= num_bases) AQ17_bases = num_bases; if (num_errors*100 <= num_bases) AQ20_bases = num_bases; if (num_errors == 0) AQ47_bases = num_bases; } // // Step 3. Profit // if (num_bases >= 20) aligned_histogram.Add(num_bases); if (AQ7_bases >= 20) AQ7_histogram.Add(AQ7_bases); if (AQ10_bases >= 20) AQ10_histogram.Add(AQ10_bases); if (AQ17_bases >= 20) AQ17_histogram.Add(AQ17_bases); if (AQ20_bases >= 20) AQ20_histogram.Add(AQ20_bases); if (AQ47_bases >= 20) AQ47_histogram.Add(AQ47_bases); } input_bam.Close(); // // Processing complete, generate ionstats_alignment.json // Json::Value output_json(Json::objectValue); output_json["meta"]["creation_date"] = get_time_iso_string(time(NULL)); output_json["meta"]["format_name"] = "ionstats_alignment"; output_json["meta"]["format_version"] = "1.0"; called_histogram.SaveToJson(output_json["full"]); aligned_histogram.SaveToJson(output_json["aligned"]); AQ7_histogram.SaveToJson(output_json["AQ7"]); AQ10_histogram.SaveToJson(output_json["AQ10"]); AQ17_histogram.SaveToJson(output_json["AQ17"]); AQ20_histogram.SaveToJson(output_json["AQ20"]); AQ47_histogram.SaveToJson(output_json["AQ47"]); error_by_position.SaveToJson(output_json["error_by_position"]); ofstream out(output_json_filename.c_str(), ios::out); if (out.good()) { out << output_json.toStyledString(); return 0; } else { fprintf(stderr, "ERROR: unable to write to '%s'\n", output_json_filename.c_str()); return 1; } return 0; }
int IonstatsBasecaller(int argc, const char *argv[]) { OptArgs opts; opts.ParseCmdLine(argc, argv); string input_bam_filename = opts.GetFirstString('i', "input", ""); string output_json_filename = opts.GetFirstString('o', "output", "ionstats_basecaller.json"); int histogram_length = opts.GetFirstInt ('h', "histogram-length", 400); if(argc < 2 or input_bam_filename.empty()) { IonstatsBasecallerHelp(); return 1; } BamReader input_bam; if (!input_bam.Open(input_bam_filename)) { fprintf(stderr, "[ionstats] ERROR: cannot open %s\n", input_bam_filename.c_str()); return 1; } SamHeader sam_header = input_bam.GetHeader(); if(!sam_header.HasReadGroups()) { fprintf(stderr, "[ionstats] ERROR: no read groups in %s\n", input_bam_filename.c_str()); return 1; } ReadLengthHistogram total_full_histo; ReadLengthHistogram total_insert_histo; ReadLengthHistogram total_Q17_histo; ReadLengthHistogram total_Q20_histo; total_full_histo.Initialize(histogram_length); total_insert_histo.Initialize(histogram_length); total_Q17_histo.Initialize(histogram_length); total_Q20_histo.Initialize(histogram_length); MetricGeneratorSNR system_snr; BaseQVHistogram qv_histogram; string flow_order; string key; for (SamReadGroupIterator rg = sam_header.ReadGroups.Begin(); rg != sam_header.ReadGroups.End(); ++rg) { if(rg->HasFlowOrder()) flow_order = rg->FlowOrder; if(rg->HasKeySequence()) key = rg->KeySequence; } double qv_to_error_rate[256]; for (int qv = 0; qv < 256; qv++) qv_to_error_rate[qv] = pow(10.0,-0.1*(double)qv); BamAlignment alignment; string read_group; vector<uint16_t> flow_signal_fz(flow_order.length()); vector<int16_t> flow_signal_zm(flow_order.length()); while(input_bam.GetNextAlignment(alignment)) { // Record read length unsigned int full_length = alignment.Length; total_full_histo.Add(full_length); // Record insert length int insert_length = 0; if (alignment.GetTag("ZA",insert_length)) total_insert_histo.Add(insert_length); // Compute and record Q17 and Q20 int Q17_length = 0; int Q20_length = 0; double num_accumulated_errors = 0.0; for(int pos = 0; pos < alignment.Length; ++pos) { num_accumulated_errors += qv_to_error_rate[(int)alignment.Qualities[pos] - 33]; if (num_accumulated_errors / (pos + 1) <= 0.02) Q17_length = pos + 1; if (num_accumulated_errors / (pos + 1) <= 0.01) Q20_length = pos + 1; } total_Q17_histo.Add(Q17_length); total_Q20_histo.Add(Q20_length); // Record data for system snr if(alignment.GetTag("ZM", flow_signal_zm)) system_snr.Add(flow_signal_zm, key.c_str(), flow_order); else if(alignment.GetTag("FZ", flow_signal_fz)) system_snr.Add(flow_signal_fz, key.c_str(), flow_order); // Record qv histogram qv_histogram.Add(alignment.Qualities); } input_bam.Close(); Json::Value output_json(Json::objectValue); output_json["meta"]["creation_date"] = get_time_iso_string(time(NULL)); output_json["meta"]["format_name"] = "ionstats_basecaller"; output_json["meta"]["format_version"] = "1.0"; system_snr.SaveToJson(output_json); qv_histogram.SaveToJson(output_json); total_full_histo.SaveToJson(output_json["full"]); total_insert_histo.SaveToJson(output_json["insert"]); total_Q17_histo.SaveToJson(output_json["Q17"]); total_Q20_histo.SaveToJson(output_json["Q20"]); ofstream out(output_json_filename.c_str(), ios::out); if (out.good()) { out << output_json.toStyledString(); return 0; } else { fprintf(stderr, "ERROR: unable to write to '%s'\n", output_json_filename.c_str()); return 1; } }
BaseCallerFilters::BaseCallerFilters(OptArgs& opts, const string& _flowOrder, int _numFlows, const vector<KeySequence>& _keys, Mask *_maskPtr) { flowOrder = _flowOrder; keypassFilter = opts.GetFirstBoolean('k', "keypass-filter", true); percentPositiveFlowsFilterTFs = opts.GetFirstBoolean('-', "clonal-filter-tf", false); clonalFilterTraining = opts.GetFirstBoolean('-', "clonal-filter-train", false); clonalFilterSolving = opts.GetFirstBoolean('-', "clonal-filter-solve", false); minReadLength = opts.GetFirstInt ('-', "min-read-length", 8); cafieResFilterCalling = opts.GetFirstBoolean('-', "cr-filter", false); cafieResFilterTFs = opts.GetFirstBoolean('-', "cr-filter-tf", false); generate_bead_summary_ = opts.GetFirstBoolean('-', "bead-summary", false); // TODO: get this to work right. May require "unwound" flow order, so incompatible with current wells.FlowOrder() //flt_control.cafieResMaxValueByFlowOrder[std::string ("TACG") ] = 0.06; // regular flow order //flt_control.cafieResMaxValueByFlowOrder[std::string ("TACGTACGTCTGAGCATCGATCGATGTACAGC") ] = 0.08; // xdb flow order cafieResMaxValue = opts.GetFirstDouble('-', "cr-filter-max-value", 0.08); // SFFTrim options trim_adapter = opts.GetFirstString('-', "trim-adapter", "ATCACCGACTGCCCATAGAGAGGCTGAGAC"); trim_adapter_cutoff = opts.GetFirstDouble('-', "trim-adapter-cutoff", 0.0); trim_adapter_closest = opts.GetFirstBoolean('-', "trim-adapter-pick-closest", false); trim_qual_wsize = opts.GetFirstInt('-', "trim-qual-window-size", 30); trim_qual_cutoff = opts.GetFirstDouble('-', "trim-qual-cutoff", 100.0); trim_min_read_len = opts.GetFirstInt('-', "trim-min-read-len", 8); // Validate options if (minReadLength < 1) { fprintf (stderr, "Option Error: min-read-length must specify a positive value (%d invalid).\n", minReadLength); exit (EXIT_FAILURE); } if (cafieResMaxValue <= 0) { fprintf (stderr, "Option Error: cr-filter-max-value must specify a positive value (%lf invalid).\n", cafieResMaxValue); exit (EXIT_FAILURE); } keys = _keys; numClasses = keys.size(); assert(numClasses == 2); classFilterPolyclonal.resize(numClasses); classFilterPolyclonal[0] = clonalFilterSolving; classFilterPolyclonal[1] = clonalFilterSolving && percentPositiveFlowsFilterTFs; classFilterHighResidual.resize(numClasses); classFilterHighResidual[0] = cafieResFilterCalling; classFilterHighResidual[1] = cafieResFilterCalling && cafieResFilterTFs; string filter_beverly_args = opts.GetFirstString('-', "beverly-filter", "0.03,0.03,8"); if (filter_beverly_args == "off") { filter_beverly_enabled_ = false; // Nothing, really printf("Beverly filter: disabled, use --beverly-filter=filter_ratio,trim_ratio,min_length\n"); } else { int stat = sscanf (filter_beverly_args.c_str(), "%f,%f,%d", &filter_beverly_filter_ratio_, &filter_beverly_trim_ratio_, &filter_beverly_min_read_length_); if (stat != 3) { fprintf (stderr, "Option Error: beverly-filter %s\n", filter_beverly_args.c_str()); fprintf (stderr, "Usage: --beverly-filter=filter_ratio,trim_ratio,min_length\n"); exit (EXIT_FAILURE); } filter_beverly_enabled_ = true; printf("Beverly filter: enabled, use --beverly-filter=off to disable\n"); printf("Beverly filter: filter_ratio = %1.5f\n", filter_beverly_filter_ratio_); printf("Beverly filter: trim_ratio = %1.5f\n", filter_beverly_trim_ratio_); printf("Beverly filter: min_length = %d\n", filter_beverly_min_read_length_); } maskPtr = _maskPtr; numFlows = _numFlows; filterMask.assign(maskPtr->H()*maskPtr->W(), kUninitialized); }
void PerBaseQual::Init(OptArgs& opts, const string& chip_type, const string &output_directory, bool recalib) { if(phred_table_) { delete [] phred_table_; phred_table_ = 0; } string phred_table_file = opts.GetFirstString ('-', "phred-table-file", ""); save_predictors_ = opts.GetFirstBoolean('-', "save-predictors", false); // Determine the correct phred table filename to use bool binTable = true; if (phred_table_file.empty()) { ChipIdDecoder::SetGlobalChipId(chip_type.c_str()); ChipIdEnum chip_id = ChipIdDecoder::GetGlobalChipId(); switch(chip_id){ case ChipId314: phred_table_file = "phredTable.txt_314.binary"; break; case ChipId316: phred_table_file = "phredTable.txt_316.binary"; break; case ChipId316v2: phred_table_file = "phredTable.txt_318.binary"; break; case ChipId318: phred_table_file = "phredTable.txt_318.binary"; break; case ChipId900: // Proton chip phred_table_file = "phredTable.txt_900.binary"; break; default: phred_table_file = "phredTable.txt_314.binary"; fprintf(stderr, "PerBaseQual: No default phred table for chip_type=%s, trying %s instead\n", chip_type.c_str(), phred_table_file.c_str()); break; } if (recalib) { phred_table_file = phred_table_file.substr(0, phred_table_file.length() - 7); phred_table_file += ".Recal.binary"; } char* full_filename = GetIonConfigFile(phred_table_file.c_str()); if(!full_filename) { printf("WARNING: cannot find binary phred table file %s, try to use non-binary phred table\n", phred_table_file.c_str()); phred_table_file = phred_table_file.substr(0, phred_table_file.length() - 7); // get rid of .binary binTable = false; char* full_filename2 = GetIonConfigFile(phred_table_file.c_str()); if(!full_filename2) ION_ABORT("ERROR: Can't find phred table file " + phred_table_file); phred_table_file = full_filename2; free(full_filename2); } else { phred_table_file = full_filename; free(full_filename); } } cout << endl << "PerBaseQual::Init... phred_table_file=" << phred_table_file << endl; binTable = hasBinaryExtension(phred_table_file); // Load the phred table if(binTable) { cout << endl << "PerBaseQual::Init... load binary phred_table_file=" << phred_table_file << endl; vector<size_t> vNumCuts(kNumPredictors, 0); if(H5Fis_hdf5(phred_table_file.c_str()) > 0) { hid_t root = H5Fopen(phred_table_file.c_str(), H5F_ACC_RDONLY, H5P_DEFAULT); if(root < 0) { ION_ABORT("ERROR: cannot open HDF5 file " + phred_table_file); } hid_t grpQvTable = H5Gopen(root, "/QvTable", H5P_DEFAULT); if (grpQvTable < 0) { H5Fclose(root); ION_ABORT("ERROR: fail to open HDF5 group QvTable"); } if(H5Aexists(grpQvTable, "NumPredictors") <= 0) { H5Gclose(grpQvTable); H5Fclose(root); ION_ABORT("ERROR: HDF5 attribute NumPredictors does not exist"); } hid_t attrNumPreds = H5Aopen(grpQvTable, "NumPredictors", H5P_DEFAULT); if (attrNumPreds < 0) { H5Gclose(grpQvTable); H5Fclose(root); ION_ABORT("ERROR: fail to open HDF5 attribute NumPredictors"); } unsigned int numPredictors = 0; herr_t ret = H5Aread(attrNumPreds, H5T_NATIVE_UINT, &numPredictors); H5Aclose(attrNumPreds); if(ret < 0 || numPredictors != (unsigned int)kNumPredictors) { H5Gclose(grpQvTable); H5Fclose(root); ION_ABORT("ERROR: HDF5 attribute NumPredictors is wrong"); } char buf[100]; for(size_t i = 0; i < (size_t)kNumPredictors; ++i) { offsets_.push_back(1); sprintf(buf, "ThresholdsOfPredictor%d", (int)i); if(H5Aexists(grpQvTable, buf) <= 0) { H5Gclose(grpQvTable); H5Fclose(root); ION_ABORT("ERROR: HDF5 attribute ThresholdsOfPredictor does not exist"); } hid_t attrCuts = H5Aopen(grpQvTable, buf, H5P_DEFAULT); if (attrCuts < 0) { H5Gclose(grpQvTable); H5Fclose(root); ION_ABORT("ERROR: fail to open HDF5 attribute ThresholdsOfPredictor"); } hsize_t size = H5Aget_storage_size(attrCuts); size /= sizeof(float); float* fcuts = new float[size]; ret = H5Aread(attrCuts, H5T_NATIVE_FLOAT, fcuts); H5Aclose(attrCuts); if(ret < 0) { H5Gclose(grpQvTable); H5Fclose(root); ION_ABORT("ERROR: fail to read HDF5 attribute ThresholdsOfPredictor"); } vector<float> vCuts(size); copy(fcuts, fcuts + size, vCuts.begin()); phred_cuts_.push_back(vCuts); delete [] fcuts; fcuts = 0; } hid_t dsQvs = H5Dopen(grpQvTable, "Qvs", H5P_DEFAULT); if (dsQvs < 0) { H5Gclose(grpQvTable); H5Fclose(root); ION_ABORT("ERROR: fail to open HDF5 dataset Qvs"); } hsize_t tbSize = H5Dget_storage_size(dsQvs); phred_table_ = new unsigned char[tbSize]; ret = H5Dread(dsQvs, H5T_NATIVE_UCHAR, H5S_ALL, H5S_ALL, H5P_DEFAULT, phred_table_); H5Dclose(dsQvs); H5Gclose(grpQvTable); H5Fclose(root); if (ret < 0) { delete [] phred_table_; phred_table_ = 0; ION_ABORT("ERROR: fail to read HDF5 dataset Qvs"); } } else { printf("WARNING: binary phred table file %s is not a HDF5 file, try binary file mode.\n", phred_table_file.c_str()); ifstream source; source.open(phred_table_file.c_str(), ios::in|ios::binary|ios::ate); if (!source.is_open()) ION_ABORT("ERROR: Cannot open file: " + phred_table_file); long totalSize = source.tellg(); char* tbBlock = new char [totalSize]; source.seekg (0, ios::beg); source.read (tbBlock, totalSize); source.close(); long headerSize = 0; char* ptr = tbBlock; int numPredictors = ptr[0]; //kNumPredictors if(numPredictors != kNumPredictors) { delete [] tbBlock; tbBlock = 0; ION_ABORT("ERROR: Wrong number of predictors load from " + phred_table_file); } ptr += 4; headerSize += 4; for(int i = 0; i < kNumPredictors; ++i) { vNumCuts[i] = ptr[0]; ptr += 4; headerSize += 4; offsets_.push_back(1); } long tbSize = 1; for(int i = 0; i < kNumPredictors; ++i) { vector<float> vCuts; tbSize *= vNumCuts[i]; for(size_t j = 0; j < vNumCuts[i]; ++j) { float tmp; memcpy(&tmp, ptr, 4); vCuts.push_back(tmp); ptr += 4; headerSize += 4; } phred_cuts_.push_back(vCuts); } if(tbSize != (totalSize - headerSize)) { delete [] tbBlock; tbBlock = 0; ION_ABORT("ERROR: Wrong QV table size"); } phred_table_ = new unsigned char[tbSize]; memcpy(phred_table_, ptr, tbSize * sizeof(unsigned char)); delete [] tbBlock; tbBlock = 0; } for(size_t i = kNumPredictors - 2; i > 0; --i) { offsets_[i] *= phred_cuts_[i + 1].size(); offsets_[i - 1] = offsets_[i]; } offsets_[0] *= phred_cuts_[1].size(); } else { ifstream source; source.open(phred_table_file.c_str()); if (!source.is_open()) ION_ABORT("ERROR: Cannot open file: " + phred_table_file); while (!source.eof()) { string line; getline(source, line); if (line.empty()) break; if (line[0] == '#') continue; stringstream strs(line); float temp; for (int k = 0; k < kNumPredictors; ++k) { strs >> temp; phred_thresholds_[k].push_back(temp); } strs >> temp; //skip n-th entry strs >> temp; phred_quality_.push_back(temp); } source.close(); for (int k = 0; k < kNumPredictors; ++k) phred_thresholds_max_[k] = *max_element(phred_thresholds_[k].begin(), phred_thresholds_[k].end()); } // Prepare for predictor dump here if (save_predictors_) { string predictors_filename = output_directory + "/Predictors.txt"; cout << endl << "Saving PerBaseQual predictors to file " << predictors_filename << endl << endl; predictor_dump_.open(predictors_filename.c_str()); if (!predictor_dump_.is_open()) ION_ABORT("ERROR: Cannot open file: " + predictors_filename); } }
int PrepareHotspots(int argc, const char *argv[]) { OptArgs opts; opts.ParseCmdLine(argc, argv); string input_bed_filename = opts.GetFirstString ('b', "input-bed", ""); string input_vcf_filename = opts.GetFirstString ('v', "input-vcf", ""); string output_bed_filename = opts.GetFirstString ('d', "output-bed", ""); string output_vcf_filename = opts.GetFirstString ('o', "output-vcf", ""); string reference_filename = opts.GetFirstString ('r', "reference", ""); bool left_alignment = opts.GetFirstBoolean('a', "left-alignment", false); bool filter_bypass = opts.GetFirstBoolean('f', "filter-bypass", false); bool allow_block_substitutions = opts.GetFirstBoolean('s', "allow-block-substitutions", false); opts.CheckNoLeftovers(); if((input_bed_filename.empty() == input_vcf_filename.empty()) or (output_bed_filename.empty() and output_vcf_filename.empty()) or reference_filename.empty()) { PrepareHotspotsHelp(); return 1; } // Populate chromosome list from reference.fai // Use mmap to fetch the entire reference int ref_handle = open(reference_filename.c_str(),O_RDONLY); struct stat ref_stat; fstat(ref_handle, &ref_stat); char *ref = (char *)mmap(0, ref_stat.st_size, PROT_READ, MAP_SHARED, ref_handle, 0); FILE *fai = fopen((reference_filename+".fai").c_str(), "r"); if (!fai) { fprintf(stderr, "ERROR: Cannot open %s.fai\n", reference_filename.c_str()); return 1; } vector<Reference> ref_index; map<string,int> ref_map; char line[1024], chrom_name[1024]; while (fgets(line, 1024, fai) != NULL) { Reference ref_entry; long chr_start; if (5 != sscanf(line, "%s\t%ld\t%ld\t%d\t%d", chrom_name, &ref_entry.size, &chr_start, &ref_entry.bases_per_line, &ref_entry.bytes_per_line)) continue; ref_entry.chr = chrom_name; ref_entry.start = ref + chr_start; ref_index.push_back(ref_entry); ref_map[ref_entry.chr] = (int) ref_index.size() - 1; } fclose(fai); // Load input BED or load input VCF, group by chromosome deque<LineStatus> line_status; vector<deque<Allele> > alleles(ref_index.size()); if (!input_bed_filename.empty()) { FILE *input = fopen(input_bed_filename.c_str(),"r"); if (!input) { fprintf(stderr,"ERROR: Cannot open %s\n", input_bed_filename.c_str()); return 1; } char line2[65536]; int line_number = 0; bool line_overflow = false; while (fgets(line2, 65536, input) != NULL) { if (line2[0] and line2[strlen(line2)-1] != '\n' and strlen(line2) == 65535) { line_overflow = true; continue; } line_number++; if (line_overflow) { line_overflow = false; line_status.push_back(LineStatus(line_number)); line_status.back().filter_message_prefix = "Malformed hotspot BED line: line length exceeds 64K"; continue; } if (strncmp(line2, "browser", 7) == 0) continue; if (strncmp(line2, "track", 5) == 0) { if (string::npos != string(line2).find("allowBlockSubstitutions=true")) allow_block_substitutions = true; continue; } char *current_chr = strtok(line2, "\t\r\n"); char *current_start = strtok(NULL, "\t\r\n"); char *current_end = strtok(NULL, "\t\r\n"); char *current_id = strtok(NULL, "\t\r\n"); char *penultimate = strtok(NULL, "\t\r\n"); char *ultimate = strtok(NULL, "\t\r\n"); for (char *next = strtok(NULL, "\t\r\n"); next; next = strtok(NULL, "\t\r\n")) { penultimate = ultimate; ultimate = next; } if (!current_chr or !current_start or !current_end or !current_id or !penultimate or !ultimate) { line_status.push_back(LineStatus(line_number)); line_status.back().filter_message_prefix = "Malformed hotspot BED line: expected at least 6 fields"; continue; } Allele allele; string string_chr(current_chr); if (ref_map.find(string_chr) != ref_map.end()) allele.chr_idx = ref_map[string_chr]; else if (ref_map.find("chr"+string_chr) != ref_map.end()) allele.chr_idx = ref_map["chr"+string_chr]; else if (string_chr == "MT" and ref_map.find("chrM") != ref_map.end()) allele.chr_idx = ref_map["chrM"]; else { line_status.push_back(LineStatus(line_number)); line_status.back().filter_message_prefix = "Unknown chromosome name: "; line_status.back().filter_message = string_chr; continue; } allele.pos = strtol(current_start,NULL,10); allele.id = current_id; char *current_ref = NULL; char *current_alt = NULL; for (char *next = strtok(penultimate, ";"); next; next = strtok(NULL, ";")) { if (strncmp(next,"REF=",4) == 0) current_ref = next; else if (strncmp(next,"OBS=",4) == 0) current_alt = next; } if (!current_ref or !current_alt) { line_status.push_back(LineStatus(line_number)); line_status.back().filter_message_prefix = "Malformed hotspot BED line: REF and OBS fields required in penultimate column"; continue; } for (char *pos = current_ref+4; *pos; ++pos) allele.ref += toupper(*pos); for (char *pos = current_alt+4; *pos; ++pos) allele.alt += toupper(*pos); allele.filtered = false; line_status.push_back(LineStatus(line_number)); allele.line_status = &line_status.back(); allele.opos = allele.pos; allele.oref = allele.ref; allele.oalt = allele.alt; alleles[allele.chr_idx].push_back(allele); line_status.back().allele = &alleles[allele.chr_idx].back(); } fclose(input); } if (!input_vcf_filename.empty()) { FILE *input = fopen(input_vcf_filename.c_str(),"r"); if (!input) { fprintf(stderr,"ERROR: Cannot open %s\n", input_vcf_filename.c_str()); return 1; } char line2[65536]; int line_number = 0; bool line_overflow = false; while (fgets(line2, 65536, input) != NULL) { if (line2[0] and line2[strlen(line2)-1] != '\n' and strlen(line2) == 65535) { line_overflow = true; continue; } line_number++; if (line_overflow) { line_overflow = false; line_status.push_back(LineStatus(line_number)); line_status.back().filter_message_prefix = "Malformed hotspot VCF line: line length exceeds 64K"; continue; } if (strncmp(line2, "##allowBlockSubstitutions=true", 30) == 0) { allow_block_substitutions = true; continue; } if (line2[0] == '#') continue; char *current_chr = strtok(line2, "\t\r\n"); char *current_start = strtok(NULL, "\t\r\n"); char *current_id = strtok(NULL, "\t\r\n"); char *current_ref = strtok(NULL, "\t\r\n"); char *current_alt = strtok(NULL, "\t\r\n"); if (!current_chr or !current_start or !current_id or !current_ref or !current_alt) { line_status.push_back(LineStatus(line_number)); line_status.back().filter_message_prefix = "Malformed hotspot VCF line: expected at least 5 fields"; continue; } string string_chr(current_chr); int chr_idx = 0; if (ref_map.find(string_chr) != ref_map.end()) chr_idx = ref_map[string_chr]; else if (ref_map.find("chr"+string_chr) != ref_map.end()) chr_idx = ref_map["chr"+string_chr]; else if (string_chr == "MT" and ref_map.find("chrM") != ref_map.end()) chr_idx = ref_map["chrM"]; else { line_status.push_back(LineStatus(line_number)); line_status.back().filter_message_prefix = "Unknown chromosome name: "; line_status.back().filter_message = string_chr; continue; } for (char *pos = current_ref; *pos; ++pos) *pos = toupper(*pos); for (char *pos = current_alt; *pos; ++pos) *pos = toupper(*pos); for (char *sub_alt = strtok(current_alt,","); sub_alt; sub_alt = strtok(NULL,",")) { Allele allele; allele.chr_idx = chr_idx; allele.ref = current_ref; allele.alt = sub_alt; allele.pos = strtol(current_start,NULL,10)-1; allele.id = current_id; if (allele.id == ".") allele.id = "hotspot"; allele.filtered = false; line_status.push_back(LineStatus(line_number)); allele.line_status = &line_status.back(); allele.opos = allele.pos; allele.oref = allele.ref; allele.oalt = allele.alt; alleles[allele.chr_idx].push_back(allele); line_status.back().allele = &alleles[allele.chr_idx].back(); } } fclose(input); } // Process by chromosome: // - Verify reference allele // - Left align // - Sort // - Filter for block substitutions, write FILE *output_vcf = NULL; if (!output_vcf_filename.empty()) { output_vcf = fopen(output_vcf_filename.c_str(), "w"); if (!output_vcf) { fprintf(stderr,"ERROR: Cannot open %s for writing\n", output_vcf_filename.c_str()); return 1; } fprintf(output_vcf, "##fileformat=VCFv4.1\n"); if (allow_block_substitutions) fprintf(output_vcf, "##allowBlockSubstitutions=true\n"); fprintf(output_vcf, "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\n"); } FILE *output_bed = NULL; if (!output_bed_filename.empty()) { output_bed = fopen(output_bed_filename.c_str(), "w"); if (!output_bed) { fprintf(stderr,"ERROR: Cannot open %s for writing\n", output_bed_filename.c_str()); if (output_vcf) fclose(output_vcf); return 1; } if (allow_block_substitutions) fprintf(output_bed, "track name=\"hotspot\" type=bedDetail allowBlockSubstitutions=true\n"); else fprintf(output_bed, "track name=\"hotspot\" type=bedDetail\n"); } for (int chr_idx = 0; chr_idx < (int)ref_index.size(); ++chr_idx) { for (deque<Allele>::iterator A = alleles[chr_idx].begin(); A != alleles[chr_idx].end(); ++A) { // Invalid characters bool valid = true; for (const char *c = A->ref.c_str(); *c ; ++c) if (*c != 'A' and *c != 'C' and *c != 'G' and *c != 'T') valid = false; for (const char *c = A->alt.c_str(); *c ; ++c) if (*c != 'A' and *c != 'C' and *c != 'G' and *c != 'T') valid = false; if (not valid) { A->filtered = true; A->line_status->filter_message_prefix = "REF and/or ALT contain characters other than ACGT: "; A->line_status->filter_message = "REF = " + A->ref + " ALT = " + A->alt; continue; } // Filter REF == ALT if (A->ref == A->alt) { A->filtered = true; A->line_status->filter_message_prefix = "REF and ALT alleles equal"; continue; } // Confirm reference allele. string ref_expected; for (int idx = 0; idx < (int) A->ref.size(); ++idx) ref_expected += ref_index[chr_idx].base(A->pos + idx); if (A->ref != ref_expected) { A->filtered = true; A->line_status->filter_message_prefix = "Provided REF allele does not match reference: "; A->line_status->filter_message = "Expected " + ref_expected + ", found " + A->ref; continue; } // Trim int ref_start = 0; int ref_end = A->ref.size(); int alt_end = A->alt.size(); // Option 1: trim all trailing bases //while(ref_end and alt_end and A->ref[ref_end-1] == A->alt[alt_end-1]) { // --ref_end; // --alt_end; //} // Option 2: trim all leading basees //while (ref_start < ref_end and ref_start < alt_end and A->ref[ref_start] == A->alt[ref_start]) // ++ref_start; // Option 3: trim anchor base if vcf if (!input_vcf_filename.empty()) { if (ref_end and alt_end and (ref_end == 1 or alt_end == 1) and A->ref[0] == A->alt[0]) ref_start = 1; } A->pos += ref_start; A->ref = A->ref.substr(ref_start, ref_end-ref_start); A->alt = A->alt.substr(ref_start, alt_end-ref_start); ref_end -= ref_start; alt_end -= ref_start; // Left align if (left_alignment) { while (A->pos > 0) { char nuc = ref_index[chr_idx].base(A->pos-1); if (ref_end > 0 and A->ref[ref_end-1] != nuc) break; if (alt_end > 0 and A->alt[alt_end-1] != nuc) break; A->ref = string(1,nuc) + A->ref; A->alt = string(1,nuc) + A->alt; A->pos--; } } A->ref.resize(ref_end); A->alt.resize(alt_end); // Filter block substitutions: take 1 if (ref_end > 0 and alt_end > 0 and ref_end != alt_end and not allow_block_substitutions and not filter_bypass) { A->filtered = true; A->line_status->filter_message_prefix = "Block substitutions not supported"; continue; } } if (output_bed) { // Sort - without anchor base sort(alleles[chr_idx].begin(), alleles[chr_idx].end(), compare_alleles); // Write for (deque<Allele>::iterator I = alleles[chr_idx].begin(); I != alleles[chr_idx].end(); ++I) { if (I->filtered) continue; if (I->pos) fprintf(output_bed, "%s\t%ld\t%ld\t%s\t0\t+\tREF=%s;OBS=%s;ANCHOR=%c\tNONE\n", ref_index[chr_idx].chr.c_str(), I->pos, I->pos + I->ref.size(), I->id.c_str(), I->ref.c_str(), I->alt.c_str(), ref_index[chr_idx].base(I->pos-1)); else fprintf(output_bed, "%s\t%ld\t%ld\t%s\t0\t+\tREF=%s;OBS=%s;ANCHOR=\tNONE\n", ref_index[chr_idx].chr.c_str(), I->pos, I->pos + I->ref.size(), I->id.c_str(), I->ref.c_str(), I->alt.c_str()); } } if (output_vcf) { // Add anchor base to indels for (deque<Allele>::iterator I = alleles[chr_idx].begin(); I != alleles[chr_idx].end(); ++I) { if (I->filtered) continue; if (not I->ref.empty() and not I->alt.empty()) continue; if (I->pos == 0) { I->filtered = true; I->line_status->filter_message_prefix = "INDELs at chromosome start not supported"; continue; } I->pos--; I->ref = string(1,ref_index[chr_idx].base(I->pos)) + I->ref; I->alt = string(1,ref_index[chr_idx].base(I->pos)) + I->alt; } // Sort - with anchor base sort(alleles[chr_idx].begin(), alleles[chr_idx].end(), compare_alleles); // Merge alleles, remove block substitutions, write for (deque<Allele>::iterator A = alleles[chr_idx].begin(); A != alleles[chr_idx].end(); ) { string max_ref; deque<Allele>::iterator B = A; for (; B != alleles[chr_idx].end() and B->pos == A->pos; ++B) if (!B->filtered and max_ref.size() < B->ref.size()) max_ref = B->ref; bool filtered = true; for (deque<Allele>::iterator I = A; I != B; ++I) { if (I->filtered) continue; string new_alt = I->alt + max_ref.substr(I->ref.size()); if (new_alt.size() > 1 and max_ref.size() > 1 and new_alt.size() != max_ref.size() and not allow_block_substitutions and not filter_bypass) { I->filtered = true; I->line_status->filter_message_prefix = "Block substitutions not supported (post-merge)"; continue; } I->ref = max_ref; I->alt = new_alt; filtered = false; } if (not filtered) { fprintf(output_vcf, "%s\t%ld\t.\t%s\t", ref_index[chr_idx].chr.c_str(), A->pos+1, max_ref.c_str()); bool comma = false; set<string> unique_alt_alleles; for (deque<Allele>::iterator I = A; I != B; ++I) { if (I->filtered) continue; if (unique_alt_alleles.count(I->alt) > 0) continue; unique_alt_alleles.insert(I->alt); if (comma) fprintf(output_vcf, ","); comma = true; fprintf(output_vcf, "%s", I->alt.c_str()); } fprintf(output_vcf, "\t.\t.\tOID="); comma = false; for (deque<Allele>::iterator I = A; I != B; ++I) { if (I->filtered) continue; if (comma) fprintf(output_vcf, ","); comma = true; fprintf(output_vcf, "%s", I->id.c_str()); } fprintf(output_vcf, ";OPOS="); comma = false; for (deque<Allele>::iterator I = A; I != B; ++I) { if (I->filtered) continue; if (comma) fprintf(output_vcf, ","); comma = true; fprintf(output_vcf, "%ld", I->opos+1); } fprintf(output_vcf, ";OREF="); comma = false; for (deque<Allele>::iterator I = A; I != B; ++I) { if (I->filtered) continue; if (comma) fprintf(output_vcf, ","); comma = true; fprintf(output_vcf, "%s", I->oref.c_str()); } fprintf(output_vcf, ";OALT="); comma = false; for (deque<Allele>::iterator I = A; I != B; ++I) { if (I->filtered) continue; if (comma) fprintf(output_vcf, ","); comma = true; fprintf(output_vcf, "%s", I->oalt.c_str()); } fprintf(output_vcf, ";OMAPALT="); comma = false; for (deque<Allele>::iterator I = A; I != B; ++I) { if (I->filtered) continue; if (comma) fprintf(output_vcf, ","); comma = true; fprintf(output_vcf, "%s", I->alt.c_str()); } fprintf(output_vcf, "\n"); } A = B; } } } if (output_bed) { fflush(output_bed); fclose(output_bed); } if (output_vcf) { fflush(output_vcf); fclose(output_vcf); } int lines_ignored = 0; for (deque<LineStatus>::iterator L = line_status.begin(); L != line_status.end(); ++L) { if (L->filter_message_prefix) { if (L->allele) printf("Line %d ignored: [%s:%ld %s] %s%s\n", L->line_number, ref_index[L->allele->chr_idx].chr.c_str(), L->allele->opos+1, L->allele->id.c_str(), L->filter_message_prefix, L->filter_message.c_str()); else printf("Line %d ignored: %s%s\n", L->line_number, L->filter_message_prefix, L->filter_message.c_str()); lines_ignored++; } } printf("Ignored %d out of %d lines\n", lines_ignored, (int)line_status.size()); munmap(ref, ref_stat.st_size); close(ref_handle); return 0; }
int IonstatsTestFragments(int argc, const char *argv[]) { OptArgs opts; opts.ParseCmdLine(argc, argv); string input_bam_filename = opts.GetFirstString('i', "input", ""); string fasta_filename = opts.GetFirstString('r', "ref", ""); string output_json_filename = opts.GetFirstString('o', "output", "ionstats_tf.json"); int histogram_length = opts.GetFirstInt ('h', "histogram-length", 400); if(argc < 2 or input_bam_filename.empty() or fasta_filename.empty()) { IonstatsTestFragmentsHelp(); return 1; } // // Prepare for metric calculation // map<string,string> tf_sequences; PopulateReferenceSequences(tf_sequences, fasta_filename); BamReader input_bam; if (!input_bam.Open(input_bam_filename)) { fprintf(stderr, "[ionstats] ERROR: cannot open %s\n", input_bam_filename.c_str()); return 1; } int num_tfs = input_bam.GetReferenceCount(); SamHeader sam_header = input_bam.GetHeader(); if(!sam_header.HasReadGroups()) { fprintf(stderr, "[ionstats] ERROR: no read groups in %s\n", input_bam_filename.c_str()); return 1; } string flow_order; string key; for (SamReadGroupIterator rg = sam_header.ReadGroups.Begin(); rg != sam_header.ReadGroups.End(); ++rg) { if(rg->HasFlowOrder()) flow_order = rg->FlowOrder; if(rg->HasKeySequence()) key = rg->KeySequence; } // Need these metrics stratified by TF. vector<ReadLengthHistogram> called_histogram(num_tfs); vector<ReadLengthHistogram> aligned_histogram(num_tfs); vector<ReadLengthHistogram> AQ10_histogram(num_tfs); vector<ReadLengthHistogram> AQ17_histogram(num_tfs); vector<SimpleHistogram> error_by_position(num_tfs); vector<MetricGeneratorSNR> system_snr(num_tfs); vector<MetricGeneratorHPAccuracy> hp_accuracy(num_tfs); for (int tf = 0; tf < num_tfs; ++tf) { called_histogram[tf].Initialize(histogram_length); aligned_histogram[tf].Initialize(histogram_length); AQ10_histogram[tf].Initialize(histogram_length); AQ17_histogram[tf].Initialize(histogram_length); error_by_position[tf].Initialize(histogram_length); } vector<uint16_t> flow_signal_fz(flow_order.length()); vector<int16_t> flow_signal_zm(flow_order.length()); const RefVector& refs = input_bam.GetReferenceData(); // Missing: // - hp accuracy - tough, copy verbatim from TFMapper? BamAlignment alignment; vector<char> MD_op; vector<int> MD_len; MD_op.reserve(1024); MD_len.reserve(1024); string MD_tag; // // Main loop over mapped reads in the input BAM // while(input_bam.GetNextAlignment(alignment)) { if (!alignment.IsMapped() or !alignment.GetTag("MD",MD_tag)) continue; // The check below eliminates unexpected alignments if (alignment.IsReverseStrand() or alignment.Position > 5) continue; int current_tf = alignment.RefID; // // Step 1. Parse MD tag // MD_op.clear(); MD_len.clear(); for (const char *MD_ptr = MD_tag.c_str(); *MD_ptr;) { int item_length = 0; if (*MD_ptr >= '0' and *MD_ptr <= '9') { // Its a match MD_op.push_back('M'); for (; *MD_ptr and *MD_ptr >= '0' and *MD_ptr <= '9'; ++MD_ptr) item_length = 10*item_length + *MD_ptr - '0'; } else { if (*MD_ptr == '^') { // Its a deletion MD_ptr++; MD_op.push_back('D'); } else // Its a substitution MD_op.push_back('X'); for (; *MD_ptr and *MD_ptr >= 'A' and *MD_ptr <= 'Z'; ++MD_ptr) item_length++; } MD_len.push_back(item_length); } // // Step 2. Synchronously scan through Cigar and MD, doing error accounting // int MD_idx = alignment.IsReverseStrand() ? MD_op.size()-1 : 0; int cigar_idx = alignment.IsReverseStrand() ? alignment.CigarData.size()-1 : 0; int increment = alignment.IsReverseStrand() ? -1 : 1; int AQ10_bases = 0; int AQ17_bases = 0; int num_bases = 0; int num_errors = 0; while (cigar_idx < (int)alignment.CigarData.size() and MD_idx < (int) MD_op.size() and cigar_idx >= 0 and MD_idx >= 0) { if (alignment.CigarData[cigar_idx].Length == 0) { // Try advancing cigar cigar_idx += increment; continue; } if (MD_len[MD_idx] == 0) { // Try advancing MD MD_idx += increment; continue; } // Match if (alignment.CigarData[cigar_idx].Type == 'M' and MD_op[MD_idx] == 'M') { int advance = min((int)alignment.CigarData[cigar_idx].Length, MD_len[MD_idx]); num_bases += advance; alignment.CigarData[cigar_idx].Length -= advance; MD_len[MD_idx] -= advance; // Insertion (read has a base, reference doesn't) } else if (alignment.CigarData[cigar_idx].Type == 'I') { int advance = alignment.CigarData[cigar_idx].Length; for (int cnt = 0; cnt < advance; ++cnt) { error_by_position[current_tf].Add(num_bases); num_bases++; num_errors++; } alignment.CigarData[cigar_idx].Length -= advance; // Deletion (reference has a base, read doesn't) } else if (alignment.CigarData[cigar_idx].Type == 'D' and MD_op[MD_idx] == 'D') { int advance = min((int)alignment.CigarData[cigar_idx].Length, MD_len[MD_idx]); for (int cnt = 0; cnt < advance; ++cnt) { error_by_position[current_tf].Add(num_bases); num_errors++; } alignment.CigarData[cigar_idx].Length -= advance; MD_len[MD_idx] -= advance; // Substitution } else if (MD_op[MD_idx] == 'X') { int advance = min((int)alignment.CigarData[cigar_idx].Length, MD_len[MD_idx]); for (int cnt = 0; cnt < advance; ++cnt) { error_by_position[current_tf].Add(num_bases); num_bases++; num_errors++; } alignment.CigarData[cigar_idx].Length -= advance; MD_len[MD_idx] -= advance; } else { printf("ionstats tf: Unexpected OP combination: %s Cigar=%c, MD=%c !\n", alignment.Name.c_str(), alignment.CigarData[cigar_idx].Type, MD_op[MD_idx]); break; } if (num_errors*10 <= num_bases) AQ10_bases = num_bases; if (num_errors*50 <= num_bases) AQ17_bases = num_bases; } // // Step 3. Profit // called_histogram[current_tf].Add(alignment.Length); aligned_histogram[current_tf].Add(num_bases); AQ10_histogram[current_tf].Add(AQ10_bases); AQ17_histogram[current_tf].Add(AQ17_bases); if(alignment.GetTag("ZM", flow_signal_zm)) system_snr[current_tf].Add(flow_signal_zm, key.c_str(), flow_order); else if(alignment.GetTag("FZ", flow_signal_fz)) system_snr[current_tf].Add(flow_signal_fz, key.c_str(), flow_order); // HP accuracy - keeping it simple if (!alignment.IsReverseStrand()) { string genome = key + tf_sequences[refs[current_tf].RefName]; string calls = key + alignment.QueryBases; const char *genome_ptr = genome.c_str(); const char *calls_ptr = calls.c_str(); for (int flow = 0; flow < (int)flow_order.length() and *genome_ptr and *calls_ptr; ++flow) { int genome_hp = 0; int calls_hp = 0; while (*genome_ptr == flow_order[flow]) { genome_hp++; genome_ptr++; } while (*calls_ptr == flow_order[flow]) { calls_hp++; calls_ptr++; } hp_accuracy[current_tf].Add(genome_hp, calls_hp); } } } // // Processing complete, generate ionstats_tf.json // Json::Value output_json(Json::objectValue); output_json["meta"]["creation_date"] = get_time_iso_string(time(NULL)); output_json["meta"]["format_name"] = "ionstats_tf"; output_json["meta"]["format_version"] = "1.0"; output_json["results_by_tf"] = Json::objectValue; for (int tf = 0; tf < num_tfs; ++tf) { if (aligned_histogram[tf].num_reads() < 1000) continue; called_histogram[tf].SaveToJson(output_json["results_by_tf"][refs[tf].RefName]["full"]); aligned_histogram[tf].SaveToJson(output_json["results_by_tf"][refs[tf].RefName]["aligned"]); AQ10_histogram[tf].SaveToJson(output_json["results_by_tf"][refs[tf].RefName]["AQ10"]); AQ17_histogram[tf].SaveToJson(output_json["results_by_tf"][refs[tf].RefName]["AQ17"]); error_by_position[tf].SaveToJson(output_json["results_by_tf"][refs[tf].RefName]["error_by_position"]); system_snr[tf].SaveToJson(output_json["results_by_tf"][refs[tf].RefName]); hp_accuracy[tf].SaveToJson(output_json["results_by_tf"][refs[tf].RefName]); output_json["results_by_tf"][refs[tf].RefName]["sequence"] = tf_sequences[refs[tf].RefName]; } input_bam.Close(); ofstream out(output_json_filename.c_str(), ios::out); if (out.good()) { out << output_json.toStyledString(); return 0; } else { fprintf(stderr, "ERROR: unable to write to '%s'\n", output_json_filename.c_str()); return 1; } }
void PhaseEstimator::InitializeFromOptArgs(OptArgs& opts, const ion::ChipSubset & chip_subset, const string & key_norm_method) { // Parse command line options phasing_estimator_ = opts.GetFirstString ('-', "phasing-estimator", "spatial-refiner-2"); vector<double> cf_ie_dr = opts.GetFirstDoubleVector('-', "libcf-ie-dr", ""); vector<double> init_cf_ie_dr = opts.GetFirstDoubleVector('-', "initcf-ie-dr", ""); residual_threshold_ = opts.GetFirstDouble ('-', "phasing-residual-filter", 1.0); max_phasing_levels_ = opts.GetFirstInt ('-', "max-phasing-levels", max_phasing_levels_default_); num_fullchip_iterations_= opts.GetFirstInt ('-', "phasing-fullchip-iterations", 3); num_region_iterations_ = opts.GetFirstInt ('-', "phasing-region-iterations", 1); num_reads_per_region_ = opts.GetFirstInt ('-', "phasing-num-reads", 5000); min_reads_per_region_ = opts.GetFirstInt ('-', "phasing-min-reads", 1000); phase_file_name_ = opts.GetFirstString ('-', "phase-estimation-file", ""); normalization_string_ = opts.GetFirstString ('-', "phase-normalization", "adaptive"); key_norm_method_ = key_norm_method; // Static member variables norm_during_param_eval_ = opts.GetFirstBoolean('-', "phase-norm-during-eval", false); windowSize_ = opts.GetFirstInt ('-', "window-size", DPTreephaser::kWindowSizeDefault_); phasing_start_flow_ = opts.GetFirstInt ('-', "phasing-start-flow", 70); phasing_end_flow_ = opts.GetFirstInt ('-', "phasing-end-flow", 150); inclusion_threshold_ = opts.GetFirstDouble ('-', "phasing-signal-cutoff", 1.4); maxfrac_negative_flows_ = opts.GetFirstDouble ('-', "phasing-norm-threshold", 0.2); // Initialize chip size - needed for loading phase parameters chip_size_x_ = chip_subset.GetChipSizeX(); chip_size_y_ = chip_subset.GetChipSizeY(); region_size_x_ = chip_subset.GetRegionSizeX(); region_size_y_ = chip_subset.GetRegionSizeY(); num_regions_x_ = chip_subset.GetNumRegionsX(); num_regions_y_ = chip_subset.GetNumRegionsY(); num_regions_ = chip_subset.NumRegions(); // Loading existing phase estimates from a file takes precedence over all other options if (not phase_file_name_.empty()) { have_phase_estimates_ = LoadPhaseEstimationTrainSubset(phase_file_name_); if (have_phase_estimates_) { phasing_estimator_ = "override"; printf("Phase estimator settings:\n"); printf(" phase file name : %s\n", phase_file_name_.c_str()); printf(" phase estimation mode : %s\n\n", phasing_estimator_.c_str()); return; } else cout << "PhaseEstimator Error loading TrainSubset from file " << phase_file_name_ << endl; } // Set phase parameters if provided by command line if (!cf_ie_dr.empty()) { if (cf_ie_dr.size() != 3){ cerr << "BaseCaller Option Error: libcf-ie-dr needs to be a comma separated vector of 3 values." << endl; exit (EXIT_FAILURE); } SetPhaseParameters(cf_ie_dr.at(0), cf_ie_dr.at(1), cf_ie_dr.at(2)); return; // --libcf-ie-dr overrides other phasing-related options } // Set starting values for estimation if (!init_cf_ie_dr.empty()) { if (init_cf_ie_dr.size() != 3){ cerr << "BaseCaller Option Error: initcf-ie-dr needs to be a comma separated vector of 3 values." << endl; exit (EXIT_FAILURE); } init_cf_ = init_cf_ie_dr.at(0); init_ie_ = init_cf_ie_dr.at(1); init_dr_ = init_cf_ie_dr.at(2); } if (phasing_start_flow_ >= phasing_end_flow_ or phasing_start_flow_ < 0) { cerr << "BaseCaller Option Error: phasing-start-flow " << phasing_start_flow_ << "needs to be positive and smaller than phasing-end-flow " << phasing_end_flow_ << endl; exit (EXIT_FAILURE); } if (normalization_string_ == "adaptive") norm_method_ = 1; else if (normalization_string_ == "pid") norm_method_ = 2; else if (normalization_string_ == "variable") norm_method_ = 3; else if (normalization_string_ == "off") norm_method_ = 4; else norm_method_ = 0; // "gain" and anythign else is default printf("Phase estimator settings:\n"); printf(" phase file name : %s\n", phase_file_name_.c_str()); printf(" phase estimation mode : %s\n", phasing_estimator_.c_str()); printf(" initial cf,ie,dr values: %f,%f,%f\n", init_cf_,init_ie_,init_dr_); printf(" reads per region target: %d-%d\n", min_reads_per_region_, num_reads_per_region_); printf(" normalization method : %s\n", normalization_string_.c_str()); printf(" variable norm threshold: %f\n", maxfrac_negative_flows_); printf("\n"); }