bool BaseCallerParameters::InitContextVarsFromOptArgs(OptArgs& opts){ assert(bc_files.options_set); char default_run_id[6]; // Create a run identifier from full output directory string ion_run_to_readname (default_run_id, (char*)bc_files.output_directory.c_str(), bc_files.output_directory.length()); context_vars.run_id = opts.GetFirstString ('-', "run-id", default_run_id); num_threads_ = opts.GetFirstInt ('n', "num-threads", max(2*numCores(), 4)); num_bamwriter_threads_ = opts.GetFirstInt ('-', "num-threads-bamwriter", 6); context_vars.flow_signals_type = opts.GetFirstString ('-', "flow-signals-type", "none"); context_vars.extra_trim_left = opts.GetFirstInt ('-', "extra-trim-left", 0); context_vars.only_process_unfiltered_set = opts.GetFirstBoolean('-', "only-process-unfiltered-set", false); // Treephaser options context_vars.dephaser = opts.GetFirstString ('-', "dephaser", "treephaser-sse"); context_vars.keynormalizer = opts.GetFirstString ('-', "keynormalizer", "gain"); context_vars.windowSize = opts.GetFirstInt ('-', "window-size", DPTreephaser::kWindowSizeDefault_); context_vars.skip_droop = opts.GetFirstBoolean('-', "skip-droop", true); context_vars.skip_recal_during_norm = opts.GetFirstBoolean('-', "skip-recal-during-normalization", false); context_vars.diagonal_state_prog = opts.GetFirstBoolean('-', "diagonal-state-prog", false); // Not every combination of options is possible here: if (context_vars.diagonal_state_prog and context_vars.dephaser != "treephaser-swan") { cout << " === BaseCaller Option Incompatibility: Using dephaser treephaser-swan with diagonal state progression instead of " << context_vars.dephaser << endl; context_vars.dephaser = "treephaser-swan"; } context_vars.process_tfs = true; context_vars.options_set = true; return true; };
void RecalibrationModel::Initialize(OptArgs& opts, vector<string> &bam_comments, const string & run_id, const ion::ChipSubset & chip_subset) { string model_file_name = opts.GetFirstString ('-', "model-file", ""); int model_threshold = opts.GetFirstInt('-', "recal-model-hp-thres", 4); bool save_hpmodel = opts.GetFirstBoolean('-', "save-hpmodel", true); bool diagonal_state_prog = opts.GetFirstBoolean('-', "diagonal-state-prog", false); if (diagonal_state_prog) model_file_name.clear(); if (InitializeModel(model_file_name, model_threshold) and save_hpmodel) SaveModelFileToBamComments(model_file_name, bam_comments, run_id, chip_subset.GetColOffset(), chip_subset.GetRowOffset()); }
bool RetrieveParameterBool(OptArgs &opts, Json::Value& json, char short_name, const string& long_name_hyphens, bool default_value) { string long_name_underscores = long_name_hyphens; for (unsigned int i = 0; i < long_name_underscores.size(); ++i) if (long_name_underscores[i] == '-') long_name_underscores[i] = '_'; bool value = default_value; string source = "builtin default"; if (json.isMember(long_name_underscores)) { if (json[long_name_underscores].isString()) value = atoi(json[long_name_underscores].asCString()); else value = json[long_name_underscores].asInt(); source = "parameters json file"; } if (opts.HasOption(short_name, long_name_hyphens)) { value = opts.GetFirstBoolean(short_name, long_name_hyphens, value); source = "command line option"; } cout << setw(35) << long_name_hyphens << " = " << setw(10) << (value ? "true" : "false") << " (boolean, " << source << ")" << endl; return value; }
TagTrimmerParameters MolecularTagTrimmer::ReadOpts(OptArgs& opts) { // Reading command line options to set tag structures TagTrimmerParameters my_params; my_params.min_family_size = opts.GetFirstInt ('-', "min-tag-fam-size", 3); my_params.suppress_mol_tags = opts.GetFirstBoolean ('-', "suppress-mol-tags", false); //my_params.cl_a_handle = opts.GetFirstString ('-', "tag-handle", ""); //my_params.handle_cutoff = opts.GetFirstInt ('-', "handle-cutoff", 2); my_params.master_tags.prefix_mol_tag = opts.GetFirstString ('-', "prefix-mol-tag", ""); my_params.master_tags.suffix_mol_tag = opts.GetFirstString ('-', "suffix-mol-tag", ""); ValidateTagString(my_params.master_tags.prefix_mol_tag); ValidateTagString(my_params.master_tags.suffix_mol_tag); // Overload to disable molecular tagging if (my_params.min_family_size == 0) my_params.suppress_mol_tags = true; else if (my_params.min_family_size < 1) { cerr << "MolecularTagTrimmer Error: min-tag-fam-size must be at least 1. " << endl; exit(EXIT_FAILURE); } my_params.command_line_tags = my_params.master_tags.HasTags(); // Options for read filtering & and trimming method selection string trim_method = opts.GetFirstString ('-', "tag-trim-method", "sloppy-trim"); if (trim_method == "sloppy-trim") my_params.tag_trim_method = kSloppyTrim; else if (trim_method == "strict-trim") my_params.tag_trim_method = kStrictTrim; else { cerr << "MolecularTagTrimmer Error: Unknown tag trimming option " << trim_method << endl; exit(EXIT_FAILURE); } string filter_method = opts.GetFirstString ('-', "tag-filter-method", "need-all"); if (filter_method == "need-all") my_params.tag_filter_method = kneed_all_tags; else if (filter_method == "need-prefix") my_params.tag_filter_method = kneed_only_prefix_tag; else if (filter_method == "need-suffix") my_params.tag_filter_method = kneed_only_suffix_tag; else { cerr << "MolecularTagTrimmer Error: Unknown tag filtering option " << filter_method << endl; exit(EXIT_FAILURE); } return my_params; }
void ExtendParameters::SetFreeBayesParameters(OptArgs &opts, Json::Value& fb_params) { // FreeBayes parameters // primarily used in candidate generation targets = opts.GetFirstString('t', "target-file", ""); trim_ampliseq_primers = opts.GetFirstBoolean('-', "trim-ampliseq-primers", false); if (targets.empty() and trim_ampliseq_primers) { cerr << "ERROR: --trim-ampliseq-primers enabled but no --target-file provided" << endl; exit(1); } allowIndels = RetrieveParameterBool (opts, fb_params, '-', "allow-indels", true); allowSNPs = RetrieveParameterBool (opts, fb_params, '-', "allow-snps", true); allowMNPs = RetrieveParameterBool (opts, fb_params, '-', "allow-mnps", true); allowComplex = RetrieveParameterBool (opts, fb_params, '-', "allow-complex", false); // deprecated: // leftAlignIndels = RetrieveParameterBool (opts, fb_params, '-', "left-align-indels", false); RetrieveParameterBool (opts, fb_params, '-', "left-align-indels", false); //useBestNAlleles = 0; useBestNAlleles = RetrieveParameterInt (opts, fb_params, 'm', "use-best-n-alleles", 2); onlyUseInputAlleles = RetrieveParameterBool (opts, fb_params, '-', "use-input-allele-only", false); min_mapping_qv = RetrieveParameterInt (opts, fb_params, 'M', "min-mapping-qv", 4); read_snp_limit = RetrieveParameterInt (opts, fb_params, 'U', "read-snp-limit", 10); readMaxMismatchFraction = RetrieveParameterDouble(opts, fb_params, 'z', "read-max-mismatch-fraction", 1.0); maxComplexGap = RetrieveParameterInt (opts, fb_params, '!', "max-complex-gap", 1); // read from json or command line, otherwise default to snp frequency minAltFraction = RetrieveParameterDouble(opts, fb_params, '-', "gen-min-alt-allele-freq", my_controls.filter_snps.min_allele_freq); minCoverage = RetrieveParameterInt (opts, fb_params, '-', "gen-min-coverage", my_controls.filter_snps.min_cov); minIndelAltFraction = RetrieveParameterDouble(opts, fb_params, '-', "gen-min-indel-alt-allele-freq", my_controls.filter_hp_indel.min_allele_freq); //set up debug levels if (program_flow.DEBUG > 0) debug = true; if (program_flow.inputPositionsOnly) { processInputPositionsOnly = true; } if (variantPriorsFile.empty() && (processInputPositionsOnly || onlyUseInputAlleles) ) { cerr << "ERROR: Parameter error - Process-input-positions-only: " << processInputPositionsOnly << " use-input-allele-only: " << onlyUseInputAlleles << " : Specified without Input VCF File " << endl; exit(1); } }
int IonstatsReduceH5(int argc, const char *argv[]) { OptArgs opts; opts.ParseCmdLine(argc-1, argv+1); string output_h5_filename = opts.GetFirstString ('o', "output", ""); bool merge_proton_blocks = opts.GetFirstBoolean ('b', "merge-proton-blocks", "true"); vector<string> input_h5_filename; opts.GetLeftoverArguments(input_h5_filename); if(input_h5_filename.empty() or output_h5_filename.empty()) { IonstatsReduceH5Help(); return 1; } if(merge_proton_blocks) cout << "NOTE:" << argv[0] << " " << argv[1] << ": --merge-proton-blocks=true so any Proton block-specific read group suffixes will be merged" << endl; return IonstatsAlignmentReduceH5(output_h5_filename, input_h5_filename, merge_proton_blocks); }
void PerBaseQual::Init(OptArgs& opts, const string& chip_type, const string &output_directory, bool recalib) { if(phred_table_) { delete [] phred_table_; phred_table_ = 0; } string phred_table_file = opts.GetFirstString ('-', "phred-table-file", ""); save_predictors_ = opts.GetFirstBoolean('-', "save-predictors", false); // Determine the correct phred table filename to use bool binTable = true; if (phred_table_file.empty()) { ChipIdDecoder::SetGlobalChipId(chip_type.c_str()); ChipIdEnum chip_id = ChipIdDecoder::GetGlobalChipId(); switch(chip_id){ case ChipId314: phred_table_file = "phredTable.txt_314.binary"; break; case ChipId316: phred_table_file = "phredTable.txt_316.binary"; break; case ChipId316v2: phred_table_file = "phredTable.txt_318.binary"; break; case ChipId318: phred_table_file = "phredTable.txt_318.binary"; break; case ChipId900: // Proton chip phred_table_file = "phredTable.txt_900.binary"; break; default: phred_table_file = "phredTable.txt_314.binary"; fprintf(stderr, "PerBaseQual: No default phred table for chip_type=%s, trying %s instead\n", chip_type.c_str(), phred_table_file.c_str()); break; } if (recalib) { phred_table_file = phred_table_file.substr(0, phred_table_file.length() - 7); phred_table_file += ".Recal.binary"; } char* full_filename = GetIonConfigFile(phred_table_file.c_str()); if(!full_filename) { printf("WARNING: cannot find binary phred table file %s, try to use non-binary phred table\n", phred_table_file.c_str()); phred_table_file = phred_table_file.substr(0, phred_table_file.length() - 7); // get rid of .binary binTable = false; char* full_filename2 = GetIonConfigFile(phred_table_file.c_str()); if(!full_filename2) ION_ABORT("ERROR: Can't find phred table file " + phred_table_file); phred_table_file = full_filename2; free(full_filename2); } else { phred_table_file = full_filename; free(full_filename); } } cout << endl << "PerBaseQual::Init... phred_table_file=" << phred_table_file << endl; binTable = hasBinaryExtension(phred_table_file); // Load the phred table if(binTable) { cout << endl << "PerBaseQual::Init... load binary phred_table_file=" << phred_table_file << endl; vector<size_t> vNumCuts(kNumPredictors, 0); if(H5Fis_hdf5(phred_table_file.c_str()) > 0) { hid_t root = H5Fopen(phred_table_file.c_str(), H5F_ACC_RDONLY, H5P_DEFAULT); if(root < 0) { ION_ABORT("ERROR: cannot open HDF5 file " + phred_table_file); } hid_t grpQvTable = H5Gopen(root, "/QvTable", H5P_DEFAULT); if (grpQvTable < 0) { H5Fclose(root); ION_ABORT("ERROR: fail to open HDF5 group QvTable"); } if(H5Aexists(grpQvTable, "NumPredictors") <= 0) { H5Gclose(grpQvTable); H5Fclose(root); ION_ABORT("ERROR: HDF5 attribute NumPredictors does not exist"); } hid_t attrNumPreds = H5Aopen(grpQvTable, "NumPredictors", H5P_DEFAULT); if (attrNumPreds < 0) { H5Gclose(grpQvTable); H5Fclose(root); ION_ABORT("ERROR: fail to open HDF5 attribute NumPredictors"); } unsigned int numPredictors = 0; herr_t ret = H5Aread(attrNumPreds, H5T_NATIVE_UINT, &numPredictors); H5Aclose(attrNumPreds); if(ret < 0 || numPredictors != (unsigned int)kNumPredictors) { H5Gclose(grpQvTable); H5Fclose(root); ION_ABORT("ERROR: HDF5 attribute NumPredictors is wrong"); } char buf[100]; for(size_t i = 0; i < (size_t)kNumPredictors; ++i) { offsets_.push_back(1); sprintf(buf, "ThresholdsOfPredictor%d", (int)i); if(H5Aexists(grpQvTable, buf) <= 0) { H5Gclose(grpQvTable); H5Fclose(root); ION_ABORT("ERROR: HDF5 attribute ThresholdsOfPredictor does not exist"); } hid_t attrCuts = H5Aopen(grpQvTable, buf, H5P_DEFAULT); if (attrCuts < 0) { H5Gclose(grpQvTable); H5Fclose(root); ION_ABORT("ERROR: fail to open HDF5 attribute ThresholdsOfPredictor"); } hsize_t size = H5Aget_storage_size(attrCuts); size /= sizeof(float); float* fcuts = new float[size]; ret = H5Aread(attrCuts, H5T_NATIVE_FLOAT, fcuts); H5Aclose(attrCuts); if(ret < 0) { H5Gclose(grpQvTable); H5Fclose(root); ION_ABORT("ERROR: fail to read HDF5 attribute ThresholdsOfPredictor"); } vector<float> vCuts(size); copy(fcuts, fcuts + size, vCuts.begin()); phred_cuts_.push_back(vCuts); delete [] fcuts; fcuts = 0; } hid_t dsQvs = H5Dopen(grpQvTable, "Qvs", H5P_DEFAULT); if (dsQvs < 0) { H5Gclose(grpQvTable); H5Fclose(root); ION_ABORT("ERROR: fail to open HDF5 dataset Qvs"); } hsize_t tbSize = H5Dget_storage_size(dsQvs); phred_table_ = new unsigned char[tbSize]; ret = H5Dread(dsQvs, H5T_NATIVE_UCHAR, H5S_ALL, H5S_ALL, H5P_DEFAULT, phred_table_); H5Dclose(dsQvs); H5Gclose(grpQvTable); H5Fclose(root); if (ret < 0) { delete [] phred_table_; phred_table_ = 0; ION_ABORT("ERROR: fail to read HDF5 dataset Qvs"); } } else { printf("WARNING: binary phred table file %s is not a HDF5 file, try binary file mode.\n", phred_table_file.c_str()); ifstream source; source.open(phred_table_file.c_str(), ios::in|ios::binary|ios::ate); if (!source.is_open()) ION_ABORT("ERROR: Cannot open file: " + phred_table_file); long totalSize = source.tellg(); char* tbBlock = new char [totalSize]; source.seekg (0, ios::beg); source.read (tbBlock, totalSize); source.close(); long headerSize = 0; char* ptr = tbBlock; int numPredictors = ptr[0]; //kNumPredictors if(numPredictors != kNumPredictors) { delete [] tbBlock; tbBlock = 0; ION_ABORT("ERROR: Wrong number of predictors load from " + phred_table_file); } ptr += 4; headerSize += 4; for(int i = 0; i < kNumPredictors; ++i) { vNumCuts[i] = ptr[0]; ptr += 4; headerSize += 4; offsets_.push_back(1); } long tbSize = 1; for(int i = 0; i < kNumPredictors; ++i) { vector<float> vCuts; tbSize *= vNumCuts[i]; for(size_t j = 0; j < vNumCuts[i]; ++j) { float tmp; memcpy(&tmp, ptr, 4); vCuts.push_back(tmp); ptr += 4; headerSize += 4; } phred_cuts_.push_back(vCuts); } if(tbSize != (totalSize - headerSize)) { delete [] tbBlock; tbBlock = 0; ION_ABORT("ERROR: Wrong QV table size"); } phred_table_ = new unsigned char[tbSize]; memcpy(phred_table_, ptr, tbSize * sizeof(unsigned char)); delete [] tbBlock; tbBlock = 0; } for(size_t i = kNumPredictors - 2; i > 0; --i) { offsets_[i] *= phred_cuts_[i + 1].size(); offsets_[i - 1] = offsets_[i]; } offsets_[0] *= phred_cuts_[1].size(); } else { ifstream source; source.open(phred_table_file.c_str()); if (!source.is_open()) ION_ABORT("ERROR: Cannot open file: " + phred_table_file); while (!source.eof()) { string line; getline(source, line); if (line.empty()) break; if (line[0] == '#') continue; stringstream strs(line); float temp; for (int k = 0; k < kNumPredictors; ++k) { strs >> temp; phred_thresholds_[k].push_back(temp); } strs >> temp; //skip n-th entry strs >> temp; phred_quality_.push_back(temp); } source.close(); for (int k = 0; k < kNumPredictors; ++k) phred_thresholds_max_[k] = *max_element(phred_thresholds_[k].begin(), phred_thresholds_[k].end()); } // Prepare for predictor dump here if (save_predictors_) { string predictors_filename = output_directory + "/Predictors.txt"; cout << endl << "Saving PerBaseQual predictors to file " << predictors_filename << endl << endl; predictor_dump_.open(predictors_filename.c_str()); if (!predictor_dump_.is_open()) ION_ABORT("ERROR: Cannot open file: " + predictors_filename); } }
int main (int argc, const char *argv[]) { printf ("------------- bamrealignment --------------\n"); OptArgs opts; opts.ParseCmdLine(argc, argv); vector<int> score_vals(4); string input_bam = opts.GetFirstString ('i', "input", ""); string output_bam = opts.GetFirstString ('o', "output", ""); opts.GetOption(score_vals, "4,-6,-5,-2", 's', "scores"); int clipping = opts.GetFirstInt ('c', "clipping", 2); bool anchors = opts.GetFirstBoolean ('a', "anchors", true); int bandwidth = opts.GetFirstInt ('b', "bandwidth", 10); bool verbose = opts.GetFirstBoolean ('v', "verbose", false); bool debug = opts.GetFirstBoolean ('d', "debug", false); int format = opts.GetFirstInt ('f', "format", 1); int num_threads = opts.GetFirstInt ('t', "threads", 8); string log_fname = opts.GetFirstString ('l', "log", ""); if (input_bam.empty() or output_bam.empty()) return PrintHelp(); opts.CheckNoLeftovers(); std::ofstream logf; if (log_fname.size ()) { logf.open (log_fname.c_str ()); if (!logf.is_open ()) { fprintf (stderr, "bamrealignment: Failed to open log file %s\n", log_fname.c_str()); return 1; } } BamReader reader; if (!reader.Open(input_bam)) { fprintf(stderr, "bamrealignment: Failed to open input file %s\n", input_bam.c_str()); return 1; } SamHeader header = reader.GetHeader(); RefVector refs = reader.GetReferenceData(); BamWriter writer; writer.SetNumThreads(num_threads); if (format == 1) writer.SetCompressionMode(BamWriter::Uncompressed); else writer.SetCompressionMode(BamWriter::Compressed); if (!writer.Open(output_bam, header, refs)) { fprintf(stderr, "bamrealignment: Failed to open output file %s\n", output_bam.c_str()); return 1; } // The meat starts here ------------------------------------ if (verbose) cout << "Verbose option is activated, each alignment will print to screen." << endl << " After a read hit RETURN to continue to the next one," << endl << " or press q RETURN to quit the program," << endl << " or press s Return to silence verbose," << endl << " or press c RETURN to continue printing without further prompt." << endl << endl; unsigned int readcounter = 0; unsigned int mapped_readcounter = 0; unsigned int realigned_readcounter = 0; unsigned int modified_alignment_readcounter = 0; unsigned int pos_update_readcounter = 0; unsigned int failed_clip_realigned_readcount = 0; unsigned int already_perfect_readcount = 0; unsigned int bad_md_tag_readcount = 0; unsigned int error_recreate_ref_readcount = 0; unsigned int error_clip_anchor_readcount = 0; unsigned int error_sw_readcount = 0; unsigned int error_unclip_readcount = 0; unsigned int start_position_shift; int orig_position; int new_position; string md_tag, new_md_tag, input = "x"; vector<CigarOp> new_cigar_data; vector<MDelement> new_md_data; bool position_shift = false; time_t start_time = time(NULL); Realigner aligner; aligner.verbose_ = verbose; aligner.debug_ = debug; if (!aligner.SetScores(score_vals)) cout << "bamrealignment: Four scores need to be provided: match, mismatch, gap open, gap extend score!" << endl; aligner.SetAlignmentBandwidth(bandwidth); BamAlignment alignment; while(reader.GetNextAlignment(alignment)){ readcounter ++; position_shift = false; if ( (readcounter % 100000) == 0 ) cout << "Processed " << readcounter << " reads. Elapsed time: " << (time(NULL) - start_time) << endl; if (alignment.IsMapped()) { orig_position = alignment.Position; mapped_readcounter++; aligner.SetClipping(clipping, !alignment.IsReverseStrand()); if (aligner.verbose_) { cout << endl; if (alignment.IsReverseStrand()) cout << "The read is from the reverse strand." << endl; else cout << "The read is from the forward strand." << endl; } if (!alignment.GetTag("MD", md_tag)) { if (aligner.verbose_) cout << "Warning: Skipping read " << alignment.Name << ". It is mapped but missing MD tag." << endl; if (logf.is_open ()) logf << alignment.Name << '\t' << alignment.IsReverseStrand() << '\t' << alignment.RefID << '\t' << setfill ('0') << setw (8) << orig_position << '\t' << "MISSMD" << '\n'; bad_md_tag_readcount++; } else if (aligner.CreateRefFromQueryBases(alignment.QueryBases, alignment.CigarData, md_tag, anchors)) { bool clipfail = false; if (Realigner::CR_ERR_CLIP_ANCHOR == aligner.GetCreateRefError ()) { clipfail = true; failed_clip_realigned_readcount ++; } if (!aligner.computeSWalignment(new_cigar_data, new_md_data, start_position_shift)) { if (aligner.verbose_) cout << "Error in the alignment! Not updating read information." << endl; if (logf.is_open ()) logf << alignment.Name << '\t' << alignment.IsReverseStrand() << '\t' << alignment.RefID << '\t' << setfill ('0') << setw (8) << orig_position << '\t' << "SWERR" << '\n'; error_sw_readcount++; writer.SaveAlignment(alignment); // Write alignment unchanged continue; } if (!aligner.addClippedBasesToTags(new_cigar_data, new_md_data, alignment.QueryBases.size())) { if (aligner.verbose_) cout << "Error when adding clipped anchors back to tags! Not updating read information." << endl; if (logf.is_open ()) logf << alignment.Name << '\t' << alignment.IsReverseStrand() << '\t' << alignment.RefID << '\t' << setfill ('0') << setw (8) << orig_position << '\t' << "UNCLIPERR" << '\n'; writer.SaveAlignment(alignment); // Write alignment unchanged error_unclip_readcount ++; continue; } new_md_tag = aligner.GetMDstring(new_md_data); realigned_readcounter++; // adjust start position of read if (!aligner.LeftAnchorClipped() and start_position_shift != 0) { new_position = aligner.updateReadPosition(alignment.CigarData, (int)start_position_shift, alignment.Position); if (new_position != alignment.Position) { pos_update_readcounter++; position_shift = true; alignment.Position = new_position; } } if (position_shift || alignment.CigarData.size () != new_cigar_data.size () || md_tag != new_md_tag) { if (logf.is_open ()) { logf << alignment.Name << '\t' << alignment.IsReverseStrand() << '\t' << alignment.RefID << '\t' << setfill ('0') << setw (8) << orig_position << '\t' << "MOD"; if (position_shift) logf << "-SHIFT"; if (clipfail) logf << " NOCLIP"; logf << '\n'; } modified_alignment_readcounter++; } else { if (logf.is_open ()) { logf << alignment.Name << '\t' << alignment.IsReverseStrand() << '\t' << alignment.RefID << '\t' << setfill ('0') << setw (8) << orig_position << '\t' << "UNMOD"; if (clipfail) logf << " NOCLIP"; logf << '\n'; } } if (aligner.verbose_){ cout << alignment.Name << endl; cout << "------------------------------------------" << endl; // Wait for input to continue or quit program if (input.size() == 0) input = 'x'; else if (input[0] != 'c' and input[0] != 'C') getline(cin, input); if (input.size()>0){ if (input[0] == 'q' or input[0] == 'Q') return 1; else if (input[0] == 's' or input[0] == 'S') aligner.verbose_ = false; } } // Finally update alignment information alignment.CigarData = new_cigar_data; alignment.EditTag("MD", "Z" , new_md_tag); } // end of CreateRef else if else { switch (aligner.GetCreateRefError ()) { case Realigner::CR_ERR_RECREATE_REF: if (logf.is_open ()) logf << alignment.Name << '\t' << alignment.IsReverseStrand() << '\t' << alignment.RefID << '\t' << setfill ('0') << setw (8) << orig_position << '\t' << "RECRERR" << '\n'; error_recreate_ref_readcount++; break; case Realigner::CR_ERR_CLIP_ANCHOR: if (logf.is_open ()) logf << alignment.Name << '\t' << alignment.IsReverseStrand() << '\t' << alignment.RefID << '\t' << setfill ('0') << setw (8) << orig_position << '\t' << "CLIPERR" << '\n'; error_clip_anchor_readcount++; break; default: // On a good run this writes way too many reads to the log file - don't want to create a too large txt file // if (logf.is_open ()) //logf << alignment.Name << '\t' << alignment.IsReverseStrand() << '\t' << alignment.RefID << '\t' << setfill ('0') << setw (8) << orig_position << '\t' << "PERFECT" << '\n'; already_perfect_readcount++; break; } if (aligner.verbose_) { cout << alignment.Name << endl; cout << "------------------------------------------" << endl; // Wait for input to continue or quit program if (input.size() == 0) input = 'x'; else if (input[0] != 'c' and input[0] != 'C') getline(cin, input); if (input.size()>0){ if (input[0] == 'q' or input[0] == 'Q') return 1; else if (input[0] == 's' or input[0] == 'S') aligner.verbose_ = false; } } } // --- Debug output for Rajesh --- if (debug && aligner.invalid_cigar_in_input) { aligner.verbose_ = true; cout << "Invalid cigar string / md tag pair in read " << alignment.Name << endl; // Rerun reference generation to display error aligner.CreateRefFromQueryBases(alignment.QueryBases, alignment.CigarData, md_tag, anchors); aligner.verbose_ = verbose; aligner.invalid_cigar_in_input = false; } // --- --- --- } // end of if isMapped writer.SaveAlignment(alignment); } // end while loop over reads if (aligner.invalid_cigar_in_input) cerr << "WARNING bamrealignment: There were invalid cigar string / md tag pairs in the input bam file." << endl; // ---------------------------------------------------------------- // program end -- output summary information cout << " File: " << input_bam << endl << " Total reads: " << readcounter << endl << " Mapped reads: " << mapped_readcounter << endl; if (bad_md_tag_readcount) cout << " Skipped: bad MD tags: " << bad_md_tag_readcount << endl; if (error_recreate_ref_readcount) cout << " Skipped: unable to recreate ref: " << error_recreate_ref_readcount << endl; if (error_clip_anchor_readcount) cout << " Skipped: error clipping anchor: " << error_clip_anchor_readcount << endl; cout << " Skipped: already perfect: " << already_perfect_readcount << endl << " Total reads realigned: " << mapped_readcounter - already_perfect_readcount - bad_md_tag_readcount - error_recreate_ref_readcount - error_clip_anchor_readcount << endl; if (failed_clip_realigned_readcount) cout << " (including " << failed_clip_realigned_readcount << " that failed to clip)" << endl; if (error_sw_readcount) cout << " Failed to complete SW alignment: " << error_sw_readcount << endl; if (error_unclip_readcount) cout << " Failed to unclip anchor: " << error_unclip_readcount << endl; cout << " Succesfully realigned: " << realigned_readcounter << endl << " Modified alignments: " << modified_alignment_readcounter << endl << " Shifted position: " << pos_update_readcounter << endl; cout << "Processing time: " << (time(NULL)-start_time) << " seconds." << endl; cout << "INFO: The output BAM file may be unsorted." << endl; cout << "------------------------------------------" << endl; return 0; }
int PrepareHotspots(int argc, const char *argv[]) { OptArgs opts; opts.ParseCmdLine(argc, argv); string input_bed_filename = opts.GetFirstString ('b', "input-bed", ""); string input_vcf_filename = opts.GetFirstString ('v', "input-vcf", ""); string input_real_vcf_filename = opts.GetFirstString ('p', "input-real-vcf", ""); string output_hot_vcf = opts.GetFirstString ('q', "output-fake-hot-vcf", ""); string output_bed_filename = opts.GetFirstString ('d', "output-bed", ""); string output_vcf_filename = opts.GetFirstString ('o', "output-vcf", ""); string reference_filename = opts.GetFirstString ('r', "reference", ""); string unmerged_bed = opts.GetFirstString ('u', "unmerged-bed", ""); bool left_alignment = opts.GetFirstBoolean('a', "left-alignment", false); bool filter_bypass = opts.GetFirstBoolean('f', "filter-bypass", false); bool allow_block_substitutions = opts.GetFirstBoolean('s', "allow-block-substitutions", true); bool strict_check = opts.GetFirstBoolean('S', "strict-check", true); opts.CheckNoLeftovers(); if((input_bed_filename.empty() == (input_vcf_filename.empty() and input_real_vcf_filename.empty())) or (output_bed_filename.empty() and output_vcf_filename.empty()) or reference_filename.empty()) { PrepareHotspotsHelp(); return 1; } if ((not input_real_vcf_filename.empty()) and (output_vcf_filename.empty() or not input_vcf_filename.empty())) { PrepareHotspotsHelp(); return 1; } // Populate chromosome list from reference.fai // Use mmap to fetch the entire reference int ref_handle = open(reference_filename.c_str(),O_RDONLY); struct stat ref_stat; fstat(ref_handle, &ref_stat); char *ref = (char *)mmap(0, ref_stat.st_size, PROT_READ, MAP_SHARED, ref_handle, 0); FILE *fai = fopen((reference_filename+".fai").c_str(), "r"); if (!fai) { fprintf(stderr, "ERROR: Cannot open %s.fai\n", reference_filename.c_str()); return 1; } vector<Reference> ref_index; map<string,int> ref_map; char line[1024], chrom_name[1024]; while (fgets(line, 1024, fai) != NULL) { Reference ref_entry; long chr_start; if (5 != sscanf(line, "%1020s\t%ld\t%ld\t%d\t%d", chrom_name, &ref_entry.size, &chr_start, &ref_entry.bases_per_line, &ref_entry.bytes_per_line)) continue; ref_entry.chr = chrom_name; ref_entry.start = ref + chr_start; ref_index.push_back(ref_entry); ref_map[ref_entry.chr] = (int) ref_index.size() - 1; } fclose(fai); junction junc; if (!unmerged_bed.empty()) { FILE *fp = fopen(unmerged_bed.c_str(), "r"); if (!fp) { fprintf(stderr, "ERROR: Cannot open %s\n", unmerged_bed.c_str()); return 1; } char line2[65536]; junc.init(ref_index.size()); bool line_overflow = false; while (fgets(line2, 65536, fp) != NULL) { if (line2[0] and line2[strlen(line2)-1] != '\n' and strlen(line2) == 65535) { line_overflow = true; continue; } if (line_overflow) { line_overflow = false; continue; } if (strstr(line2, "track")) continue; char chr[100]; int b, e; sscanf(line2, "%s %d %d", chr, &b, &e); junc.add(ref_map[chr], b, e); } fclose(fp); } // Load input BED or load input VCF, group by chromosome deque<LineStatus> line_status; vector<deque<Allele> > alleles(ref_index.size()); if (!input_bed_filename.empty()) { FILE *input = fopen(input_bed_filename.c_str(),"r"); if (!input) { fprintf(stderr,"ERROR: Cannot open %s\n", input_bed_filename.c_str()); return 1; } char line2[65536]; int line_number = 0; bool line_overflow = false; while (fgets(line2, 65536, input) != NULL) { if (line2[0] and line2[strlen(line2)-1] != '\n' and strlen(line2) == 65535) { line_overflow = true; continue; } line_number++; if (line_overflow) { line_overflow = false; line_status.push_back(LineStatus(line_number)); line_status.back().filter_message_prefix = "Malformed hotspot BED line: line length exceeds 64K"; continue; } if (strncmp(line2, "browser", 7) == 0) continue; if (strncmp(line2, "track", 5) == 0) { if (string::npos != string(line2).find("allowBlockSubstitutions=true")) allow_block_substitutions = true; continue; } // OID= table has special meaning if (string::npos != string(line2).find("OID=")) { line_status.push_back(LineStatus(line_number)); line_status.back().filter_message_prefix = "Bed line contains OID="; continue; } char *current_chr = strtok(line2, "\t\r\n"); char *current_start = strtok(NULL, "\t\r\n"); char *current_end = strtok(NULL, "\t\r\n"); char *current_id = strtok(NULL, "\t\r\n"); char *penultimate = strtok(NULL, "\t\r\n"); char *ultimate = strtok(NULL, "\t\r\n"); for (char *next = strtok(NULL, "\t\r\n"); next; next = strtok(NULL, "\t\r\n")) { penultimate = ultimate; ultimate = next; } if (!current_chr or !current_start or !current_end or !current_id or !penultimate or !ultimate) { line_status.push_back(LineStatus(line_number)); line_status.back().filter_message_prefix = "Malformed hotspot BED line: expected at least 6 fields"; continue; } Allele allele; string string_chr(current_chr); if (ref_map.find(string_chr) != ref_map.end()) allele.chr_idx = ref_map[string_chr]; else if (ref_map.find("chr"+string_chr) != ref_map.end()) allele.chr_idx = ref_map["chr"+string_chr]; else if (string_chr == "MT" and ref_map.find("chrM") != ref_map.end()) allele.chr_idx = ref_map["chrM"]; else { line_status.push_back(LineStatus(line_number)); line_status.back().filter_message_prefix = "Unknown chromosome name: "; line_status.back().filter_message = string_chr; continue; } allele.pos = strtol(current_start,NULL,10); allele.id = current_id; char *current_ref = NULL; char *current_alt = NULL; for (char *next = strtok(penultimate, ";"); next; next = strtok(NULL, ";")) { if (strncmp(next,"REF=",4) == 0) current_ref = next; else if (strncmp(next,"OBS=",4) == 0) current_alt = next; else if (strncmp(next,"ANCHOR=",7) == 0) { // ignore ANCHOR } else { char *value = next; while (*value and *value != '=') ++value; if (*value == '=') *value++ = 0; allele.custom_tags[next] = value; } } if (!current_ref or !current_alt) { line_status.push_back(LineStatus(line_number)); line_status.back().filter_message_prefix = "Malformed hotspot BED line: REF and OBS fields required in penultimate column"; continue; } for (char *pos = current_ref+4; *pos; ++pos) allele.ref += toupper(*pos); for (char *pos = current_alt+4; *pos; ++pos) allele.alt += toupper(*pos); // here is the place to check the length of the hotspot cover the amplicon junction. ZZ /* if (junc.contain(allele.chr_idx, allele.pos, (unsigned int) allele.ref.size())) { line_status.push_back(LineStatus(line_number)); line_status.back().filter_message_prefix = "hotspot BED line contain the complete overlapping region of two amplicon, the variant cannot be detected by tvc"; continue; } if (not junc.contained_in_ampl(allele.chr_idx, allele.pos, (unsigned int) allele.ref.size())) { line_status.push_back(LineStatus(line_number)); line_status.back().filter_message_prefix = "hotspot BED line is not contained in any amplicon, the variant cannot be detected by tvc"; continue; } */ allele.filtered = false; line_status.push_back(LineStatus(line_number)); allele.line_status = &line_status.back(); allele.opos = allele.pos; allele.oref = allele.ref; allele.oalt = allele.alt; alleles[allele.chr_idx].push_back(allele); //line_status.back().allele = &alleles[allele.chr_idx].back(); line_status.back().chr_idx = allele.chr_idx; line_status.back().opos = allele.opos; line_status.back().id = allele.id; } fclose(input); } if (!input_vcf_filename.empty() or !input_real_vcf_filename.empty()) { bool real_vcf = false; FILE *input; FILE *out_real = NULL; FILE *out_hot = NULL; int fake_ = 0; int hn = 1; if (!input_real_vcf_filename.empty()) { real_vcf = true; input = fopen(input_real_vcf_filename.c_str(),"r"); if (!input) { fprintf(stderr,"ERROR: Cannot open %s\n", input_real_vcf_filename.c_str()); return 1; } out_real = fopen(output_vcf_filename.c_str(), "w"); if (!out_real) { fprintf(stderr,"ERROR: Cannot open %s\n", output_vcf_filename.c_str()); return 1; } if (!output_hot_vcf.empty()) { out_hot = fopen(output_hot_vcf.c_str(), "w"); if (!out_hot) { fprintf(stderr,"ERROR: Cannot open %s\n", output_hot_vcf.c_str()); return 1; } } else out_hot = stdout; fprintf(out_hot, "##fileformat=VCFv4.1\n##allowBlockSubstitutions=true\n#CHROM POS ID REF ALT QUAL FILTER INFO\n"); } else { input = fopen(input_vcf_filename.c_str(),"r"); if (!input) { fprintf(stderr,"ERROR: Cannot open %s\n", input_vcf_filename.c_str()); return 1; } } char line2[65536]; char line3[65536]; int line_number = 0; bool line_overflow = false; list<one_vcfline> vcflist; char last_chr[1024] = ""; while (fgets(line2, 65536, input) != NULL) { if (line2[0] and line2[strlen(line2)-1] != '\n' and strlen(line2) == 65535) { line_overflow = true; continue; } line_number++; if (line_overflow) { line_overflow = false; line_status.push_back(LineStatus(line_number)); line_status.back().filter_message_prefix = "Malformed hotspot VCF line: line length exceeds 64K"; continue; } if (strncmp(line2, "##allowBlockSubstitutions=true", 30) == 0) { allow_block_substitutions = true; continue; } if (line2[0] == '#') { if (out_real) { fprintf(out_real, "%s", line2);} continue; } if (real_vcf) strcpy(line3, line2); char *current_chr = strtok(line2, "\t\r\n"); char *current_start = strtok(NULL, "\t\r\n"); char *current_id = strtok(NULL, "\t\r\n"); char *current_ref = strtok(NULL, "\t\r\n"); char *current_alt = strtok(NULL, "\t\r\n"); strtok(NULL, "\t\r\n"); // Ignore QUAL strtok(NULL, "\t\r\n"); // Ignore FILTER char *current_info = strtok(NULL, "\t\r\n"); strtok(NULL, "\t\r\n"); char *gt = strtok(NULL, "\t\r\n"); if (!current_chr or !current_start or !current_id or !current_ref or !current_alt) { line_status.push_back(LineStatus(line_number)); if (real_vcf) line_status.back().filter_message_prefix = "Malformed real VCF line: expected at least 5 fields"; else line_status.back().filter_message_prefix = "Malformed hotspot VCF line: expected at least 5 fields"; continue; } string string_chr(current_chr); int chr_idx = 0; if (ref_map.find(string_chr) != ref_map.end()) chr_idx = ref_map[string_chr]; else if (ref_map.find("chr"+string_chr) != ref_map.end()) chr_idx = ref_map["chr"+string_chr]; else if (string_chr == "MT" and ref_map.find("chrM") != ref_map.end()) chr_idx = ref_map["chrM"]; else { line_status.push_back(LineStatus(line_number)); line_status.back().filter_message_prefix = "Unknown chromosome name: "; line_status.back().filter_message = string_chr; continue; } for (char *pos = current_ref; *pos; ++pos) *pos = toupper(*pos); for (char *pos = current_alt; *pos; ++pos) *pos = toupper(*pos); // Process custom tags vector<string> bstrand; vector<string> hp_max_length; string raw_oid; string raw_omapalt; string raw_oalt; string raw_oref; string raw_opos; if (current_info) { string raw_bstrand; string raw_hp_max_length; for (char *next = strtok(current_info, ";"); next; next = strtok(NULL, ";")) { char *value = next; while (*value and *value != '=') ++value; if (*value == '=') *value++ = 0; if (strcmp(next, "TYPE") == 0) continue; if (strcmp(next, "HRUN") == 0) continue; if (strcmp(next, "HBASE") == 0) continue; if (strcmp(next, "FR") == 0) continue; if (strcmp(next, "OPOS") == 0) { raw_opos = value; continue; } if (strcmp(next, "OREF") == 0) { raw_oref = value; continue; } if (strcmp(next, "OALT") == 0) { raw_oalt = value; continue; } if (strcmp(next, "OID") == 0) { raw_oid = value; continue; } if (strcmp(next, "OMAPALT") == 0) { raw_omapalt = value; continue; } if (strcmp(next, "BSTRAND") == 0) { raw_bstrand = value; continue; } if (strcmp(next, "hp_max_length") == 0) { raw_hp_max_length = value; continue; } } if (not raw_bstrand.empty()) split(raw_bstrand, ',', bstrand); if (not raw_hp_max_length.empty()) split(raw_hp_max_length, ',', hp_max_length); } if (real_vcf) { //fprintf(stderr, "%s\n", gt); if (gt == NULL) continue; // get gt int g1 = atoi(gt), g2; gt = strchr(gt, '/'); if (gt) g2 = atoi(gt+1); else {fprintf(stderr, "GT not formatted right\n"); exit(1);} //if (g1 == 0 and g2 == 0) continue; unsigned int cur_pos = atoi(current_start); one_vcfline newline(current_ref, current_alt, cur_pos, g1, g2, line3); bool new_chr = false; if (strcmp(current_chr, last_chr) != 0) { new_chr = true; } while (not vcflist.empty()) { if ((not new_chr) and vcflist.front().pos+strlen(vcflist.front().ref) > cur_pos) break; if (vcflist.front().produce_hot_vcf(last_chr, out_real, hn, out_hot)) fake_++; vcflist.pop_front(); } if (new_chr) strcpy(last_chr, current_chr); for (list<one_vcfline>::iterator it = vcflist.begin(); it != vcflist.end(); it++) { it->check_subset(newline); } if (not newline.alts.empty()) vcflist.push_back(newline); continue; } unsigned int allele_idx = 0; for (char *sub_alt = strtok(current_alt,","); sub_alt; sub_alt = strtok(NULL,",")) { Allele allele; allele.chr_idx = chr_idx; allele.ref = current_ref; allele.alt = sub_alt; allele.pos = strtol(current_start,NULL,10)-1; allele.id = current_id; if (allele.id == ".") allele.id = "hotspot"; allele.filtered = false; line_status.push_back(LineStatus(line_number)); allele.line_status = &line_status.back(); allele.opos = allele.pos; allele.oref = allele.ref; allele.oalt = allele.alt; if (allele_idx < bstrand.size()) { if (bstrand[allele_idx] != ".") allele.custom_tags["BSTRAND"] = bstrand[allele_idx]; } if (allele_idx < hp_max_length.size()) { if (hp_max_length[allele_idx] != ".") allele.custom_tags["hp_max_length"] = hp_max_length[allele_idx]; } alleles[allele.chr_idx].push_back(allele); //line_status.back().allele = &alleles[allele.chr_idx].back(); line_status.back().chr_idx = allele.chr_idx; line_status.back().opos = allele.opos; line_status.back().id = allele.id; allele_idx++; } } fclose(input); if (real_vcf) { while (not vcflist.empty()) { if (vcflist.front().produce_hot_vcf(last_chr, out_real, hn, out_hot)) fake_++; vcflist.pop_front(); } fclose(out_real); fclose(out_hot); if (fake_ > 0) return 0; else return 1; } } // Process by chromosome: // - Verify reference allele // - Left align // - Sort // - Filter for block substitutions, write FILE *output_vcf = NULL; if (!output_vcf_filename.empty()) { output_vcf = fopen(output_vcf_filename.c_str(), "w"); if (!output_vcf) { fprintf(stderr,"ERROR: Cannot open %s for writing\n", output_vcf_filename.c_str()); return 1; } fprintf(output_vcf, "##fileformat=VCFv4.1\n"); if (allow_block_substitutions) fprintf(output_vcf, "##allowBlockSubstitutions=true\n"); fprintf(output_vcf, "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\n"); } FILE *output_bed = NULL; if (!output_bed_filename.empty()) { output_bed = fopen(output_bed_filename.c_str(), "w"); if (!output_bed) { fprintf(stderr,"ERROR: Cannot open %s for writing\n", output_bed_filename.c_str()); if (output_vcf) fclose(output_vcf); return 1; } if (allow_block_substitutions) fprintf(output_bed, "track name=\"hotspot\" type=bedDetail allowBlockSubstitutions=true\n"); else fprintf(output_bed, "track name=\"hotspot\" type=bedDetail\n"); } for (int chr_idx = 0; chr_idx < (int)ref_index.size(); ++chr_idx) { for (deque<Allele>::iterator A = alleles[chr_idx].begin(); A != alleles[chr_idx].end(); ++A) { // check bed file if (junc.contain(A->chr_idx, A->pos, (unsigned int) A->ref.size())) { A->filtered = true; A->line_status->filter_message_prefix = "hotspot BED line contain the complete overlapping region of two amplicon, the variant cannot be detected by tvc"; continue; } if (not junc.contained_in_ampl(A->chr_idx, A->pos, (unsigned int) A->ref.size())) { A->filtered = true; A->line_status->filter_message_prefix = "hotspot BED line is not contained in any amplicon, the variant cannot be detected by tvc"; continue; } // Invalid characters bool valid = true; for (const char *c = A->ref.c_str(); *c ; ++c) if (*c != 'A' and *c != 'C' and *c != 'G' and *c != 'T') valid = false; for (const char *c = A->alt.c_str(); *c ; ++c) if (*c != 'A' and *c != 'C' and *c != 'G' and *c != 'T') valid = false; if (not valid) { A->filtered = true; A->line_status->filter_message_prefix = "REF and/or ALT contain characters other than ACGT: "; A->line_status->filter_message = "REF = " + A->ref + " ALT = " + A->alt; continue; } // Filter REF == ALT if (A->ref == A->alt) { A->filtered = true; A->line_status->filter_message_prefix = "REF and ALT alleles equal"; continue; } // Confirm reference allele. string ref_expected; for (int idx = 0; idx < (int) A->ref.size(); ++idx) ref_expected += ref_index[chr_idx].base(A->pos + idx); if (A->ref != ref_expected) { A->filtered = true; A->line_status->filter_message_prefix = "Provided REF allele does not match reference: "; A->line_status->filter_message = "Expected " + ref_expected + ", found " + A->ref; continue; } // Trim int ref_start = 0; int ref_end = A->ref.size(); int alt_end = A->alt.size(); // Option 1: trim all trailing bases; //while(ref_end and alt_end and A->ref[ref_end-1] == A->alt[alt_end-1]) { // --ref_end; // --alt_end; //} // Option 2: trim all leading basees; //while (ref_start < ref_end and ref_start < alt_end and A->ref[ref_start] == A->alt[ref_start]) // ++ref_start; // Option 3: trim anchor base if vcf if (!input_vcf_filename.empty()) { if (ref_end and alt_end and (ref_end == 1 or alt_end == 1) and A->ref[0] == A->alt[0]) ref_start = 1; } A->pos += ref_start; A->ref = A->ref.substr(ref_start, ref_end-ref_start); A->alt = A->alt.substr(ref_start, alt_end-ref_start); ref_end -= ref_start; alt_end -= ref_start; // Left align if (left_alignment && A->custom_tags.find("BSTRAND") == A->custom_tags.end()) { // black list variant not to be left aligned. string trailing; int can_do = 0, need_do = 0; int ref_end_orig= ref_end, alt_end_orig = alt_end; while(ref_end and alt_end and A->ref[ref_end-1] == A->alt[alt_end-1]) { ref_end--; alt_end--; } if (ref_end == 0 || alt_end == 0) { can_do = need_do = 1; // indel type, ZZ } else { int tmp_start = ref_start; int ref_end_0 = ref_end, alt_end_0 = alt_end; // end after remove trailing match ZZ while (tmp_start < ref_end and tmp_start < alt_end and A->ref[tmp_start] == A->alt[tmp_start]) ++tmp_start; if (tmp_start == ref_end || tmp_start == alt_end) { can_do = 1; need_do = 0; // indel but indel is not at the left. ZZ } else { ref_end--; alt_end--; while(ref_end and alt_end and A->ref[ref_end-1] == A->alt[alt_end-1]) { ref_end--; alt_end--; } if (ref_end == 0 || alt_end == 0) { // complex with 1 bp MM at right end can_do = need_do = 1; if (ref_end + alt_end == 0) need_do = 0; // SNP } else { int tmp_start0 = tmp_start; // start after removing leading matches tmp_start++; while (tmp_start < ref_end_orig and tmp_start < alt_end_orig and A->ref[tmp_start] == A->alt[tmp_start]) tmp_start++; if (tmp_start >= ref_end_0 || tmp_start >= alt_end_0 || ref_end <= tmp_start0 || alt_end <= tmp_start0) { // 1MM plus indel in middle, by definition cannot move the indel left enough to change A->pos can_do = 1; need_do = 0; } // else real complex } } } if (!can_do or !need_do) { // do nothing // if !can_do need add some more DP ref_end = ref_end_orig; alt_end = alt_end_orig; } else { // left align the indel part, here either ref_end = 0 or alt_end = 0 int opos = A->pos; while (A->pos > 0) { char nuc = ref_index[chr_idx].base(A->pos-1); if (ref_end > 0 and A->ref[ref_end-1] != nuc) break; if (alt_end > 0 and A->alt[alt_end-1] != nuc) break; A->ref = string(1,nuc) + A->ref; A->alt = string(1,nuc) + A->alt; A->pos--; } if (ref_end != ref_end_orig) { // trailing part is aligned, the whole ref and alt need to be kept. ZZ ref_end = A->ref.size(); alt_end = A->alt.size(); } if (junc.contain(chr_idx, A->pos, ref_end) or not junc.contained_in_ampl(chr_idx, A->pos, ref_end)) { // after left align the hotspot contain an overlap region, revert to the original ZZ if (opos != A->pos) { A->ref.erase(0, opos-A->pos); A->alt.erase(0, opos-A->pos); A->pos = opos; ref_end = ref_end_orig; alt_end = alt_end_orig; } } } } A->ref.resize(ref_end); A->alt.resize(alt_end); // Filter block substitutions: take 1 if (ref_end > 0 and alt_end > 0 and ref_end != alt_end and not allow_block_substitutions and not filter_bypass) { A->filtered = true; A->line_status->filter_message_prefix = "Block substitutions not supported"; continue; } } if (output_bed) { // Sort - without anchor base stable_sort(alleles[chr_idx].begin(), alleles[chr_idx].end(), compare_alleles); // Write for (deque<Allele>::iterator I = alleles[chr_idx].begin(); I != alleles[chr_idx].end(); ++I) { if (I->filtered) continue; fprintf(output_bed, "%s\t%ld\t%ld\t%s\tREF=%s;OBS=%s", ref_index[chr_idx].chr.c_str(), I->pos, I->pos + I->ref.size(), I->id.c_str(), I->ref.c_str(), I->alt.c_str()); for (map<string,string>::iterator C = I->custom_tags.begin(); C != I->custom_tags.end(); ++C) fprintf(output_bed, ";%s=%s", C->first.c_str(), C->second.c_str()); fprintf(output_bed, "\tNONE\n"); /* if (I->pos) fprintf(output_bed, "%s\t%ld\t%ld\t%s\t0\t+\tREF=%s;OBS=%s;ANCHOR=%c\tNONE\n", ref_index[chr_idx].chr.c_str(), I->pos, I->pos + I->ref.size(), I->id.c_str(), I->ref.c_str(), I->alt.c_str(), ref_index[chr_idx].base(I->pos-1)); else fprintf(output_bed, "%s\t%ld\t%ld\t%s\t0\t+\tREF=%s;OBS=%s;ANCHOR=\tNONE\n", ref_index[chr_idx].chr.c_str(), I->pos, I->pos + I->ref.size(), I->id.c_str(), I->ref.c_str(), I->alt.c_str()); */ } } if (output_vcf) { // Add anchor base to indels for (deque<Allele>::iterator I = alleles[chr_idx].begin(); I != alleles[chr_idx].end(); ++I) { if (I->filtered) continue; if (not I->ref.empty() and not I->alt.empty()) continue; if (I->pos == 0) { I->filtered = true; I->line_status->filter_message_prefix = "INDELs at chromosome start not supported"; continue; } I->pos--; I->ref = string(1,ref_index[chr_idx].base(I->pos)) + I->ref; I->alt = string(1,ref_index[chr_idx].base(I->pos)) + I->alt; } // Sort - with anchor base stable_sort(alleles[chr_idx].begin(), alleles[chr_idx].end(), compare_alleles); // Merge alleles, remove block substitutions, write for (deque<Allele>::iterator A = alleles[chr_idx].begin(); A != alleles[chr_idx].end(); ) { string max_ref; deque<Allele>::iterator B = A; for (; B != alleles[chr_idx].end() and B->pos == A->pos; ++B) if (!B->filtered and max_ref.size() < B->ref.size()) max_ref = B->ref; bool filtered = true; map<string,set<string> > unique_alts_and_ids; for (deque<Allele>::iterator I = A; I != B; ++I) { if (I->filtered) continue; string new_alt = I->alt + max_ref.substr(I->ref.size()); if (new_alt.size() > 1 and max_ref.size() > 1 and new_alt.size() != max_ref.size() and not allow_block_substitutions and not filter_bypass) { I->filtered = true; I->line_status->filter_message_prefix = "Block substitutions not supported (post-merge)"; continue; } I->ref = max_ref; I->alt = new_alt; // Filter alleles with duplicate ALT + ID pairs map<string,set<string> >::iterator alt_iter = unique_alts_and_ids.find(new_alt); if (alt_iter != unique_alts_and_ids.end()) { if (alt_iter->second.count(I->id) > 0) { I->filtered = true; I->line_status->filter_message_prefix = "Duplicate allele and ID"; continue; } } unique_alts_and_ids[new_alt].insert(I->id); filtered = false; } if (not filtered) { fprintf(output_vcf, "%s\t%ld\t.\t%s\t", ref_index[chr_idx].chr.c_str(), A->pos+1, max_ref.c_str()); bool comma = false; map<string,map<string,string> > unique_alts_and_tags; set<string> unique_tags; set<string> unique_alt_alleles; for (deque<Allele>::iterator I = A; I != B; ++I) { if (I->filtered) continue; unique_alts_and_tags[I->alt].insert(I->custom_tags.begin(), I->custom_tags.end()); for (map<string,string>::iterator S = I->custom_tags.begin(); S != I->custom_tags.end(); ++S) unique_tags.insert(S->first); if (unique_alt_alleles.count(I->alt) > 0) continue; unique_alt_alleles.insert(I->alt); if (comma) fprintf(output_vcf, ","); comma = true; fprintf(output_vcf, "%s", I->alt.c_str()); } /* for (deque<Allele>::iterator I = A; I != B; ++I) { if (I->filtered) continue; map<string,map<string,string> >::iterator Q = unique_alts_and_tags.find(I->alt); if (comma) fprintf(output_vcf, ","); comma = true; if (Q == unique_alts_and_tags.end()) {fprintf(output_vcf, "."); continue;} fprintf(output_vcf, "%s", Q->first.c_str()); } */ fprintf(output_vcf, "\t.\t.\tOID="); comma = false; for (deque<Allele>::iterator I = A; I != B; ++I) { if (I->filtered) continue; if (comma) fprintf(output_vcf, ","); comma = true; fprintf(output_vcf, "%s", I->id.c_str()); } fprintf(output_vcf, ";OPOS="); comma = false; for (deque<Allele>::iterator I = A; I != B; ++I) { if (I->filtered) continue; if (comma) fprintf(output_vcf, ","); comma = true; fprintf(output_vcf, "%ld", I->opos+1); } fprintf(output_vcf, ";OREF="); comma = false; for (deque<Allele>::iterator I = A; I != B; ++I) { if (I->filtered) continue; if (comma) fprintf(output_vcf, ","); comma = true; fprintf(output_vcf, "%s", I->oref.c_str()); } fprintf(output_vcf, ";OALT="); comma = false; for (deque<Allele>::iterator I = A; I != B; ++I) { if (I->filtered) continue; if (comma) fprintf(output_vcf, ","); comma = true; fprintf(output_vcf, "%s", I->oalt.c_str()); } fprintf(output_vcf, ";OMAPALT="); comma = false; for (deque<Allele>::iterator I = A; I != B; ++I) { if (I->filtered) continue; if (comma) fprintf(output_vcf, ","); comma = true; fprintf(output_vcf, "%s", I->alt.c_str()); } for (set<string>::iterator S = unique_tags.begin(); S != unique_tags.end(); ++S) { fprintf(output_vcf, ";%s=", S->c_str()); comma=false; for (deque<Allele>::iterator I = A; I != B; ++I) { if (I->filtered) continue; map<string,map<string,string> >::iterator Q = unique_alts_and_tags.find(I->alt); if (comma) fprintf(output_vcf, ","); comma = true; if (Q == unique_alts_and_tags.end()) {fprintf(output_vcf, "."); continue;} map<string,string>::iterator W = Q->second.find(*S); if (W == Q->second.end()) fprintf(output_vcf, "."); else fprintf(output_vcf, "%s", W->second.c_str()); } } // fprintf(output_vcf, ";%s=%s", S->first.c_str(), S->second.c_str()); fprintf(output_vcf, "\n"); } A = B; } } } if (output_bed) { fflush(output_bed); fclose(output_bed); } if (output_vcf) { fflush(output_vcf); fclose(output_vcf); } int lines_ignored = 0; for (deque<LineStatus>::iterator L = line_status.begin(); L != line_status.end(); ++L) { if (L->filter_message_prefix) { if (L->chr_idx >= 0) printf("Line %d ignored: [%s:%ld %s] %s%s\n", L->line_number, ref_index[L->chr_idx].chr.c_str(), L->opos+1, L->id.c_str(), L->filter_message_prefix, L->filter_message.c_str()); else printf("Line %d ignored: %s%s\n", L->line_number, L->filter_message_prefix, L->filter_message.c_str()); lines_ignored++; } } printf("Ignored %d out of %d lines\n", lines_ignored, (int)line_status.size()); munmap(ref, ref_stat.st_size); close(ref_handle); if (lines_ignored > 0 and strict_check) return 1; return 0; }
int main (int argc, const char *argv[]) { time_t program_start_time; time(&program_start_time); Json::Value calibration_json(Json::objectValue); DumpStartingStateOfProgram (argc,argv,program_start_time, calibration_json["Calibration"]); // // Step 1. Process command line options // OptArgs opts; opts.ParseCmdLine(argc, argv); // enable floating point exceptions during program execution if (opts.GetFirstBoolean('-', "float-exceptions", true)) { cout << "Calibration: Floating point exceptions enabled." << endl; feenableexcept(FE_DIVBYZERO | FE_INVALID | FE_OVERFLOW); } //*/ CalibrationContext calib_context; if (not calib_context.InitializeFromOpts(opts)){ PrintHelp_CalModules(); } HistogramCalibration master_histogram(opts, calib_context); calib_context.hist_calibration_master = &master_histogram; LinearCalibrationModel master_linear_model(opts, calib_context); calib_context.linear_model_master = &master_linear_model; opts.CheckNoLeftovers(); // // Step 2. Execute threaded calibration // int calibration_thread_time = 0; if (calib_context.successive_fit) { // first train linear model if (master_linear_model.DoTraining()) { int l_thread_time = 0; for (int i_iteration=0; i_iteration<calib_context.num_train_iterations; i_iteration++) { cout << " -Training Iteration " << i_iteration+1; l_thread_time = ExecuteThreadedCalibrationTraining(calib_context); // Activate master linear model after every round of training master_linear_model.CreateCalibrationModel(false); // make linear model master_linear_model.SetModelGainsAndOffsets(); // expand for use in basecalling calibration_thread_time += l_thread_time; calib_context.bam_reader.Rewind(); // reset all files for another pass cout << " Duration = " << l_thread_time << endl; } } // Then apply it during polish model training if (master_histogram.DoTraining()) { calib_context.local_fit_linear_model = false; calib_context.local_fit_polish_model = true; calibration_thread_time += ExecuteThreadedCalibrationTraining(calib_context); } } else { // Single pass in which both models are fit jointly calibration_thread_time=ExecuteThreadedCalibrationTraining(calib_context); } // // Step 3. Create models, write output, and close modules // // Linear Model if (master_linear_model.CreateCalibrationModel()) master_linear_model.ExportModelToJson(calibration_json["LinearModel"], ""); // HP histogram calibration if (master_histogram.CreateCalibrationModel()) master_histogram.ExportModelToJson(calibration_json["HPHistogram"]); // Transfer stuff from calibration context and close bam reader calib_context.Close(calibration_json["Calibration"]); time_t program_end_time; time(&program_end_time); calibration_json["Calibration"]["end_time"] = get_time_iso_string(program_end_time); calibration_json["Calibration"]["total_duration"] = (Json::Int)difftime(program_end_time,program_start_time); calibration_json["Calibration"]["calibration_duration"] = (Json::Int)calibration_thread_time; SaveJson(calibration_json, calib_context.filename_json); return EXIT_SUCCESS; }
int PrepareHotspots(int argc, const char *argv[]) { OptArgs opts; opts.ParseCmdLine(argc, argv); string input_bed_filename = opts.GetFirstString ('b', "input-bed", ""); string input_vcf_filename = opts.GetFirstString ('v', "input-vcf", ""); string output_bed_filename = opts.GetFirstString ('d', "output-bed", ""); string output_vcf_filename = opts.GetFirstString ('o', "output-vcf", ""); string reference_filename = opts.GetFirstString ('r', "reference", ""); bool left_alignment = opts.GetFirstBoolean('a', "left-alignment", false); bool filter_bypass = opts.GetFirstBoolean('f', "filter-bypass", false); bool allow_block_substitutions = opts.GetFirstBoolean('s', "allow-block-substitutions", false); opts.CheckNoLeftovers(); if((input_bed_filename.empty() == input_vcf_filename.empty()) or (output_bed_filename.empty() and output_vcf_filename.empty()) or reference_filename.empty()) { PrepareHotspotsHelp(); return 1; } // Populate chromosome list from reference.fai // Use mmap to fetch the entire reference int ref_handle = open(reference_filename.c_str(),O_RDONLY); struct stat ref_stat; fstat(ref_handle, &ref_stat); char *ref = (char *)mmap(0, ref_stat.st_size, PROT_READ, MAP_SHARED, ref_handle, 0); FILE *fai = fopen((reference_filename+".fai").c_str(), "r"); if (!fai) { fprintf(stderr, "ERROR: Cannot open %s.fai\n", reference_filename.c_str()); return 1; } vector<Reference> ref_index; map<string,int> ref_map; char line[1024], chrom_name[1024]; while (fgets(line, 1024, fai) != NULL) { Reference ref_entry; long chr_start; if (5 != sscanf(line, "%s\t%ld\t%ld\t%d\t%d", chrom_name, &ref_entry.size, &chr_start, &ref_entry.bases_per_line, &ref_entry.bytes_per_line)) continue; ref_entry.chr = chrom_name; ref_entry.start = ref + chr_start; ref_index.push_back(ref_entry); ref_map[ref_entry.chr] = (int) ref_index.size() - 1; } fclose(fai); // Load input BED or load input VCF, group by chromosome deque<LineStatus> line_status; vector<deque<Allele> > alleles(ref_index.size()); if (!input_bed_filename.empty()) { FILE *input = fopen(input_bed_filename.c_str(),"r"); if (!input) { fprintf(stderr,"ERROR: Cannot open %s\n", input_bed_filename.c_str()); return 1; } char line2[65536]; int line_number = 0; bool line_overflow = false; while (fgets(line2, 65536, input) != NULL) { if (line2[0] and line2[strlen(line2)-1] != '\n' and strlen(line2) == 65535) { line_overflow = true; continue; } line_number++; if (line_overflow) { line_overflow = false; line_status.push_back(LineStatus(line_number)); line_status.back().filter_message_prefix = "Malformed hotspot BED line: line length exceeds 64K"; continue; } if (strncmp(line2, "browser", 7) == 0) continue; if (strncmp(line2, "track", 5) == 0) { if (string::npos != string(line2).find("allowBlockSubstitutions=true")) allow_block_substitutions = true; continue; } char *current_chr = strtok(line2, "\t\r\n"); char *current_start = strtok(NULL, "\t\r\n"); char *current_end = strtok(NULL, "\t\r\n"); char *current_id = strtok(NULL, "\t\r\n"); char *penultimate = strtok(NULL, "\t\r\n"); char *ultimate = strtok(NULL, "\t\r\n"); for (char *next = strtok(NULL, "\t\r\n"); next; next = strtok(NULL, "\t\r\n")) { penultimate = ultimate; ultimate = next; } if (!current_chr or !current_start or !current_end or !current_id or !penultimate or !ultimate) { line_status.push_back(LineStatus(line_number)); line_status.back().filter_message_prefix = "Malformed hotspot BED line: expected at least 6 fields"; continue; } Allele allele; string string_chr(current_chr); if (ref_map.find(string_chr) != ref_map.end()) allele.chr_idx = ref_map[string_chr]; else if (ref_map.find("chr"+string_chr) != ref_map.end()) allele.chr_idx = ref_map["chr"+string_chr]; else if (string_chr == "MT" and ref_map.find("chrM") != ref_map.end()) allele.chr_idx = ref_map["chrM"]; else { line_status.push_back(LineStatus(line_number)); line_status.back().filter_message_prefix = "Unknown chromosome name: "; line_status.back().filter_message = string_chr; continue; } allele.pos = strtol(current_start,NULL,10); allele.id = current_id; char *current_ref = NULL; char *current_alt = NULL; for (char *next = strtok(penultimate, ";"); next; next = strtok(NULL, ";")) { if (strncmp(next,"REF=",4) == 0) current_ref = next; else if (strncmp(next,"OBS=",4) == 0) current_alt = next; } if (!current_ref or !current_alt) { line_status.push_back(LineStatus(line_number)); line_status.back().filter_message_prefix = "Malformed hotspot BED line: REF and OBS fields required in penultimate column"; continue; } for (char *pos = current_ref+4; *pos; ++pos) allele.ref += toupper(*pos); for (char *pos = current_alt+4; *pos; ++pos) allele.alt += toupper(*pos); allele.filtered = false; line_status.push_back(LineStatus(line_number)); allele.line_status = &line_status.back(); allele.opos = allele.pos; allele.oref = allele.ref; allele.oalt = allele.alt; alleles[allele.chr_idx].push_back(allele); line_status.back().allele = &alleles[allele.chr_idx].back(); } fclose(input); } if (!input_vcf_filename.empty()) { FILE *input = fopen(input_vcf_filename.c_str(),"r"); if (!input) { fprintf(stderr,"ERROR: Cannot open %s\n", input_vcf_filename.c_str()); return 1; } char line2[65536]; int line_number = 0; bool line_overflow = false; while (fgets(line2, 65536, input) != NULL) { if (line2[0] and line2[strlen(line2)-1] != '\n' and strlen(line2) == 65535) { line_overflow = true; continue; } line_number++; if (line_overflow) { line_overflow = false; line_status.push_back(LineStatus(line_number)); line_status.back().filter_message_prefix = "Malformed hotspot VCF line: line length exceeds 64K"; continue; } if (strncmp(line2, "##allowBlockSubstitutions=true", 30) == 0) { allow_block_substitutions = true; continue; } if (line2[0] == '#') continue; char *current_chr = strtok(line2, "\t\r\n"); char *current_start = strtok(NULL, "\t\r\n"); char *current_id = strtok(NULL, "\t\r\n"); char *current_ref = strtok(NULL, "\t\r\n"); char *current_alt = strtok(NULL, "\t\r\n"); if (!current_chr or !current_start or !current_id or !current_ref or !current_alt) { line_status.push_back(LineStatus(line_number)); line_status.back().filter_message_prefix = "Malformed hotspot VCF line: expected at least 5 fields"; continue; } string string_chr(current_chr); int chr_idx = 0; if (ref_map.find(string_chr) != ref_map.end()) chr_idx = ref_map[string_chr]; else if (ref_map.find("chr"+string_chr) != ref_map.end()) chr_idx = ref_map["chr"+string_chr]; else if (string_chr == "MT" and ref_map.find("chrM") != ref_map.end()) chr_idx = ref_map["chrM"]; else { line_status.push_back(LineStatus(line_number)); line_status.back().filter_message_prefix = "Unknown chromosome name: "; line_status.back().filter_message = string_chr; continue; } for (char *pos = current_ref; *pos; ++pos) *pos = toupper(*pos); for (char *pos = current_alt; *pos; ++pos) *pos = toupper(*pos); for (char *sub_alt = strtok(current_alt,","); sub_alt; sub_alt = strtok(NULL,",")) { Allele allele; allele.chr_idx = chr_idx; allele.ref = current_ref; allele.alt = sub_alt; allele.pos = strtol(current_start,NULL,10)-1; allele.id = current_id; if (allele.id == ".") allele.id = "hotspot"; allele.filtered = false; line_status.push_back(LineStatus(line_number)); allele.line_status = &line_status.back(); allele.opos = allele.pos; allele.oref = allele.ref; allele.oalt = allele.alt; alleles[allele.chr_idx].push_back(allele); line_status.back().allele = &alleles[allele.chr_idx].back(); } } fclose(input); } // Process by chromosome: // - Verify reference allele // - Left align // - Sort // - Filter for block substitutions, write FILE *output_vcf = NULL; if (!output_vcf_filename.empty()) { output_vcf = fopen(output_vcf_filename.c_str(), "w"); if (!output_vcf) { fprintf(stderr,"ERROR: Cannot open %s for writing\n", output_vcf_filename.c_str()); return 1; } fprintf(output_vcf, "##fileformat=VCFv4.1\n"); if (allow_block_substitutions) fprintf(output_vcf, "##allowBlockSubstitutions=true\n"); fprintf(output_vcf, "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\n"); } FILE *output_bed = NULL; if (!output_bed_filename.empty()) { output_bed = fopen(output_bed_filename.c_str(), "w"); if (!output_bed) { fprintf(stderr,"ERROR: Cannot open %s for writing\n", output_bed_filename.c_str()); if (output_vcf) fclose(output_vcf); return 1; } if (allow_block_substitutions) fprintf(output_bed, "track name=\"hotspot\" type=bedDetail allowBlockSubstitutions=true\n"); else fprintf(output_bed, "track name=\"hotspot\" type=bedDetail\n"); } for (int chr_idx = 0; chr_idx < (int)ref_index.size(); ++chr_idx) { for (deque<Allele>::iterator A = alleles[chr_idx].begin(); A != alleles[chr_idx].end(); ++A) { // Invalid characters bool valid = true; for (const char *c = A->ref.c_str(); *c ; ++c) if (*c != 'A' and *c != 'C' and *c != 'G' and *c != 'T') valid = false; for (const char *c = A->alt.c_str(); *c ; ++c) if (*c != 'A' and *c != 'C' and *c != 'G' and *c != 'T') valid = false; if (not valid) { A->filtered = true; A->line_status->filter_message_prefix = "REF and/or ALT contain characters other than ACGT: "; A->line_status->filter_message = "REF = " + A->ref + " ALT = " + A->alt; continue; } // Filter REF == ALT if (A->ref == A->alt) { A->filtered = true; A->line_status->filter_message_prefix = "REF and ALT alleles equal"; continue; } // Confirm reference allele. string ref_expected; for (int idx = 0; idx < (int) A->ref.size(); ++idx) ref_expected += ref_index[chr_idx].base(A->pos + idx); if (A->ref != ref_expected) { A->filtered = true; A->line_status->filter_message_prefix = "Provided REF allele does not match reference: "; A->line_status->filter_message = "Expected " + ref_expected + ", found " + A->ref; continue; } // Trim int ref_start = 0; int ref_end = A->ref.size(); int alt_end = A->alt.size(); // Option 1: trim all trailing bases //while(ref_end and alt_end and A->ref[ref_end-1] == A->alt[alt_end-1]) { // --ref_end; // --alt_end; //} // Option 2: trim all leading basees //while (ref_start < ref_end and ref_start < alt_end and A->ref[ref_start] == A->alt[ref_start]) // ++ref_start; // Option 3: trim anchor base if vcf if (!input_vcf_filename.empty()) { if (ref_end and alt_end and (ref_end == 1 or alt_end == 1) and A->ref[0] == A->alt[0]) ref_start = 1; } A->pos += ref_start; A->ref = A->ref.substr(ref_start, ref_end-ref_start); A->alt = A->alt.substr(ref_start, alt_end-ref_start); ref_end -= ref_start; alt_end -= ref_start; // Left align if (left_alignment) { while (A->pos > 0) { char nuc = ref_index[chr_idx].base(A->pos-1); if (ref_end > 0 and A->ref[ref_end-1] != nuc) break; if (alt_end > 0 and A->alt[alt_end-1] != nuc) break; A->ref = string(1,nuc) + A->ref; A->alt = string(1,nuc) + A->alt; A->pos--; } } A->ref.resize(ref_end); A->alt.resize(alt_end); // Filter block substitutions: take 1 if (ref_end > 0 and alt_end > 0 and ref_end != alt_end and not allow_block_substitutions and not filter_bypass) { A->filtered = true; A->line_status->filter_message_prefix = "Block substitutions not supported"; continue; } } if (output_bed) { // Sort - without anchor base sort(alleles[chr_idx].begin(), alleles[chr_idx].end(), compare_alleles); // Write for (deque<Allele>::iterator I = alleles[chr_idx].begin(); I != alleles[chr_idx].end(); ++I) { if (I->filtered) continue; if (I->pos) fprintf(output_bed, "%s\t%ld\t%ld\t%s\t0\t+\tREF=%s;OBS=%s;ANCHOR=%c\tNONE\n", ref_index[chr_idx].chr.c_str(), I->pos, I->pos + I->ref.size(), I->id.c_str(), I->ref.c_str(), I->alt.c_str(), ref_index[chr_idx].base(I->pos-1)); else fprintf(output_bed, "%s\t%ld\t%ld\t%s\t0\t+\tREF=%s;OBS=%s;ANCHOR=\tNONE\n", ref_index[chr_idx].chr.c_str(), I->pos, I->pos + I->ref.size(), I->id.c_str(), I->ref.c_str(), I->alt.c_str()); } } if (output_vcf) { // Add anchor base to indels for (deque<Allele>::iterator I = alleles[chr_idx].begin(); I != alleles[chr_idx].end(); ++I) { if (I->filtered) continue; if (not I->ref.empty() and not I->alt.empty()) continue; if (I->pos == 0) { I->filtered = true; I->line_status->filter_message_prefix = "INDELs at chromosome start not supported"; continue; } I->pos--; I->ref = string(1,ref_index[chr_idx].base(I->pos)) + I->ref; I->alt = string(1,ref_index[chr_idx].base(I->pos)) + I->alt; } // Sort - with anchor base sort(alleles[chr_idx].begin(), alleles[chr_idx].end(), compare_alleles); // Merge alleles, remove block substitutions, write for (deque<Allele>::iterator A = alleles[chr_idx].begin(); A != alleles[chr_idx].end(); ) { string max_ref; deque<Allele>::iterator B = A; for (; B != alleles[chr_idx].end() and B->pos == A->pos; ++B) if (!B->filtered and max_ref.size() < B->ref.size()) max_ref = B->ref; bool filtered = true; for (deque<Allele>::iterator I = A; I != B; ++I) { if (I->filtered) continue; string new_alt = I->alt + max_ref.substr(I->ref.size()); if (new_alt.size() > 1 and max_ref.size() > 1 and new_alt.size() != max_ref.size() and not allow_block_substitutions and not filter_bypass) { I->filtered = true; I->line_status->filter_message_prefix = "Block substitutions not supported (post-merge)"; continue; } I->ref = max_ref; I->alt = new_alt; filtered = false; } if (not filtered) { fprintf(output_vcf, "%s\t%ld\t.\t%s\t", ref_index[chr_idx].chr.c_str(), A->pos+1, max_ref.c_str()); bool comma = false; set<string> unique_alt_alleles; for (deque<Allele>::iterator I = A; I != B; ++I) { if (I->filtered) continue; if (unique_alt_alleles.count(I->alt) > 0) continue; unique_alt_alleles.insert(I->alt); if (comma) fprintf(output_vcf, ","); comma = true; fprintf(output_vcf, "%s", I->alt.c_str()); } fprintf(output_vcf, "\t.\t.\tOID="); comma = false; for (deque<Allele>::iterator I = A; I != B; ++I) { if (I->filtered) continue; if (comma) fprintf(output_vcf, ","); comma = true; fprintf(output_vcf, "%s", I->id.c_str()); } fprintf(output_vcf, ";OPOS="); comma = false; for (deque<Allele>::iterator I = A; I != B; ++I) { if (I->filtered) continue; if (comma) fprintf(output_vcf, ","); comma = true; fprintf(output_vcf, "%ld", I->opos+1); } fprintf(output_vcf, ";OREF="); comma = false; for (deque<Allele>::iterator I = A; I != B; ++I) { if (I->filtered) continue; if (comma) fprintf(output_vcf, ","); comma = true; fprintf(output_vcf, "%s", I->oref.c_str()); } fprintf(output_vcf, ";OALT="); comma = false; for (deque<Allele>::iterator I = A; I != B; ++I) { if (I->filtered) continue; if (comma) fprintf(output_vcf, ","); comma = true; fprintf(output_vcf, "%s", I->oalt.c_str()); } fprintf(output_vcf, ";OMAPALT="); comma = false; for (deque<Allele>::iterator I = A; I != B; ++I) { if (I->filtered) continue; if (comma) fprintf(output_vcf, ","); comma = true; fprintf(output_vcf, "%s", I->alt.c_str()); } fprintf(output_vcf, "\n"); } A = B; } } } if (output_bed) { fflush(output_bed); fclose(output_bed); } if (output_vcf) { fflush(output_vcf); fclose(output_vcf); } int lines_ignored = 0; for (deque<LineStatus>::iterator L = line_status.begin(); L != line_status.end(); ++L) { if (L->filter_message_prefix) { if (L->allele) printf("Line %d ignored: [%s:%ld %s] %s%s\n", L->line_number, ref_index[L->allele->chr_idx].chr.c_str(), L->allele->opos+1, L->allele->id.c_str(), L->filter_message_prefix, L->filter_message.c_str()); else printf("Line %d ignored: %s%s\n", L->line_number, L->filter_message_prefix, L->filter_message.c_str()); lines_ignored++; } } printf("Ignored %d out of %d lines\n", lines_ignored, (int)line_status.size()); munmap(ref, ref_stat.st_size); close(ref_handle); return 0; }
int main(int argc, const char* argv[]) { printf ("tvcvalidator %s-%s (%s) - Prototype tvc validation tool\n\n", IonVersion::GetVersion().c_str(), IonVersion::GetRelease().c_str(), IonVersion::GetSvnRev().c_str()); if (argc == 1) { VariantValidatorHelp(); return 1; } OptArgs opts; opts.ParseCmdLine(argc, argv); if (opts.GetFirstBoolean('v', "version", false)) { return 0; } if (opts.GetFirstBoolean('h', "help", false)) { VariantValidatorHelp(); return 0; } string input_vcf_filename = opts.GetFirstString ('i', "input-vcf", ""); string truth_filename = opts.GetFirstString ('t', "truth-file", ""); string truth_dir = opts.GetFirstString ('d', "truth-dir", "/results/plugins/validateVariantCaller/files"); // TODO: reference optional, only used to verify reference allele in input-vcf and truth files //string reference_filename = opts.GetFirstString ('r', "reference", ""); opts.CheckNoLeftovers(); // // Step 1. Load input VCF file into memory // if (input_vcf_filename.empty()) { VariantValidatorHelp(); cerr << "ERROR: Input VCF file not specified " << endl; return 1; } VariantCallerResults results_vcf; results_vcf.load_vcf(input_vcf_filename); printf("Loaded VCF %s with %d variant calls\n", input_vcf_filename.c_str(), (int)results_vcf.variants.size()); // // Step 2. Parse truth files, compare them to the input vcf, and compute match scores // if (not truth_filename.empty()) { ValidatorTruth truth; truth.ReadTruthFile(truth_filename); truth.CompareToCalls(results_vcf); return 0; } truth_dir += "/*.bed"; glob_t glob_result; glob(truth_dir.c_str(), GLOB_TILDE, NULL, &glob_result); for(unsigned int i = 0; i < glob_result.gl_pathc; ++i) { ValidatorTruth truth; truth.ReadTruthFile(string(glob_result.gl_pathv[i])); truth.CompareToCalls(results_vcf); } globfree(&glob_result); return 0; }
BaseCallerFilters::BaseCallerFilters(OptArgs& opts, const string& _flowOrder, int _numFlows, const vector<KeySequence>& _keys, Mask *_maskPtr) { flowOrder = _flowOrder; keypassFilter = opts.GetFirstBoolean('k', "keypass-filter", true); percentPositiveFlowsFilterTFs = opts.GetFirstBoolean('-', "clonal-filter-tf", false); clonalFilterTraining = opts.GetFirstBoolean('-', "clonal-filter-train", false); clonalFilterSolving = opts.GetFirstBoolean('-', "clonal-filter-solve", false); minReadLength = opts.GetFirstInt ('-', "min-read-length", 8); cafieResFilterCalling = opts.GetFirstBoolean('-', "cr-filter", false); cafieResFilterTFs = opts.GetFirstBoolean('-', "cr-filter-tf", false); generate_bead_summary_ = opts.GetFirstBoolean('-', "bead-summary", false); // TODO: get this to work right. May require "unwound" flow order, so incompatible with current wells.FlowOrder() //flt_control.cafieResMaxValueByFlowOrder[std::string ("TACG") ] = 0.06; // regular flow order //flt_control.cafieResMaxValueByFlowOrder[std::string ("TACGTACGTCTGAGCATCGATCGATGTACAGC") ] = 0.08; // xdb flow order cafieResMaxValue = opts.GetFirstDouble('-', "cr-filter-max-value", 0.08); // SFFTrim options trim_adapter = opts.GetFirstString('-', "trim-adapter", "ATCACCGACTGCCCATAGAGAGGCTGAGAC"); trim_adapter_cutoff = opts.GetFirstDouble('-', "trim-adapter-cutoff", 0.0); trim_adapter_closest = opts.GetFirstBoolean('-', "trim-adapter-pick-closest", false); trim_qual_wsize = opts.GetFirstInt('-', "trim-qual-window-size", 30); trim_qual_cutoff = opts.GetFirstDouble('-', "trim-qual-cutoff", 100.0); trim_min_read_len = opts.GetFirstInt('-', "trim-min-read-len", 8); // Validate options if (minReadLength < 1) { fprintf (stderr, "Option Error: min-read-length must specify a positive value (%d invalid).\n", minReadLength); exit (EXIT_FAILURE); } if (cafieResMaxValue <= 0) { fprintf (stderr, "Option Error: cr-filter-max-value must specify a positive value (%lf invalid).\n", cafieResMaxValue); exit (EXIT_FAILURE); } keys = _keys; numClasses = keys.size(); assert(numClasses == 2); classFilterPolyclonal.resize(numClasses); classFilterPolyclonal[0] = clonalFilterSolving; classFilterPolyclonal[1] = clonalFilterSolving && percentPositiveFlowsFilterTFs; classFilterHighResidual.resize(numClasses); classFilterHighResidual[0] = cafieResFilterCalling; classFilterHighResidual[1] = cafieResFilterCalling && cafieResFilterTFs; string filter_beverly_args = opts.GetFirstString('-', "beverly-filter", "0.03,0.03,8"); if (filter_beverly_args == "off") { filter_beverly_enabled_ = false; // Nothing, really printf("Beverly filter: disabled, use --beverly-filter=filter_ratio,trim_ratio,min_length\n"); } else { int stat = sscanf (filter_beverly_args.c_str(), "%f,%f,%d", &filter_beverly_filter_ratio_, &filter_beverly_trim_ratio_, &filter_beverly_min_read_length_); if (stat != 3) { fprintf (stderr, "Option Error: beverly-filter %s\n", filter_beverly_args.c_str()); fprintf (stderr, "Usage: --beverly-filter=filter_ratio,trim_ratio,min_length\n"); exit (EXIT_FAILURE); } filter_beverly_enabled_ = true; printf("Beverly filter: enabled, use --beverly-filter=off to disable\n"); printf("Beverly filter: filter_ratio = %1.5f\n", filter_beverly_filter_ratio_); printf("Beverly filter: trim_ratio = %1.5f\n", filter_beverly_trim_ratio_); printf("Beverly filter: min_length = %d\n", filter_beverly_min_read_length_); } maskPtr = _maskPtr; numFlows = _numFlows; filterMask.assign(maskPtr->H()*maskPtr->W(), kUninitialized); }
void PhaseEstimator::InitializeFromOptArgs(OptArgs& opts, const ion::ChipSubset & chip_subset, const string & key_norm_method) { // Parse command line options phasing_estimator_ = opts.GetFirstString ('-', "phasing-estimator", "spatial-refiner-2"); vector<double> cf_ie_dr = opts.GetFirstDoubleVector('-', "libcf-ie-dr", ""); vector<double> init_cf_ie_dr = opts.GetFirstDoubleVector('-', "initcf-ie-dr", ""); residual_threshold_ = opts.GetFirstDouble ('-', "phasing-residual-filter", 1.0); max_phasing_levels_ = opts.GetFirstInt ('-', "max-phasing-levels", max_phasing_levels_default_); num_fullchip_iterations_= opts.GetFirstInt ('-', "phasing-fullchip-iterations", 3); num_region_iterations_ = opts.GetFirstInt ('-', "phasing-region-iterations", 1); num_reads_per_region_ = opts.GetFirstInt ('-', "phasing-num-reads", 5000); min_reads_per_region_ = opts.GetFirstInt ('-', "phasing-min-reads", 1000); phase_file_name_ = opts.GetFirstString ('-', "phase-estimation-file", ""); normalization_string_ = opts.GetFirstString ('-', "phase-normalization", "adaptive"); key_norm_method_ = key_norm_method; // Static member variables norm_during_param_eval_ = opts.GetFirstBoolean('-', "phase-norm-during-eval", false); windowSize_ = opts.GetFirstInt ('-', "window-size", DPTreephaser::kWindowSizeDefault_); phasing_start_flow_ = opts.GetFirstInt ('-', "phasing-start-flow", 70); phasing_end_flow_ = opts.GetFirstInt ('-', "phasing-end-flow", 150); inclusion_threshold_ = opts.GetFirstDouble ('-', "phasing-signal-cutoff", 1.4); maxfrac_negative_flows_ = opts.GetFirstDouble ('-', "phasing-norm-threshold", 0.2); // Initialize chip size - needed for loading phase parameters chip_size_x_ = chip_subset.GetChipSizeX(); chip_size_y_ = chip_subset.GetChipSizeY(); region_size_x_ = chip_subset.GetRegionSizeX(); region_size_y_ = chip_subset.GetRegionSizeY(); num_regions_x_ = chip_subset.GetNumRegionsX(); num_regions_y_ = chip_subset.GetNumRegionsY(); num_regions_ = chip_subset.NumRegions(); // Loading existing phase estimates from a file takes precedence over all other options if (not phase_file_name_.empty()) { have_phase_estimates_ = LoadPhaseEstimationTrainSubset(phase_file_name_); if (have_phase_estimates_) { phasing_estimator_ = "override"; printf("Phase estimator settings:\n"); printf(" phase file name : %s\n", phase_file_name_.c_str()); printf(" phase estimation mode : %s\n\n", phasing_estimator_.c_str()); return; } else cout << "PhaseEstimator Error loading TrainSubset from file " << phase_file_name_ << endl; } // Set phase parameters if provided by command line if (!cf_ie_dr.empty()) { if (cf_ie_dr.size() != 3){ cerr << "BaseCaller Option Error: libcf-ie-dr needs to be a comma separated vector of 3 values." << endl; exit (EXIT_FAILURE); } SetPhaseParameters(cf_ie_dr.at(0), cf_ie_dr.at(1), cf_ie_dr.at(2)); return; // --libcf-ie-dr overrides other phasing-related options } // Set starting values for estimation if (!init_cf_ie_dr.empty()) { if (init_cf_ie_dr.size() != 3){ cerr << "BaseCaller Option Error: initcf-ie-dr needs to be a comma separated vector of 3 values." << endl; exit (EXIT_FAILURE); } init_cf_ = init_cf_ie_dr.at(0); init_ie_ = init_cf_ie_dr.at(1); init_dr_ = init_cf_ie_dr.at(2); } if (phasing_start_flow_ >= phasing_end_flow_ or phasing_start_flow_ < 0) { cerr << "BaseCaller Option Error: phasing-start-flow " << phasing_start_flow_ << "needs to be positive and smaller than phasing-end-flow " << phasing_end_flow_ << endl; exit (EXIT_FAILURE); } if (normalization_string_ == "adaptive") norm_method_ = 1; else if (normalization_string_ == "pid") norm_method_ = 2; else if (normalization_string_ == "variable") norm_method_ = 3; else if (normalization_string_ == "off") norm_method_ = 4; else norm_method_ = 0; // "gain" and anythign else is default printf("Phase estimator settings:\n"); printf(" phase file name : %s\n", phase_file_name_.c_str()); printf(" phase estimation mode : %s\n", phasing_estimator_.c_str()); printf(" initial cf,ie,dr values: %f,%f,%f\n", init_cf_,init_ie_,init_dr_); printf(" reads per region target: %d-%d\n", min_reads_per_region_, num_reads_per_region_); printf(" normalization method : %s\n", normalization_string_.c_str()); printf(" variable norm threshold: %f\n", maxfrac_negative_flows_); printf("\n"); }