int main(int argc, const char *argv[]) { OptArgs opts; opts.ParseCmdLine(argc, argv); string queryFile, goldFile; double epsilon; bool help = false; bool version = false; int allowedWrong = 0; double maxAbsVal = 0; double minCorrelation = 1; opts.GetOption(queryFile, "", 'q', "query-wells"); opts.GetOption(goldFile, "", 'g', "gold-wells"); opts.GetOption(epsilon, "0.0", 'e', "epsilon"); opts.GetOption(allowedWrong, "0", 'm', "max-mismatch"); opts.GetOption(minCorrelation, "1", 'c', "min-cor"); opts.GetOption(maxAbsVal, "1e3", '-', "max-val"); opts.GetOption(help, "false", 'h', "help"); opts.GetOption(version, "false", 'v', "version"); opts.CheckNoLeftovers(); if (version) { fprintf (stdout, "%s", IonVersion::GetFullVersion("RawWellsEquivalent").c_str()); exit(0); } if (queryFile.empty() || goldFile.empty() || help) { cout << "RawWellsEquivalent - Check to see how similar two wells files are to each other" << endl << "options: " << endl << " -g,--gold-wells trusted wells to compare against." << endl << " -q,--query-wells new wells to check." << endl << " -e,--epsilon maximum allowed difference to be considered equivalent." << endl << " -m,--max-mixmatch maximum number of non-equivalent entries to allow." << endl << " -c,--min-cor minimum correlation allowed to be considered equivalent." << endl << " --max-val maximum absolute value considered (avoid extreme values)." << endl << " -h,--help this message." << endl << "" << endl << "usage: " << endl << " RawWellsEquivalent -e 10 --query-wells query.wells --gold-wells gold.wells " << endl; exit(1); } NumericalComparison<double> compare = CompareWells(queryFile, goldFile, epsilon, maxAbsVal); cout << compare.GetCount() << " total values. " << endl << compare.GetNumSame() << " (" << (100.0 * compare.GetNumSame())/compare.GetCount() << "%) are equivalent. " << endl << compare.GetNumDiff() << " (" << (100.0 * compare.GetNumDiff())/compare.GetCount() << "%) are not equivalent. " << endl << "Correlation of: " << compare.GetCorrelation() << endl; if((compare.GetCount() - allowedWrong) >= compare.GetNumSame() || compare.GetCorrelation() < minCorrelation) { cout << "Wells files not equivalent for allowed mismatch: " << allowedWrong << " minimum correlation: " << minCorrelation << endl; return 1; } cout << "Wells files equivalent for allowed mismatch: " << allowedWrong << " minimum correlation: " << minCorrelation << endl; return 0; }
int main(int argc, const char *argv[]) { OptArgs opts; opts.ParseCmdLine(argc, argv); string queryFile, goldFile; double epsilon; bool help = false; bool version = false; int allowedWrong = 0; double maxAbsVal = 0; double minCorrelation = 1; bool dumpMisMatch = false; opts.GetOption(queryFile, "", 'q', "query-wells"); opts.GetOption(goldFile, "", 'g', "gold-wells"); opts.GetOption(epsilon, "0.0", 'e', "epsilon"); opts.GetOption(allowedWrong, "0", 'm', "max-mismatch"); opts.GetOption(minCorrelation, "1", 'c', "min-cor"); opts.GetOption(maxAbsVal, "1e3", '-', "max-val"); opts.GetOption(help, "false", 'h', "help"); opts.GetOption(version, "false", 'v', "version"); opts.GetOption(dumpMisMatch, "false", 'o', "dump-mismatch"); opts.CheckNoLeftovers(); if (version) { fprintf (stdout, "%s", IonVersion::GetFullVersion("RawWellsEquivalent").c_str()); exit(0); } if (queryFile.empty() || goldFile.empty() || help) { printUsage(); exit(1); } DumpMismatches dump(dumpMisMatch); NumericalComparison<double> compare = CompareWells(queryFile, goldFile, epsilon, maxAbsVal, dump); cout << compare.GetCount() << " total values. " << endl << compare.GetNumSame() << " (" << (100.0 * compare.GetNumSame())/compare.GetCount() << "%) are equivalent. " << endl << compare.GetNumDiff() << " (" << (100.0 * compare.GetNumDiff())/compare.GetCount() << "%) are not equivalent. " << endl << "Correlation of: " << compare.GetCorrelation() << endl; if((compare.GetCount() - allowedWrong) > compare.GetNumSame() || (compare.GetCorrelation() < minCorrelation && compare.GetCount() != compare.GetNumSame())) { cout << "Wells files not equivalent for allowed mismatch: " << allowedWrong << " minimum correlation: " << minCorrelation << endl; return 1; } cout << "Wells files equivalent for allowed mismatch: " << allowedWrong << " minimum correlation: " << minCorrelation << endl; return 0; }
int main (int argc, const char *argv[]) { time_t program_start_time; time(&program_start_time); Json::Value calibration_json(Json::objectValue); DumpStartingStateOfProgram (argc,argv,program_start_time, calibration_json["Calibration"]); // // Step 1. Process command line options // OptArgs opts; opts.ParseCmdLine(argc, argv); CalibrationContext calib_context; if (not calib_context.InitializeFromOpts(opts)){ PrintHelp_CalModules(); } HistogramCalibration master_histogram(opts, calib_context); calib_context.hist_calibration_master = &master_histogram; LinearCalibrationModel master_linear_model(opts, calib_context); calib_context.linear_model_master = &master_linear_model; opts.CheckNoLeftovers(); // // Step 2. Execute threaded calibration // time_t calibration_start_time; time(&calibration_start_time); pthread_mutex_init(&calib_context.read_mutex, NULL); pthread_mutex_init(&calib_context.write_mutex, NULL); pthread_t worker_id[calib_context.num_threads]; for (int worker = 0; worker < calib_context.num_threads; worker++) if (pthread_create(&worker_id[worker], NULL, CalibrationWorker, &calib_context)) { cerr << "Calibration ERROR: Problem starting thread" << endl; exit (EXIT_FAILURE); } for (int worker = 0; worker < calib_context.num_threads; worker++) pthread_join(worker_id[worker], NULL); pthread_mutex_destroy(&calib_context.read_mutex); pthread_mutex_destroy(&calib_context.write_mutex); time_t calibration_end_time; time(&calibration_end_time); // // Step 3. Create models, write output, and close modules // // HP histogram calibration if (master_histogram.CreateCalibrationModel()) master_histogram.ExportModelToJson(calibration_json["HPHistogram"]); // Linear Model if (master_linear_model.CreateCalibrationModel()) master_linear_model.ExportModelToJson(calibration_json["LinearModel"], ""); // Transfer stuff from calibration context and close bam reader calib_context.Close(calibration_json["Calibration"]); time_t program_end_time; time(&program_end_time); calibration_json["Calibration"]["end_time"] = get_time_iso_string(program_end_time); calibration_json["Calibration"]["total_duration"] = (Json::Int)difftime(program_end_time,program_start_time); calibration_json["Calibration"]["calibration_duration"] = (Json::Int)difftime(calibration_end_time,calibration_start_time); SaveJson(calibration_json, calib_context.filename_json); return EXIT_SUCCESS; }
int main (int argc, const char *argv[]) { printf ("------------- bamrealignment --------------\n"); OptArgs opts; opts.ParseCmdLine(argc, argv); vector<int> score_vals(4); string input_bam = opts.GetFirstString ('i', "input", ""); string output_bam = opts.GetFirstString ('o', "output", ""); opts.GetOption(score_vals, "4,-6,-5,-2", 's', "scores"); int clipping = opts.GetFirstInt ('c', "clipping", 2); bool anchors = opts.GetFirstBoolean ('a', "anchors", true); int bandwidth = opts.GetFirstInt ('b', "bandwidth", 10); bool verbose = opts.GetFirstBoolean ('v', "verbose", false); bool debug = opts.GetFirstBoolean ('d', "debug", false); int format = opts.GetFirstInt ('f', "format", 1); int num_threads = opts.GetFirstInt ('t', "threads", 8); string log_fname = opts.GetFirstString ('l', "log", ""); if (input_bam.empty() or output_bam.empty()) return PrintHelp(); opts.CheckNoLeftovers(); std::ofstream logf; if (log_fname.size ()) { logf.open (log_fname.c_str ()); if (!logf.is_open ()) { fprintf (stderr, "bamrealignment: Failed to open log file %s\n", log_fname.c_str()); return 1; } } BamReader reader; if (!reader.Open(input_bam)) { fprintf(stderr, "bamrealignment: Failed to open input file %s\n", input_bam.c_str()); return 1; } SamHeader header = reader.GetHeader(); RefVector refs = reader.GetReferenceData(); BamWriter writer; writer.SetNumThreads(num_threads); if (format == 1) writer.SetCompressionMode(BamWriter::Uncompressed); else writer.SetCompressionMode(BamWriter::Compressed); if (!writer.Open(output_bam, header, refs)) { fprintf(stderr, "bamrealignment: Failed to open output file %s\n", output_bam.c_str()); return 1; } // The meat starts here ------------------------------------ if (verbose) cout << "Verbose option is activated, each alignment will print to screen." << endl << " After a read hit RETURN to continue to the next one," << endl << " or press q RETURN to quit the program," << endl << " or press s Return to silence verbose," << endl << " or press c RETURN to continue printing without further prompt." << endl << endl; unsigned int readcounter = 0; unsigned int mapped_readcounter = 0; unsigned int realigned_readcounter = 0; unsigned int modified_alignment_readcounter = 0; unsigned int pos_update_readcounter = 0; unsigned int failed_clip_realigned_readcount = 0; unsigned int already_perfect_readcount = 0; unsigned int bad_md_tag_readcount = 0; unsigned int error_recreate_ref_readcount = 0; unsigned int error_clip_anchor_readcount = 0; unsigned int error_sw_readcount = 0; unsigned int error_unclip_readcount = 0; unsigned int start_position_shift; int orig_position; int new_position; string md_tag, new_md_tag, input = "x"; vector<CigarOp> new_cigar_data; vector<MDelement> new_md_data; bool position_shift = false; time_t start_time = time(NULL); Realigner aligner; aligner.verbose_ = verbose; aligner.debug_ = debug; if (!aligner.SetScores(score_vals)) cout << "bamrealignment: Four scores need to be provided: match, mismatch, gap open, gap extend score!" << endl; aligner.SetAlignmentBandwidth(bandwidth); BamAlignment alignment; while(reader.GetNextAlignment(alignment)){ readcounter ++; position_shift = false; if ( (readcounter % 100000) == 0 ) cout << "Processed " << readcounter << " reads. Elapsed time: " << (time(NULL) - start_time) << endl; if (alignment.IsMapped()) { orig_position = alignment.Position; mapped_readcounter++; aligner.SetClipping(clipping, !alignment.IsReverseStrand()); if (aligner.verbose_) { cout << endl; if (alignment.IsReverseStrand()) cout << "The read is from the reverse strand." << endl; else cout << "The read is from the forward strand." << endl; } if (!alignment.GetTag("MD", md_tag)) { if (aligner.verbose_) cout << "Warning: Skipping read " << alignment.Name << ". It is mapped but missing MD tag." << endl; if (logf.is_open ()) logf << alignment.Name << '\t' << alignment.IsReverseStrand() << '\t' << alignment.RefID << '\t' << setfill ('0') << setw (8) << orig_position << '\t' << "MISSMD" << '\n'; bad_md_tag_readcount++; } else if (aligner.CreateRefFromQueryBases(alignment.QueryBases, alignment.CigarData, md_tag, anchors)) { bool clipfail = false; if (Realigner::CR_ERR_CLIP_ANCHOR == aligner.GetCreateRefError ()) { clipfail = true; failed_clip_realigned_readcount ++; } if (!aligner.computeSWalignment(new_cigar_data, new_md_data, start_position_shift)) { if (aligner.verbose_) cout << "Error in the alignment! Not updating read information." << endl; if (logf.is_open ()) logf << alignment.Name << '\t' << alignment.IsReverseStrand() << '\t' << alignment.RefID << '\t' << setfill ('0') << setw (8) << orig_position << '\t' << "SWERR" << '\n'; error_sw_readcount++; writer.SaveAlignment(alignment); // Write alignment unchanged continue; } if (!aligner.addClippedBasesToTags(new_cigar_data, new_md_data, alignment.QueryBases.size())) { if (aligner.verbose_) cout << "Error when adding clipped anchors back to tags! Not updating read information." << endl; if (logf.is_open ()) logf << alignment.Name << '\t' << alignment.IsReverseStrand() << '\t' << alignment.RefID << '\t' << setfill ('0') << setw (8) << orig_position << '\t' << "UNCLIPERR" << '\n'; writer.SaveAlignment(alignment); // Write alignment unchanged error_unclip_readcount ++; continue; } new_md_tag = aligner.GetMDstring(new_md_data); realigned_readcounter++; // adjust start position of read if (!aligner.LeftAnchorClipped() and start_position_shift != 0) { new_position = aligner.updateReadPosition(alignment.CigarData, (int)start_position_shift, alignment.Position); if (new_position != alignment.Position) { pos_update_readcounter++; position_shift = true; alignment.Position = new_position; } } if (position_shift || alignment.CigarData.size () != new_cigar_data.size () || md_tag != new_md_tag) { if (logf.is_open ()) { logf << alignment.Name << '\t' << alignment.IsReverseStrand() << '\t' << alignment.RefID << '\t' << setfill ('0') << setw (8) << orig_position << '\t' << "MOD"; if (position_shift) logf << "-SHIFT"; if (clipfail) logf << " NOCLIP"; logf << '\n'; } modified_alignment_readcounter++; } else { if (logf.is_open ()) { logf << alignment.Name << '\t' << alignment.IsReverseStrand() << '\t' << alignment.RefID << '\t' << setfill ('0') << setw (8) << orig_position << '\t' << "UNMOD"; if (clipfail) logf << " NOCLIP"; logf << '\n'; } } if (aligner.verbose_){ cout << alignment.Name << endl; cout << "------------------------------------------" << endl; // Wait for input to continue or quit program if (input.size() == 0) input = 'x'; else if (input[0] != 'c' and input[0] != 'C') getline(cin, input); if (input.size()>0){ if (input[0] == 'q' or input[0] == 'Q') return 1; else if (input[0] == 's' or input[0] == 'S') aligner.verbose_ = false; } } // Finally update alignment information alignment.CigarData = new_cigar_data; alignment.EditTag("MD", "Z" , new_md_tag); } // end of CreateRef else if else { switch (aligner.GetCreateRefError ()) { case Realigner::CR_ERR_RECREATE_REF: if (logf.is_open ()) logf << alignment.Name << '\t' << alignment.IsReverseStrand() << '\t' << alignment.RefID << '\t' << setfill ('0') << setw (8) << orig_position << '\t' << "RECRERR" << '\n'; error_recreate_ref_readcount++; break; case Realigner::CR_ERR_CLIP_ANCHOR: if (logf.is_open ()) logf << alignment.Name << '\t' << alignment.IsReverseStrand() << '\t' << alignment.RefID << '\t' << setfill ('0') << setw (8) << orig_position << '\t' << "CLIPERR" << '\n'; error_clip_anchor_readcount++; break; default: // On a good run this writes way too many reads to the log file - don't want to create a too large txt file // if (logf.is_open ()) //logf << alignment.Name << '\t' << alignment.IsReverseStrand() << '\t' << alignment.RefID << '\t' << setfill ('0') << setw (8) << orig_position << '\t' << "PERFECT" << '\n'; already_perfect_readcount++; break; } if (aligner.verbose_) { cout << alignment.Name << endl; cout << "------------------------------------------" << endl; // Wait for input to continue or quit program if (input.size() == 0) input = 'x'; else if (input[0] != 'c' and input[0] != 'C') getline(cin, input); if (input.size()>0){ if (input[0] == 'q' or input[0] == 'Q') return 1; else if (input[0] == 's' or input[0] == 'S') aligner.verbose_ = false; } } } // --- Debug output for Rajesh --- if (debug && aligner.invalid_cigar_in_input) { aligner.verbose_ = true; cout << "Invalid cigar string / md tag pair in read " << alignment.Name << endl; // Rerun reference generation to display error aligner.CreateRefFromQueryBases(alignment.QueryBases, alignment.CigarData, md_tag, anchors); aligner.verbose_ = verbose; aligner.invalid_cigar_in_input = false; } // --- --- --- } // end of if isMapped writer.SaveAlignment(alignment); } // end while loop over reads if (aligner.invalid_cigar_in_input) cerr << "WARNING bamrealignment: There were invalid cigar string / md tag pairs in the input bam file." << endl; // ---------------------------------------------------------------- // program end -- output summary information cout << " File: " << input_bam << endl << " Total reads: " << readcounter << endl << " Mapped reads: " << mapped_readcounter << endl; if (bad_md_tag_readcount) cout << " Skipped: bad MD tags: " << bad_md_tag_readcount << endl; if (error_recreate_ref_readcount) cout << " Skipped: unable to recreate ref: " << error_recreate_ref_readcount << endl; if (error_clip_anchor_readcount) cout << " Skipped: error clipping anchor: " << error_clip_anchor_readcount << endl; cout << " Skipped: already perfect: " << already_perfect_readcount << endl << " Total reads realigned: " << mapped_readcounter - already_perfect_readcount - bad_md_tag_readcount - error_recreate_ref_readcount - error_clip_anchor_readcount << endl; if (failed_clip_realigned_readcount) cout << " (including " << failed_clip_realigned_readcount << " that failed to clip)" << endl; if (error_sw_readcount) cout << " Failed to complete SW alignment: " << error_sw_readcount << endl; if (error_unclip_readcount) cout << " Failed to unclip anchor: " << error_unclip_readcount << endl; cout << " Succesfully realigned: " << realigned_readcounter << endl << " Modified alignments: " << modified_alignment_readcounter << endl << " Shifted position: " << pos_update_readcounter << endl; cout << "Processing time: " << (time(NULL)-start_time) << " seconds." << endl; cout << "INFO: The output BAM file may be unsorted." << endl; cout << "------------------------------------------" << endl; return 0; }
int PrepareHotspots(int argc, const char *argv[]) { OptArgs opts; opts.ParseCmdLine(argc, argv); string input_bed_filename = opts.GetFirstString ('b', "input-bed", ""); string input_vcf_filename = opts.GetFirstString ('v', "input-vcf", ""); string input_real_vcf_filename = opts.GetFirstString ('p', "input-real-vcf", ""); string output_hot_vcf = opts.GetFirstString ('q', "output-fake-hot-vcf", ""); string output_bed_filename = opts.GetFirstString ('d', "output-bed", ""); string output_vcf_filename = opts.GetFirstString ('o', "output-vcf", ""); string reference_filename = opts.GetFirstString ('r', "reference", ""); string unmerged_bed = opts.GetFirstString ('u', "unmerged-bed", ""); bool left_alignment = opts.GetFirstBoolean('a', "left-alignment", false); bool filter_bypass = opts.GetFirstBoolean('f', "filter-bypass", false); bool allow_block_substitutions = opts.GetFirstBoolean('s', "allow-block-substitutions", true); bool strict_check = opts.GetFirstBoolean('S', "strict-check", true); opts.CheckNoLeftovers(); if((input_bed_filename.empty() == (input_vcf_filename.empty() and input_real_vcf_filename.empty())) or (output_bed_filename.empty() and output_vcf_filename.empty()) or reference_filename.empty()) { PrepareHotspotsHelp(); return 1; } if ((not input_real_vcf_filename.empty()) and (output_vcf_filename.empty() or not input_vcf_filename.empty())) { PrepareHotspotsHelp(); return 1; } // Populate chromosome list from reference.fai // Use mmap to fetch the entire reference int ref_handle = open(reference_filename.c_str(),O_RDONLY); struct stat ref_stat; fstat(ref_handle, &ref_stat); char *ref = (char *)mmap(0, ref_stat.st_size, PROT_READ, MAP_SHARED, ref_handle, 0); FILE *fai = fopen((reference_filename+".fai").c_str(), "r"); if (!fai) { fprintf(stderr, "ERROR: Cannot open %s.fai\n", reference_filename.c_str()); return 1; } vector<Reference> ref_index; map<string,int> ref_map; char line[1024], chrom_name[1024]; while (fgets(line, 1024, fai) != NULL) { Reference ref_entry; long chr_start; if (5 != sscanf(line, "%1020s\t%ld\t%ld\t%d\t%d", chrom_name, &ref_entry.size, &chr_start, &ref_entry.bases_per_line, &ref_entry.bytes_per_line)) continue; ref_entry.chr = chrom_name; ref_entry.start = ref + chr_start; ref_index.push_back(ref_entry); ref_map[ref_entry.chr] = (int) ref_index.size() - 1; } fclose(fai); junction junc; if (!unmerged_bed.empty()) { FILE *fp = fopen(unmerged_bed.c_str(), "r"); if (!fp) { fprintf(stderr, "ERROR: Cannot open %s\n", unmerged_bed.c_str()); return 1; } char line2[65536]; junc.init(ref_index.size()); bool line_overflow = false; while (fgets(line2, 65536, fp) != NULL) { if (line2[0] and line2[strlen(line2)-1] != '\n' and strlen(line2) == 65535) { line_overflow = true; continue; } if (line_overflow) { line_overflow = false; continue; } if (strstr(line2, "track")) continue; char chr[100]; int b, e; sscanf(line2, "%s %d %d", chr, &b, &e); junc.add(ref_map[chr], b, e); } fclose(fp); } // Load input BED or load input VCF, group by chromosome deque<LineStatus> line_status; vector<deque<Allele> > alleles(ref_index.size()); if (!input_bed_filename.empty()) { FILE *input = fopen(input_bed_filename.c_str(),"r"); if (!input) { fprintf(stderr,"ERROR: Cannot open %s\n", input_bed_filename.c_str()); return 1; } char line2[65536]; int line_number = 0; bool line_overflow = false; while (fgets(line2, 65536, input) != NULL) { if (line2[0] and line2[strlen(line2)-1] != '\n' and strlen(line2) == 65535) { line_overflow = true; continue; } line_number++; if (line_overflow) { line_overflow = false; line_status.push_back(LineStatus(line_number)); line_status.back().filter_message_prefix = "Malformed hotspot BED line: line length exceeds 64K"; continue; } if (strncmp(line2, "browser", 7) == 0) continue; if (strncmp(line2, "track", 5) == 0) { if (string::npos != string(line2).find("allowBlockSubstitutions=true")) allow_block_substitutions = true; continue; } // OID= table has special meaning if (string::npos != string(line2).find("OID=")) { line_status.push_back(LineStatus(line_number)); line_status.back().filter_message_prefix = "Bed line contains OID="; continue; } char *current_chr = strtok(line2, "\t\r\n"); char *current_start = strtok(NULL, "\t\r\n"); char *current_end = strtok(NULL, "\t\r\n"); char *current_id = strtok(NULL, "\t\r\n"); char *penultimate = strtok(NULL, "\t\r\n"); char *ultimate = strtok(NULL, "\t\r\n"); for (char *next = strtok(NULL, "\t\r\n"); next; next = strtok(NULL, "\t\r\n")) { penultimate = ultimate; ultimate = next; } if (!current_chr or !current_start or !current_end or !current_id or !penultimate or !ultimate) { line_status.push_back(LineStatus(line_number)); line_status.back().filter_message_prefix = "Malformed hotspot BED line: expected at least 6 fields"; continue; } Allele allele; string string_chr(current_chr); if (ref_map.find(string_chr) != ref_map.end()) allele.chr_idx = ref_map[string_chr]; else if (ref_map.find("chr"+string_chr) != ref_map.end()) allele.chr_idx = ref_map["chr"+string_chr]; else if (string_chr == "MT" and ref_map.find("chrM") != ref_map.end()) allele.chr_idx = ref_map["chrM"]; else { line_status.push_back(LineStatus(line_number)); line_status.back().filter_message_prefix = "Unknown chromosome name: "; line_status.back().filter_message = string_chr; continue; } allele.pos = strtol(current_start,NULL,10); allele.id = current_id; char *current_ref = NULL; char *current_alt = NULL; for (char *next = strtok(penultimate, ";"); next; next = strtok(NULL, ";")) { if (strncmp(next,"REF=",4) == 0) current_ref = next; else if (strncmp(next,"OBS=",4) == 0) current_alt = next; else if (strncmp(next,"ANCHOR=",7) == 0) { // ignore ANCHOR } else { char *value = next; while (*value and *value != '=') ++value; if (*value == '=') *value++ = 0; allele.custom_tags[next] = value; } } if (!current_ref or !current_alt) { line_status.push_back(LineStatus(line_number)); line_status.back().filter_message_prefix = "Malformed hotspot BED line: REF and OBS fields required in penultimate column"; continue; } for (char *pos = current_ref+4; *pos; ++pos) allele.ref += toupper(*pos); for (char *pos = current_alt+4; *pos; ++pos) allele.alt += toupper(*pos); // here is the place to check the length of the hotspot cover the amplicon junction. ZZ /* if (junc.contain(allele.chr_idx, allele.pos, (unsigned int) allele.ref.size())) { line_status.push_back(LineStatus(line_number)); line_status.back().filter_message_prefix = "hotspot BED line contain the complete overlapping region of two amplicon, the variant cannot be detected by tvc"; continue; } if (not junc.contained_in_ampl(allele.chr_idx, allele.pos, (unsigned int) allele.ref.size())) { line_status.push_back(LineStatus(line_number)); line_status.back().filter_message_prefix = "hotspot BED line is not contained in any amplicon, the variant cannot be detected by tvc"; continue; } */ allele.filtered = false; line_status.push_back(LineStatus(line_number)); allele.line_status = &line_status.back(); allele.opos = allele.pos; allele.oref = allele.ref; allele.oalt = allele.alt; alleles[allele.chr_idx].push_back(allele); //line_status.back().allele = &alleles[allele.chr_idx].back(); line_status.back().chr_idx = allele.chr_idx; line_status.back().opos = allele.opos; line_status.back().id = allele.id; } fclose(input); } if (!input_vcf_filename.empty() or !input_real_vcf_filename.empty()) { bool real_vcf = false; FILE *input; FILE *out_real = NULL; FILE *out_hot = NULL; int fake_ = 0; int hn = 1; if (!input_real_vcf_filename.empty()) { real_vcf = true; input = fopen(input_real_vcf_filename.c_str(),"r"); if (!input) { fprintf(stderr,"ERROR: Cannot open %s\n", input_real_vcf_filename.c_str()); return 1; } out_real = fopen(output_vcf_filename.c_str(), "w"); if (!out_real) { fprintf(stderr,"ERROR: Cannot open %s\n", output_vcf_filename.c_str()); return 1; } if (!output_hot_vcf.empty()) { out_hot = fopen(output_hot_vcf.c_str(), "w"); if (!out_hot) { fprintf(stderr,"ERROR: Cannot open %s\n", output_hot_vcf.c_str()); return 1; } } else out_hot = stdout; fprintf(out_hot, "##fileformat=VCFv4.1\n##allowBlockSubstitutions=true\n#CHROM POS ID REF ALT QUAL FILTER INFO\n"); } else { input = fopen(input_vcf_filename.c_str(),"r"); if (!input) { fprintf(stderr,"ERROR: Cannot open %s\n", input_vcf_filename.c_str()); return 1; } } char line2[65536]; char line3[65536]; int line_number = 0; bool line_overflow = false; list<one_vcfline> vcflist; char last_chr[1024] = ""; while (fgets(line2, 65536, input) != NULL) { if (line2[0] and line2[strlen(line2)-1] != '\n' and strlen(line2) == 65535) { line_overflow = true; continue; } line_number++; if (line_overflow) { line_overflow = false; line_status.push_back(LineStatus(line_number)); line_status.back().filter_message_prefix = "Malformed hotspot VCF line: line length exceeds 64K"; continue; } if (strncmp(line2, "##allowBlockSubstitutions=true", 30) == 0) { allow_block_substitutions = true; continue; } if (line2[0] == '#') { if (out_real) { fprintf(out_real, "%s", line2);} continue; } if (real_vcf) strcpy(line3, line2); char *current_chr = strtok(line2, "\t\r\n"); char *current_start = strtok(NULL, "\t\r\n"); char *current_id = strtok(NULL, "\t\r\n"); char *current_ref = strtok(NULL, "\t\r\n"); char *current_alt = strtok(NULL, "\t\r\n"); strtok(NULL, "\t\r\n"); // Ignore QUAL strtok(NULL, "\t\r\n"); // Ignore FILTER char *current_info = strtok(NULL, "\t\r\n"); strtok(NULL, "\t\r\n"); char *gt = strtok(NULL, "\t\r\n"); if (!current_chr or !current_start or !current_id or !current_ref or !current_alt) { line_status.push_back(LineStatus(line_number)); if (real_vcf) line_status.back().filter_message_prefix = "Malformed real VCF line: expected at least 5 fields"; else line_status.back().filter_message_prefix = "Malformed hotspot VCF line: expected at least 5 fields"; continue; } string string_chr(current_chr); int chr_idx = 0; if (ref_map.find(string_chr) != ref_map.end()) chr_idx = ref_map[string_chr]; else if (ref_map.find("chr"+string_chr) != ref_map.end()) chr_idx = ref_map["chr"+string_chr]; else if (string_chr == "MT" and ref_map.find("chrM") != ref_map.end()) chr_idx = ref_map["chrM"]; else { line_status.push_back(LineStatus(line_number)); line_status.back().filter_message_prefix = "Unknown chromosome name: "; line_status.back().filter_message = string_chr; continue; } for (char *pos = current_ref; *pos; ++pos) *pos = toupper(*pos); for (char *pos = current_alt; *pos; ++pos) *pos = toupper(*pos); // Process custom tags vector<string> bstrand; vector<string> hp_max_length; string raw_oid; string raw_omapalt; string raw_oalt; string raw_oref; string raw_opos; if (current_info) { string raw_bstrand; string raw_hp_max_length; for (char *next = strtok(current_info, ";"); next; next = strtok(NULL, ";")) { char *value = next; while (*value and *value != '=') ++value; if (*value == '=') *value++ = 0; if (strcmp(next, "TYPE") == 0) continue; if (strcmp(next, "HRUN") == 0) continue; if (strcmp(next, "HBASE") == 0) continue; if (strcmp(next, "FR") == 0) continue; if (strcmp(next, "OPOS") == 0) { raw_opos = value; continue; } if (strcmp(next, "OREF") == 0) { raw_oref = value; continue; } if (strcmp(next, "OALT") == 0) { raw_oalt = value; continue; } if (strcmp(next, "OID") == 0) { raw_oid = value; continue; } if (strcmp(next, "OMAPALT") == 0) { raw_omapalt = value; continue; } if (strcmp(next, "BSTRAND") == 0) { raw_bstrand = value; continue; } if (strcmp(next, "hp_max_length") == 0) { raw_hp_max_length = value; continue; } } if (not raw_bstrand.empty()) split(raw_bstrand, ',', bstrand); if (not raw_hp_max_length.empty()) split(raw_hp_max_length, ',', hp_max_length); } if (real_vcf) { //fprintf(stderr, "%s\n", gt); if (gt == NULL) continue; // get gt int g1 = atoi(gt), g2; gt = strchr(gt, '/'); if (gt) g2 = atoi(gt+1); else {fprintf(stderr, "GT not formatted right\n"); exit(1);} //if (g1 == 0 and g2 == 0) continue; unsigned int cur_pos = atoi(current_start); one_vcfline newline(current_ref, current_alt, cur_pos, g1, g2, line3); bool new_chr = false; if (strcmp(current_chr, last_chr) != 0) { new_chr = true; } while (not vcflist.empty()) { if ((not new_chr) and vcflist.front().pos+strlen(vcflist.front().ref) > cur_pos) break; if (vcflist.front().produce_hot_vcf(last_chr, out_real, hn, out_hot)) fake_++; vcflist.pop_front(); } if (new_chr) strcpy(last_chr, current_chr); for (list<one_vcfline>::iterator it = vcflist.begin(); it != vcflist.end(); it++) { it->check_subset(newline); } if (not newline.alts.empty()) vcflist.push_back(newline); continue; } unsigned int allele_idx = 0; for (char *sub_alt = strtok(current_alt,","); sub_alt; sub_alt = strtok(NULL,",")) { Allele allele; allele.chr_idx = chr_idx; allele.ref = current_ref; allele.alt = sub_alt; allele.pos = strtol(current_start,NULL,10)-1; allele.id = current_id; if (allele.id == ".") allele.id = "hotspot"; allele.filtered = false; line_status.push_back(LineStatus(line_number)); allele.line_status = &line_status.back(); allele.opos = allele.pos; allele.oref = allele.ref; allele.oalt = allele.alt; if (allele_idx < bstrand.size()) { if (bstrand[allele_idx] != ".") allele.custom_tags["BSTRAND"] = bstrand[allele_idx]; } if (allele_idx < hp_max_length.size()) { if (hp_max_length[allele_idx] != ".") allele.custom_tags["hp_max_length"] = hp_max_length[allele_idx]; } alleles[allele.chr_idx].push_back(allele); //line_status.back().allele = &alleles[allele.chr_idx].back(); line_status.back().chr_idx = allele.chr_idx; line_status.back().opos = allele.opos; line_status.back().id = allele.id; allele_idx++; } } fclose(input); if (real_vcf) { while (not vcflist.empty()) { if (vcflist.front().produce_hot_vcf(last_chr, out_real, hn, out_hot)) fake_++; vcflist.pop_front(); } fclose(out_real); fclose(out_hot); if (fake_ > 0) return 0; else return 1; } } // Process by chromosome: // - Verify reference allele // - Left align // - Sort // - Filter for block substitutions, write FILE *output_vcf = NULL; if (!output_vcf_filename.empty()) { output_vcf = fopen(output_vcf_filename.c_str(), "w"); if (!output_vcf) { fprintf(stderr,"ERROR: Cannot open %s for writing\n", output_vcf_filename.c_str()); return 1; } fprintf(output_vcf, "##fileformat=VCFv4.1\n"); if (allow_block_substitutions) fprintf(output_vcf, "##allowBlockSubstitutions=true\n"); fprintf(output_vcf, "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\n"); } FILE *output_bed = NULL; if (!output_bed_filename.empty()) { output_bed = fopen(output_bed_filename.c_str(), "w"); if (!output_bed) { fprintf(stderr,"ERROR: Cannot open %s for writing\n", output_bed_filename.c_str()); if (output_vcf) fclose(output_vcf); return 1; } if (allow_block_substitutions) fprintf(output_bed, "track name=\"hotspot\" type=bedDetail allowBlockSubstitutions=true\n"); else fprintf(output_bed, "track name=\"hotspot\" type=bedDetail\n"); } for (int chr_idx = 0; chr_idx < (int)ref_index.size(); ++chr_idx) { for (deque<Allele>::iterator A = alleles[chr_idx].begin(); A != alleles[chr_idx].end(); ++A) { // check bed file if (junc.contain(A->chr_idx, A->pos, (unsigned int) A->ref.size())) { A->filtered = true; A->line_status->filter_message_prefix = "hotspot BED line contain the complete overlapping region of two amplicon, the variant cannot be detected by tvc"; continue; } if (not junc.contained_in_ampl(A->chr_idx, A->pos, (unsigned int) A->ref.size())) { A->filtered = true; A->line_status->filter_message_prefix = "hotspot BED line is not contained in any amplicon, the variant cannot be detected by tvc"; continue; } // Invalid characters bool valid = true; for (const char *c = A->ref.c_str(); *c ; ++c) if (*c != 'A' and *c != 'C' and *c != 'G' and *c != 'T') valid = false; for (const char *c = A->alt.c_str(); *c ; ++c) if (*c != 'A' and *c != 'C' and *c != 'G' and *c != 'T') valid = false; if (not valid) { A->filtered = true; A->line_status->filter_message_prefix = "REF and/or ALT contain characters other than ACGT: "; A->line_status->filter_message = "REF = " + A->ref + " ALT = " + A->alt; continue; } // Filter REF == ALT if (A->ref == A->alt) { A->filtered = true; A->line_status->filter_message_prefix = "REF and ALT alleles equal"; continue; } // Confirm reference allele. string ref_expected; for (int idx = 0; idx < (int) A->ref.size(); ++idx) ref_expected += ref_index[chr_idx].base(A->pos + idx); if (A->ref != ref_expected) { A->filtered = true; A->line_status->filter_message_prefix = "Provided REF allele does not match reference: "; A->line_status->filter_message = "Expected " + ref_expected + ", found " + A->ref; continue; } // Trim int ref_start = 0; int ref_end = A->ref.size(); int alt_end = A->alt.size(); // Option 1: trim all trailing bases; //while(ref_end and alt_end and A->ref[ref_end-1] == A->alt[alt_end-1]) { // --ref_end; // --alt_end; //} // Option 2: trim all leading basees; //while (ref_start < ref_end and ref_start < alt_end and A->ref[ref_start] == A->alt[ref_start]) // ++ref_start; // Option 3: trim anchor base if vcf if (!input_vcf_filename.empty()) { if (ref_end and alt_end and (ref_end == 1 or alt_end == 1) and A->ref[0] == A->alt[0]) ref_start = 1; } A->pos += ref_start; A->ref = A->ref.substr(ref_start, ref_end-ref_start); A->alt = A->alt.substr(ref_start, alt_end-ref_start); ref_end -= ref_start; alt_end -= ref_start; // Left align if (left_alignment && A->custom_tags.find("BSTRAND") == A->custom_tags.end()) { // black list variant not to be left aligned. string trailing; int can_do = 0, need_do = 0; int ref_end_orig= ref_end, alt_end_orig = alt_end; while(ref_end and alt_end and A->ref[ref_end-1] == A->alt[alt_end-1]) { ref_end--; alt_end--; } if (ref_end == 0 || alt_end == 0) { can_do = need_do = 1; // indel type, ZZ } else { int tmp_start = ref_start; int ref_end_0 = ref_end, alt_end_0 = alt_end; // end after remove trailing match ZZ while (tmp_start < ref_end and tmp_start < alt_end and A->ref[tmp_start] == A->alt[tmp_start]) ++tmp_start; if (tmp_start == ref_end || tmp_start == alt_end) { can_do = 1; need_do = 0; // indel but indel is not at the left. ZZ } else { ref_end--; alt_end--; while(ref_end and alt_end and A->ref[ref_end-1] == A->alt[alt_end-1]) { ref_end--; alt_end--; } if (ref_end == 0 || alt_end == 0) { // complex with 1 bp MM at right end can_do = need_do = 1; if (ref_end + alt_end == 0) need_do = 0; // SNP } else { int tmp_start0 = tmp_start; // start after removing leading matches tmp_start++; while (tmp_start < ref_end_orig and tmp_start < alt_end_orig and A->ref[tmp_start] == A->alt[tmp_start]) tmp_start++; if (tmp_start >= ref_end_0 || tmp_start >= alt_end_0 || ref_end <= tmp_start0 || alt_end <= tmp_start0) { // 1MM plus indel in middle, by definition cannot move the indel left enough to change A->pos can_do = 1; need_do = 0; } // else real complex } } } if (!can_do or !need_do) { // do nothing // if !can_do need add some more DP ref_end = ref_end_orig; alt_end = alt_end_orig; } else { // left align the indel part, here either ref_end = 0 or alt_end = 0 int opos = A->pos; while (A->pos > 0) { char nuc = ref_index[chr_idx].base(A->pos-1); if (ref_end > 0 and A->ref[ref_end-1] != nuc) break; if (alt_end > 0 and A->alt[alt_end-1] != nuc) break; A->ref = string(1,nuc) + A->ref; A->alt = string(1,nuc) + A->alt; A->pos--; } if (ref_end != ref_end_orig) { // trailing part is aligned, the whole ref and alt need to be kept. ZZ ref_end = A->ref.size(); alt_end = A->alt.size(); } if (junc.contain(chr_idx, A->pos, ref_end) or not junc.contained_in_ampl(chr_idx, A->pos, ref_end)) { // after left align the hotspot contain an overlap region, revert to the original ZZ if (opos != A->pos) { A->ref.erase(0, opos-A->pos); A->alt.erase(0, opos-A->pos); A->pos = opos; ref_end = ref_end_orig; alt_end = alt_end_orig; } } } } A->ref.resize(ref_end); A->alt.resize(alt_end); // Filter block substitutions: take 1 if (ref_end > 0 and alt_end > 0 and ref_end != alt_end and not allow_block_substitutions and not filter_bypass) { A->filtered = true; A->line_status->filter_message_prefix = "Block substitutions not supported"; continue; } } if (output_bed) { // Sort - without anchor base stable_sort(alleles[chr_idx].begin(), alleles[chr_idx].end(), compare_alleles); // Write for (deque<Allele>::iterator I = alleles[chr_idx].begin(); I != alleles[chr_idx].end(); ++I) { if (I->filtered) continue; fprintf(output_bed, "%s\t%ld\t%ld\t%s\tREF=%s;OBS=%s", ref_index[chr_idx].chr.c_str(), I->pos, I->pos + I->ref.size(), I->id.c_str(), I->ref.c_str(), I->alt.c_str()); for (map<string,string>::iterator C = I->custom_tags.begin(); C != I->custom_tags.end(); ++C) fprintf(output_bed, ";%s=%s", C->first.c_str(), C->second.c_str()); fprintf(output_bed, "\tNONE\n"); /* if (I->pos) fprintf(output_bed, "%s\t%ld\t%ld\t%s\t0\t+\tREF=%s;OBS=%s;ANCHOR=%c\tNONE\n", ref_index[chr_idx].chr.c_str(), I->pos, I->pos + I->ref.size(), I->id.c_str(), I->ref.c_str(), I->alt.c_str(), ref_index[chr_idx].base(I->pos-1)); else fprintf(output_bed, "%s\t%ld\t%ld\t%s\t0\t+\tREF=%s;OBS=%s;ANCHOR=\tNONE\n", ref_index[chr_idx].chr.c_str(), I->pos, I->pos + I->ref.size(), I->id.c_str(), I->ref.c_str(), I->alt.c_str()); */ } } if (output_vcf) { // Add anchor base to indels for (deque<Allele>::iterator I = alleles[chr_idx].begin(); I != alleles[chr_idx].end(); ++I) { if (I->filtered) continue; if (not I->ref.empty() and not I->alt.empty()) continue; if (I->pos == 0) { I->filtered = true; I->line_status->filter_message_prefix = "INDELs at chromosome start not supported"; continue; } I->pos--; I->ref = string(1,ref_index[chr_idx].base(I->pos)) + I->ref; I->alt = string(1,ref_index[chr_idx].base(I->pos)) + I->alt; } // Sort - with anchor base stable_sort(alleles[chr_idx].begin(), alleles[chr_idx].end(), compare_alleles); // Merge alleles, remove block substitutions, write for (deque<Allele>::iterator A = alleles[chr_idx].begin(); A != alleles[chr_idx].end(); ) { string max_ref; deque<Allele>::iterator B = A; for (; B != alleles[chr_idx].end() and B->pos == A->pos; ++B) if (!B->filtered and max_ref.size() < B->ref.size()) max_ref = B->ref; bool filtered = true; map<string,set<string> > unique_alts_and_ids; for (deque<Allele>::iterator I = A; I != B; ++I) { if (I->filtered) continue; string new_alt = I->alt + max_ref.substr(I->ref.size()); if (new_alt.size() > 1 and max_ref.size() > 1 and new_alt.size() != max_ref.size() and not allow_block_substitutions and not filter_bypass) { I->filtered = true; I->line_status->filter_message_prefix = "Block substitutions not supported (post-merge)"; continue; } I->ref = max_ref; I->alt = new_alt; // Filter alleles with duplicate ALT + ID pairs map<string,set<string> >::iterator alt_iter = unique_alts_and_ids.find(new_alt); if (alt_iter != unique_alts_and_ids.end()) { if (alt_iter->second.count(I->id) > 0) { I->filtered = true; I->line_status->filter_message_prefix = "Duplicate allele and ID"; continue; } } unique_alts_and_ids[new_alt].insert(I->id); filtered = false; } if (not filtered) { fprintf(output_vcf, "%s\t%ld\t.\t%s\t", ref_index[chr_idx].chr.c_str(), A->pos+1, max_ref.c_str()); bool comma = false; map<string,map<string,string> > unique_alts_and_tags; set<string> unique_tags; set<string> unique_alt_alleles; for (deque<Allele>::iterator I = A; I != B; ++I) { if (I->filtered) continue; unique_alts_and_tags[I->alt].insert(I->custom_tags.begin(), I->custom_tags.end()); for (map<string,string>::iterator S = I->custom_tags.begin(); S != I->custom_tags.end(); ++S) unique_tags.insert(S->first); if (unique_alt_alleles.count(I->alt) > 0) continue; unique_alt_alleles.insert(I->alt); if (comma) fprintf(output_vcf, ","); comma = true; fprintf(output_vcf, "%s", I->alt.c_str()); } /* for (deque<Allele>::iterator I = A; I != B; ++I) { if (I->filtered) continue; map<string,map<string,string> >::iterator Q = unique_alts_and_tags.find(I->alt); if (comma) fprintf(output_vcf, ","); comma = true; if (Q == unique_alts_and_tags.end()) {fprintf(output_vcf, "."); continue;} fprintf(output_vcf, "%s", Q->first.c_str()); } */ fprintf(output_vcf, "\t.\t.\tOID="); comma = false; for (deque<Allele>::iterator I = A; I != B; ++I) { if (I->filtered) continue; if (comma) fprintf(output_vcf, ","); comma = true; fprintf(output_vcf, "%s", I->id.c_str()); } fprintf(output_vcf, ";OPOS="); comma = false; for (deque<Allele>::iterator I = A; I != B; ++I) { if (I->filtered) continue; if (comma) fprintf(output_vcf, ","); comma = true; fprintf(output_vcf, "%ld", I->opos+1); } fprintf(output_vcf, ";OREF="); comma = false; for (deque<Allele>::iterator I = A; I != B; ++I) { if (I->filtered) continue; if (comma) fprintf(output_vcf, ","); comma = true; fprintf(output_vcf, "%s", I->oref.c_str()); } fprintf(output_vcf, ";OALT="); comma = false; for (deque<Allele>::iterator I = A; I != B; ++I) { if (I->filtered) continue; if (comma) fprintf(output_vcf, ","); comma = true; fprintf(output_vcf, "%s", I->oalt.c_str()); } fprintf(output_vcf, ";OMAPALT="); comma = false; for (deque<Allele>::iterator I = A; I != B; ++I) { if (I->filtered) continue; if (comma) fprintf(output_vcf, ","); comma = true; fprintf(output_vcf, "%s", I->alt.c_str()); } for (set<string>::iterator S = unique_tags.begin(); S != unique_tags.end(); ++S) { fprintf(output_vcf, ";%s=", S->c_str()); comma=false; for (deque<Allele>::iterator I = A; I != B; ++I) { if (I->filtered) continue; map<string,map<string,string> >::iterator Q = unique_alts_and_tags.find(I->alt); if (comma) fprintf(output_vcf, ","); comma = true; if (Q == unique_alts_and_tags.end()) {fprintf(output_vcf, "."); continue;} map<string,string>::iterator W = Q->second.find(*S); if (W == Q->second.end()) fprintf(output_vcf, "."); else fprintf(output_vcf, "%s", W->second.c_str()); } } // fprintf(output_vcf, ";%s=%s", S->first.c_str(), S->second.c_str()); fprintf(output_vcf, "\n"); } A = B; } } } if (output_bed) { fflush(output_bed); fclose(output_bed); } if (output_vcf) { fflush(output_vcf); fclose(output_vcf); } int lines_ignored = 0; for (deque<LineStatus>::iterator L = line_status.begin(); L != line_status.end(); ++L) { if (L->filter_message_prefix) { if (L->chr_idx >= 0) printf("Line %d ignored: [%s:%ld %s] %s%s\n", L->line_number, ref_index[L->chr_idx].chr.c_str(), L->opos+1, L->id.c_str(), L->filter_message_prefix, L->filter_message.c_str()); else printf("Line %d ignored: %s%s\n", L->line_number, L->filter_message_prefix, L->filter_message.c_str()); lines_ignored++; } } printf("Ignored %d out of %d lines\n", lines_ignored, (int)line_status.size()); munmap(ref, ref_stat.st_size); close(ref_handle); if (lines_ignored > 0 and strict_check) return 1; return 0; }
int main (int argc, const char *argv[]) { time_t program_start_time; time(&program_start_time); Json::Value calibration_json(Json::objectValue); DumpStartingStateOfProgram (argc,argv,program_start_time, calibration_json["Calibration"]); // // Step 1. Process command line options // OptArgs opts; opts.ParseCmdLine(argc, argv); // enable floating point exceptions during program execution if (opts.GetFirstBoolean('-', "float-exceptions", true)) { cout << "Calibration: Floating point exceptions enabled." << endl; feenableexcept(FE_DIVBYZERO | FE_INVALID | FE_OVERFLOW); } //*/ CalibrationContext calib_context; if (not calib_context.InitializeFromOpts(opts)){ PrintHelp_CalModules(); } HistogramCalibration master_histogram(opts, calib_context); calib_context.hist_calibration_master = &master_histogram; LinearCalibrationModel master_linear_model(opts, calib_context); calib_context.linear_model_master = &master_linear_model; opts.CheckNoLeftovers(); // // Step 2. Execute threaded calibration // int calibration_thread_time = 0; if (calib_context.successive_fit) { // first train linear model if (master_linear_model.DoTraining()) { int l_thread_time = 0; for (int i_iteration=0; i_iteration<calib_context.num_train_iterations; i_iteration++) { cout << " -Training Iteration " << i_iteration+1; l_thread_time = ExecuteThreadedCalibrationTraining(calib_context); // Activate master linear model after every round of training master_linear_model.CreateCalibrationModel(false); // make linear model master_linear_model.SetModelGainsAndOffsets(); // expand for use in basecalling calibration_thread_time += l_thread_time; calib_context.bam_reader.Rewind(); // reset all files for another pass cout << " Duration = " << l_thread_time << endl; } } // Then apply it during polish model training if (master_histogram.DoTraining()) { calib_context.local_fit_linear_model = false; calib_context.local_fit_polish_model = true; calibration_thread_time += ExecuteThreadedCalibrationTraining(calib_context); } } else { // Single pass in which both models are fit jointly calibration_thread_time=ExecuteThreadedCalibrationTraining(calib_context); } // // Step 3. Create models, write output, and close modules // // Linear Model if (master_linear_model.CreateCalibrationModel()) master_linear_model.ExportModelToJson(calibration_json["LinearModel"], ""); // HP histogram calibration if (master_histogram.CreateCalibrationModel()) master_histogram.ExportModelToJson(calibration_json["HPHistogram"]); // Transfer stuff from calibration context and close bam reader calib_context.Close(calibration_json["Calibration"]); time_t program_end_time; time(&program_end_time); calibration_json["Calibration"]["end_time"] = get_time_iso_string(program_end_time); calibration_json["Calibration"]["total_duration"] = (Json::Int)difftime(program_end_time,program_start_time); calibration_json["Calibration"]["calibration_duration"] = (Json::Int)calibration_thread_time; SaveJson(calibration_json, calib_context.filename_json); return EXIT_SUCCESS; }
int main(int argc, const char *argv[]) { OptArgs opts; opts.ParseCmdLine(argc, argv); int hpLength; string statsOut; string alignmentOut; string pairedOut; string flowsOut; string summaryOut; string samFile; string qScoreCol; string wellsFile; string bfmaskFile; string snrFile; string binnedHpSigFile; string flowErrFile; string gcErrFile; int gcWin; string flowOrder; string keySeq; int numFlows; bool help; int qLength; double colCenter; double rowCenter; int colSize; int rowSize; int sampleSize; string wellsToUse; string run1, run2; opts.GetOption(run1, "", '-', "sff1"); opts.GetOption(run2, "", '-', "sff2"); opts.GetOption(wellsToUse, "", '-', "use-wells"); opts.GetOption(samFile, "", '-', "sam-parsed"); opts.GetOption(statsOut, "", '-', "stats-out"); opts.GetOption(flowsOut, "", '-', "flows-out"); opts.GetOption(alignmentOut, "", '-', "align-out"); opts.GetOption(summaryOut, "", '-', "summary-out"); opts.GetOption(pairedOut, "", '-', "paired-out"); opts.GetOption(numFlows, "40", '-', "num-flows"); opts.GetOption(hpLength, "6", '-', "max-hp"); opts.GetOption(qScoreCol, "q7Len", '-', "qscore-col"); opts.GetOption(qLength, "25", '-', "min-qlength"); opts.GetOption(help, "false", 'h', "help"); opts.GetOption(wellsFile, "", '-', "wells-file"); opts.GetOption(bfmaskFile, "", '-', "bfmask-file"); opts.GetOption(snrFile, "", '-', "snr-file"); opts.GetOption(binnedHpSigFile, "", '-', "binned-hp-sig-file"); opts.GetOption(flowErrFile, "", '-', "flow-err-file"); opts.GetOption(gcErrFile, "", '-', "gc-err-file"); opts.GetOption(flowOrder, "", '-', "flow-order"); opts.GetOption(keySeq, "", '-', "key-seq"); opts.GetOption(colCenter, "0.5", '-', "col-center"); opts.GetOption(rowCenter, "0.5", '-', "row-center"); opts.GetOption(colSize, "0", '-', "col-size"); opts.GetOption(rowSize, "0", '-', "row-size"); opts.GetOption(gcErrFile, "", '-', "gc-err-file"); opts.GetOption(gcWin, "40", '-', "gc-win"); opts.GetOption(sampleSize, "100000", '-', "sample-size"); if (help || samFile.empty() || statsOut.empty() || summaryOut.empty()) { usage(); } opts.CheckNoLeftovers(); // Some checks to make sure sensible bounds have been set if(colCenter < 0 || colCenter > 1) { cerr << "AnalyzeHPErrs - col-center must be in the range [0,1]" << endl; exit(1); } if(rowCenter < 0 || rowCenter > 1) { cerr << "AnalyzeHPErrs - row-center must be in the range [0,1]" << endl; exit(1); } if(colSize < 0) { cerr << "AnalyzeHPErrs - col-size cannot be negative." << endl; exit(1); } if(rowSize < 0) { cerr << "AnalyzeHPErrs - row-size cannot be negative." << endl; exit(1); } // Determine rows & cols if a bfmask file was supplied int nRow=0; int nCol=0; if(!bfmaskFile.empty()) { if(GetRowColFromBfmask(bfmaskFile, &nRow, &nCol)) { cerr << "AnalyzeHPErrs - problem determining rows & columns from bfmask file " << bfmaskFile << endl; exit(1); } } // Set up fds object FlowDiffStats* fds; if (!run1.empty()) { SffDiffStats* sds = new SffDiffStats(hpLength, nCol, nRow, qScoreCol, run1, run2); if (!pairedOut.empty()) sds->SetPairedOut(pairedOut); fds = dynamic_cast<FlowDiffStats*>(sds); } else { GenomeDiffStats* gds = new GenomeDiffStats(hpLength, nCol, nRow, qScoreCol); if(alignmentOut != "") { gds->SetAlignmentsOut(alignmentOut); } if (!flowsOut.empty()) { gds->SetFlowsOut(flowsOut); } fds = dynamic_cast<FlowDiffStats*>(gds); } if (gcErrFile != "") { fds->SetFlowGCOut(gcErrFile); fds->SetGCWindowSize(gcWin); } if(keySeq != "") { fds->SetKeySeq(keySeq); } if(flowOrder != "") { fds->SetFlowOrder(flowOrder); } fds->SetStatsOut(statsOut); if (!wellsToUse.empty()) { std::vector<int> wells; std::vector<bool> use; ReadSetFromFile(wellsToUse, 0, wells); use.resize(nRow * nCol, false); int count = 0; ReservoirSample<int> wellSample(sampleSize); for (size_t i = 0; i < wells.size(); i++) { wellSample.Add(wells[i]); } wells = wellSample.GetData(); for (size_t i = 0; i < wells.size(); i++) { use[wells[i]] = true; count++; } cout << "Read: " << count << " reads." << endl; fds->SetWellToAnalyze(use); } // Set integer-value row & column bounds int minRow=-1; int maxRow=-1; int minCol=-1; int maxCol=-1; if(colSize > 0 || rowSize > 0) { if(bfmaskFile.empty()) { cerr << "AnalyzeHPErrs - must specify bfmask file when restricting row or column ranges" << endl; exit(1); } if(rowSize > 0) { minRow = floor(nRow * rowCenter - rowSize / 2.0); maxRow = minRow + rowSize; minRow = std::max(0,minRow); maxRow = std::min(nRow,maxRow); } if(colSize > 0) { minCol = floor(nCol * colCenter - colSize / 2.0); maxCol = minCol + colSize; minCol = std::max(0,minCol); maxCol = std::min(nCol,maxCol); } } if (wellsFile != "") { std::vector<int32_t> xSubset, ySubset; fds->FillInSubset(samFile, qLength, minRow, maxRow, minCol, maxCol, xSubset, ySubset); if(bfmaskFile.empty()) { cerr << "AnalyzeHPErrs - must specify bfmask file when specifying wells file" << endl; exit(1); } fds->SetWellsFile(wellsFile, nRow, nCol, numFlows, xSubset, ySubset); } if (snrFile != "") { cout << "Opening snr file: " << snrFile << endl; fds->SetSNROut(snrFile); } if (binnedHpSigFile != "") { cout << "Opening binned HP signal file: " << binnedHpSigFile << endl; fds->SetBinnedHpSigOut(binnedHpSigFile); } if (flowErrFile != "") { cout << "Opening flow err file: " << flowErrFile << endl; fds->SetFlowErrOut(flowErrFile); } ofstream summary; summary.open(summaryOut.c_str()); cout << "Reading and analyzing alignments from: " << samFile << endl; if(minCol > -1 || maxCol > -1) cout << " Restricting to " << (maxCol-minCol) << " cols in the range [" << minCol << "," << maxCol << ")" << endl; if(minRow > -1 || maxRow > -1) cout << " Restricting to " << (maxRow-minRow) << " rows in the range [" << minRow << "," << maxRow << ")" << endl; fds->SetAlignmentInFile(samFile); fds->FilterAndCompare(numFlows, summary, qLength, minRow, maxRow, minCol, maxCol); summary.close(); delete fds; cout << "Done." << endl; return 0; }
int PrepareHotspots(int argc, const char *argv[]) { OptArgs opts; opts.ParseCmdLine(argc, argv); string input_bed_filename = opts.GetFirstString ('b', "input-bed", ""); string input_vcf_filename = opts.GetFirstString ('v', "input-vcf", ""); string output_bed_filename = opts.GetFirstString ('d', "output-bed", ""); string output_vcf_filename = opts.GetFirstString ('o', "output-vcf", ""); string reference_filename = opts.GetFirstString ('r', "reference", ""); bool left_alignment = opts.GetFirstBoolean('a', "left-alignment", false); bool filter_bypass = opts.GetFirstBoolean('f', "filter-bypass", false); bool allow_block_substitutions = opts.GetFirstBoolean('s', "allow-block-substitutions", false); opts.CheckNoLeftovers(); if((input_bed_filename.empty() == input_vcf_filename.empty()) or (output_bed_filename.empty() and output_vcf_filename.empty()) or reference_filename.empty()) { PrepareHotspotsHelp(); return 1; } // Populate chromosome list from reference.fai // Use mmap to fetch the entire reference int ref_handle = open(reference_filename.c_str(),O_RDONLY); struct stat ref_stat; fstat(ref_handle, &ref_stat); char *ref = (char *)mmap(0, ref_stat.st_size, PROT_READ, MAP_SHARED, ref_handle, 0); FILE *fai = fopen((reference_filename+".fai").c_str(), "r"); if (!fai) { fprintf(stderr, "ERROR: Cannot open %s.fai\n", reference_filename.c_str()); return 1; } vector<Reference> ref_index; map<string,int> ref_map; char line[1024], chrom_name[1024]; while (fgets(line, 1024, fai) != NULL) { Reference ref_entry; long chr_start; if (5 != sscanf(line, "%s\t%ld\t%ld\t%d\t%d", chrom_name, &ref_entry.size, &chr_start, &ref_entry.bases_per_line, &ref_entry.bytes_per_line)) continue; ref_entry.chr = chrom_name; ref_entry.start = ref + chr_start; ref_index.push_back(ref_entry); ref_map[ref_entry.chr] = (int) ref_index.size() - 1; } fclose(fai); // Load input BED or load input VCF, group by chromosome deque<LineStatus> line_status; vector<deque<Allele> > alleles(ref_index.size()); if (!input_bed_filename.empty()) { FILE *input = fopen(input_bed_filename.c_str(),"r"); if (!input) { fprintf(stderr,"ERROR: Cannot open %s\n", input_bed_filename.c_str()); return 1; } char line2[65536]; int line_number = 0; bool line_overflow = false; while (fgets(line2, 65536, input) != NULL) { if (line2[0] and line2[strlen(line2)-1] != '\n' and strlen(line2) == 65535) { line_overflow = true; continue; } line_number++; if (line_overflow) { line_overflow = false; line_status.push_back(LineStatus(line_number)); line_status.back().filter_message_prefix = "Malformed hotspot BED line: line length exceeds 64K"; continue; } if (strncmp(line2, "browser", 7) == 0) continue; if (strncmp(line2, "track", 5) == 0) { if (string::npos != string(line2).find("allowBlockSubstitutions=true")) allow_block_substitutions = true; continue; } char *current_chr = strtok(line2, "\t\r\n"); char *current_start = strtok(NULL, "\t\r\n"); char *current_end = strtok(NULL, "\t\r\n"); char *current_id = strtok(NULL, "\t\r\n"); char *penultimate = strtok(NULL, "\t\r\n"); char *ultimate = strtok(NULL, "\t\r\n"); for (char *next = strtok(NULL, "\t\r\n"); next; next = strtok(NULL, "\t\r\n")) { penultimate = ultimate; ultimate = next; } if (!current_chr or !current_start or !current_end or !current_id or !penultimate or !ultimate) { line_status.push_back(LineStatus(line_number)); line_status.back().filter_message_prefix = "Malformed hotspot BED line: expected at least 6 fields"; continue; } Allele allele; string string_chr(current_chr); if (ref_map.find(string_chr) != ref_map.end()) allele.chr_idx = ref_map[string_chr]; else if (ref_map.find("chr"+string_chr) != ref_map.end()) allele.chr_idx = ref_map["chr"+string_chr]; else if (string_chr == "MT" and ref_map.find("chrM") != ref_map.end()) allele.chr_idx = ref_map["chrM"]; else { line_status.push_back(LineStatus(line_number)); line_status.back().filter_message_prefix = "Unknown chromosome name: "; line_status.back().filter_message = string_chr; continue; } allele.pos = strtol(current_start,NULL,10); allele.id = current_id; char *current_ref = NULL; char *current_alt = NULL; for (char *next = strtok(penultimate, ";"); next; next = strtok(NULL, ";")) { if (strncmp(next,"REF=",4) == 0) current_ref = next; else if (strncmp(next,"OBS=",4) == 0) current_alt = next; } if (!current_ref or !current_alt) { line_status.push_back(LineStatus(line_number)); line_status.back().filter_message_prefix = "Malformed hotspot BED line: REF and OBS fields required in penultimate column"; continue; } for (char *pos = current_ref+4; *pos; ++pos) allele.ref += toupper(*pos); for (char *pos = current_alt+4; *pos; ++pos) allele.alt += toupper(*pos); allele.filtered = false; line_status.push_back(LineStatus(line_number)); allele.line_status = &line_status.back(); allele.opos = allele.pos; allele.oref = allele.ref; allele.oalt = allele.alt; alleles[allele.chr_idx].push_back(allele); line_status.back().allele = &alleles[allele.chr_idx].back(); } fclose(input); } if (!input_vcf_filename.empty()) { FILE *input = fopen(input_vcf_filename.c_str(),"r"); if (!input) { fprintf(stderr,"ERROR: Cannot open %s\n", input_vcf_filename.c_str()); return 1; } char line2[65536]; int line_number = 0; bool line_overflow = false; while (fgets(line2, 65536, input) != NULL) { if (line2[0] and line2[strlen(line2)-1] != '\n' and strlen(line2) == 65535) { line_overflow = true; continue; } line_number++; if (line_overflow) { line_overflow = false; line_status.push_back(LineStatus(line_number)); line_status.back().filter_message_prefix = "Malformed hotspot VCF line: line length exceeds 64K"; continue; } if (strncmp(line2, "##allowBlockSubstitutions=true", 30) == 0) { allow_block_substitutions = true; continue; } if (line2[0] == '#') continue; char *current_chr = strtok(line2, "\t\r\n"); char *current_start = strtok(NULL, "\t\r\n"); char *current_id = strtok(NULL, "\t\r\n"); char *current_ref = strtok(NULL, "\t\r\n"); char *current_alt = strtok(NULL, "\t\r\n"); if (!current_chr or !current_start or !current_id or !current_ref or !current_alt) { line_status.push_back(LineStatus(line_number)); line_status.back().filter_message_prefix = "Malformed hotspot VCF line: expected at least 5 fields"; continue; } string string_chr(current_chr); int chr_idx = 0; if (ref_map.find(string_chr) != ref_map.end()) chr_idx = ref_map[string_chr]; else if (ref_map.find("chr"+string_chr) != ref_map.end()) chr_idx = ref_map["chr"+string_chr]; else if (string_chr == "MT" and ref_map.find("chrM") != ref_map.end()) chr_idx = ref_map["chrM"]; else { line_status.push_back(LineStatus(line_number)); line_status.back().filter_message_prefix = "Unknown chromosome name: "; line_status.back().filter_message = string_chr; continue; } for (char *pos = current_ref; *pos; ++pos) *pos = toupper(*pos); for (char *pos = current_alt; *pos; ++pos) *pos = toupper(*pos); for (char *sub_alt = strtok(current_alt,","); sub_alt; sub_alt = strtok(NULL,",")) { Allele allele; allele.chr_idx = chr_idx; allele.ref = current_ref; allele.alt = sub_alt; allele.pos = strtol(current_start,NULL,10)-1; allele.id = current_id; if (allele.id == ".") allele.id = "hotspot"; allele.filtered = false; line_status.push_back(LineStatus(line_number)); allele.line_status = &line_status.back(); allele.opos = allele.pos; allele.oref = allele.ref; allele.oalt = allele.alt; alleles[allele.chr_idx].push_back(allele); line_status.back().allele = &alleles[allele.chr_idx].back(); } } fclose(input); } // Process by chromosome: // - Verify reference allele // - Left align // - Sort // - Filter for block substitutions, write FILE *output_vcf = NULL; if (!output_vcf_filename.empty()) { output_vcf = fopen(output_vcf_filename.c_str(), "w"); if (!output_vcf) { fprintf(stderr,"ERROR: Cannot open %s for writing\n", output_vcf_filename.c_str()); return 1; } fprintf(output_vcf, "##fileformat=VCFv4.1\n"); if (allow_block_substitutions) fprintf(output_vcf, "##allowBlockSubstitutions=true\n"); fprintf(output_vcf, "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\n"); } FILE *output_bed = NULL; if (!output_bed_filename.empty()) { output_bed = fopen(output_bed_filename.c_str(), "w"); if (!output_bed) { fprintf(stderr,"ERROR: Cannot open %s for writing\n", output_bed_filename.c_str()); if (output_vcf) fclose(output_vcf); return 1; } if (allow_block_substitutions) fprintf(output_bed, "track name=\"hotspot\" type=bedDetail allowBlockSubstitutions=true\n"); else fprintf(output_bed, "track name=\"hotspot\" type=bedDetail\n"); } for (int chr_idx = 0; chr_idx < (int)ref_index.size(); ++chr_idx) { for (deque<Allele>::iterator A = alleles[chr_idx].begin(); A != alleles[chr_idx].end(); ++A) { // Invalid characters bool valid = true; for (const char *c = A->ref.c_str(); *c ; ++c) if (*c != 'A' and *c != 'C' and *c != 'G' and *c != 'T') valid = false; for (const char *c = A->alt.c_str(); *c ; ++c) if (*c != 'A' and *c != 'C' and *c != 'G' and *c != 'T') valid = false; if (not valid) { A->filtered = true; A->line_status->filter_message_prefix = "REF and/or ALT contain characters other than ACGT: "; A->line_status->filter_message = "REF = " + A->ref + " ALT = " + A->alt; continue; } // Filter REF == ALT if (A->ref == A->alt) { A->filtered = true; A->line_status->filter_message_prefix = "REF and ALT alleles equal"; continue; } // Confirm reference allele. string ref_expected; for (int idx = 0; idx < (int) A->ref.size(); ++idx) ref_expected += ref_index[chr_idx].base(A->pos + idx); if (A->ref != ref_expected) { A->filtered = true; A->line_status->filter_message_prefix = "Provided REF allele does not match reference: "; A->line_status->filter_message = "Expected " + ref_expected + ", found " + A->ref; continue; } // Trim int ref_start = 0; int ref_end = A->ref.size(); int alt_end = A->alt.size(); // Option 1: trim all trailing bases //while(ref_end and alt_end and A->ref[ref_end-1] == A->alt[alt_end-1]) { // --ref_end; // --alt_end; //} // Option 2: trim all leading basees //while (ref_start < ref_end and ref_start < alt_end and A->ref[ref_start] == A->alt[ref_start]) // ++ref_start; // Option 3: trim anchor base if vcf if (!input_vcf_filename.empty()) { if (ref_end and alt_end and (ref_end == 1 or alt_end == 1) and A->ref[0] == A->alt[0]) ref_start = 1; } A->pos += ref_start; A->ref = A->ref.substr(ref_start, ref_end-ref_start); A->alt = A->alt.substr(ref_start, alt_end-ref_start); ref_end -= ref_start; alt_end -= ref_start; // Left align if (left_alignment) { while (A->pos > 0) { char nuc = ref_index[chr_idx].base(A->pos-1); if (ref_end > 0 and A->ref[ref_end-1] != nuc) break; if (alt_end > 0 and A->alt[alt_end-1] != nuc) break; A->ref = string(1,nuc) + A->ref; A->alt = string(1,nuc) + A->alt; A->pos--; } } A->ref.resize(ref_end); A->alt.resize(alt_end); // Filter block substitutions: take 1 if (ref_end > 0 and alt_end > 0 and ref_end != alt_end and not allow_block_substitutions and not filter_bypass) { A->filtered = true; A->line_status->filter_message_prefix = "Block substitutions not supported"; continue; } } if (output_bed) { // Sort - without anchor base sort(alleles[chr_idx].begin(), alleles[chr_idx].end(), compare_alleles); // Write for (deque<Allele>::iterator I = alleles[chr_idx].begin(); I != alleles[chr_idx].end(); ++I) { if (I->filtered) continue; if (I->pos) fprintf(output_bed, "%s\t%ld\t%ld\t%s\t0\t+\tREF=%s;OBS=%s;ANCHOR=%c\tNONE\n", ref_index[chr_idx].chr.c_str(), I->pos, I->pos + I->ref.size(), I->id.c_str(), I->ref.c_str(), I->alt.c_str(), ref_index[chr_idx].base(I->pos-1)); else fprintf(output_bed, "%s\t%ld\t%ld\t%s\t0\t+\tREF=%s;OBS=%s;ANCHOR=\tNONE\n", ref_index[chr_idx].chr.c_str(), I->pos, I->pos + I->ref.size(), I->id.c_str(), I->ref.c_str(), I->alt.c_str()); } } if (output_vcf) { // Add anchor base to indels for (deque<Allele>::iterator I = alleles[chr_idx].begin(); I != alleles[chr_idx].end(); ++I) { if (I->filtered) continue; if (not I->ref.empty() and not I->alt.empty()) continue; if (I->pos == 0) { I->filtered = true; I->line_status->filter_message_prefix = "INDELs at chromosome start not supported"; continue; } I->pos--; I->ref = string(1,ref_index[chr_idx].base(I->pos)) + I->ref; I->alt = string(1,ref_index[chr_idx].base(I->pos)) + I->alt; } // Sort - with anchor base sort(alleles[chr_idx].begin(), alleles[chr_idx].end(), compare_alleles); // Merge alleles, remove block substitutions, write for (deque<Allele>::iterator A = alleles[chr_idx].begin(); A != alleles[chr_idx].end(); ) { string max_ref; deque<Allele>::iterator B = A; for (; B != alleles[chr_idx].end() and B->pos == A->pos; ++B) if (!B->filtered and max_ref.size() < B->ref.size()) max_ref = B->ref; bool filtered = true; for (deque<Allele>::iterator I = A; I != B; ++I) { if (I->filtered) continue; string new_alt = I->alt + max_ref.substr(I->ref.size()); if (new_alt.size() > 1 and max_ref.size() > 1 and new_alt.size() != max_ref.size() and not allow_block_substitutions and not filter_bypass) { I->filtered = true; I->line_status->filter_message_prefix = "Block substitutions not supported (post-merge)"; continue; } I->ref = max_ref; I->alt = new_alt; filtered = false; } if (not filtered) { fprintf(output_vcf, "%s\t%ld\t.\t%s\t", ref_index[chr_idx].chr.c_str(), A->pos+1, max_ref.c_str()); bool comma = false; set<string> unique_alt_alleles; for (deque<Allele>::iterator I = A; I != B; ++I) { if (I->filtered) continue; if (unique_alt_alleles.count(I->alt) > 0) continue; unique_alt_alleles.insert(I->alt); if (comma) fprintf(output_vcf, ","); comma = true; fprintf(output_vcf, "%s", I->alt.c_str()); } fprintf(output_vcf, "\t.\t.\tOID="); comma = false; for (deque<Allele>::iterator I = A; I != B; ++I) { if (I->filtered) continue; if (comma) fprintf(output_vcf, ","); comma = true; fprintf(output_vcf, "%s", I->id.c_str()); } fprintf(output_vcf, ";OPOS="); comma = false; for (deque<Allele>::iterator I = A; I != B; ++I) { if (I->filtered) continue; if (comma) fprintf(output_vcf, ","); comma = true; fprintf(output_vcf, "%ld", I->opos+1); } fprintf(output_vcf, ";OREF="); comma = false; for (deque<Allele>::iterator I = A; I != B; ++I) { if (I->filtered) continue; if (comma) fprintf(output_vcf, ","); comma = true; fprintf(output_vcf, "%s", I->oref.c_str()); } fprintf(output_vcf, ";OALT="); comma = false; for (deque<Allele>::iterator I = A; I != B; ++I) { if (I->filtered) continue; if (comma) fprintf(output_vcf, ","); comma = true; fprintf(output_vcf, "%s", I->oalt.c_str()); } fprintf(output_vcf, ";OMAPALT="); comma = false; for (deque<Allele>::iterator I = A; I != B; ++I) { if (I->filtered) continue; if (comma) fprintf(output_vcf, ","); comma = true; fprintf(output_vcf, "%s", I->alt.c_str()); } fprintf(output_vcf, "\n"); } A = B; } } } if (output_bed) { fflush(output_bed); fclose(output_bed); } if (output_vcf) { fflush(output_vcf); fclose(output_vcf); } int lines_ignored = 0; for (deque<LineStatus>::iterator L = line_status.begin(); L != line_status.end(); ++L) { if (L->filter_message_prefix) { if (L->allele) printf("Line %d ignored: [%s:%ld %s] %s%s\n", L->line_number, ref_index[L->allele->chr_idx].chr.c_str(), L->allele->opos+1, L->allele->id.c_str(), L->filter_message_prefix, L->filter_message.c_str()); else printf("Line %d ignored: %s%s\n", L->line_number, L->filter_message_prefix, L->filter_message.c_str()); lines_ignored++; } } printf("Ignored %d out of %d lines\n", lines_ignored, (int)line_status.size()); munmap(ref, ref_stat.st_size); close(ref_handle); return 0; }
int main(int argc, const char* argv[]) { printf ("tvcvalidator %s-%s (%s) - Prototype tvc validation tool\n\n", IonVersion::GetVersion().c_str(), IonVersion::GetRelease().c_str(), IonVersion::GetSvnRev().c_str()); if (argc == 1) { VariantValidatorHelp(); return 1; } OptArgs opts; opts.ParseCmdLine(argc, argv); if (opts.GetFirstBoolean('v', "version", false)) { return 0; } if (opts.GetFirstBoolean('h', "help", false)) { VariantValidatorHelp(); return 0; } string input_vcf_filename = opts.GetFirstString ('i', "input-vcf", ""); string truth_filename = opts.GetFirstString ('t', "truth-file", ""); string truth_dir = opts.GetFirstString ('d', "truth-dir", "/results/plugins/validateVariantCaller/files"); // TODO: reference optional, only used to verify reference allele in input-vcf and truth files //string reference_filename = opts.GetFirstString ('r', "reference", ""); opts.CheckNoLeftovers(); // // Step 1. Load input VCF file into memory // if (input_vcf_filename.empty()) { VariantValidatorHelp(); cerr << "ERROR: Input VCF file not specified " << endl; return 1; } VariantCallerResults results_vcf; results_vcf.load_vcf(input_vcf_filename); printf("Loaded VCF %s with %d variant calls\n", input_vcf_filename.c_str(), (int)results_vcf.variants.size()); // // Step 2. Parse truth files, compare them to the input vcf, and compute match scores // if (not truth_filename.empty()) { ValidatorTruth truth; truth.ReadTruthFile(truth_filename); truth.CompareToCalls(results_vcf); return 0; } truth_dir += "/*.bed"; glob_t glob_result; glob(truth_dir.c_str(), GLOB_TILDE, NULL, &glob_result); for(unsigned int i = 0; i < glob_result.gl_pathc; ++i) { ValidatorTruth truth; truth.ReadTruthFile(string(glob_result.gl_pathv[i])); truth.CompareToCalls(results_vcf); } globfree(&glob_result); return 0; }
int main(int argc, const char *argv[]) { OptArgs opts; opts.ParseCmdLine(argc, argv); string inFile, outFile; bool help = false; bool version = false; double lower = -5.0; double upper = 28.0; opts.GetOption(inFile, "", 'i', "input-file"); opts.GetOption(outFile, "", 'o', "output-file"); opts.GetOption(lower, "-5.0", '-', "wells-convert-low"); opts.GetOption(upper, "28.0", '-', "wells-convert-high"); opts.GetOption(help, "false", 'h', "help"); opts.GetOption(version, "false", 'v', "version"); opts.CheckNoLeftovers(); if (version) { fprintf (stdout, "%s", IonVersion::GetFullVersion("RawWellsConverter").c_str()); exit(0); } if (inFile.empty() || help) { cout << "RawWellsConverter - Convert unsigned short type wells file to float type wells file, or vice versa." << endl << "options: " << endl << " -i,--input-file input wells file." << endl << " -o,--output-file output wells file." << endl << " ,--wells-convert-low lower bound for converting to unsigned short." << endl << " ,--wells-convert-high upper bound for converting to unsigned short." << endl << " -h,--help this message." << endl << "" << endl << "usage: " << endl << " RawWellsConverter -i input_path/1.wells -o output_path/1.wells " << endl; exit(1); } struct stat sb; if(stat(inFile.c_str(), &sb) != 0) { cerr << "RawWellsConverter ERROR: " << inFile << " does not exist." << endl; exit (1); } if (outFile.empty()) { outFile = inFile; outFile += ".converted"; } string cmd("cp "); cmd += inFile; cmd += " "; cmd += outFile; int ret0 = system(cmd.c_str()); hid_t root = H5Fopen(outFile.c_str(), H5F_ACC_RDWR, H5P_DEFAULT); if(root < 0) { cerr << "RawWellsConverter ERROR: Fail to open " << outFile << endl; exit(1); } H5G_info_t group_info; group_info.nlinks = 0; if(H5Gget_info(root, &group_info) < 0) { H5Fclose(root); cerr << "RawWellsConverter ERROR: Fail H5Gget_info." << endl; exit(1); } char name[10]; string sName; bool bWells = false; bool bCopies = false; for(unsigned int i = 0; i < group_info.nlinks; ++i) { int size = H5Gget_objname_by_idx(root, i, NULL, 0); if(H5Gget_objname_by_idx(root, i, name, size + 1) < 0) { H5Fclose(root); cerr << "RawWellsConverter ERROR: Fail H5Gget_objname_by_idx." << endl; exit(1); } else { sName = name; if(sName == "wells") { bWells = true; } if(sName == "wells_copies") { bCopies = true; } } } if(!bWells) { H5Fclose(root); cerr << "RawWellsConverter ERROR: There is no dataset wells." << endl; exit(1); } hid_t dsWells = H5Dopen2(root, "wells", H5P_DEFAULT); if(dsWells < 0) { H5Fclose(root); cerr << "RawWellsConverter ERROR: Fail H5Dopen2 wells." << endl; exit(1); } bool saveAsUShort = false; if(H5Aexists(dsWells, "convert_low") > 0) { hid_t attrLower = H5Aopen(dsWells, "convert_low", H5T_NATIVE_FLOAT ); H5Aread(attrLower, H5T_NATIVE_FLOAT, &lower); saveAsUShort = true; H5Aclose(attrLower); } if(H5Aexists(dsWells, "convert_high") > 0) { hid_t attrUpper = H5Aopen(dsWells, "convert_high", H5T_NATIVE_FLOAT); H5Aread(attrUpper, H5T_NATIVE_FLOAT, &upper); saveAsUShort = true; H5Aclose(attrUpper); } hid_t dataSpace = H5Dget_space(dsWells); if(dataSpace < 0) { H5Dclose(dsWells); H5Fclose(root); cerr << "RawWellsConverter ERROR: Fail H5Dget_space wells." << endl; exit(1); } hssize_t dsSize = H5Sget_simple_extent_npoints(dataSpace); if(dsSize < 1) { H5Sclose(dataSpace); H5Dclose(dsWells); H5Fclose(root); cerr << "RawWellsConverter ERROR: Wrong size of dataset wells - " << dsSize << endl; exit(1); } int nRows = 0; int nCols = 0; int nFlows = 0; int rank = H5Sget_simple_extent_ndims(dataSpace); if(rank != 3) { bCopies = false; } else { hsize_t dims_out[3]; int status_n = H5Sget_simple_extent_dims(dataSpace, dims_out, NULL); if(status_n < 0) { bCopies = false; } else { nRows = dims_out[0]; nCols = dims_out[1]; nFlows = dims_out[2]; } } float* fPtr = new float[dsSize]; unsigned short* usPtr = new unsigned short[dsSize]; if(fPtr == NULL || usPtr == NULL) { H5Sclose(dataSpace); H5Dclose(dsWells); H5Fclose(root); cerr << "RawWellsConverter ERROR: Fail to allocate fPtr or usPtr." << endl; exit(1); } hid_t dcpl = H5Dget_create_plist(dsWells); if(dcpl < 0) { H5Sclose(dataSpace); H5Dclose(dsWells); H5Fclose(root); cerr << "RawWellsConverter ERROR: Fail H5Dget_create_plist." << endl; exit(1); } hid_t dapl = H5Dget_access_plist(dsWells); if(dapl < 0) { H5Pclose(dcpl); H5Sclose(dataSpace); H5Dclose(dsWells); H5Fclose(root); cerr << "RawWellsConverter ERROR: Fail H5Dget_access_plist." << endl; exit(1); } if(saveAsUShort) { cout << "RawWellsConverter: converting unsigned short wells file - " << inFile << " to float wells file - " << outFile << " with boundary (" << lower << "," << upper << ")" << endl; herr_t ret = H5Dread(dsWells, H5T_NATIVE_USHORT, H5S_ALL, H5S_ALL, H5P_DEFAULT, usPtr); H5Dclose(dsWells); if(ret < 0) { delete [] fPtr; delete [] usPtr; H5Sclose(dataSpace); H5Pclose(dcpl); H5Pclose(dapl); H5Fclose(root); cerr << "RawWellsConverter ERROR: Fail to read dataset wells." << endl; exit(1); } float factor = 65535.0 / (upper - lower); float* fPtr2 = fPtr; unsigned short* usPtr2 = usPtr; for(unsigned int i = 0; i < dsSize; ++i, ++fPtr2, ++usPtr2) { (*fPtr2) = (float)(*usPtr2) / factor + lower; } delete [] usPtr; if(bCopies) { vector<float> copies(nRows * nCols, 1.0); hid_t dsCopies = H5Dopen2(root, "wells_copies", H5P_DEFAULT); if(dsCopies < 0) { cerr << "RawWellsConverter WARNING: 1.wells files does not have wells_copies." << endl; } else { hid_t dataSpace2 = H5Dget_space(dsCopies); if(dataSpace2 < 0) { H5Dclose(dsCopies); cerr << "RawWellsConverter WARNING: fail to H5Dget_space for dataset wells_copies." << endl; } else { hssize_t dsSize2 = H5Sget_simple_extent_npoints(dataSpace2); H5Sclose(dataSpace2); if(dsSize2 != (hssize_t)(nRows * nCols)) { H5Dclose(dsCopies); cerr << "RawWellsConverter WARNING: dataset wells_copies size is " << dsSize2 << ", it is different from nRows * nCols = " << nRows * nCols << endl; } else { herr_t ret = H5Dread(dsCopies, H5T_NATIVE_FLOAT, H5S_ALL, H5S_ALL, H5P_DEFAULT, &copies[0]); H5Dclose(dsCopies); if(ret < 0) { copies.resize(nRows * nCols, 1.0); cerr << "RawWellsConverter WARNING: failto load dataset wells_copies." << endl; } } } } uint64_t fptrCount = 0; uint64_t copyCount = 0; for(int row = 0; row < nRows; ++row) { for(int col = 0; col < nCols; ++col) { for(int flow = 0; flow < nFlows; ++flow) { if(copies[copyCount] > 0) { fPtr[fptrCount] *= copies[copyCount]; } else { fPtr[fptrCount] = -1.0; } ++fptrCount; } ++copyCount; } } } H5Ldelete(root, "wells", H5P_DEFAULT); hid_t dsWells2 = H5Dcreate2 (root, "wells", H5T_NATIVE_FLOAT, dataSpace, H5P_DEFAULT, dcpl, dapl); if(dsWells2 < 0) { delete [] fPtr; H5Sclose(dataSpace); H5Pclose(dcpl); H5Pclose(dapl); H5Fclose(root); cerr << "RawWellsConverter ERROR: Fail to create dataset wells." << endl; exit(1); } ret = H5Dwrite(dsWells2, H5T_NATIVE_FLOAT, H5S_ALL, H5S_ALL, H5P_DEFAULT, fPtr); delete [] fPtr; H5Dclose(dsWells2); H5Sclose(dataSpace); H5Pclose(dcpl); H5Pclose(dapl); H5Fclose(root); if(ret < 0) { cerr << "RawWellsConverter ERROR: Fail to write dataset wells." << endl; exit(1); } } else { cout << "RawWellsConverter: converting float wells file - " << inFile << " to unsigned short wells file - " << outFile << " with boundary (" << lower << "," << upper << ")" << endl; herr_t ret = H5Dread(dsWells, H5T_NATIVE_FLOAT, H5S_ALL, H5S_ALL, H5P_DEFAULT, fPtr); H5Dclose(dsWells); if(ret < 0) { delete [] fPtr; delete [] usPtr; H5Sclose(dataSpace); H5Pclose(dcpl); H5Pclose(dapl); H5Fclose(root); cerr << "RawWellsConverter ERROR: Fail to read dataset wells." << endl; exit(1); } float factor = 65535.0 / (upper - lower); float* fPtr2 = fPtr; unsigned short* usPtr2 = usPtr; for(unsigned int i = 0; i < dsSize; ++i, ++fPtr2, ++usPtr2) { if(*fPtr2 < lower) { (*usPtr2) = 0; } else if(*fPtr2 > upper) { (*usPtr2) = 65535; } else { (*usPtr2) = (unsigned short)((*fPtr2 - lower) * factor); } } delete [] fPtr; H5Ldelete(root, "wells", H5P_DEFAULT); hid_t dsWells2 = H5Dcreate2 (root, "wells", H5T_NATIVE_USHORT, dataSpace, H5P_DEFAULT, dcpl, dapl); if(dsWells2 < 0) { delete [] usPtr; H5Sclose(dataSpace); H5Pclose(dcpl); H5Pclose(dapl); H5Fclose(root); cerr << "RawWellsConverter ERROR: Fail to create dataset wells." << endl; exit(1); } ret = H5Dwrite(dsWells2, H5T_NATIVE_USHORT, H5S_ALL, H5S_ALL, H5P_DEFAULT, usPtr); delete [] usPtr; if(dsWells2 < 0) { H5Dclose(dsWells2); H5Sclose(dataSpace); H5Pclose(dcpl); H5Pclose(dapl); H5Fclose(root); cerr << "RawWellsConverter ERROR: Fail to write dataset wells." << endl; exit(1); } float lower2 = (float)lower; float upper2 = (float)upper; hsize_t dimsa[1]; dimsa[0] = 1; hid_t dataspacea = H5Screate_simple(1, dimsa, NULL); hid_t attrLower = H5Acreate(dsWells2, "convert_low", H5T_NATIVE_FLOAT, dataspacea, H5P_DEFAULT, H5P_DEFAULT ); H5Awrite(attrLower, H5T_NATIVE_FLOAT, &lower2); H5Aclose(attrLower); hid_t attrUpper = H5Acreate(dsWells2, "convert_high", H5T_NATIVE_FLOAT, dataspacea, H5P_DEFAULT, H5P_DEFAULT ); H5Awrite(attrUpper, H5T_NATIVE_FLOAT, &upper2); H5Aclose(attrUpper); H5Sclose(dataspacea); H5Dclose(dsWells2); H5Sclose(dataSpace); H5Pclose(dcpl); H5Pclose(dapl); H5Fclose(root); } return 0; }
int main(int argc, const char *argv[]) { #ifdef _DEBUG atexit(memstatus); dbgmemInit(); #endif /* _DEBUG */ printf ("%s - %s-%s (%s)\n", argv[0], IonVersion::GetVersion().c_str(), IonVersion::GetRelease().c_str(), IonVersion::GetSvnRev().c_str()); string bamInputFilename; string fastaInputFilename; string jsonOutputFilename; bool help; OptArgs opts; opts.ParseCmdLine(argc, argv); opts.GetOption(bamInputFilename, "", '-', "bam"); opts.GetOption(fastaInputFilename, "", '-', "ref"); opts.GetOption(jsonOutputFilename, "TFStats.json", '-', "output-json"); opts.GetOption(help, "false", 'h', "help"); opts.CheckNoLeftovers(); if (help || bamInputFilename.empty() || fastaInputFilename.empty()) return showHelp(); // Parse BAM header BAMReader bamReader(bamInputFilename); bamReader.open(); bam_header_t *header = (bam_header_t *)bamReader.get_header_ptr(); int numFlows = 0; string flowOrder; string key; if (header->l_text >= 3) { if (header->dict == 0) header->dict = sam_header_parse2(header->text); int nEntries = 0; char **tmp = sam_header2list(header->dict, "RG", "FO", &nEntries); if (nEntries) { flowOrder = tmp[0]; numFlows = flowOrder.length(); } if (tmp) free(tmp); nEntries = 0; tmp = sam_header2list(header->dict, "RG", "KS", &nEntries); if (nEntries) { key = tmp[0]; } if (tmp) free(tmp); } if (numFlows <= 0) { fprintf(stderr, "[TFMapper] Could not retrieve flow order from FO BAM tag. SFF-specific tags absent?\n"); exit(1); } if (key.empty()) { fprintf(stderr, "[TFMapper] Could not retrieve key sequence from KS BAM tag. SFF-specific tags absent?\n"); exit(1); } //printf("Retrieved flow order from bam: %s (%d)\n", flowOrder.c_str(), numFlows); //printf("Retrieved key from bam: %s\n", key.c_str()); // Retrieve test fragment sequences vector<string> referenceSequences; PopulateReferenceSequences(referenceSequences, fastaInputFilename, header->n_targets, header->target_name, string("")); // Process the BAM reads and generate metrics int numTFs = header->n_targets; vector<int> TFCount(numTFs,0); MetricGeneratorQualityHistograms metricGeneratorQualityHistograms[numTFs]; MetricGeneratorHPAccuracy metricGeneratorHPAccuracy[numTFs]; MetricGeneratorSNR metricGeneratorSNR[numTFs]; MetricGeneratorAvgIonogram metricGeneratorAvgIonogram[numTFs]; for (BAMReader::iterator i = bamReader.get_iterator(); i.good(); i.next()) { BAMRead bamRead = i.get(); int bestTF = bamRead.get_tid(); if (bestTF < 0) continue; BAMUtils bamUtil(bamRead); TFCount[bestTF]++; // Extract flowspace signal from FZ BAM tag uint16_t *bam_flowgram = NULL; uint8_t *fz = bam_aux_get(bamRead.get_bam_ptr(), "FZ"); if (fz != NULL) { if (fz[0] == (uint8_t)'B' && fz[1] == (uint8_t)'S' && *((uint32_t *)(fz+2)) == (uint32_t)numFlows) bam_flowgram = (uint16_t *)(fz+6); } if (bam_flowgram == NULL) { fprintf(stderr, "[TFMapper] Could not retrieve flow signal from FZ BAM tag. SFF-specific tags absent?\n"); exit(1); } // Use alignments to generate "synchronized" flowspace reference and read ionograms // TODO: Do proper flowspace alignment string genome = key + bamUtil.get_tdna(); string calls = key + bamUtil.get_qdna(); int numBases = min(genome.length(),calls.length()); vector<int> refIonogram(numFlows, 0); vector<int> readIonogram(numFlows, 0); int numFlowsRead = 0; int numFlowsRef = 0; char gC = flowOrder[0]; int gBC = 0; for (int iBase = 0; (iBase < numBases) && (numFlowsRead < numFlows) && (numFlowsRef < numFlows); iBase++) { // Conversion for reads (independent of reference) if (calls[iBase] != '-') { while ((calls[iBase] != flowOrder[numFlowsRead]) && (numFlowsRead < numFlows)) numFlowsRead++; if (numFlowsRead < numFlows) readIonogram[numFlowsRead]++; } if (genome[iBase] != '-') { if (genome[iBase] != gC) { // Since a new homopolymer begins, need to drop off the old one while ((gC != flowOrder[numFlowsRef]) && (numFlowsRef < numFlows)) { numFlowsRef++; if (numFlowsRef < numFlows) refIonogram[numFlowsRef] = 0; } if (numFlowsRef < numFlows) refIonogram[numFlowsRef] = gBC; gC = genome[iBase]; gBC = 0; } gBC++; if (genome[iBase] == calls[iBase]) numFlowsRef = numFlowsRead; } } int validFlows = min(numFlowsRef, numFlowsRead); metricGeneratorSNR[bestTF].AddElement(bam_flowgram ,key.c_str(), flowOrder); metricGeneratorAvgIonogram[bestTF].AddElement(bam_flowgram, numFlows); metricGeneratorQualityHistograms[bestTF].AddElement(bamUtil.get_phred_len(10),bamUtil.get_phred_len(17)); for (int iFlow = 0; iFlow < validFlows-20; iFlow++) metricGeneratorHPAccuracy[bestTF].AddElement(refIonogram[iFlow],readIonogram[iFlow]); } // Save stats to a json file Json::Value outputJson(Json::objectValue); for(int i = 0; i < numTFs; i++) { if (TFCount[i] < minTFCount) continue; Json::Value currentTFJson(Json::objectValue); currentTFJson["TF Name"] = header->target_name[i]; currentTFJson["TF Seq"] = referenceSequences[i]; currentTFJson["Num"] = TFCount[i]; currentTFJson["Top Reads"] = Json::Value(Json::arrayValue); // Obsolete metricGeneratorSNR[i].PrintSNR(currentTFJson); metricGeneratorHPAccuracy[i].PrintHPAccuracy(currentTFJson); metricGeneratorQualityHistograms[i].PrintMetrics(currentTFJson); metricGeneratorAvgIonogram[i].PrintIonograms(currentTFJson); outputJson[header->target_name[i]] = currentTFJson; } bamReader.close(); // Closing invalidates the header pointers if (!jsonOutputFilename.empty()) { ofstream out(jsonOutputFilename.c_str(), ios::out); if (out.good()) out << outputJson.toStyledString(); } return 0; }