void OrderedDatasetWriter::Close(BarcodeDatasets& datasets, const string& dataset_nickname) { for (;num_regions_written_ < num_regions_; num_regions_written_++) { PhysicalWriteRegion(num_regions_written_); region_dropbox_[num_regions_written_].clear(); } for (int ds = 0; ds < num_datasets_; ++ds) { if (bam_writer_[ds]) { if (!dataset_nickname.empty()) printf("%s: Generated %s with %d reads\n", dataset_nickname.c_str(), bam_filename_[ds].c_str(), num_reads_[ds]); bam_writer_[ds]->Close(); delete bam_writer_[ds]; } else { if (!dataset_nickname.empty()) printf("%s: No reads for %s\n", dataset_nickname.c_str(), bam_filename_[ds].c_str()); } datasets.dataset(ds)["read_count"] = num_reads_[ds]; for (Json::Value::iterator rg = datasets.dataset(ds)["read_groups"].begin(); rg != datasets.dataset(ds)["read_groups"].end(); ++rg) { string read_group_name = (*rg).asString(); Json::Value& read_group_json = datasets.read_groups()[read_group_name]; int rg_index = datasets.read_group_name_to_id(read_group_name); read_group_json["read_count"] = (Json::UInt64)read_group_stats_.at(rg_index).num_reads_final_; read_group_json["total_bases"] = (Json::UInt64)read_group_stats_.at(rg_index).num_bases_final_; read_group_json["Q20_bases"] = (Json::UInt64)read_group_num_Q20_bases_[rg_index]; // Log barcode statistics only for barcode read groups if (read_group_json.isMember("barcode_sequence")) { read_group_json["barcode_match_filtered"] = (Json::UInt64)read_group_barcode_filt_zero_err_[rg_index]; read_group_json["barcode_adapter_filtered"] = (Json::UInt64)read_group_barcode_adapter_rejected_[rg_index]; for (unsigned int iflow=0; iflow < read_group_barcode_bias_[rg_index].size(); iflow++) { Json::Value av_bias_json(read_group_barcode_bias_[rg_index].at(iflow) / max(read_group_stats_.at(rg_index).num_reads_final_,(int64_t)1)); read_group_json["barcode_bias"][iflow] = av_bias_json; } for (unsigned int ibin=0; ibin < read_group_barcode_distance_hist_[rg_index].size(); ibin++) read_group_json["barcode_distance_hist"][ibin] = (Json::UInt64)read_group_barcode_distance_hist_[rg_index].at(ibin); for (unsigned int ierr=0; ierr < read_group_num_barcode_errors_[rg_index].size(); ierr++) read_group_json["barcode_errors_hist"][ierr] = (Json::UInt64)read_group_num_barcode_errors_[rg_index][ierr]; } } } for (int rg = 0; rg < num_read_groups_; ++rg) combined_stats_.MergeFrom(read_group_stats_.at(rg)); combined_stats_.ComputeAverages(); if (!dataset_nickname.empty()) combined_stats_.PrettyPrint(dataset_nickname); }
void OrderedDatasetWriter::Close(BarcodeDatasets& datasets, const string& dataset_nickname) { for (;num_regions_written_ < num_regions_; num_regions_written_++) { PhysicalWriteRegion(num_regions_written_); region_dropbox_[num_regions_written_].clear(); } for (int ds = 0; ds < num_datasets_; ++ds) { if (bam_writer_[ds]) { bam_writer_[ds]->Close(); delete bam_writer_[ds]; } datasets.dataset(ds)["read_count"] = num_reads_[ds]; for (Json::Value::iterator rg = datasets.dataset(ds)["read_groups"].begin(); rg != datasets.dataset(ds)["read_groups"].end(); ++rg) { string read_group_name = (*rg).asString(); Json::Value& read_group_json = datasets.read_groups()[read_group_name]; int rg_index = datasets.read_group_name_to_id(read_group_name); read_group_json["read_count"] = (Json::UInt64)read_group_stats_[rg_index].num_reads_final_; read_group_json["total_bases"] = (Json::UInt64)read_group_stats_[rg_index].num_bases_final_; read_group_json["Q20_bases"] = (Json::UInt64)read_group_num_Q20_bases_[rg_index]; read_group_json["barcode_errors_hist"][0] = (Json::UInt64)read_group_num_barcode_errors_[rg_index][0]; read_group_json["barcode_errors_hist"][1] = (Json::UInt64)read_group_num_barcode_errors_[rg_index][1]; read_group_json["barcode_errors_hist"][2] = (Json::UInt64)read_group_num_barcode_errors_[rg_index][2]; } if (!dataset_nickname.empty()) printf("%s: Generated %s with %d reads\n", dataset_nickname.c_str(), bam_filename_[ds].c_str(), num_reads_[ds]); } for (int rg = 0; rg < num_read_groups_; ++rg) combined_stats_.MergeFrom(read_group_stats_[rg]); if (!dataset_nickname.empty()) combined_stats_.PrettyPrint(dataset_nickname); }
void OrderedDatasetWriter::Open(const string& base_directory, BarcodeDatasets& datasets, int num_regions, const ion::FlowOrder& flow_order, const string& key, const string& basecaller_name, const string& basecalller_version, const string& basecaller_command_line, const string& production_date, const string& platform_unit, bool save_filtered_reads) { num_regions_ = num_regions; num_regions_written_ = 0; region_ready_.assign(num_regions_+1,false); region_dropbox_.clear(); region_dropbox_.resize(num_regions_); qv_histogram_.assign(50,0); num_datasets_ = datasets.num_datasets(); num_barcodes_ = datasets.num_barcodes(); num_read_groups_ = datasets.num_read_groups(); num_reads_.resize(num_datasets_,0); bam_filename_.resize(num_datasets_); save_filtered_reads_ = save_filtered_reads; read_group_name_.resize(num_read_groups_); read_group_dataset_.assign(num_read_groups_, -1); read_group_num_Q20_bases_.assign(num_read_groups_,0); read_group_num_barcode_errors_.resize(num_read_groups_); for (int rg = 0; rg < num_read_groups_; ++rg) { read_group_name_[rg] = datasets.read_group_name(rg); read_group_num_barcode_errors_[rg].assign(3,0); } // New filtering and trimming accounting (per read group) read_group_stats_.resize(num_read_groups_); bam_writer_.resize(num_datasets_, NULL); for (int ds = 0; ds < num_datasets_; ++ds) { // Set up BAM header bam_filename_[ds] = base_directory + "/" + datasets.dataset(ds)["basecaller_bam"].asString(); SamHeader sam_header; sam_header.Version = "1.4"; sam_header.SortOrder = "unsorted"; SamProgram sam_program("bc"); sam_program.Name = basecaller_name; sam_program.Version = basecalller_version; sam_program.CommandLine = basecaller_command_line; sam_header.Programs.Add(sam_program); for (Json::Value::iterator rg = datasets.dataset(ds)["read_groups"].begin(); rg != datasets.dataset(ds)["read_groups"].end(); ++rg) { string read_group_name = (*rg).asString(); Json::Value& read_group_json = datasets.read_groups()[read_group_name]; read_group_dataset_[datasets.read_group_name_to_id(read_group_name)] = ds; SamReadGroup read_group (read_group_name); read_group.FlowOrder = flow_order.full_nucs(); read_group.KeySequence = key; read_group.KeySequence += read_group_json.get("barcode_sequence","").asString(); read_group.KeySequence += read_group_json.get("barcode_adapter","").asString(); read_group.ProductionDate = production_date; read_group.Sample = read_group_json.get("sample","").asString(); read_group.Library = read_group_json.get("library","").asString(); read_group.Description = read_group_json.get("description","").asString(); read_group.PlatformUnit = read_group_json.get("platform_unit","").asString(); read_group.SequencingCenter = datasets.json().get("sequencing_center","").asString(); read_group.SequencingTechnology = "IONTORRENT"; sam_header.ReadGroups.Add(read_group); } // Open Bam for writing RefVector empty_reference_vector; bam_writer_[ds] = new BamWriter(); bam_writer_[ds]->SetCompressionMode(BamWriter::Compressed); //bam_writer_[ds]->SetCompressionMode(BamWriter::Uncompressed); bam_writer_[ds]->Open(bam_filename_[ds], sam_header, empty_reference_vector); } }
void OrderedDatasetWriter::Open(const string& base_directory, BarcodeDatasets& datasets, int read_class_idx, int num_regions, const ion::FlowOrder& flow_order, const string& key, const vector<string> & bead_adapters, int num_bamwriter_threads, const Json::Value & basecaller_json, vector<string>& comments, MolecularTagTrimmer& tag_trimmer, bool trim_barcodes) { num_regions_ = num_regions; num_regions_written_ = 0; region_ready_.assign(num_regions_+1,false); region_dropbox_.clear(); region_dropbox_.resize(num_regions_); qv_histogram_.assign(50,0); num_datasets_ = datasets.num_datasets(); num_barcodes_ = datasets.num_barcodes(); num_read_groups_ = datasets.num_read_groups(); num_reads_.resize(num_datasets_,0); bam_filename_.resize(num_datasets_); // A negative read group index indicates untrimmed/unfiltered bam files (w. library key) and we save all reads if (read_class_idx < 0) { save_filtered_reads_ = true; read_class_idx = 0; } else save_filtered_reads_ = false; read_group_name_.resize(num_read_groups_); read_group_dataset_.assign(num_read_groups_, -1); read_group_num_Q20_bases_.assign(num_read_groups_,0); read_group_barcode_filt_zero_err_.assign(num_read_groups_, 0); read_group_barcode_adapter_rejected_.assign(num_read_groups_, 0); read_group_num_barcode_errors_.resize(num_read_groups_); read_group_barcode_distance_hist_.resize(num_read_groups_); read_group_barcode_bias_.resize(num_read_groups_); for (int rg = 0; rg < num_read_groups_; ++rg) { read_group_name_[rg] = datasets.read_group_name(rg); read_group_num_barcode_errors_[rg].assign(3,0); read_group_barcode_bias_[rg].assign(datasets.GetBCmaxFlows(),0.0); read_group_barcode_distance_hist_[rg].assign(5,0); } // New filtering and trimming accounting (per read group) read_group_stats_.resize(num_read_groups_); for (int rg=0; rg<num_read_groups_; rg++) read_group_stats_[rg].SetBeadAdapters(bead_adapters); combined_stats_.SetBeadAdapters(bead_adapters); bam_writer_.resize(num_datasets_, NULL); sam_header_.resize(num_datasets_); num_bamwriter_threads_ = num_bamwriter_threads; for (int ds = 0; ds < num_datasets_; ++ds) { // Set up BAM header bam_filename_[ds] = base_directory + "/" + datasets.dataset(ds)["basecaller_bam"].asString(); SamHeader& sam_header = sam_header_[ds]; sam_header.Version = "1.4"; sam_header.SortOrder = "unsorted"; SamProgram sam_program("bc"); sam_program.Name = "BaseCaller"; sam_program.Version = basecaller_json["BaseCaller"]["version"].asString() + "/" + basecaller_json["BaseCaller"]["git_hash"].asString(); sam_program.CommandLine = basecaller_json["BaseCaller"]["command_line"].asString(); sam_header.Programs.Add(sam_program); for (Json::Value::iterator rg = datasets.dataset(ds)["read_groups"].begin(); rg != datasets.dataset(ds)["read_groups"].end(); ++rg) { string read_group_name = (*rg).asString(); Json::Value& read_group_json = datasets.read_groups()[read_group_name]; read_group_dataset_[datasets.read_group_name_to_id(read_group_name)] = ds; SamReadGroup read_group (read_group_name); read_group.FlowOrder = flow_order.full_nucs(); read_group.KeySequence = key; if (trim_barcodes){ // We only add the barcode info to the key sequence if we hard clipped it read_group.KeySequence += read_group_json.get("barcode_sequence","").asString(); read_group.KeySequence += read_group_json.get("barcode_adapter","").asString(); } read_group.ProductionDate = basecaller_json["BaseCaller"]["start_time"].asString(); read_group.Sample = read_group_json.get("sample","").asString(); read_group.Library = read_group_json.get("library","").asString(); read_group.Description = read_group_json.get("description","").asString(); read_group.PlatformUnit = read_group_json.get("platform_unit","").asString(); read_group.SequencingCenter = datasets.json().get("sequencing_center","").asString(); read_group.SequencingTechnology = "IONTORRENT"; // Add custom tags: Structure of tags per read group XXX if (datasets.IsLibraryDataset()) { MolTag my_tags = tag_trimmer.GetReadGroupTags(read_group_name); AddCustomReadGroupTag(read_group, "zt", my_tags.prefix_mol_tag); AddCustomReadGroupTag(read_group, "yt", my_tags.suffix_mol_tag); } sam_header.ReadGroups.Add(read_group); } for(size_t i = 0; i < comments.size(); ++i) sam_header.Comments.push_back(comments[i]); } }