void OrderedDatasetWriter::Open(const string& base_directory, BarcodeDatasets& datasets, int num_regions, const ion::FlowOrder& flow_order, const string& key, const string& basecaller_name, const string& basecalller_version, const string& basecaller_command_line, const string& production_date, const string& platform_unit, bool save_filtered_reads) { num_regions_ = num_regions; num_regions_written_ = 0; region_ready_.assign(num_regions_+1,false); region_dropbox_.clear(); region_dropbox_.resize(num_regions_); qv_histogram_.assign(50,0); num_datasets_ = datasets.num_datasets(); num_barcodes_ = datasets.num_barcodes(); num_read_groups_ = datasets.num_read_groups(); num_reads_.resize(num_datasets_,0); bam_filename_.resize(num_datasets_); save_filtered_reads_ = save_filtered_reads; read_group_name_.resize(num_read_groups_); read_group_dataset_.assign(num_read_groups_, -1); read_group_num_Q20_bases_.assign(num_read_groups_,0); read_group_num_barcode_errors_.resize(num_read_groups_); for (int rg = 0; rg < num_read_groups_; ++rg) { read_group_name_[rg] = datasets.read_group_name(rg); read_group_num_barcode_errors_[rg].assign(3,0); } // New filtering and trimming accounting (per read group) read_group_stats_.resize(num_read_groups_); bam_writer_.resize(num_datasets_, NULL); for (int ds = 0; ds < num_datasets_; ++ds) { // Set up BAM header bam_filename_[ds] = base_directory + "/" + datasets.dataset(ds)["basecaller_bam"].asString(); SamHeader sam_header; sam_header.Version = "1.4"; sam_header.SortOrder = "unsorted"; SamProgram sam_program("bc"); sam_program.Name = basecaller_name; sam_program.Version = basecalller_version; sam_program.CommandLine = basecaller_command_line; sam_header.Programs.Add(sam_program); for (Json::Value::iterator rg = datasets.dataset(ds)["read_groups"].begin(); rg != datasets.dataset(ds)["read_groups"].end(); ++rg) { string read_group_name = (*rg).asString(); Json::Value& read_group_json = datasets.read_groups()[read_group_name]; read_group_dataset_[datasets.read_group_name_to_id(read_group_name)] = ds; SamReadGroup read_group (read_group_name); read_group.FlowOrder = flow_order.full_nucs(); read_group.KeySequence = key; read_group.KeySequence += read_group_json.get("barcode_sequence","").asString(); read_group.KeySequence += read_group_json.get("barcode_adapter","").asString(); read_group.ProductionDate = production_date; read_group.Sample = read_group_json.get("sample","").asString(); read_group.Library = read_group_json.get("library","").asString(); read_group.Description = read_group_json.get("description","").asString(); read_group.PlatformUnit = read_group_json.get("platform_unit","").asString(); read_group.SequencingCenter = datasets.json().get("sequencing_center","").asString(); read_group.SequencingTechnology = "IONTORRENT"; sam_header.ReadGroups.Add(read_group); } // Open Bam for writing RefVector empty_reference_vector; bam_writer_[ds] = new BamWriter(); bam_writer_[ds]->SetCompressionMode(BamWriter::Compressed); //bam_writer_[ds]->SetCompressionMode(BamWriter::Uncompressed); bam_writer_[ds]->Open(bam_filename_[ds], sam_header, empty_reference_vector); } }
void OrderedDatasetWriter::Open(const string& base_directory, BarcodeDatasets& datasets, int read_class_idx, int num_regions, const ion::FlowOrder& flow_order, const string& key, const vector<string> & bead_adapters, int num_bamwriter_threads, const Json::Value & basecaller_json, vector<string>& comments, MolecularTagTrimmer& tag_trimmer, bool trim_barcodes) { num_regions_ = num_regions; num_regions_written_ = 0; region_ready_.assign(num_regions_+1,false); region_dropbox_.clear(); region_dropbox_.resize(num_regions_); qv_histogram_.assign(50,0); num_datasets_ = datasets.num_datasets(); num_barcodes_ = datasets.num_barcodes(); num_read_groups_ = datasets.num_read_groups(); num_reads_.resize(num_datasets_,0); bam_filename_.resize(num_datasets_); // A negative read group index indicates untrimmed/unfiltered bam files (w. library key) and we save all reads if (read_class_idx < 0) { save_filtered_reads_ = true; read_class_idx = 0; } else save_filtered_reads_ = false; read_group_name_.resize(num_read_groups_); read_group_dataset_.assign(num_read_groups_, -1); read_group_num_Q20_bases_.assign(num_read_groups_,0); read_group_barcode_filt_zero_err_.assign(num_read_groups_, 0); read_group_barcode_adapter_rejected_.assign(num_read_groups_, 0); read_group_num_barcode_errors_.resize(num_read_groups_); read_group_barcode_distance_hist_.resize(num_read_groups_); read_group_barcode_bias_.resize(num_read_groups_); for (int rg = 0; rg < num_read_groups_; ++rg) { read_group_name_[rg] = datasets.read_group_name(rg); read_group_num_barcode_errors_[rg].assign(3,0); read_group_barcode_bias_[rg].assign(datasets.GetBCmaxFlows(),0.0); read_group_barcode_distance_hist_[rg].assign(5,0); } // New filtering and trimming accounting (per read group) read_group_stats_.resize(num_read_groups_); for (int rg=0; rg<num_read_groups_; rg++) read_group_stats_[rg].SetBeadAdapters(bead_adapters); combined_stats_.SetBeadAdapters(bead_adapters); bam_writer_.resize(num_datasets_, NULL); sam_header_.resize(num_datasets_); num_bamwriter_threads_ = num_bamwriter_threads; for (int ds = 0; ds < num_datasets_; ++ds) { // Set up BAM header bam_filename_[ds] = base_directory + "/" + datasets.dataset(ds)["basecaller_bam"].asString(); SamHeader& sam_header = sam_header_[ds]; sam_header.Version = "1.4"; sam_header.SortOrder = "unsorted"; SamProgram sam_program("bc"); sam_program.Name = "BaseCaller"; sam_program.Version = basecaller_json["BaseCaller"]["version"].asString() + "/" + basecaller_json["BaseCaller"]["git_hash"].asString(); sam_program.CommandLine = basecaller_json["BaseCaller"]["command_line"].asString(); sam_header.Programs.Add(sam_program); for (Json::Value::iterator rg = datasets.dataset(ds)["read_groups"].begin(); rg != datasets.dataset(ds)["read_groups"].end(); ++rg) { string read_group_name = (*rg).asString(); Json::Value& read_group_json = datasets.read_groups()[read_group_name]; read_group_dataset_[datasets.read_group_name_to_id(read_group_name)] = ds; SamReadGroup read_group (read_group_name); read_group.FlowOrder = flow_order.full_nucs(); read_group.KeySequence = key; if (trim_barcodes){ // We only add the barcode info to the key sequence if we hard clipped it read_group.KeySequence += read_group_json.get("barcode_sequence","").asString(); read_group.KeySequence += read_group_json.get("barcode_adapter","").asString(); } read_group.ProductionDate = basecaller_json["BaseCaller"]["start_time"].asString(); read_group.Sample = read_group_json.get("sample","").asString(); read_group.Library = read_group_json.get("library","").asString(); read_group.Description = read_group_json.get("description","").asString(); read_group.PlatformUnit = read_group_json.get("platform_unit","").asString(); read_group.SequencingCenter = datasets.json().get("sequencing_center","").asString(); read_group.SequencingTechnology = "IONTORRENT"; // Add custom tags: Structure of tags per read group XXX if (datasets.IsLibraryDataset()) { MolTag my_tags = tag_trimmer.GetReadGroupTags(read_group_name); AddCustomReadGroupTag(read_group, "zt", my_tags.prefix_mol_tag); AddCustomReadGroupTag(read_group, "yt", my_tags.suffix_mol_tag); } sam_header.ReadGroups.Add(read_group); } for(size_t i = 0; i < comments.size(); ++i) sam_header.Comments.push_back(comments[i]); } }