void TargetsManager::Initialize(const ReferenceReader& ref_reader, const string& _targets, bool _trim_ampliseq_primers /*const ExtendParameters& parameters*/) { // // Step 1. Retrieve raw target definitions // list<UnmergedTarget> raw_targets; if (not _targets.empty()) { LoadRawTargets(ref_reader, _targets, raw_targets); } else { for (int chr = 0; chr < ref_reader.chr_count(); ++chr) { raw_targets.push_back(UnmergedTarget()); UnmergedTarget& target = raw_targets.back(); target.begin = 0; target.end = ref_reader.chr_size(chr); target.chr = chr; } } // // Step 2. Sort raw targets and transfer to the vector // int num_unmerged = raw_targets.size(); vector<UnmergedTarget*> raw_sort; raw_sort.reserve(num_unmerged); for (list<UnmergedTarget>::iterator I = raw_targets.begin(); I != raw_targets.end(); ++I) raw_sort.push_back(&(*I)); sort(raw_sort.begin(), raw_sort.end(), CompareTargets); unmerged.reserve(num_unmerged); bool already_sorted = true; list<UnmergedTarget>::iterator I = raw_targets.begin(); for (int idx = 0; idx < num_unmerged; ++idx, ++I) { if (raw_sort[idx] != &(*I) and already_sorted) { already_sorted = false; cerr << "TargetsManager: BED not sorted at position " << idx; cerr << " replaced " << I->name << ":" << I->chr << ":" << I->begin << "-" << I->end; cerr << " with " << raw_sort[idx]->name << ":" << raw_sort[idx]->chr << ":" << raw_sort[idx]->begin << "-" << raw_sort[idx]->end << endl; } unmerged.push_back(*raw_sort[idx]); } // // Step 3. Merge targets and link merged/unmerged entries // merged.reserve(num_unmerged); bool already_merged = true; for (int idx = 0; idx < num_unmerged; ++idx) { if (idx and merged.back().chr == unmerged[idx].chr and merged.back().end >= unmerged[idx].begin) { merged.back().end = max(merged.back().end, unmerged[idx].end); already_merged = false; } else { merged.push_back(MergedTarget()); merged.back().chr = unmerged[idx].chr; merged.back().begin = unmerged[idx].begin; merged.back().end = unmerged[idx].end; merged.back().first_unmerged = idx; } unmerged[idx].merged = merged.size(); } if (_targets.empty()) { cout << "TargetsManager: No targets file specified, processing entire reference" << endl; } else { cout << "TargetsManager: Loaded targets file " << _targets << endl; cout << "TargetsManager: " << num_unmerged << " target(s)"; if (not already_merged) cout << " (" << merged.size() << " after merging)"; cout << endl; if (not already_sorted) cout << "TargetsManager: Targets required sorting" << endl; trim_ampliseq_primers = _trim_ampliseq_primers; if (trim_ampliseq_primers) cout << "TargetsManager: Trimming of AmpliSeq primers is enabled" << endl; } }
// open BAM input file void BAMWalkerEngine::InitializeBAMs(const ReferenceReader& ref_reader, const vector<string>& bam_filenames) { if (not bam_reader_.SetExplicitMergeOrder(BamMultiReader::MergeByCoordinate)) { cerr << "ERROR: Could not set merge order to BamMultiReader::MergeByCoordinate" << endl; exit(1); } if (not bam_reader_.Open(bam_filenames)) { cerr << "ERROR: Could not open input BAM file(s) : " << bam_reader_.GetErrorString() << endl; exit(1); } if (not bam_reader_.LocateIndexes()) { cerr << "ERROR: Could not open BAM index file(s) : " << bam_reader_.GetErrorString() << endl; exit(1); } // BAM multi reader combines the read group information of the different BAMs but does not merge comment sections bam_header_ = bam_reader_.GetHeader(); if (!bam_header_.HasReadGroups()) { cerr << "ERROR: there is no read group in BAM files specified" << endl; exit(1); } // Manually merge comment sections of BAM files if we have more than one BAM file if (bam_filenames.size() > 1) { unsigned int num_duplicates = 0; unsigned int num_merged = 0; for (unsigned int bam_idx = 0; bam_idx < bam_filenames.size(); bam_idx++) { BamReader reader; if (not reader.Open(bam_filenames.at(bam_idx))) { cerr << "TVC ERROR: Failed to open input BAM file " << reader.GetErrorString() << endl; exit(1); } SamHeader header = reader.GetHeader(); for (unsigned int i_co = 0; i_co < header.Comments.size(); i_co++) { // Step 1: Check if this comment is already part of the merged header unsigned int m_co = 0; while (m_co < bam_header_.Comments.size() and bam_header_.Comments.at(m_co) != header.Comments.at(i_co)) m_co++; if (m_co < bam_header_.Comments.size()){ num_duplicates++; continue; } // Add comment line to merged header if it is a new one num_merged++; bam_header_.Comments.push_back(header.Comments.at(i_co)); } } // Verbose what we did cout << "Merged " << num_merged << " unique comment lines into combined BAM header. Encountered " << num_duplicates << " duplicate comments." << endl; } // // Reference sequences in the bam file must match that in the fasta file // vector<RefData> referenceSequences = bam_reader_.GetReferenceData(); if ((int)referenceSequences.size() != ref_reader.chr_count()) { cerr << "ERROR: Reference in BAM file does not match fasta file" << endl << " BAM has " << referenceSequences.size() << " chromosomes while fasta has " << ref_reader.chr_count() << endl; exit(1); } for (int chr_idx = 0; chr_idx < ref_reader.chr_count(); ++chr_idx) { if (referenceSequences[chr_idx].RefName != ref_reader.chr_str(chr_idx)) { cerr << "ERROR: Reference in BAM file does not match fasta file" << endl << " Chromosome #" << (chr_idx+1) << "in BAM is " << referenceSequences[chr_idx].RefName << " while fasta has " << ref_reader.chr_str(chr_idx) << endl; exit(1); } if (referenceSequences[chr_idx].RefLength != ref_reader.chr_size(chr_idx)) { cerr << "ERROR: Reference in BAM file does not match fasta file" << endl << " Chromosome " << referenceSequences[chr_idx].RefName << "in BAM has length " << referenceSequences[chr_idx].RefLength << " while fasta has " << ref_reader.chr_size(chr_idx) << endl; exit(1); } } // // Retrieve BaseCaller and TMAP version strings from BAM header // set<string> basecaller_versions; set<string> tmap_versions; for (SamProgramIterator I = bam_header_.Programs.Begin(); I != bam_header_.Programs.End(); ++I) { if (I->ID.substr(0,2) == "bc") basecaller_versions.insert(I->Version); if (I->ID.substr(0,4) == "tmap") tmap_versions.insert(I->Version); } basecaller_version_ = ""; for (set<string>::const_iterator I = basecaller_versions.begin(); I != basecaller_versions.end(); ++I) { if (not basecaller_version_.empty()) basecaller_version_ += ", "; basecaller_version_ += *I; } tmap_version_ = ""; for (set<string>::const_iterator I = tmap_versions.begin(); I != tmap_versions.end(); ++I) { if (not tmap_version_.empty()) tmap_version_ += ", "; tmap_version_ += *I; } }
// open BAM input file void BAMWalkerEngine::InitializeBAMs(const ReferenceReader& ref_reader, const vector<string>& bam_filenames) { if (not bam_reader_.SetExplicitMergeOrder(BamMultiReader::MergeByCoordinate)) { cerr << "ERROR: Could not set merge order to BamMultiReader::MergeByCoordinate" << endl; exit(1); } if (not bam_reader_.Open(bam_filenames)) { cerr << "ERROR: Could not open input BAM file(s) : " << bam_reader_.GetErrorString(); exit(1); } if (not bam_reader_.LocateIndexes()) { cerr << "ERROR: Could not open BAM index file(s) : " << bam_reader_.GetErrorString(); exit(1); } bam_header_ = bam_reader_.GetHeader(); if (!bam_header_.HasReadGroups()) { cerr << "ERROR: there is no read group in BAM files specified" << endl; exit(1); } // // Reference sequences in the bam file must match that in the fasta file // vector<RefData> referenceSequences = bam_reader_.GetReferenceData(); if ((int)referenceSequences.size() != ref_reader.chr_count()) { cerr << "ERROR: Reference in BAM file does not match fasta file" << endl << " BAM has " << referenceSequences.size() << " chromosomes while fasta has " << ref_reader.chr_count() << endl; exit(1); } for (int chr_idx = 0; chr_idx < ref_reader.chr_count(); ++chr_idx) { if (referenceSequences[chr_idx].RefName != ref_reader.chr_str(chr_idx)) { cerr << "ERROR: Reference in BAM file does not match fasta file" << endl << " Chromosome #" << (chr_idx+1) << "in BAM is " << referenceSequences[chr_idx].RefName << " while fasta has " << ref_reader.chr_str(chr_idx) << endl; exit(1); } if (referenceSequences[chr_idx].RefLength != ref_reader.chr_size(chr_idx)) { cerr << "ERROR: Reference in BAM file does not match fasta file" << endl << " Chromosome " << referenceSequences[chr_idx].RefName << "in BAM has length " << referenceSequences[chr_idx].RefLength << " while fasta has " << ref_reader.chr_size(chr_idx) << endl; exit(1); } } // // Retrieve BaseCaller and TMAP version strings from BAM header // set<string> basecaller_versions; set<string> tmap_versions; for (SamProgramIterator I = bam_header_.Programs.Begin(); I != bam_header_.Programs.End(); ++I) { if (I->ID.substr(0,2) == "bc") basecaller_versions.insert(I->Version); if (I->ID.substr(0,4) == "tmap") tmap_versions.insert(I->Version); } basecaller_version_ = ""; for (set<string>::const_iterator I = basecaller_versions.begin(); I != basecaller_versions.end(); ++I) { if (not basecaller_version_.empty()) basecaller_version_ += ", "; basecaller_version_ += *I; } tmap_version_ = ""; for (set<string>::const_iterator I = tmap_versions.begin(); I != tmap_versions.end(); ++I) { if (not tmap_version_.empty()) tmap_version_ += ", "; tmap_version_ += *I; } }