// ValidateReaders checks that all the readers point to BAM files representing // alignments against the same set of reference sequences, and that the // sequences are identically ordered. If these checks fail the operation of // the multireader is undefined, so we force program exit. void BamMultiReader::ValidateReaders(void) const { int firstRefCount = readers.front().first->GetReferenceCount(); BamTools::RefVector firstRefData = readers.front().first->GetReferenceData(); for (vector<pair<BamReader*, BamAlignment*> >::const_iterator it = readers.begin(); it != readers.end(); ++it) { BamReader* reader = it->first; BamTools::RefVector currentRefData = reader->GetReferenceData(); BamTools::RefVector::const_iterator f = firstRefData.begin(); BamTools::RefVector::const_iterator c = currentRefData.begin(); if (reader->GetReferenceCount() != firstRefCount || firstRefData.size() != currentRefData.size()) { cerr << "ERROR: mismatched number of references in " << reader->GetFilename() << " expected " << firstRefCount << " reference sequences but only found " << reader->GetReferenceCount() << endl; exit(1); } // this will be ok; we just checked above that we have identically-sized sets of references // here we simply check if they are all, in fact, equal in content while (f != firstRefData.end()) { if (f->RefName != c->RefName || f->RefLength != c->RefLength) { cerr << "ERROR: mismatched references found in " << reader->GetFilename() << " expected: " << endl; for (BamTools::RefVector::const_iterator a = firstRefData.begin(); a != firstRefData.end(); ++a) cerr << a->RefName << " " << a->RefLength << endl; cerr << "but found: " << endl; for (BamTools::RefVector::const_iterator a = currentRefData.begin(); a != currentRefData.end(); ++a) cerr << a->RefName << " " << a->RefLength << endl; exit(1); } ++f; ++c; } } }
bool BamMultiReaderPrivate::PopNextCachedAlignment(BamAlignment& al, const bool needCharData) { // skip if no alignments available if (m_alignmentCache == 0 || m_alignmentCache->IsEmpty()) return false; // pop next merge item entry from cache MergeItem item = m_alignmentCache->TakeFirst(); BamReader* reader = item.Reader; BamAlignment* alignment = item.Alignment; if (reader == 0 || alignment == 0) return false; // set char data if requested if (needCharData) { alignment->BuildCharData(); alignment->Filename = reader->GetFilename(); } // store cached alignment into destination parameter (by copy) al = *alignment; // load next alignment from reader & store in cache SaveNextAlignment(reader, alignment); return true; }
// jumps to specified region(refID, leftBound) in BAM files, returns success/fail bool BamMultiReader::Jump(int refID, int position) { //if ( References.at(refID).RefHasAlignments && (position <= References.at(refID).RefLength) ) { CurrentRefID = refID; CurrentLeft = position; bool result = true; for (vector<pair<BamReader*, BamAlignment*> >::iterator it = readers.begin(); it != readers.end(); ++it) { BamReader* reader = it->first; result &= reader->Jump(refID, position); if (!result) { cerr << "ERROR: could not jump " << reader->GetFilename() << " to " << refID << ":" << position << endl; exit(1); } } if (result) UpdateAlignments(); return result; }
// makes a virtual, unified header for all the bam files in the multireader const string BamMultiReader::GetHeaderText(void) const { string mergedHeader = ""; map<string, bool> readGroups; // foreach extraction entry (each BAM file) for (vector<pair<BamReader*, BamAlignment*> >::const_iterator rs = readers.begin(); rs != readers.end(); ++rs) { BamReader* reader = rs->first; string headerText = reader->GetHeaderText(); if ( headerText.empty() ) continue; map<string, bool> currentFileReadGroups; stringstream header(headerText); vector<string> lines; string item; while (getline(header, item)) lines.push_back(item); for (vector<string>::const_iterator it = lines.begin(); it != lines.end(); ++it) { // get next line from header, skip if empty string headerLine = *it; if ( headerLine.empty() ) { continue; } // if first file, save HD & SQ entries if ( rs == readers.begin() ) { if ( headerLine.find("@HD") == 0 || headerLine.find("@SQ") == 0) { mergedHeader.append(headerLine.c_str()); mergedHeader.append(1, '\n'); } } // (for all files) append RG entries if they are unique if ( headerLine.find("@RG") == 0 ) { stringstream headerLineSs(headerLine); string part, readGroupPart, readGroup; while(std::getline(headerLineSs, part, '\t')) { stringstream partSs(part); string subtag; std::getline(partSs, subtag, ':'); if (subtag == "ID") { std::getline(partSs, readGroup, ':'); break; } } if (readGroups.find(readGroup) == readGroups.end()) { // prevents duplicate @RG entries mergedHeader.append(headerLine.c_str() ); mergedHeader.append(1, '\n'); readGroups[readGroup] = true; currentFileReadGroups[readGroup] = true; } else { // warn iff we are reading one file and discover duplicated @RG tags in the header // otherwise, we emit no warning, as we might be merging multiple BAM files with identical @RG tags if (currentFileReadGroups.find(readGroup) != currentFileReadGroups.end()) { cerr << "WARNING: duplicate @RG tag " << readGroup << " entry in header of " << reader->GetFilename() << endl; } } } } } // return merged header text return mergedHeader; }
void BamMultiReader::PrintFilenames(void) { for (vector<pair<BamReader*, BamAlignment*> >::iterator it = readers.begin(); it != readers.end(); ++it) { BamReader* reader = it->first; cout << reader->GetFilename() << endl; } }
// ValidateReaders checks that all the readers point to BAM files representing // alignments against the same set of reference sequences, and that the // sequences are identically ordered. If these checks fail the operation of // the multireader is undefined, so we force program exit. bool BamMultiReaderPrivate::ValidateReaders() const { m_errorString.clear(); // skip if 0 or 1 readers opened if (m_readers.empty() || (m_readers.size() == 1)) return true; // retrieve first reader const MergeItem& firstItem = m_readers.front(); const BamReader* firstReader = firstItem.Reader; if (firstReader == 0) return false; // retrieve first reader's header data const SamHeader& firstReaderHeader = firstReader->GetHeader(); const std::string& firstReaderSortOrder = firstReaderHeader.SortOrder; // retrieve first reader's reference data const RefVector& firstReaderRefData = firstReader->GetReferenceData(); const int firstReaderRefCount = firstReader->GetReferenceCount(); const int firstReaderRefSize = firstReaderRefData.size(); // iterate over all readers std::vector<MergeItem>::const_iterator readerIter = m_readers.begin(); std::vector<MergeItem>::const_iterator readerEnd = m_readers.end(); for (; readerIter != readerEnd; ++readerIter) { const MergeItem& item = (*readerIter); BamReader* reader = item.Reader; if (reader == 0) continue; // get current reader's header data const SamHeader& currentReaderHeader = reader->GetHeader(); const std::string& currentReaderSortOrder = currentReaderHeader.SortOrder; // check compatible sort order if (currentReaderSortOrder != firstReaderSortOrder) { const std::string message = std::string("mismatched sort order in ") + reader->GetFilename() + ", expected " + firstReaderSortOrder + ", but found " + currentReaderSortOrder; SetErrorString("BamMultiReader::ValidateReaders", message); return false; } // get current reader's reference data const RefVector currentReaderRefData = reader->GetReferenceData(); const int currentReaderRefCount = reader->GetReferenceCount(); const int currentReaderRefSize = currentReaderRefData.size(); // init reference data iterators RefVector::const_iterator firstRefIter = firstReaderRefData.begin(); RefVector::const_iterator firstRefEnd = firstReaderRefData.end(); RefVector::const_iterator currentRefIter = currentReaderRefData.begin(); // compare reference counts from BamReader ( & container size, in case of BR error) if ((currentReaderRefCount != firstReaderRefCount) || (firstReaderRefSize != currentReaderRefSize)) { std::stringstream s; s << "mismatched reference count in " << reader->GetFilename() << ", expected " << firstReaderRefCount << ", but found " << currentReaderRefCount; SetErrorString("BamMultiReader::ValidateReaders", s.str()); return false; } // this will be ok; we just checked above that we have identically-sized sets of references // here we simply check if they are all, in fact, equal in content while (firstRefIter != firstRefEnd) { const RefData& firstRef = (*firstRefIter); const RefData& currentRef = (*currentRefIter); // compare reference name & length if ((firstRef.RefName != currentRef.RefName) || (firstRef.RefLength != currentRef.RefLength)) { std::stringstream s; s << "mismatched references found in" << reader->GetFilename() << "expected: " << std::endl; // print first reader's reference data RefVector::const_iterator refIter = firstReaderRefData.begin(); RefVector::const_iterator refEnd = firstReaderRefData.end(); for (; refIter != refEnd; ++refIter) { const RefData& entry = (*refIter); std::stringstream s; s << entry.RefName << ' ' << std::endl; } s << "but found: " << std::endl; // print current reader's reference data refIter = currentReaderRefData.begin(); refEnd = currentReaderRefData.end(); for (; refIter != refEnd; ++refIter) { const RefData& entry = (*refIter); s << entry.RefName << ' ' << entry.RefLength << std::endl; } SetErrorString("BamMultiReader::ValidateReaders", s.str()); return false; } // update iterators ++firstRefIter; ++currentRefIter; } } // if we get here, everything checks out return true; }
// close requested BAM files bool BamMultiReaderPrivate::CloseFiles(const std::vector<std::string>& filenames) { bool errorsEncountered = false; m_errorString.clear(); // iterate over filenames std::vector<std::string>::const_iterator filesIter = filenames.begin(); std::vector<std::string>::const_iterator filesEnd = filenames.end(); for (; filesIter != filesEnd; ++filesIter) { const std::string& filename = (*filesIter); if (filename.empty()) continue; // iterate over readers std::vector<MergeItem>::iterator readerIter = m_readers.begin(); std::vector<MergeItem>::iterator readerEnd = m_readers.end(); for (; readerIter != readerEnd; ++readerIter) { MergeItem& item = (*readerIter); BamReader* reader = item.Reader; if (reader == 0) continue; // if reader matches requested filename if (reader->GetFilename() == filename) { // remove reader's entry from alignment cache m_alignmentCache->Remove(reader); // clean up reader & its alignment if (!reader->Close()) { m_errorString.append(1, '\t'); m_errorString.append(reader->GetErrorString()); m_errorString.append(1, '\n'); errorsEncountered = true; } delete reader; reader = 0; // delete reader's alignment entry BamAlignment* alignment = item.Alignment; delete alignment; alignment = 0; // remove reader from reader list m_readers.erase(readerIter); // on match, just go on to next filename // (no need to keep looking and item iterator is invalid now anyway) break; } } } // make sure we clean up properly if all readers were closed if (m_readers.empty()) { // clean up merger if (m_alignmentCache) { m_alignmentCache->Clear(); delete m_alignmentCache; m_alignmentCache = 0; } // reset merge flags m_hasUserMergeOrder = false; m_mergeOrder = BamMultiReader::RoundRobinMerge; } // return whether all readers closed OK return !errorsEncountered; }