Beispiel #1
0
// ValidateReaders checks that all the readers point to BAM files representing
// alignments against the same set of reference sequences, and that the
// sequences are identically ordered.  If these checks fail the operation of
// the multireader is undefined, so we force program exit.
void BamMultiReader::ValidateReaders(void) const {
    int firstRefCount = readers.front().first->GetReferenceCount();
    BamTools::RefVector firstRefData = readers.front().first->GetReferenceData();
    for (vector<pair<BamReader*, BamAlignment*> >::const_iterator it = readers.begin(); it != readers.end(); ++it) {
        BamReader* reader = it->first;
        BamTools::RefVector currentRefData = reader->GetReferenceData();
        BamTools::RefVector::const_iterator f = firstRefData.begin();
        BamTools::RefVector::const_iterator c = currentRefData.begin();
        if (reader->GetReferenceCount() != firstRefCount || firstRefData.size() != currentRefData.size()) {
            cerr << "ERROR: mismatched number of references in " << reader->GetFilename()
                      << " expected " << firstRefCount 
                      << " reference sequences but only found " << reader->GetReferenceCount() << endl;
            exit(1);
        }
        // this will be ok; we just checked above that we have identically-sized sets of references
        // here we simply check if they are all, in fact, equal in content
        while (f != firstRefData.end()) {
            if (f->RefName != c->RefName || f->RefLength != c->RefLength) {
                cerr << "ERROR: mismatched references found in " << reader->GetFilename()
                          << " expected: " << endl;
                for (BamTools::RefVector::const_iterator a = firstRefData.begin(); a != firstRefData.end(); ++a)
                    cerr << a->RefName << " " << a->RefLength << endl;
                cerr << "but found: " << endl;
                for (BamTools::RefVector::const_iterator a = currentRefData.begin(); a != currentRefData.end(); ++a)
                    cerr << a->RefName << " " << a->RefLength << endl;
                exit(1);
            }
            ++f; ++c;
        }
    }
}
Beispiel #2
0
bool BamMultiReaderPrivate::PopNextCachedAlignment(BamAlignment& al, const bool needCharData)
{

    // skip if no alignments available
    if (m_alignmentCache == 0 || m_alignmentCache->IsEmpty()) return false;

    // pop next merge item entry from cache
    MergeItem item = m_alignmentCache->TakeFirst();
    BamReader* reader = item.Reader;
    BamAlignment* alignment = item.Alignment;
    if (reader == 0 || alignment == 0) return false;

    // set char data if requested
    if (needCharData) {
        alignment->BuildCharData();
        alignment->Filename = reader->GetFilename();
    }

    // store cached alignment into destination parameter (by copy)
    al = *alignment;

    // load next alignment from reader & store in cache
    SaveNextAlignment(reader, alignment);
    return true;
}
Beispiel #3
0
// jumps to specified region(refID, leftBound) in BAM files, returns success/fail
bool BamMultiReader::Jump(int refID, int position) {

    //if ( References.at(refID).RefHasAlignments && (position <= References.at(refID).RefLength) ) {
    CurrentRefID = refID;
    CurrentLeft  = position;

    bool result = true;
    for (vector<pair<BamReader*, BamAlignment*> >::iterator it = readers.begin(); it != readers.end(); ++it) {
        BamReader* reader = it->first;
        result &= reader->Jump(refID, position);
        if (!result) {
            cerr << "ERROR: could not jump " << reader->GetFilename() << " to " << refID << ":" << position << endl;
            exit(1);
        }
    }
    if (result) UpdateAlignments();
    return result;
}
Beispiel #4
0
// makes a virtual, unified header for all the bam files in the multireader
const string BamMultiReader::GetHeaderText(void) const {

    string mergedHeader = "";
    map<string, bool> readGroups;

    // foreach extraction entry (each BAM file)
    for (vector<pair<BamReader*, BamAlignment*> >::const_iterator rs = readers.begin(); rs != readers.end(); ++rs) {

        BamReader* reader = rs->first;
        string headerText = reader->GetHeaderText();
        if ( headerText.empty() ) continue;
        
        map<string, bool> currentFileReadGroups;
        stringstream header(headerText);
        vector<string> lines;
        string item;
        while (getline(header, item))
            lines.push_back(item);

        for (vector<string>::const_iterator it = lines.begin(); it != lines.end(); ++it) {

            // get next line from header, skip if empty
            string headerLine = *it;
            if ( headerLine.empty() ) { continue; }

            // if first file, save HD & SQ entries
            if ( rs == readers.begin() ) {
                if ( headerLine.find("@HD") == 0 || headerLine.find("@SQ") == 0) {
                    mergedHeader.append(headerLine.c_str());
                    mergedHeader.append(1, '\n');
                }
            }

            // (for all files) append RG entries if they are unique
            if ( headerLine.find("@RG") == 0 ) {
                stringstream headerLineSs(headerLine);
                string part, readGroupPart, readGroup;
                while(std::getline(headerLineSs, part, '\t')) {
                    stringstream partSs(part);
                    string subtag;
                    std::getline(partSs, subtag, ':');
                    if (subtag == "ID") {
                        std::getline(partSs, readGroup, ':');
                        break;
                    }
                }
                if (readGroups.find(readGroup) == readGroups.end()) { // prevents duplicate @RG entries
                    mergedHeader.append(headerLine.c_str() );
                    mergedHeader.append(1, '\n');
                    readGroups[readGroup] = true;
                    currentFileReadGroups[readGroup] = true;
                } else {
                    // warn iff we are reading one file and discover duplicated @RG tags in the header
                    // otherwise, we emit no warning, as we might be merging multiple BAM files with identical @RG tags
                    if (currentFileReadGroups.find(readGroup) != currentFileReadGroups.end()) {
                        cerr << "WARNING: duplicate @RG tag " << readGroup 
                            << " entry in header of " << reader->GetFilename() << endl;
                    }
                }
            }
        }
    }

    // return merged header text
    return mergedHeader;
}
Beispiel #5
0
void BamMultiReader::PrintFilenames(void) {
    for (vector<pair<BamReader*, BamAlignment*> >::iterator it = readers.begin(); it != readers.end(); ++it) {
        BamReader* reader = it->first;
        cout << reader->GetFilename() << endl;
    }
}
Beispiel #6
0
// ValidateReaders checks that all the readers point to BAM files representing
// alignments against the same set of reference sequences, and that the
// sequences are identically ordered.  If these checks fail the operation of
// the multireader is undefined, so we force program exit.
bool BamMultiReaderPrivate::ValidateReaders() const
{

    m_errorString.clear();

    // skip if 0 or 1 readers opened
    if (m_readers.empty() || (m_readers.size() == 1)) return true;

    // retrieve first reader
    const MergeItem& firstItem = m_readers.front();
    const BamReader* firstReader = firstItem.Reader;
    if (firstReader == 0) return false;

    // retrieve first reader's header data
    const SamHeader& firstReaderHeader = firstReader->GetHeader();
    const std::string& firstReaderSortOrder = firstReaderHeader.SortOrder;

    // retrieve first reader's reference data
    const RefVector& firstReaderRefData = firstReader->GetReferenceData();
    const int firstReaderRefCount = firstReader->GetReferenceCount();
    const int firstReaderRefSize = firstReaderRefData.size();

    // iterate over all readers
    std::vector<MergeItem>::const_iterator readerIter = m_readers.begin();
    std::vector<MergeItem>::const_iterator readerEnd = m_readers.end();
    for (; readerIter != readerEnd; ++readerIter) {
        const MergeItem& item = (*readerIter);
        BamReader* reader = item.Reader;
        if (reader == 0) continue;

        // get current reader's header data
        const SamHeader& currentReaderHeader = reader->GetHeader();
        const std::string& currentReaderSortOrder = currentReaderHeader.SortOrder;

        // check compatible sort order
        if (currentReaderSortOrder != firstReaderSortOrder) {
            const std::string message =
                std::string("mismatched sort order in ") + reader->GetFilename() + ", expected " +
                firstReaderSortOrder + ", but found " + currentReaderSortOrder;
            SetErrorString("BamMultiReader::ValidateReaders", message);
            return false;
        }

        // get current reader's reference data
        const RefVector currentReaderRefData = reader->GetReferenceData();
        const int currentReaderRefCount = reader->GetReferenceCount();
        const int currentReaderRefSize = currentReaderRefData.size();

        // init reference data iterators
        RefVector::const_iterator firstRefIter = firstReaderRefData.begin();
        RefVector::const_iterator firstRefEnd = firstReaderRefData.end();
        RefVector::const_iterator currentRefIter = currentReaderRefData.begin();

        // compare reference counts from BamReader ( & container size, in case of BR error)
        if ((currentReaderRefCount != firstReaderRefCount) ||
            (firstReaderRefSize != currentReaderRefSize)) {
            std::stringstream s;
            s << "mismatched reference count in " << reader->GetFilename() << ", expected "
              << firstReaderRefCount << ", but found " << currentReaderRefCount;
            SetErrorString("BamMultiReader::ValidateReaders", s.str());
            return false;
        }

        // this will be ok; we just checked above that we have identically-sized sets of references
        // here we simply check if they are all, in fact, equal in content
        while (firstRefIter != firstRefEnd) {
            const RefData& firstRef = (*firstRefIter);
            const RefData& currentRef = (*currentRefIter);

            // compare reference name & length
            if ((firstRef.RefName != currentRef.RefName) ||
                (firstRef.RefLength != currentRef.RefLength)) {
                std::stringstream s;
                s << "mismatched references found in" << reader->GetFilename()
                  << "expected: " << std::endl;

                // print first reader's reference data
                RefVector::const_iterator refIter = firstReaderRefData.begin();
                RefVector::const_iterator refEnd = firstReaderRefData.end();
                for (; refIter != refEnd; ++refIter) {
                    const RefData& entry = (*refIter);
                    std::stringstream s;
                    s << entry.RefName << ' ' << std::endl;
                }

                s << "but found: " << std::endl;

                // print current reader's reference data
                refIter = currentReaderRefData.begin();
                refEnd = currentReaderRefData.end();
                for (; refIter != refEnd; ++refIter) {
                    const RefData& entry = (*refIter);
                    s << entry.RefName << ' ' << entry.RefLength << std::endl;
                }

                SetErrorString("BamMultiReader::ValidateReaders", s.str());
                return false;
            }

            // update iterators
            ++firstRefIter;
            ++currentRefIter;
        }
    }

    // if we get here, everything checks out
    return true;
}
Beispiel #7
0
// close requested BAM files
bool BamMultiReaderPrivate::CloseFiles(const std::vector<std::string>& filenames)
{

    bool errorsEncountered = false;
    m_errorString.clear();

    // iterate over filenames
    std::vector<std::string>::const_iterator filesIter = filenames.begin();
    std::vector<std::string>::const_iterator filesEnd = filenames.end();
    for (; filesIter != filesEnd; ++filesIter) {
        const std::string& filename = (*filesIter);
        if (filename.empty()) continue;

        // iterate over readers
        std::vector<MergeItem>::iterator readerIter = m_readers.begin();
        std::vector<MergeItem>::iterator readerEnd = m_readers.end();
        for (; readerIter != readerEnd; ++readerIter) {
            MergeItem& item = (*readerIter);
            BamReader* reader = item.Reader;
            if (reader == 0) continue;

            // if reader matches requested filename
            if (reader->GetFilename() == filename) {

                // remove reader's entry from alignment cache
                m_alignmentCache->Remove(reader);

                // clean up reader & its alignment
                if (!reader->Close()) {
                    m_errorString.append(1, '\t');
                    m_errorString.append(reader->GetErrorString());
                    m_errorString.append(1, '\n');
                    errorsEncountered = true;
                }
                delete reader;
                reader = 0;

                // delete reader's alignment entry
                BamAlignment* alignment = item.Alignment;
                delete alignment;
                alignment = 0;

                // remove reader from reader list
                m_readers.erase(readerIter);

                // on match, just go on to next filename
                // (no need to keep looking and item iterator is invalid now anyway)
                break;
            }
        }
    }

    // make sure we clean up properly if all readers were closed
    if (m_readers.empty()) {

        // clean up merger
        if (m_alignmentCache) {
            m_alignmentCache->Clear();
            delete m_alignmentCache;
            m_alignmentCache = 0;
        }

        // reset merge flags
        m_hasUserMergeOrder = false;
        m_mergeOrder = BamMultiReader::RoundRobinMerge;
    }

    // return whether all readers closed OK
    return !errorsEncountered;
}