Example #1
0
bool SubreadConverter::ConvertFile(HDFBasReader* reader,
                                   PacBio::BAM::BamWriter* writer,
                                   PacBio::BAM::BamWriter* scrapsWriter) 
{
    assert(reader);

    // initialize with default values (shared across all unmapped subreads)
    BamRecordImpl bamRecord;

    // read region table info
    std::unique_ptr<HDFRegionTableReader> const regionTableReader(new HDFRegionTableReader);
    RegionTable regionTable;
    string fn = filenameForReader_[reader];
    assert(!fn.empty());
    if (regionTableReader->Initialize(fn) == 0) {
        AddErrorMessage("could not read region table on "+fn);
        return false;
    }
    regionTable.Reset();
    regionTableReader->ReadTable(regionTable);
    regionTableReader->Close();

    // initialize read scores
    InitReadScores(reader);

    // fetch records from HDF5 file
    SMRTSequence smrtRecord;
    while (reader->GetNext(smrtRecord)) {

        // compute subread & adapter intervals
        SubreadInterval hqInterval;
        deque<SubreadInterval> subreadIntervals;
        deque<SubreadInterval> adapterIntervals;
        try {
            hqInterval = ComputeSubreadIntervals(&subreadIntervals,
                                                 &adapterIntervals,
                                                 regionTable,
                                                 smrtRecord.zmwData.holeNumber,
                                                 smrtRecord.length);
        } catch (runtime_error& e) {
            AddErrorMessage(string(e.what()));
            smrtRecord.Free();
            return false;
        }

        // sequencing ZMW
        if (IsSequencingZmw(smrtRecord))
        {
            // write subreads to main BAM file
            for (const SubreadInterval& interval : subreadIntervals)
            {
                // skip invalid or 0-sized intervals
                if (interval.End <= interval.Start)
                    continue;

                if (!WriteSubreadRecord(smrtRecord,
                                        interval.Start,
                                        interval.End,
                                        ReadGroupId(),
                                        static_cast<uint8_t>(interval.LocalContextFlags),
                                        writer))
                {
                    smrtRecord.Free();
                    return false;
                }
            }

            // if scraps BAM file present
            if (scrapsWriter)
            {
                // write 5-end LQ sequence
                if (hqInterval.Start > 0)
                {
                    if (!WriteLowQualityRecord(smrtRecord,
                                               0,
                                               hqInterval.Start,
                                               ScrapsReadGroupId(),
                                               scrapsWriter))
                    {
                        smrtRecord.Free();
                        return false;
                    }
                }

                // write adapters
                for (const SubreadInterval& interval : adapterIntervals) {

                    // skip invalid or 0-sized adapters
                    if (interval.End <= interval.Start)
                        continue;

                    if (!WriteAdapterRecord(smrtRecord,
                                            interval.Start,
                                            interval.End,
                                            ScrapsReadGroupId(),
                                            scrapsWriter))
                    {
                        smrtRecord.Free();
                        return false;
                    }
                }

                // write 3'-end LQ sequence
                if (hqInterval.End < smrtRecord.length)
                {
                    if (!WriteLowQualityRecord(smrtRecord,
                                               hqInterval.End,
                                               smrtRecord.length,
                                               ScrapsReadGroupId(),
                                               scrapsWriter))
                    {
                        smrtRecord.Free();
                        return false;
                    }
                }
            }
        } // sequencing ZMW

        // non-sequencing ZMW
        else
        {
            assert(!IsSequencingZmw(smrtRecord));

            // only write these if scraps BAM present & we are in 'internal mode'
            if (settings_.isInternal && scrapsWriter)
            {
                // write 5-end LQ sequence to scraps BAM
                if (hqInterval.Start > 0)
                {
                    if (!WriteLowQualityRecord(smrtRecord,
                                               0,
                                               hqInterval.Start,
                                               ScrapsReadGroupId(),
                                               scrapsWriter))
                    {
                        smrtRecord.Free();
                        return false;
                    }
                }

                // write subreads & adapters to scraps BAM, sorted by query start
                while (!subreadIntervals.empty() && !adapterIntervals.empty()) {

                    const SubreadInterval& subread = subreadIntervals.front();
                    const SubreadInterval& adapter = adapterIntervals.front();
                    assert(subread.Start != adapter.Start);

                    if (subread.Start < adapter.Start)
                    {
                        if (!WriteFilteredRecord(smrtRecord,
                                                 subread.Start,
                                                 subread.End,
                                                 ScrapsReadGroupId(),
                                                 static_cast<uint8_t>(subread.LocalContextFlags),
                                                 scrapsWriter))
                        {
                            smrtRecord.Free();
                            return false;
                        }

                        subreadIntervals.pop_front();
                    }
                    else
                    {
                        if (!WriteAdapterRecord(smrtRecord,
                                                adapter.Start,
                                                adapter.End,
                                                ScrapsReadGroupId(),
                                                scrapsWriter))
                        {
                            smrtRecord.Free();
                            return false;
                        }
                        adapterIntervals.pop_front();
                    }
                }

                // flush any traling subread intervals
                while (!subreadIntervals.empty())
                {
                    assert(adapterIntervals.empty());
                    const SubreadInterval& subread = subreadIntervals.front();
                    if (!WriteFilteredRecord(smrtRecord,
                                             subread.Start,
                                             subread.End,
                                             ScrapsReadGroupId(),
                                             static_cast<uint8_t>(subread.LocalContextFlags),
                                             scrapsWriter))
                    {
                        smrtRecord.Free();
                        return false;
                    }

                    subreadIntervals.pop_front();
                }

                // flush any remaining adapter intervals
                while (!adapterIntervals.empty())
                {
                    assert(subreadIntervals.empty());
                    const SubreadInterval& adapter = adapterIntervals.front();
                    if (!WriteAdapterRecord(smrtRecord,
                                            adapter.Start,
                                            adapter.End,
                                            ScrapsReadGroupId(),
                                            scrapsWriter))
                    {
                        smrtRecord.Free();
                        return false;
                    }
                    adapterIntervals.pop_front();
                }

                // write 3'-end LQ sequence to scraps BAM
                if (hqInterval.End < smrtRecord.length)
                {
                    if (!WriteLowQualityRecord(smrtRecord,
                                               hqInterval.End,
                                               smrtRecord.length,
                                               ScrapsReadGroupId(),
                                               scrapsWriter))
                    {
                        smrtRecord.Free();
                        return false;
                    }
                }
            }
        } // non-sequencing ZMW

        smrtRecord.Free();
    }

    // if we get here, all OK
    return true; 
} 
Example #2
0
TEST(SubreadsTest, EndToEnd_Multiple)
{
    // setup
    const string movieName = "m140905_042212_sidney_c100564852550000001823085912221377_s1_X0";

    vector<string> baxFilenames;
    baxFilenames.push_back(tests::Data_Dir + "/" + movieName + ".1.bax.h5");

    const string generatedBam = movieName + ".subreads.bam";
    const string scrapBam = movieName + ".scraps.bam";

    // run conversion
    const int result = RunBax2Bam(baxFilenames, "--subread");
    EXPECT_EQ(0, result);

    // open BAX reader on original data
    HDFBasReader baxReader;
    baxReader.IncludeField("Basecall");
    baxReader.IncludeField("DeletionQV");
    baxReader.IncludeField("DeletionTag");
    baxReader.IncludeField("InsertionQV");
    baxReader.IncludeField("PreBaseFrames");
    baxReader.IncludeField("MergeQV");
    baxReader.IncludeField("SubstitutionQV");
    baxReader.IncludeField("HQRegionSNR");
    // not using SubTag or PulseWidth here

    string baxBasecallerVersion;
    string baxBindingKit;
    string baxSequencingKit;

    const int initOk = baxReader.Initialize(baxFilenames.front());
    EXPECT_EQ(1, initOk);
    if (initOk == 1) {

        if (baxReader.scanDataReader.fileHasScanData && baxReader.scanDataReader.initializedRunInfoGroup) {

            if (baxReader.scanDataReader.runInfoGroup.ContainsAttribute("BindingKit")) {
                HDFAtom<std::string> bkAtom;
                if (bkAtom.Initialize(baxReader.scanDataReader.runInfoGroup, "BindingKit")) {
                    bkAtom.Read(baxBindingKit);
                    bkAtom.dataspace.close();
                }
            }

            if (baxReader.scanDataReader.runInfoGroup.ContainsAttribute("SequencingKit")) {
                HDFAtom<std::string> skAtom;
                if (skAtom.Initialize(baxReader.scanDataReader.runInfoGroup, "SequencingKit")) {
                    skAtom.Read(baxSequencingKit);
                    skAtom.dataspace.close();
                }
            }
        }

        baxReader.GetChangeListID(baxBasecallerVersion);
    }

    // read region table info
    boost::scoped_ptr<HDFRegionTableReader> regionTableReader(new HDFRegionTableReader);
    RegionTable regionTable;
    std::string fn = baxFilenames.front();
    EXPECT_TRUE(regionTableReader->Initialize(fn) != 0);
    regionTable.Reset();
    regionTableReader->ReadTable(regionTable);
    regionTableReader->Close();

    EXPECT_NO_THROW(
    {
        // open BAM file
        BamFile bamFile(generatedBam);

        // check BAM header information
        const BamHeader& header = bamFile.Header();
        EXPECT_EQ(string("1.5"),     header.Version());
        EXPECT_EQ(string("unknown"), header.SortOrder());
        EXPECT_EQ(string("3.0.1"),   header.PacBioBamVersion());
        EXPECT_TRUE(header.Sequences().empty());
        EXPECT_TRUE(header.Comments().empty());
        ASSERT_FALSE(header.Programs().empty());

        const vector<string> readGroupIds = header.ReadGroupIds();
        ASSERT_FALSE(readGroupIds.empty());
        const ReadGroupInfo& rg = header.ReadGroup(readGroupIds.front());

        string rawId = movieName + "//SUBREAD";
        string md5Id;
        MakeMD5(rawId, md5Id, 8);
        EXPECT_EQ(md5Id, rg.Id());

        EXPECT_EQ(string("PACBIO"), rg.Platform());
        EXPECT_EQ(movieName, rg.MovieName());

        EXPECT_TRUE(rg.SequencingCenter().empty());
        EXPECT_TRUE(rg.Date().empty());
        EXPECT_TRUE(rg.FlowOrder().empty());
        EXPECT_TRUE(rg.KeySequence().empty());
        EXPECT_TRUE(rg.Library().empty());
        EXPECT_TRUE(rg.Programs().empty());
        EXPECT_TRUE(rg.PredictedInsertSize().empty());
        EXPECT_TRUE(rg.Sample().empty());

        EXPECT_EQ("SUBREAD", rg.ReadType());
        EXPECT_EQ(baxBasecallerVersion, rg.BasecallerVersion());
        EXPECT_EQ(baxBindingKit, rg.BindingKit());
        EXPECT_EQ(baxSequencingKit, rg.SequencingKit());
        EXPECT_EQ(75, std::stod(rg.FrameRateHz()));
        EXPECT_EQ("dq", rg.BaseFeatureTag(BaseFeature::DELETION_QV));
        EXPECT_EQ("dt", rg.BaseFeatureTag(BaseFeature::DELETION_TAG));
        EXPECT_EQ("iq", rg.BaseFeatureTag(BaseFeature::INSERTION_QV));
        EXPECT_EQ("ip", rg.BaseFeatureTag(BaseFeature::IPD));
        EXPECT_EQ("mq", rg.BaseFeatureTag(BaseFeature::MERGE_QV));
        EXPECT_EQ("sq", rg.BaseFeatureTag(BaseFeature::SUBSTITUTION_QV));
        EXPECT_FALSE(rg.HasBaseFeature(BaseFeature::SUBSTITUTION_TAG));
        EXPECT_EQ(FrameCodec::V1, rg.IpdCodec());

        // compare 1st record from each file
        SMRTSequence baxRecord;
        UInt holeNumber = 0;
        vector<float> hqSnr;

        size_t intervalIdx = 0;
        vector<SubreadInterval> subreadIntervals;

        size_t numTested = 0;
        EntireFileQuery entireFile(bamFile);
        for (BamRecord& bamRecord : entireFile) {
            if (intervalIdx >= subreadIntervals.size())
            {
                while (baxReader.GetNext(baxRecord))
                {
                    holeNumber  = baxRecord.zmwData.holeNumber;

                    ComputeSubreadIntervals(&subreadIntervals, regionTable, holeNumber);

                    /* this is for debugging subread interval problems
                    int hqStart = 0;
                    int hqEnd = 0;
                    int hqScore = 0;
                    LookupHQRegion(holeNumber,
                                   regionTable,
                                   hqStart,
                                   hqEnd,
                                   hqScore);

                    vector<ReadInterval> subreadIntervals_;
                    CollectSubreadIntervals(baxRecord, &regionTable, subreadIntervals_);

                    for (int i = subreadIntervals_.size() - 1; i >= 0; --i)
                    {
                        auto& in = subreadIntervals_[i];
                        int inStart = max(hqStart, in.start);
                        int inEnd   = min(hqEnd,   in.end);
                        if (inEnd <= inStart)
                            subreadIntervals_.erase(subreadIntervals_.begin() + i);
                    }

                    cerr << "hqRegion: " << hqStart << ", " << hqEnd << endl;
                    cerr << "subreadRegions:" << endl;
                    for (const auto& in : subreadIntervals_)
                        cerr << "  l, r: " << in.start << ", " << in.end << endl;

                    cerr << "adapterDerived:" << endl;
                    for (const auto& in : subreadIntervals)
                        cerr << "  l, r: " << in.Start << ", " << in.End << endl;

                    cerr << endl;
                    // */

                    if (subreadIntervals.empty())
                        continue;

                    intervalIdx = 0;

                    hqSnr.clear();
                    hqSnr.push_back(baxRecord.HQRegionSnr('A'));
                    hqSnr.push_back(baxRecord.HQRegionSnr('C'));
                    hqSnr.push_back(baxRecord.HQRegionSnr('G'));
                    hqSnr.push_back(baxRecord.HQRegionSnr('T'));

                    EXPECT_GT(hqSnr[0], 0);
                    EXPECT_GT(hqSnr[1], 0);
                    EXPECT_GT(hqSnr[2], 0);
                    EXPECT_GT(hqSnr[3], 0);

                    goto compare;
                }

                goto cleanup;
            }

compare:
            const BamRecordImpl& bamRecordImpl = bamRecord.Impl();
            EXPECT_EQ(4680,bamRecordImpl.Bin());
            EXPECT_EQ(0,   bamRecordImpl.InsertSize());
            EXPECT_EQ(255, bamRecordImpl.MapQuality());
            EXPECT_EQ(-1,  bamRecordImpl.MatePosition());
            EXPECT_EQ(-1,  bamRecordImpl.MateReferenceId());
            EXPECT_EQ(-1,  bamRecordImpl.Position());
            EXPECT_EQ(-1,  bamRecordImpl.ReferenceId());
            EXPECT_FALSE(bamRecordImpl.IsMapped());

            const int subreadStart = subreadIntervals[intervalIdx].Start;
            const int subreadEnd   = subreadIntervals[intervalIdx].End;

            const string expectedName = movieName + "/" +
                    to_string(holeNumber)   + "/" +
                    to_string(subreadStart) + "_" +
                    to_string(subreadEnd);
            EXPECT_EQ(expectedName, bamRecordImpl.Name());

            using PacBio::BAM::QualityValue;
            using PacBio::BAM::QualityValues;

            const DNALength length = subreadEnd - subreadStart;

            string expectedSequence;
            expectedSequence.assign((const char*)baxRecord.seq + subreadStart, length);

            const string bamSequence = bamRecord.Sequence();
            const QualityValues bamQualities = bamRecord.Qualities();
            EXPECT_EQ(expectedSequence, bamSequence);
            EXPECT_TRUE(bamQualities.empty());

            const QualityValues bamDeletionQVs = bamRecord.DeletionQV();
            const QualityValues bamInsertionQVs = bamRecord.InsertionQV();
            const QualityValues bamMergeQVs = bamRecord.MergeQV();
            const QualityValues bamSubstitutionQVs = bamRecord.SubstitutionQV();

            for (size_t i = 0; i < length; ++i) {
                const size_t pos = subreadStart + i;

                EXPECT_EQ((QualityValue)baxRecord.GetDeletionQV(pos),     bamDeletionQVs.at(i));
                EXPECT_EQ((QualityValue)baxRecord.GetInsertionQV(pos),    bamInsertionQVs.at(i));
                EXPECT_EQ((QualityValue)baxRecord.GetMergeQV(pos),        bamMergeQVs.at(i));
                EXPECT_EQ((QualityValue)baxRecord.GetSubstitutionQV(pos), bamSubstitutionQVs.at(i));
            }

            if (baxRecord.deletionTag)
            {
                string expectedDeletionTags;
                expectedDeletionTags.assign((char*)baxRecord.deletionTag + subreadStart,
                                            (char*)baxRecord.deletionTag + subreadStart + length);
                const string& bamDeletionTags = bamRecord.DeletionTag();
                EXPECT_EQ(expectedDeletionTags, bamDeletionTags);
            }

            if (baxRecord.substitutionTag)
            {
                string expectedSubstitutionTags;
                expectedSubstitutionTags.assign((char*)baxRecord.substitutionTag + subreadStart,
                                            (char*)baxRecord.substitutionTag + subreadStart + length);
                const string& bamSubstitutionTags = bamRecord.SubstitutionTag();
                EXPECT_EQ(expectedSubstitutionTags, bamSubstitutionTags);
            }

            // TODO: IPDs
            const LocalContextFlags ctxFlags = subreadIntervals[intervalIdx].LocalContextFlags;

            EXPECT_EQ(md5Id,        bamRecord.ReadGroupId());
            EXPECT_EQ(movieName,    bamRecord.MovieName());
            EXPECT_EQ(1,            bamRecord.NumPasses());
            EXPECT_EQ(holeNumber,   bamRecord.HoleNumber());
            EXPECT_EQ(subreadStart, bamRecord.QueryStart());
            EXPECT_EQ(subreadEnd,   bamRecord.QueryEnd());
            EXPECT_EQ(hqSnr,        bamRecord.SignalToNoise());
            EXPECT_EQ(ctxFlags,     bamRecord.LocalContextFlags());

            numTested++;
            intervalIdx++;
        }

cleanup:
        EXPECT_GT(numTested, 1);

        // cleanup
        baxReader.Close();
        RemoveFile(generatedBam);
        RemoveFile(scrapBam);

    }); // EXPECT_NO_THROW
Example #3
0
bool HqRegionConverter::ConvertFile(HDFBasReader* reader,
                                    PacBio::BAM::BamWriter* writer,
                                    PacBio::BAM::BamWriter* scrapsWriter) 
{
    assert(reader);

    // read region table info
    std::unique_ptr<HDFRegionTableReader> const regionTableReader(new HDFRegionTableReader);
    RegionTable regionTable;
    std::string fn = filenameForReader_[reader];
    assert(!fn.empty());
    if (regionTableReader->Initialize(fn) == 0) {
        AddErrorMessage("could not read region table on "+fn);
        return false;
    }
    regionTable.Reset();
    regionTableReader->ReadTable(regionTable);
    regionTableReader->Close();

    // initialize read scores
    InitReadScores(reader);

    // fetch records from HDF5 file
    SMRTSequence smrtRecord;
    int hqStart, hqEnd, score;
    while (reader->GetNext(smrtRecord)) {

        // attempt get high quality region
        if (!LookupHQRegion(smrtRecord.zmwData.holeNumber,
                            regionTable,
                            hqStart,
                            hqEnd,
                            score))
        {
            stringstream s;
            s << "could not find HQ region for hole number: " << smrtRecord.zmwData.holeNumber;
            AddErrorMessage(s.str());
            smrtRecord.Free();
            return false;
        }

        // Catch and repair 1-off errors in the HQ region
        hqEnd = (hqEnd == static_cast<int>(smrtRecord.length)-1) ? smrtRecord.length
                                                                 : hqEnd;

        // sequencing ZMW
        if (IsSequencingZmw(smrtRecord))
        {
            // write HQRegion to main BAM file
            if (hqStart < hqEnd)
            {
                if (!WriteRecord(smrtRecord,
                                 hqStart,
                                 hqEnd,
                                 ReadGroupId(),
                                 writer))
                {
                    smrtRecord.Free();
                    return false;
                }
            }

            // if scraps BAM file present
            if (scrapsWriter)
            {
                // write 5'-end LQ sequence
                if (hqStart > 0)
                {
                    if (!WriteLowQualityRecord(smrtRecord,
                                               0,
                                               hqStart,
                                               ScrapsReadGroupId(),
                                               scrapsWriter))
                    {
                        smrtRecord.Free();
                        return false;
                    }
                }

                // write 3'-end LQ sequence
                if (static_cast<size_t>(hqEnd) < smrtRecord.length)
                {
                    if (!WriteLowQualityRecord(smrtRecord,
                                               hqEnd,
                                               smrtRecord.length,
                                               ScrapsReadGroupId(),
                                               scrapsWriter))
                    {
                        smrtRecord.Free();
                        return false;
                    }
                }
            }
        }

        // non-sequencing ZMW
        else
        {
            assert(!IsSequencingZmw(smrtRecord));

            // only write these if scraps BAM present & we are in 'internal mode'
            if (settings_.isInternal && scrapsWriter)
            {
                // write 5'-end LQ sequence
                if (hqStart > 0)
                {
                    if (!WriteLowQualityRecord(smrtRecord,
                                               0,
                                               hqStart,
                                               ScrapsReadGroupId(),
                                               scrapsWriter))
                    {
                        smrtRecord.Free();
                        return false;
                    }
                }

                // write HQRegion to scraps BAM file
                if (hqStart < hqEnd)
                {
                    if (!WriteFilteredRecord(smrtRecord,
                                             hqStart,
                                             hqEnd,
                                             ScrapsReadGroupId(),
                                             scrapsWriter))
                    {
                        smrtRecord.Free();
                        return false;
                    }
                }

                // write 3'-end LQ sequence
                if (static_cast<size_t>(hqEnd) < smrtRecord.length)
                {
                    if (!WriteLowQualityRecord(smrtRecord,
                                               hqEnd,
                                               smrtRecord.length,
                                               ScrapsReadGroupId(),
                                               scrapsWriter))
                    {
                        smrtRecord.Free();
                        return false;
                    }
                }
            }
        }

        smrtRecord.Free();
    }

    // if we get here, all OK
    return true;
}