Example #1
0
BamHeader PrepareHeader(const OptionParser& parser, int argc, char** argv,
                        const vector<string>& files)
{
    using boost::algorithm::join;

    ProgramInfo program(parser.prog() + "-" + CCS_VERSION);
    program.Name(parser.prog())
    .CommandLine(parser.prog() + " " + join(vector<string>(argv + 1, argv + argc), " "))
    .Description(DESCRIPTION)
    .Version(CCS_VERSION);

    BamHeader header;
    header.PacBioBamVersion("3.0.1").SortOrder("unknown").Version("1.5").AddProgram(program);

    for (const auto& file : files) {
        BamFile bam(file);

        for (const auto& rg : bam.Header().ReadGroups()) {
            if (rg.ReadType() != "SUBREAD")
                parser.error("invalid input file, READTYPE must be SUBREAD");

            ReadGroupInfo readGroup(rg.MovieName(), "CCS");
            readGroup.BindingKit(rg.BindingKit())
            .SequencingKit(rg.SequencingKit())
            .BasecallerVersion(rg.BasecallerVersion())
            .FrameRateHz(rg.FrameRateHz());

            header.AddReadGroup(readGroup);
        }
    }

    return header;
}
Example #2
0
BamIndex::BamIndex(const BamHeader & h)
: metadata(h.getSequences().size())
, num_coordless_reads(0)
{
    const BamSequenceRecords sequence_records = h.getSequences();
	for(BamSequenceRecords::const_iterator i = sequence_records.begin(); i != sequence_records.end(); i++)
		sequences.push_back(new BamIndexSequence(*i));
}
TEST(BamHeaderTest, ConvertToRawDataOk)
{
    ReadGroupInfo rg1("rg1");
    rg1.Sample("control");
    ReadGroupInfo rg2("rg2");
    rg2.Sample("condition1");
    ReadGroupInfo rg3("rg3");
    rg3.Sample("condition1");

    SequenceInfo seq1("chr1");
    seq1.Length("2038").Species("chocobo");
    SequenceInfo seq2("chr2");
    seq2.Length("3042").Species("chocobo");

    ProgramInfo prog1("_foo_");
    prog1.Name("ide");

    BamHeader header;
    header.Version("1.1")
          .SortOrder("queryname")
          .PacBioBamVersion("3.0.1")
          .AddReadGroup(rg1)
          .AddReadGroup(rg2)
          .AddReadGroup(rg3)
          .AddSequence(seq1)
          .AddSequence(seq2)
          .AddProgram(prog1)
          .AddComment("ipsum and so on")
          .AddComment("citation needed");

    const string& expectedText = "@HD\tVN:1.1\tSO:queryname\tpb:3.0.1\n"
                                 "@SQ\tSN:chr1\tLN:2038\tSP:chocobo\n"
                                 "@SQ\tSN:chr2\tLN:3042\tSP:chocobo\n"
                                 "@RG\tID:rg1\tPL:PACBIO\tDS:READTYPE=UNKNOWN\tSM:control\tPM:SEQUEL\n"
                                 "@RG\tID:rg2\tPL:PACBIO\tDS:READTYPE=UNKNOWN\tSM:condition1\tPM:SEQUEL\n"
                                 "@RG\tID:rg3\tPL:PACBIO\tDS:READTYPE=UNKNOWN\tSM:condition1\tPM:SEQUEL\n"
                                 "@PG\tID:_foo_\tPN:ide\n"
                                 "@CO\tipsum and so on\n"
                                 "@CO\tcitation needed\n";


    const string& text = header.ToSam();
    PBBAM_SHARED_PTR<bam_hdr_t> rawData(sam_hdr_parse(text.size(), text.c_str()), tests::BamHdrDeleter());
    rawData->ignore_sam_err = 0;
    rawData->cigar_tab = NULL;
    rawData->l_text = text.size();
    rawData->text = (char*)calloc(rawData->l_text + 1, 1);
    memcpy(rawData->text, text.c_str(), rawData->l_text);

    const string& rawText = string(rawData->text, rawData->l_text);
    EXPECT_EQ(expectedText, rawText);
}
Example #4
0
  std::string GenomicRegion::ChrName(const BamHeader& h) const {
    
    std::string cc;
    if (!h.isEmpty()) {
      if (chr >= h.NumSequences())
	throw std::invalid_argument( "GenomicRegion::ChrName - not enough targets in BamHeader to cover ref id");
      else
	cc = h.IDtoName(chr); // std::string(h->target_name[chr]);
    } else {
      cc = chrToString(chr);
    }
    return cc;
  }
    void Interval(const BamHeader& header,
                  const GenomicInterval& interval)
    {
        htsIterator_.reset(nullptr);

        if (header.HasSequence(interval.Name())) {
            auto id = header.SequenceId(interval.Name());
            if (id >= 0 && static_cast<size_t>(id) < header.NumSequences()) {
                htsIterator_.reset(bam_itr_queryi(htsIndex_.get(),
                                                  id,
                                                  interval.Start(),
                                                  interval.Stop()));
            }
        }

        if (!htsIterator_)
            throw std::runtime_error("could not create iterator for requested region");
    }
TEST(BamHeaderTest, MergeHandlesDuplicateReadGroups)
{
    const string hdrText = {
        "@HD\tVN:1.1\tSO:unknown\tpb:3.0.1\n"
        "@RG\tID:a955def6\tPL:PACBIO\tDS:READTYPE=SUBREAD;DeletionQV=dq;DeletionTag=dt;"
            "InsertionQV=iq;MergeQV=mq;SubstitutionQV=sq;Ipd:CodecV1=ip;BINDINGKIT=100356300;"
            "SEQUENCINGKIT=100356200;BASECALLERVERSION=2.3.0.0.140018;FRAMERATEHZ=75.000000\t"
            "PU:m140918_150013_42139_c100697631700000001823144703261565_s1_p0\tPM:SEQUEL\n"
        "@PG\tID:bam2bam-0.20.0\tPN:bam2bam\tVN:0.20.0\n"
        "@PG\tID:bax2bam-0.0.2\tPN:bax2bam\tVN:0.0.2\n"
    };

    // duplicate @RG:IDs handled ok (i.e. not duplicated in output)
    const BamHeader header1(hdrText);
    const BamHeader header2(hdrText);
    const BamHeader merged = header1 + header2;
    EXPECT_EQ(hdrText, merged.ToSam());
}
TEST(BamHeaderTest, EncodeTest)
{
    ReadGroupInfo rg1("rg1");
    rg1.Sample("control");
    ReadGroupInfo rg2("rg2");
    rg2.Sample("condition1");
    ReadGroupInfo rg3("rg3");
    rg3.Sample("condition1");

    SequenceInfo seq1("chr1");
    seq1.Length("2038").Species("chocobo");
    SequenceInfo seq2("chr2");
    seq2.Length("3042").Species("chocobo");

    ProgramInfo prog1("_foo_");
    prog1.Name("ide");

    BamHeader header;
    header.Version("1.1")
          .SortOrder("queryname")
          .PacBioBamVersion("3.0.1")
          .AddReadGroup(rg1)
          .AddReadGroup(rg2)
          .AddReadGroup(rg3)
          .AddSequence(seq1)
          .AddSequence(seq2)
          .AddProgram(prog1)
          .AddComment("ipsum and so on")
          .AddComment("citation needed");

    const string& expectedText = "@HD\tVN:1.1\tSO:queryname\tpb:3.0.1\n"
                                 "@SQ\tSN:chr1\tLN:2038\tSP:chocobo\n"
                                 "@SQ\tSN:chr2\tLN:3042\tSP:chocobo\n"
                                 "@RG\tID:rg1\tPL:PACBIO\tDS:READTYPE=UNKNOWN\tSM:control\tPM:SEQUEL\n"
                                 "@RG\tID:rg2\tPL:PACBIO\tDS:READTYPE=UNKNOWN\tSM:condition1\tPM:SEQUEL\n"
                                 "@RG\tID:rg3\tPL:PACBIO\tDS:READTYPE=UNKNOWN\tSM:condition1\tPM:SEQUEL\n"
                                 "@PG\tID:_foo_\tPN:ide\n"
                                 "@CO\tipsum and so on\n"
                                 "@CO\tcitation needed\n";

    const string& text = header.ToSam();
    EXPECT_EQ(expectedText, text);
}
Example #8
0
PBBAM_SHARED_PTR<bam_hdr_t> BamHeaderMemory::MakeRawHeader(const BamHeader& header)
{
    const string& text = header.ToSam();
    PBBAM_SHARED_PTR<bam_hdr_t> rawData(sam_hdr_parse(text.size(), text.c_str()), internal::HtslibHeaderDeleter());
    rawData->ignore_sam_err = 0;
    rawData->cigar_tab = NULL;
    rawData->l_text = text.size();
    rawData->text = (char*)calloc(rawData->l_text + 1, 1);
    memcpy(rawData->text, text.c_str(), rawData->l_text);
    return rawData;
}
Example #9
0
static BamHeader PrepareHeader(const std::string& cmdLine,
                               const std::vector<ReadGroupInfo>& inputReadgroups)
{
    ProgramInfo program{APPNAME + "-" + PacBio::UnanimityVersion()};
    program.Name(APPNAME)
        .CommandLine(APPNAME + " " + cmdLine)
        .Description(DESCRIPTION)
        .Version(PacBio::UnanimityVersion());

    std::random_device rd;
    std::default_random_engine rng{rd()};

    if (inputReadgroups.size() > 1) {
        std::cerr << APPNAME << " can currently only handle one readgroup per input bam\n";
        exit(EXIT_FAILURE);
    }

    std::ostringstream movieName;
    movieName << 'm' << std::uniform_int_distribution<uint32_t>{10000, 99999}(rng) << '_'
              << std::uniform_int_distribution<uint32_t>{100000, 999999}(rng) << '_'
              << std::uniform_int_distribution<uint32_t>{100000, 999999}(rng);

    // reuse input readgroups for platform information
    ReadGroupInfo newRg{inputReadgroups.front()};
    newRg.MovieName(movieName.str())
        .ReadType("SUBREAD")
        .PlatformModel(PlatformModelType::SEQUEL)
        .IpdCodec(FrameCodec::V1)
        .PulseWidthCodec(FrameCodec::V1)
        .Id(movieName.str(), "SUBREAD");

    BamHeader header;
    header.PacBioBamVersion("3.0.1")
        .SortOrder("unknown")
        .Version("1.5")
        .AddProgram(program)
        .ReadGroups(std::vector<ReadGroupInfo>{newRg});

    return header;
}
Example #10
0
  GenomicRegion::GenomicRegion(const std::string& reg, const BamHeader& hdr) {
  
  if (hdr.isEmpty())
    throw std::invalid_argument("GenomicRegion constructor - supplied empty BamHeader");

  // scrub String
  //std::string reg2 = SeqLib::scrubString(reg, "chr");

  // use htslib region parsing code
  int tid, beg, end;
  const char * q = hts_parse_reg(reg.c_str(), &beg, &end);
  if (q) {
    char *tmp = (char*)alloca(q - reg.c_str() + 1); // stack alloc
    strncpy(tmp, reg.c_str(), q - reg.c_str());
    tmp[q - reg.c_str()] = 0;
    tid = hdr.Name2ID(std::string(tmp)); //bam_name2id(h.get(), tmp);
    if (tid < 0) {
      std::string inv = "GenomicRegion constructor: Failed to set region for " + reg;
      throw std::invalid_argument(inv);
    }

    if (end == INT_MAX) { // single chrome
      tid = hdr.Name2ID(reg);
      beg = 0;
      end = hdr.GetSequenceLength(reg);
    }
  } else {
    std::string inv = "GenomicRegion constructor: Failed to set region for " + reg;
    throw std::invalid_argument(inv);
  }
  
  chr = tid;
  pos1 = beg+1;
  pos2 = end;
  strand = '*';

}
TEST(PacBioIndexTest, CreateOnTheFly)
{
    // do this in temp directory, so we can ensure write access
    const string tempDir    = tests::GeneratedData_Dir + "/";
    const string tempBamFn  = tempDir + "temp.bam";
    const string tempPbiFn  = tempBamFn + ".pbi";

    // NOTE: new file differs in size than existing (different write parameters may yield different file sizes, even though content is same)
    const vector<int64_t> expectedNewOffsets = { 33816576, 236126208, 391315456, 469106688, 537067520, 587792384, 867303424, 1182793728, 1449787392, 1582628864 };
    vector<int64_t> observedOffsets;

    // create PBI on the fly from input BAM while we write to new file
    {
        BamFile bamFile(test2BamFn);
        BamHeader header = bamFile.Header();

        BamWriter writer(tempBamFn, header); // default compression, default thread count
        PbiBuilder builder(tempPbiFn, header.Sequences().size());

        int64_t vOffset = 0;
        EntireFileQuery entireFile(bamFile);
        for (const BamRecord& record : entireFile) {
            writer.Write(record, &vOffset);
            builder.AddRecord(record, vOffset);
            observedOffsets.push_back(vOffset);
        }
    }

    EXPECT_EQ(expectedNewOffsets, observedOffsets);

    // sanity check on original file
    {
        const vector<int64_t> originalFileOffsets = { 33816576, 33825163, 33831333, 33834264, 33836542, 33838065, 33849818, 33863499, 33874621, 1392836608 };
        BamRecord r;
        BamReader reader(test2BamFn);
        for (int i = 0; i < originalFileOffsets.size(); ++i) {
            reader.VirtualSeek(originalFileOffsets.at(i));
            EXPECT_TRUE(CanRead(reader, r, i));
        }
    }

    // attempt to seek in our new file using both expected & observed offsets
    {
        BamRecord r;
        BamReader reader(tempBamFn);
        for (int i = 0; i < expectedNewOffsets.size(); ++i) {
            reader.VirtualSeek(expectedNewOffsets.at(i));
            EXPECT_TRUE(CanRead(reader, r, i));
        }
        for (int i = 0; i < observedOffsets.size(); ++i) {
            reader.VirtualSeek(observedOffsets.at(i));
            EXPECT_TRUE(CanRead(reader, r, i));
        }
    }

    // compare data in new PBI file, to expected data
    const PbiRawData& expectedIndex = tests::Test2Bam_NewIndex();
    const PbiRawData& fromBuilt = PbiRawData(tempPbiFn);
    tests::ExpectRawIndicesEqual(expectedIndex, fromBuilt);

    // straight diff of newly-generated PBI file to existing PBI
    // TODO: Come back to this once pbindexump is in place.
    //       We can't exactly do this since file offsets may differ between 2 BAMs of differing compression levels.
    //       Should add some sort of BAM checksum based on contents, not just size, for this reason.
//    const string pbiDiffCmd = string("diff -q ") + test2BamFn + ".pbi " + tempPbiFn;
//    EXPECT_EQ(0, system(pbiDiffCmd.c_str()));

    // clean up temp file(s)
    remove(tempBamFn.c_str());
    remove(tempPbiFn.c_str());
}
TEST(BamHeaderTest, DecodeTest)
{
    const string& text = "@HD\tVN:1.1\tSO:queryname\tpb:3.0.1\n"
                         "@SQ\tSN:chr1\tLN:2038\tSP:chocobo\n"
                         "@SQ\tSN:chr2\tLN:3042\tSP:chocobo\n"
                         "@RG\tID:rg1\tSM:control\n"
                         "@RG\tID:rg2\tSM:condition1\n"
                         "@RG\tID:rg3\tSM:condition1\n"
                         "@PG\tID:_foo_\tPN:ide\n"
                         "@CO\tipsum and so on\n"
                         "@CO\tcitation needed\n";

    BamHeader header = BamHeader(text);

    EXPECT_EQ(string("1.1"),       header.Version());
    EXPECT_EQ(string("queryname"), header.SortOrder());
    EXPECT_EQ(string("3.0.1"),     header.PacBioBamVersion());

    EXPECT_EQ(3, header.ReadGroups().size());
    EXPECT_TRUE(header.HasReadGroup("rg1"));
    EXPECT_TRUE(header.HasReadGroup("rg2"));
    EXPECT_TRUE(header.HasReadGroup("rg3"));

    EXPECT_EQ(string("control"),    header.ReadGroup("rg1").Sample());
    EXPECT_EQ(string("condition1"), header.ReadGroup("rg2").Sample());
    EXPECT_EQ(string("condition1"), header.ReadGroup("rg3").Sample());

    EXPECT_EQ(2, header.Sequences().size());
    EXPECT_TRUE(header.HasSequence("chr1"));
    EXPECT_TRUE(header.HasSequence("chr2"));
    EXPECT_EQ(string("chocobo"), header.Sequence("chr1").Species());
    EXPECT_EQ(string("chocobo"), header.Sequence("chr2").Species());
    EXPECT_EQ(string("2038"), header.Sequence("chr1").Length());
    EXPECT_EQ(string("3042"), header.Sequence("chr2").Length());

    EXPECT_EQ(1, header.Programs().size());
    EXPECT_TRUE(header.HasProgram("_foo_"));
    EXPECT_EQ(string("ide"), header.Program("_foo_").Name());

    EXPECT_EQ(2, header.Comments().size());
    EXPECT_EQ(string("ipsum and so on"), header.Comments().at(0));
    EXPECT_EQ(string("citation needed"), header.Comments().at(1));
}
TEST(BamHeaderTest, DefaultConstruction)
{
    BamHeader header;
    EXPECT_TRUE(header.Version().empty());
    EXPECT_TRUE(header.SortOrder().empty()); // default to unknown ?
    EXPECT_TRUE(header.ReadGroups().empty());
    EXPECT_TRUE(header.Sequences().empty());
    EXPECT_TRUE(header.Programs().empty());
    EXPECT_TRUE(header.Comments().empty());

    EXPECT_THROW(header.Program("foo"),     std::exception);
    EXPECT_THROW(header.ReadGroup("foo"),   std::exception);
    EXPECT_THROW(header.SequenceId("foo"),  std::exception);
    EXPECT_THROW(header.SequenceLength(42), std::exception);
    EXPECT_THROW(header.SequenceName(42),   std::exception);
}
TEST(BamHeaderTest, MergeOk)
{
    const string hdrText1 = {
        "@HD\tVN:1.1\tSO:unknown\tpb:3.0.1\n"
        "@RG\tID:a955def6\tPL:PACBIO\tDS:READTYPE=SUBREAD;DeletionQV=dq;DeletionTag=dt;"
            "InsertionQV=iq;MergeQV=mq;SubstitutionQV=sq;Ipd:CodecV1=ip;BINDINGKIT=100356300;"
            "SEQUENCINGKIT=100356200;BASECALLERVERSION=2.3.0.0.140018;FRAMERATEHZ=75.000000\t"
            "PU:m140918_150013_42139_c100697631700000001823144703261565_s1_p0\t"
            "PM:SEQUEL\n"
        "@PG\tID:bam2bam-0.20.0\tPN:bam2bam\tVN:0.20.0\n"
        "@PG\tID:bax2bam-0.0.2\tPN:bax2bam\tVN:0.0.2\n"
        "@CO\tcomment1\n"
    };

    const string hdrText2 = {
        "@HD\tVN:1.1\tSO:unknown\tpb:3.0.1\n"
        "@RG\tID:e83fc9c6\tPL:PACBIO\tDS:READTYPE=SCRAP;DeletionQV=dq;DeletionTag=dt;"
            "InsertionQV=iq;MergeQV=mq;SubstitutionQV=sq;SubstitutionTag=st;Ipd:Frames=ip;"
            "PulseWidth:Frames=pw;PkMid=pm;PkMean=pa;LabelQV=pq;AltLabel=pt;AltLabelQV=pv;"
            "PulseMergeQV=pg;PulseCall=pc;PrePulseFrames=pd;PulseCallWidth=px;"
            "BINDINGKIT=100372700;SEQUENCINGKIT=100356200;BASECALLERVERSION=0.1;"
            "FRAMERATEHZ=100.000000\tPU:ArminsFakeMovie\t"
            "PM:SEQUEL\n"
        "@PG\tID:baz2bam-0.15.0\tPN:baz2bam\tVN:0.15.0\n"
        "@PG\tID:bazFormat-0.3.0\tPN:bazFormat\tVN:0.3.0\n"
        "@PG\tID:bazwriter-0.15.0\tPN:bazwriter\tVN:0.15.0\n"
        "@CO\tcomment2\n"
    };

    const string mergedText = {
        "@HD\tVN:1.1\tSO:unknown\tpb:3.0.1\n"
        "@RG\tID:a955def6\tPL:PACBIO\tDS:READTYPE=SUBREAD;DeletionQV=dq;DeletionTag=dt;"
            "InsertionQV=iq;MergeQV=mq;SubstitutionQV=sq;Ipd:CodecV1=ip;BINDINGKIT=100356300;"
            "SEQUENCINGKIT=100356200;BASECALLERVERSION=2.3.0.0.140018;FRAMERATEHZ=75.000000\t"
            "PU:m140918_150013_42139_c100697631700000001823144703261565_s1_p0\t"
            "PM:SEQUEL\n"
        "@RG\tID:e83fc9c6\tPL:PACBIO\tDS:READTYPE=SCRAP;DeletionQV=dq;DeletionTag=dt;"
            "InsertionQV=iq;MergeQV=mq;SubstitutionQV=sq;SubstitutionTag=st;Ipd:Frames=ip;"
            "PulseWidth:Frames=pw;PkMid=pm;PkMean=pa;LabelQV=pq;AltLabel=pt;AltLabelQV=pv;"
            "PulseMergeQV=pg;PulseCall=pc;PrePulseFrames=pd;PulseCallWidth=px;"
            "BINDINGKIT=100372700;SEQUENCINGKIT=100356200;BASECALLERVERSION=0.1;"
            "FRAMERATEHZ=100.000000\tPU:ArminsFakeMovie\t"
            "PM:SEQUEL\n"
        "@PG\tID:bam2bam-0.20.0\tPN:bam2bam\tVN:0.20.0\n"
        "@PG\tID:bax2bam-0.0.2\tPN:bax2bam\tVN:0.0.2\n"
        "@PG\tID:baz2bam-0.15.0\tPN:baz2bam\tVN:0.15.0\n"
        "@PG\tID:bazFormat-0.3.0\tPN:bazFormat\tVN:0.3.0\n"
        "@PG\tID:bazwriter-0.15.0\tPN:bazwriter\tVN:0.15.0\n"
        "@CO\tcomment1\n"
        "@CO\tcomment2\n"
    };

    { // operator+

        const BamHeader header1(hdrText1);
        const BamHeader header2(hdrText2);
        const BamHeader merged = header1 + header2;
        EXPECT_EQ(mergedText, merged.ToSam());

        // also make sure inputs not changed
        EXPECT_EQ(hdrText1, header1.ToSam());
        EXPECT_EQ(hdrText2, header2.ToSam());
    }

    { // operator+=

        BamHeader header1(hdrText1);
        header1 += BamHeader(hdrText2);
        EXPECT_EQ(mergedText, header1.ToSam());
    }
}