Ejemplo n.º 1
bool Revert::updateCigar(SamRecord& samRecord)
    // Get the OC tag, which is a string.
    const String* oldCigar = samRecord.getStringTag(SamTags::ORIG_CIGAR_TAG);
    // Get the OP tag, which is an integer.
    int* oldPos = samRecord.getIntegerTag(SamTags::ORIG_POS_TAG);

    bool status = true;
    if(oldCigar != NULL)
        // The old cigar was found, so set it in the record.
        status &= samRecord.setCigar((*oldCigar).c_str());

            // Remove the tag.
            status &= samRecord.rmTag(SamTags::ORIG_CIGAR_TAG, SamTags::ORIG_CIGAR_TAG_TYPE);
    if(oldPos != NULL)
        // The old position was found, so set it in the record.
        status &= samRecord.set1BasedPosition(*oldPos);

            // Remove the tag.
            status &= samRecord.rmTag(SamTags::ORIG_POS_TAG, SamTags::ORIG_POS_TAG_TYPE);

Ejemplo n.º 2
bool Revert::updateQual(SamRecord& samRecord)
        // Get the OQ tag, which is a string.
    const String* oldQual = samRecord.getStringTag(SamTags::ORIG_QUAL_TAG);

    bool status = true;
    if(oldQual != NULL)
        // The old quality was found, so set it in the record.
        status &= samRecord.setQuality((*oldQual).c_str());

            // Remove the tag.
            samRecord.rmTag(SamTags::ORIG_QUAL_TAG, SamTags::ORIG_QUAL_TAG_TYPE);
Ejemplo n.º 3
void Bam2FastQ::writeFastQ(SamRecord& samRec, IFILE filePtr,
                           const std::string& fileNameExt, const char* readNameExt)
    static int16_t flag;
    static std::string sequence;
    static String quality;
    static std::string rg;
    static std::string rgFastqExt;
    static std::string rgListStr;
    static std::string fileName;
    static std::string fq2;
        rg = samRec.getString("RG").c_str();
        rgFastqExt = rg + fileNameExt;

        OutFastqMap::iterator it;
        it = myOutFastqs.find(rgFastqExt);
        if(it == myOutFastqs.end())
            // New file.
            fileName = myOutBase.c_str();
            if(rg != "")
                fileName += '.';
                rg = ".";
            fileName += rgFastqExt;
            filePtr = ifopen(fileName.c_str(), "w", myCompression);
            myOutFastqs[rgFastqExt] = filePtr;

            if(fileNameExt != mySecondFileNameExt)
                // first end.
                const char* sm = mySamHeader.getRGTagValue("SM", rg.c_str());
                if(strcmp(sm, "") == 0){sm = myOutBase.c_str();}

                SamHeaderRG* rgPtr = mySamHeader.getRG(rg.c_str());
                if((rgPtr == NULL) || (!rgPtr->appendString(rgListStr)))
                    // No RG info for this record.
                    rgListStr = ".\n";
                fq2 = ".";
                if(fileNameExt == myFirstFileNameExt)
                    fq2 = myOutBase.c_str();
                    if(rg != ".")
                        fq2 += '.';
                        fq2 += rg;
                    fq2 += mySecondFileNameExt;
                ifprintf(myFqList, "%s\t%s\t%s\t%s",
                         sm, fileName.c_str(), fq2.c_str(),
            filePtr = it->second;
    if(filePtr == NULL)
        throw(std::runtime_error("Programming ERROR/EXITING: Bam2FastQ filePtr not set."));

    flag = samRec.getFlag();
    const char* readName = samRec.getReadName();
    sequence = samRec.getSequence();
        // Read the quality from the quality field
        quality = samRec.getQuality();
        // Read Quality from the specified tag
        const String* qTagPtr = samRec.getStringTag(myQField.c_str());
        if((qTagPtr != NULL) && (qTagPtr->Length() == (int)sequence.length()))
            // Use the tag value for quality
            quality = qTagPtr->c_str();
            // Tag was not found, so use the quality field.
            if(myNumQualTagErrors == 1)
                std::cerr << "Bam2FastQ: " << myQField.c_str() 
                          << " tag was not found/invalid, so using the quality field in records without the tag\n";
            quality = samRec.getQuality();
    if(SamFlag::isReverse(flag) && myReverseComp)
        // It is reverse, so reverse compliment the sequence
        // Reverse the quality.
        // Ensure it is all capitalized.
        int seqLen = sequence.size();
        for (int i = 0; i < seqLen; i++)
            sequence[i] = (char)toupper(sequence[i]);

        ifprintf(filePtr, "@%s%s\n%s\n+%s%s\n%s\n", readName, readNameExt,
                 sequence.c_str(), readName, readNameExt, quality.c_str());
        ifprintf(filePtr, "@%s%s\n%s\n+\n%s\n", readName, readNameExt,
                 sequence.c_str(), quality.c_str());
    // Release the record.
Ejemplo n.º 4
bool Recab::processReadApplyTable(SamRecord& samRecord)
    static BaseData data;
    static std::string readGroup;
    static std::string aligTypes;

    int seqLen = samRecord.getReadLength();

    uint16_t  flag = samRecord.getFlag();

    // Check if the flag contains an exclude.
    if((flag & myIntApplyExcludeFlags) != 0)
        // Do not apply the recalibration table to this read.
    readGroup = samRecord.getString("RG").c_str();

    // Look for the read group in the map.
    // TODO - extra string constructor??
    RgInsertReturn insertRet = 
        myRg2Id.insert(std::pair<std::string, uint16_t>(readGroup, 0));
    if(insertRet.second == true)
        // New element inserted.
        insertRet.first->second = myId2Rg.size();

    data.rgid = insertRet.first->second;

        // Check if there is an old quality.
        const String* oldQPtr =
        if((oldQPtr != NULL) && (oldQPtr->Length() == seqLen))
            // There is an old quality, so use that.
            myQualityStrings.oldq = oldQPtr->c_str();
            myQualityStrings.oldq = samRecord.getQuality();
        myQualityStrings.oldq = samRecord.getQuality();

    if(myQualityStrings.oldq.length() != (unsigned int)seqLen)
        Logger::gLogger->warning("Quality is not the correct length, so skipping recalibration on that record.");


    ////// iterate sequence
    int32_t seqPos = 0;
    int seqIncr = 1;

    bool reverse;
        reverse = true;
        seqPos = seqLen - 1;
        seqIncr = -1;
        reverse = false;

    // Check which read - this will be the same for all positions, so 
    // do this outside of the smaller loop.
    if(!SamFlag::isPaired(flag) || SamFlag::isFirstFragment(flag))
        // Mark as first if it is not paired or if it is the
        // first in the pair.
        data.read = 0;
        data.read = 1;

    // Set unsetbase for curBase.
    // This will be used for the prebase of cycle 0.
    data.curBase = 'K';

    for (data.cycle = 0; data.cycle < seqLen; data.cycle++, seqPos += seqIncr)
        // Set the preBase to the previous cycle's current base.
        // For cycle 0, curBase was set to a default value.
        data.preBase = data.curBase;

        // Get the current base.
        data.curBase = samRecord.getSequence(seqPos);

            // Complement the current base.
            data.curBase =
                BaseAsciiMap::base2complement[(unsigned int)(data.curBase)];

        // Get quality
        data.qual = 

        // skip bases with quality below the minimum set.
        if(data.qual < myMinBaseQual)
            myQualityStrings.newq[seqPos] = myQualityStrings.oldq[seqPos];

        // Update quality score
        uint8_t qemp = hasherrormodel.getQemp(data);
        qemp = mySqueeze.getQualCharFromQemp(qemp);
        if(qemp > myMaxBaseQualChar)
            qemp = myMaxBaseQualChar;
        myQualityStrings.newq[seqPos] = qemp;

        samRecord.addTag(myStoreQualTag, 'Z', myQualityStrings.oldq.c_str());

    return true;
Ejemplo n.º 5
bool Recab::processReadBuildTable(SamRecord& samRecord)
    static BaseData data;
    static std::string chromosomeName;
    static std::string readGroup;
    static std::string aligTypes;

    int seqLen = samRecord.getReadLength();
    // Check if the parameters have been processed.
        // This throws an exception if the reference cannot be setup.

    uint16_t  flag = samRecord.getFlag();

        // Unmapped, skip processing
        // This read is mapped.

        // Secondary read

    // Check if the flag contains an exclude.
    if((flag & myIntBuildExcludeFlags) != 0)
        // Do not use this read for building the recalibration table.

    if(samRecord.getMapQuality() == 0)
        // 0 mapping quality, so skip processing.
    if(samRecord.getMapQuality() == 255)
        // 255 mapping quality, so skip processing.
    chromosomeName = samRecord.getReferenceName();
    readGroup = samRecord.getString("RG").c_str();

    // Look for the read group in the map.
    // TODO - extra string constructor??
    RgInsertReturn insertRet = 
        myRg2Id.insert(std::pair<std::string, uint16_t>(readGroup, 0));
    if(insertRet.second == true)
        // New element inserted.
        insertRet.first->second = myId2Rg.size();

    data.rgid = insertRet.first->second;

    bool reverse;
        reverse = true;
        reverse = false;

    if(myReferenceGenome == NULL)
        throw std::runtime_error("Failed to setup Reference File.\n");

    genomeIndex_t mapPos = 

    	Logger::gLogger->warning("INVALID_GENOME_INDEX (chrom:pos %s:%ld) and record skipped... Reference in BAM is different from the ref used here!", chromosomeName.c_str(), samRecord.get1BasedPosition());

        return false;

        // Check if there is an old quality.
        const String* oldQPtr = 
        if((oldQPtr != NULL) && (oldQPtr->Length() == seqLen))
            // There is an old quality, so use that.
            myQualityStrings.oldq = oldQPtr->c_str();
            // Tag was not found, so use the current quality.
            if(myNumQualTagErrors == 1)
                Logger::gLogger->warning("Recab: %s tag was not found/invalid, so using the quality field in records without the tag", myQField.c_str());
            myQualityStrings.oldq = samRecord.getQuality();
        myQualityStrings.oldq = samRecord.getQuality();

    if(myQualityStrings.oldq.length() != (unsigned int)seqLen)
        Logger::gLogger->warning("Quality is not the correct length, so skipping recalibration on that record.");

    aligTypes = "";
    Cigar* cigarPtr = samRecord.getCigarInfo();

    if(cigarPtr == NULL)
        Logger::gLogger->warning("Failed to get the cigar");

    // This read will be used for building the recab table.

    ////// iterate sequence
    genomeIndex_t refPos = 0;
    int32_t refOffset = 0;
    int32_t prevRefOffset = Cigar::INDEX_NA;
    int32_t seqPos = 0;
    int seqIncr = 1;
        seqPos = seqLen - 1;
        seqIncr = -1;

    // read
    if(!SamFlag::isPaired(flag) || SamFlag::isFirstFragment(flag))
        // Mark as first if it is not paired or if it is the
        // first in the pair.
        data.read = 0;
        data.read = 1;

    // Set unsetbase for curBase.
    // This will be used for the prebase of cycle 0.
    data.curBase = 'K';

    for (data.cycle = 0; data.cycle < seqLen; data.cycle++, seqPos += seqIncr)
        // Store the previous current base in preBase.
        data.preBase = data.curBase;

        // Get the current base before checking if we are going to
        // process this position so it will be set for the next position.
        data.curBase = samRecord.getSequence(seqPos);
            // Complement the current base.
            // The prebase is already complemented.
            data.curBase = 
                BaseAsciiMap::base2complement[(unsigned int)(data.curBase)];
        // Get the reference offset.
        refOffset = cigarPtr->getRefOffset(seqPos);
        if(refOffset == Cigar::INDEX_NA)
            // Not a match/mismatch, so continue to the next one which will
            // not have a previous match/mismatch.
            // Set previous ref offset to a negative so
            // the next one won't be kept.
            prevRefOffset = -2;

        // This one is a match.
        refPos = mapPos + refOffset;

        // Check to see if we should process this position.
        // Do not process if it is cycle 0 and:
        //   1) current base is in dbsnp
        if(data.cycle == 0)
            if(!(myDbsnpFile.IsEmpty()) && myDbSNP[refPos])
                // Save the previous reference offset.
                prevRefOffset = refOffset;
            // Do not process if it is not cycle 0 and:
            //   1) previous reference position not adjacent 
            //      (not a match/mismatch)
            //   2) previous base is in dbsnp
            //   3) current base is in dbsnp
            if((!myKeepPrevNonAdjacent && (refOffset != (prevRefOffset + seqIncr))) ||
               (data.preBase == 'K'))
                // Save the previous reference offset.
                prevRefOffset = refOffset;
            if(!(myDbsnpFile.IsEmpty()) && 
               (myDbSNP[refPos] ||
                (!myKeepPrevDbsnp && myDbSNP[refPos - seqIncr])))
                // Save the previous reference offset.
                prevRefOffset = refOffset;
        // Save the previous reference offset.
        prevRefOffset = refOffset;

        // Set the reference & read bases in the Covariates
        char refBase = (*myReferenceGenome)[refPos];

            // N reference, so skip it when building the table.

            refBase = BaseAsciiMap::base2complement[(unsigned int)(refBase)];

        // Get quality char
        data.qual = 

        // skip bases with quality below the minimum set.
        if(data.qual < myMinBaseQual)

        if(BaseUtilities::areEqual(refBase, data.curBase)
           && (BaseAsciiMap::base2int[(unsigned int)(data.curBase)] < 4))

        hasherrormodel.setCell(data, refBase);
    return true;
Ejemplo n.º 6
bool BamProcessor::processRecord ()
    trclog << "\nProcessing record " << read_cnt_ << " - " << rec_.getReadName () << ", " << rec_.get0BasedUnclippedEnd () << "->" << rec_.getReadLength () << ", ref " << rec_.getReferenceName () << std::endl;
    const char* seq = rec_.getSequence ();
    unsigned position = rec_.get0BasedPosition ();
    unsigned new_position = position;
    bool reverse_match = (rec_.getFlag () & 0x10);

    Cigar* cigar_p = rec_.getCigarInfo ();
    if (!cigar_p->size ())  // can not recreate reference is cigar is missing. Keep record unaligned.
    {                       // TODO: allow to specify and load external reference
        ++ unaligned_cnt_;
        return true;

    myassert (cigar_p);

    const String *mdval = rec_.getStringTag ("MD");
    if (!mdval) // can not recreate reference is MD tag is missing. Keep record as is.
        warn << "No MD Tag for record " << proc_cnt_ << ". Skipping record." << std::endl;
        return true; // record will be kept as-is.
    std::string md_tag = mdval->c_str ();

    // find the non-clipped region
    uint32_t clean_len;
    EndClips clips;
    const char* clean_read = clip_seq (seq, *cigar_p, clean_len, clips);

    // find length needed for the reference
    // this reserves space enough for entire refference, including softclipped ends.
    unsigned ref_len = cigar_p->getExpectedReferenceBaseCount ();
    if (ref_buffer_sz_ < ref_len)
        ref_buffer_sz_ = (1 + ref_len / REF_BUF_INCR) * REF_BUF_INCR;
        ref_buffer_.reset (ref_buffer_sz_);
    if (clean_len > MAX_SEQ_LEN || ref_len > MAX_SEQ_LEN)
        ++ toolongs_;
        return true;

    // recreate reference by Query, Cigar, and MD tag. Do not include softclipped ends in the recreated sequence (use default last parameter)
    recreate_ref (seq, rec_.getReadLength (), cigar_p, md_tag.c_str (), ref_buffer_, ref_buffer_sz_);

    unsigned qry_ins; // extra bases in query     == width_left
    unsigned ref_ins; // extra bases in reference == width_right
    band_width (*cigar_p, qry_ins, ref_ins);

    if (log_matr_ || log_base_)
        logfile_ << "Record " << read_cnt_ << ": " << rec_.getReadName () << "\n"
                 << "   sequence (" << rec_.getReadLength () << " bases)\n";

    CigarRoller roller;
    int ref_shift = 0;  // shift of the new alignment position on refereance relative the original
    unsigned qry_off, ref_off; // offsets on the query and reference of the first non-clipped aligned bases
    double new_score = 0;

    switch (p_->algo ())
        case ContalignParams::TEMPL:
            // call aligner
            new_score = taligner_.eval (clean_read, clean_len, ref_buffer_, ref_len, 0, band_width_);
            // read traceback
            // TODO: convert directly to cigar
            genstr::Alignment* al = taligner_.trace ();
            // convert alignment to cigar
            ref_shift = roll_cigar (roller, *al, clean_len, clips, qry_off, ref_off);
        case ContalignParams::PLAIN:
            new_score = aligner_.align_band (
                clean_read,                     // xseq
                clean_len,                      // xlen
                ref_buffer_,                    // yseq
                ref_len,                        // ylen
                0,                              // xpos
                0,                              // ypos
                std::max (clean_len, ref_len),  // segment length
                qry_ins + band_width_,          // width_left
                false,                          // unpack
                ref_ins + band_width_,          // width_right - forces to width_left
                true,                           // to_beg
                true                            // to_end
            unsigned bno = aligner_.backtrace (
                    batches_,      // BATCH buffer
                    max_batch_no_, // size of BATCH buffer
                    false,         // fill the BATCH array in reverse direction
                    ref_ins + band_width_ // width
            // convert alignment to cigar
            ref_shift = roll_cigar (roller, batches_, bno, clean_len, clips, qry_off, ref_off);
        case ContalignParams::POLY:
            new_score = contalign_.align_band (
                clean_read,                     // xseq
                clean_len,                      // xlen
                ref_buffer_,                    // yseq
                ref_len,                        // ylen
                0,                              // xpos
                0,                              // ypos
                std::max (clean_len, ref_len),  // segment length
                qry_ins + band_width_,          // width_left
                false,                          // unpack
                ref_ins + band_width_,          // width_right - forces to width_left
                true,                           // to_beg
                true                            // to_end
            unsigned bno = contalign_.backtrace (
                    batches_,      // BATCH buffer
                    max_batch_no_, // size of BATCH buffer
                    false,         // fill the BATCH array in reverse direction
                    ref_ins + band_width_ // width
            // convert alignment to cigar
            ref_shift = roll_cigar (roller, batches_, bno, clean_len, clips, qry_off, ref_off);
    // compare original and new cigar (and location)
    if (ref_shift || !(*cigar_p == roller))
        // save original cigar and position for reporting
        std::string orig_cigar_str;
        rec_.getCigarInfo ()->getCigarString (orig_cigar_str);
        int32_t prior_pos = rec_.get0BasedPosition ();

        // replace cigar
        rec_.setCigar (roller);
        ++ modified_cnt_;
        // update pos_adjusted_cnt if position changed
        if (ref_shift != 0)
            myassert (prior_pos + ref_shift >= 0);
            rec_.set0BasedPosition (prior_pos + ref_shift);
            ++ pos_adjusted_cnt_;
        if (log_diff_)
            const unsigned MAX_BATCH_PRINTED = 100;
            BATCH batches [MAX_BATCH_PRINTED];
            std::string new_cigar_str;
            unsigned bno;
            int swscore;

            rec_.getCigarInfo ()->getCigarString (new_cigar_str);
            if (!log_base_ && !log_matr_)
                logfile_ << "Record " << read_cnt_ << ": " << rec_.getReadName () << " (" << rec_.getReadLength () << " bases)\n";

            logfile_ << "   ORIG ALIGNMENT:" << std::right << std::setw (9) << prior_pos+1 << "->" <<  orig_cigar_str << "\n";
            bno = cigar_to_batches (orig_cigar_str, batches, MAX_BATCH_PRINTED);
            swscore = align_score (batches, bno, clean_read, ref_buffer_, p_->gip (), p_->gep (), p_->mat (), p_->mis ());
            print_batches (clean_read, clean_len, false, ref_buffer_, ref_len, false, batches, bno, logfile_, false, prior_pos + clips.soft_beg_, clips.soft_beg_, 0, 160);
            logfile_ << "\n     'classic' SW score is " << swscore << "\n";

            logfile_ << "   NEW ALIGNMENT:" << std::right << std::setw (9) << rec_.get1BasedPosition () << "->" <<  new_cigar_str << std::endl;
            bno = cigar_to_batches (new_cigar_str, batches, MAX_BATCH_PRINTED);
            swscore = align_score (batches, bno, clean_read + qry_off, ref_buffer_ + ref_off, p_->gip (), p_->gep (), p_->mat (), p_->mis ());
            print_batches (clean_read + qry_off, clean_len - qry_off, false, ref_buffer_ + ref_off, ref_len - ref_off, false, batches, bno, logfile_, false, prior_pos + clips.soft_beg_ + ref_off, clips.soft_beg_ + qry_off, 0, 160);
            logfile_ << "\n      'classic' SW score is " << swscore;
            logfile_ << "\n      alternate (context-aware) score is " << new_score << ", used bandwidth left: " << qry_ins + band_width_ << ", right: " << ref_ins + band_width_ << "\n" << std::endl;
        else if (log_base_)
            logfile_ << "Recomputed alignment differs from original:\n";
            logfile_ << "   ORIG ALIGNMENT:" << std::right << std::setw (9) << prior_pos+1 << "->" <<  orig_cigar_str << "\n";
            std::string new_cigar_str;
            rec_.getCigarInfo ()->getCigarString (new_cigar_str);
            logfile_ << "    NEW ALIGNMENT:" << std::right << std::setw (9) << rec_.get1BasedPosition () << "->" <<  new_cigar_str << "\n" << std::endl;
        if (log_base_)
            logfile_ << "Recomputed alignment matches the original:\n";
            std::string orig_cigar_str;
            rec_.getCigarInfo ()->getCigarString (orig_cigar_str);
            int32_t prior_pos = rec_.get0BasedPosition ();
            logfile_ << "   " << std::right << std::setw (9) << prior_pos+1 << "->" <<  orig_cigar_str << "\n" << std::endl;
    return true;