예제 #1
0
void SAMOutput::SetAlignedSequence(T_AlignmentCandidate &alignment, T_Sequence &read,
        T_Sequence &alignedSeq,
        Clipping clipping) {
    //
    // In both no, and hard clipping, the dna sequence that is output
    // solely corresponds to the aligned sequence.
    //
    DNALength clippedReadLength = 0;
    DNALength clippedStartPos   = 0;

    if (clipping == none or clipping == hard) {
        DNALength qStart = alignment.QAlignStart();
        DNALength qEnd   = alignment.QAlignEnd();
        clippedReadLength = qEnd - qStart;
        clippedStartPos   = qStart;
    }
    else if (clipping == soft) {
        clippedReadLength = read.length - read.lowQualityPrefix - read.lowQualitySuffix;
        clippedStartPos = read.lowQualityPrefix;
    }
    else if (clipping == subread) {
        clippedReadLength = read.subreadEnd - read.subreadStart;
        clippedStartPos = read.subreadStart;
    }
    else {
        std::cout <<" ERROR! The clipping must be none, hard, subread, or soft when setting the aligned sequence." << std::endl;
        assert(0);
    }

    //
    // Set the aligned sequence according to the clipping boundaries.
    //
    if (alignment.tStrand == 0) {
        alignedSeq.ReferenceSubstring(read, clippedStartPos, clippedReadLength);
    }
    else {
        T_Sequence subSeq;
        subSeq.ReferenceSubstring(read, clippedStartPos, clippedReadLength);
        subSeq.MakeRC(alignedSeq);
        assert(alignedSeq.deleteOnExit);
    }
}
DNALength CompressedSequence<T_Sequence>::FourBitDecompressHomopolymers(int start, int end, 
        T_Sequence &decompSeq) {
    decompSeq.Free(); // Free before decomp;

    //
    // first compute the length of the decoded 
    //
    DNALength i;
    decompSeq.length = 0;
    for (i = start; i < end; i++ ){ 
        unsigned char count;
        count = (unsigned char) seq[i];
        count >>= 4;
        decompSeq.length += count;
    }
    decompSeq.seq = ProtectedNew<Nucleotide>(decompSeq.length);

    //
    // Now store the actual decompressed seq.
    //
    int d = 0;
    unsigned char mask = 0xf;
    for (i = start; i < end; i++ ){ 
        unsigned char count;
        count = (unsigned char) seq[i];
        count >>= 4;
        int j;
        for (j = 0; j < count; j++ ){ 
            decompSeq.seq[d] = FourBitToAscii[(seq[i] & mask)];
            d++;
        }
    }
    decompSeq.bitsPerNuc = 4;
    decompSeq.deleteOnExit = true;
    return decompSeq.length;
}
예제 #3
0
	int GetNext(T_Sequence &ccsSequence) {
		//
		// Read in all ccs pass data.
		//

		ccsSequence.Free();
		int retVal = 0;
		if (this->curRead == ccsBasReader.nReads) {
			return 0;
		}
		if (this->curBasePos == ccsBasReader.nBases) {
			return 0;
		}
        try {
        UInt numPasses;
		numPassesArray.Read(this->curRead, this->curRead+1, &numPasses);
		if (numPasses > 0) {
			// Read in the ccs bases
			if ((retVal = ccsBasReader.GetNext((SMRTSequence&)ccsSequence)) == 0)
                return 0;

            ccsSequence.numPasses = numPasses;

			if (this->includedFields["AdapterHitAfter"]) {
				ccsSequence.adapterHitAfter.resize(ccsSequence.numPasses);
				adapterHitAfterArray.Read(curPassPos,  curPassPos + ccsSequence.numPasses, &ccsSequence.adapterHitAfter[0]);
			}
			if (this->includedFields["AdapterHitBefore"]) {
				ccsSequence.adapterHitBefore.resize(ccsSequence.numPasses);
				adapterHitBeforeArray.Read(curPassPos, curPassPos + ccsSequence.numPasses, &ccsSequence.adapterHitBefore[0]);
			}
			if (this->includedFields["PassDirection"]) {
				ccsSequence.passDirection.resize(ccsSequence.numPasses);
				passDirectionArray.Read(curPassPos,    curPassPos + ccsSequence.numPasses, &ccsSequence.passDirection[0]);
			}
			if (this->includedFields["PassNumBases"]) {
				ccsSequence.passNumBases.resize(ccsSequence.numPasses);
				passNumBasesArray.Read(curPassPos,     curPassPos + ccsSequence.numPasses, &ccsSequence.passNumBases[0]);
			}
			if (this->includedFields["PassStartBase"]) {
				ccsSequence.passStartBase.resize(ccsSequence.numPasses);
				passStartBaseArray.Read(curPassPos,    curPassPos + ccsSequence.numPasses, &ccsSequence.passStartBase[0]);
			}
			if (this->includedFields["PassStartPulse"]) {
				ccsSequence.passStartPulse.resize(ccsSequence.numPasses);
				passStartPulseArray.Read(curPassPos,   curPassPos + ccsSequence.numPasses, &ccsSequence.passStartPulse[0]);
			}
			if (this->includedFields["PassNumPulses"]) { 
				ccsSequence.passNumPulses.resize(ccsSequence.numPasses);
				passNumPulsesArray.Read(curPassPos,    curPassPos + ccsSequence.numPasses, &ccsSequence.passNumPulses[0]);			
			}
			curPassPos += ccsSequence.numPasses;
		}
		else {
			// advance a read in the ccs sequence without advancing positions.
			ccsBasReader.curRead++;
		}
		//
		// Regardless whether or not a ccs read was called, read the next
		// unrolled read, since an unrolled read is called for each zmw.
		//
		retVal = ((T_HDFBasReader<SMRTSequence>*)this)->GetNext(ccsSequence.unrolledRead);
        ccsSequence.zmwData = ccsSequence.unrolledRead.zmwData;
		ccsSequence.CopyTitle(ccsSequence.unrolledRead.title);
    string newTitle = string(ccsSequence.title) + string("/ccs");
    ccsSequence.CopyTitle(newTitle.c_str());
        } catch (H5::DataSetIException e) {
            cout << "ERROR, could not read ccs data for CCS Sequence " 
                 << ccsSequence.unrolledRead.title << endl; 
            exit(1);
        }
		//		cout << "title: " << ccsSequence.title << endl;
		if (retVal == 0) {
			return 0;
		}
		else {
			return 1;
		}
	}
예제 #4
0
void SAMOutput::PrintAlignment(T_AlignmentCandidate &alignment,
        T_Sequence &read,
        std::ostream &samFile,
        AlignmentContext &context,
        SupplementalQVList & qvList,
        Clipping clipping,
        bool cigarUseSeqMatch) {

    std::string cigarString;
    uint16_t flag;
    T_Sequence alignedSequence;
    DNALength prefixSoftClip = 0, suffixSoftClip = 0;
    DNALength prefixHardClip = 0, suffixHardClip = 0;

    CreateCIGARString(alignment, read, cigarString, clipping, prefixSoftClip, suffixSoftClip, prefixHardClip, suffixHardClip, cigarUseSeqMatch);
    SetAlignedSequence(alignment, read, alignedSequence, clipping);
    BuildFlag(alignment, context, flag);
    samFile << alignment.qName << "\t" 
            << flag << "\t" 
            << alignment.tName << "\t";   // RNAME
    if (alignment.tStrand == 0) {
      samFile << alignment.TAlignStart() + 1 << "\t"; 
      // POS, add 1 to get 1 based coordinate system
    }
    else {
      samFile << alignment.tLength - (alignment.TAlignStart() + alignment.TEnd()) + 1 << "\t"; // includes - 1 for rev-comp,  +1 for one-based
    }
    samFile << (int) alignment.mapQV << "\t"// MAPQ
            << cigarString << "\t"; // CIGAR
      
      //
      // Determine RNEXT

    std::string rNext;
    rNext = "*";
    /*
    if (context.hasNextSubreadPos == false) {
      rNext = "*";
    }
    else {
      if (context.rNext == alignment.tName) {
        rNext = "=";
      }
      else {
        rNext = context.rNext;
      }
    }
    */
    samFile << rNext << "\t"; // RNEXT
    
    DNALength nextSubreadPos = 0;
    /*
    if (context.hasNextSubreadPos) {
      nextSubreadPos = context.nextSubreadPos + 1;
      }*/
    samFile << nextSubreadPos << "\t"; // RNEXT, add 1 for 1 based
                                           // indexing

    //DNALength tLen = alignment.GenomicTEnd() - alignment.GenomicTBegin();
    //SAM v1.5, tLen is set as 0 for single-segment template
    samFile << 0 << "\t"; // TLEN
    // Print the sequence on one line, and suppress printing the
    // newline (by setting the line length to alignedSequence.length
    (static_cast<DNASequence*>(&alignedSequence))->PrintSeq(samFile, 0);  // SEQ
    samFile << "\t";
    if (alignedSequence.qual.data != NULL && qvList.useqv == 0) {
        alignedSequence.PrintAsciiQual(samFile, 0);  // QUAL
    }
    else {
      samFile <<"*";
    }
    samFile << "\t";
    //
    // Add optional fields
    //
    samFile << "RG:Z:" << context.readGroupId << "\t";
    samFile << "AS:i:" << alignment.score << "\t";

    //
    // "RG" read group Id
    // "AS" alignment score
    // "XS" read alignment start position without counting previous soft clips (1 based) 
    // "XE" read alignment end position without counting previous soft clips (1 based) 
    // "XL" aligned read length 
    // "XQ" query sequence length
    // "XT" # of continues reads, always 1 for blasr 
    // "NM" edit distance 
    // "FI" read alignment start position (1 based) 
    //
    
    DNALength qAlignStart = alignment.QAlignStart();
    DNALength qAlignEnd = alignment.QAlignEnd();

    if (clipping == none) {
      samFile << "XS:i:" << qAlignStart + 1 << "\t";
      samFile << "XE:i:" << qAlignEnd + 1 << "\t";
    }
    else if (clipping == hard or clipping == soft or clipping == subread) {
        DNALength xs = prefixHardClip;
        DNALength xe = read.length - suffixHardClip;
        if (alignment.tStrand == 1) {
            xs = suffixHardClip;
            xe = read.length - prefixHardClip;
        }
        samFile << "XS:i:" << xs + 1 << "\t"; // add 1 for 1-based indexing in sam
        assert(read.length - suffixHardClip == prefixHardClip + alignedSequence.length);
        samFile << "XE:i:" << xe + 1 << "\t";
    }
    samFile << "YS:i:" << read.subreadStart << "\t";
    samFile << "YE:i:" << read.subreadEnd << "\t";
    samFile << "ZM:i:" << read.zmwData.holeNumber << "\t";
    samFile << "XL:i:" << alignment.qAlignedSeq.length << "\t";
    samFile << "XT:i:1\t"; // reads are allways continuous reads, not
                        // referenced based circular consensus when
                        // output by blasr.
    samFile << "NM:i:" << context.editDist << "\t";
    samFile << "FI:i:" << alignment.qAlignedSeqPos + 1;
    // Add query sequence length
    samFile << "\t" << "XQ:i:" << alignment.qLength;

    //
	// Write out optional quality values.  If qvlist does not 
	// have any qv's signaled to print, this is a no-op.
	//
	// First transform characters that are too large to printable ones.
	qvList.FormatQVOptionalFields(alignedSequence);
	qvList.PrintQVOptionalFields(alignedSequence, samFile);

    samFile << std::endl;
}
int LocateAnchorBoundsInSuffixArray(T_RefSequence &reference,
	T_SuffixArray &sa, T_Sequence &read, unsigned int minPrefixMatchLength,
	std::vector<DNALength> &matchLow, std::vector<DNALength> &matchHigh,
	std::vector<DNALength> &matchLength, AnchorParameters &params) {

    //
    // Make sure there is enough of this read to map.  Since searches
    // are keyed off of 'minPrefixMatchLength' matches, don't search
    // anything shorter than that.
    //
    if (minPrefixMatchLength > 0 and 
        read.SubreadLength() < minPrefixMatchLength) {
        return 0;
    }

    DNALength p, m;
    DNALength matchEnd = read.SubreadEnd() - minPrefixMatchLength + 1;
    DNALength numSearchedPositions = matchEnd - read.SubreadStart();

    matchLength.resize(numSearchedPositions);
    matchLow.resize(numSearchedPositions);
    matchHigh.resize(numSearchedPositions);

    std::fill(matchLength.begin(), matchLength.end(), 0);
    std::fill(matchLow.begin(), matchLow.end(), 0);
    std::fill(matchHigh.begin(), matchHigh.end(), 0);
    vector<SAIndex> lowMatchBound, highMatchBound;	

    for (m = 0, p = read.SubreadStart(); p < matchEnd; p++, m++) {
        lowMatchBound.clear(); highMatchBound.clear();
        DNALength lcpLength = sa.StoreLCPBounds(reference.seq, reference.length, 
            &read.seq[p], matchEnd - p,
            params.useLookupTable,
            params.maxLCPLength,
            //
            // Store the positions in the SA
            // that are searched.
            //
            lowMatchBound, highMatchBound, 
            params.stopMappingOnceUnique);

        //
        // Possibly print the lcp bounds for debugging
        //
        if (params.lcpBoundsOutPtr != NULL) {
            for (size_t i = 0; i < lowMatchBound.size(); i++) {
                *params.lcpBoundsOutPtr << 
                    (highMatchBound[i] - lowMatchBound[i]);
                if (i < lowMatchBound.size() - 1) {
                    *params.lcpBoundsOutPtr << " ";
                }  
            }
            *params.lcpBoundsOutPtr << endl;
        }

        //
        // Default to no match.
        //
        matchLow[m] = matchHigh[m] = matchLength[m] = 0;

        //
        // If anything was found in the suffix array:
        //
        if (lowMatchBound.size() > 0) {
            //
            // First expand the search bounds until at least
            // one match is found.
            //
            int lcpSearchLength = lowMatchBound.size();
            while (lcpSearchLength > 0 and 
                    lowMatchBound[lcpSearchLength - 1] == 
                    highMatchBound[lcpSearchLength - 1]) {
                lcpSearchLength--;
                lcpLength--;
            }
            matchLow[m]  = lowMatchBound[lcpSearchLength - 1];
            matchHigh[m] = highMatchBound[lcpSearchLength - 1];
            matchLength[m] = minPrefixMatchLength + lcpSearchLength;

            //
            // Next, apply some heuristics to the anchor generation.
            //
            // 1.1 If the suffix array match is unique, try and extend that
            // match as long as possible to ease global chaining later on.  
            //
            // 1.2 If the suffix array match is unique, but cannot be
            // extended, it probably ends in an error.  Back the search up
            // by 1.
            //
            // 2.1 If the suffix array match is not unique, return the
            // default matches, or expand the search to include more
            // matches. 
            //

            //
            // Check to see if the match was unique.
            //
            if (matchLow[m] + 1 == matchHigh[m]) {
                //
                // If the match is unique, extend for as long as possible.
                //
                lcpLength = minPrefixMatchLength + lcpSearchLength;
                long refPos    = sa.index[matchLow[m]] + lcpLength;
                long queryPos  = p + lcpLength;
                bool extensionWasPossible = false;

                while (refPos + 1 < reference.length and
                       queryPos + 1 < read.length and
                       reference.seq[refPos + 1] == read.seq[queryPos + 1] and 
                       (params.maxLCPLength == 0 or 
                        lcpLength < static_cast<DNALength>(params.maxLCPLength))) {
                    refPos++;
                    queryPos++;
                    lcpLength++;
                    extensionWasPossible = true;
                }

                if (extensionWasPossible) {
                    //
                    // Was able to extend match far into the genome, store that.
                    //
                    matchLength[m] = lcpLength;
                }
                else if (extensionWasPossible == false) {
                    //
                    // No extension was possible, indicating that this match
                    // ends at an error.  To be safe, expand search by up to
                    // 1.
                    //
                    if (lcpSearchLength > 1) {
                        lcpSearchLength = lcpSearchLength - 1;
                    }
                    matchLow[m]  = lowMatchBound[lcpSearchLength-1];
                    matchHigh[m] = highMatchBound[lcpSearchLength-1];
                    matchLength[m] = minPrefixMatchLength + lcpSearchLength;
                }
            }
            else {
                //
                // The match is not unique.  Store a possibly expanded search.
                // 
                if (lcpSearchLength > params.expand) {
                    lcpSearchLength -= params.expand;
                }
                else {
                    assert(lowMatchBound.size() > 0);
                    lcpSearchLength = 1;
                }

                //
                // There are multiple matches for this position.
                //
                matchLow[m]    = lowMatchBound[lcpSearchLength - 1];
                matchHigh[m]   = highMatchBound[lcpSearchLength - 1];
                matchLength[m] = minPrefixMatchLength + lcpSearchLength;
            }
        }
        else {
            //
            // The match is shorter than what the search is supposed to
            // expand to.  In order to avoid expanding to before the end
            // of the match list, do not set any match.
            //
            matchLow[m]    = 0;
            matchHigh[m]   = 0;
            matchLength[m] = 0;
        }

        //
        // Possibly advance a bunch of steps.
        //
        if (params.advanceExactMatches) {
            int tmp = (int)lcpLength - (int)params.expand
                      - params.advanceExactMatches;
            int advance = MAX(tmp, 0);
            p += advance;
            m += advance;
        }
    }
    return 1;
}
int MapReadToGenome(T_RefSequence &reference,
    T_SuffixArray &sa, T_Sequence &read, 
    unsigned int minPrefixMatchLength,
    vector<T_MatchPos> &matchPosList,
    AnchorParameters &anchorParameters) {

    vector<DNALength> matchLow, matchHigh, matchLength;

    DNALength minMatchLen = anchorParameters.minMatchLength;
    if (read.SubreadLength() < minMatchLen) {
        matchPosList.clear();
        return 0;
    }

    LocateAnchorBoundsInSuffixArray(reference, sa, read, 
        minPrefixMatchLength, matchLow, matchHigh, matchLength,
        anchorParameters);

    //
    // Try evaluating some contexts.
    //
    DNALength pos;
    assert(matchLow.size() == matchHigh.size());

    DNASequence evalQrySeq, evalRefSeq;
    vector<Arrow> pathMat;
    vector<int> scoreMat;
    Alignment alignment;

    //
    // Do some filtering on the matches looking for overlapping matches
    // if there are any.
    //
    if (anchorParameters.removeEncompassedMatches) {
        vector<bool> removed;
        removed.resize(read.length);
        std::fill(removed.begin(), removed.end(), false);
        size_t i;
        for (i = 0; i < read.length-1; i++) {
            if (matchLength[i] == matchLength[i+1]+1) {
                removed[i+1] = true;
            }
        }
        for (i = 1; i < matchLength.size(); i++) {
            if (removed[i]) {
                matchLength[i] = matchLow[i] = matchHigh[i] = 0;
            }
        }
    }
    //
    // Now add 
    // 
    DNALength endOfMapping;
    DNALength trim = MAX(minMatchLen + 1, sa.lookupPrefixLength + 1);
    if (read.SubreadEnd() < trim) {
        endOfMapping = 0;
    }
    else {
        endOfMapping = read.SubreadEnd() - trim;
    }

    for (pos = read.SubreadStart(); pos < endOfMapping; pos++) {
        size_t matchIndex = pos - read.SubreadStart();
        assert(matchIndex < matchHigh.size());
        if (matchHigh[matchIndex] - matchLow[matchIndex] <= 
            anchorParameters.maxAnchorsPerPosition) {
            DNALength mp;
            for (mp = matchLow[matchIndex]; mp < matchHigh[matchIndex]; mp++) {
                if (matchLength[matchIndex] < minMatchLen) {
                    continue;
                }

                //
                // By default, add all anchors.
                //
                if (matchLength[matchIndex] + pos > read.length) {
                    //
                    // When doing branching, it's possible that a deletion
                    // branch finds an anchor that goes past the end of a
                    // read.  When that is the case, trim back the anchor
                    // match since this confuses downstream assertions.
                    //
                    matchLength[matchIndex] = read.length - pos;
                }
                assert(sa.index[mp] + matchLength[matchIndex] 
                    <= reference.length);

                matchPosList.push_back(ChainedMatchPos(sa.index[mp], pos,
                    matchLength[matchIndex], 
                    matchHigh[matchIndex] - matchLow[matchIndex]));
            }
        }
    }

    return matchPosList.size();
}		
예제 #7
0
void SAMOutput::CreateCIGARString(T_AlignmentCandidate &alignment,
        T_Sequence &read,
        std::string &cigarString,
        Clipping clipping,
        DNALength & prefixSoftClip, DNALength & suffixSoftClip, 
        DNALength & prefixHardClip, DNALength & suffixHardClip,
        bool cigarUseSeqMatch, const bool allowAdjacentIndels) {

    cigarString = "";
    // All cigarString use the no clipping core
    std::vector<int> opSize;
    std::vector<char> opChar;
    CreateNoClippingCigarOps(alignment, opSize, opChar, cigarUseSeqMatch, allowAdjacentIndels);

    // Clipping needs to be added

    if (clipping == hard) {
      SetHardClip(alignment, read, prefixHardClip, suffixHardClip);
      if (prefixHardClip > 0) {
        opSize.insert(opSize.begin(), prefixHardClip);
        opChar.insert(opChar.begin(), 'H');
      }
      if (suffixHardClip > 0) {
        opSize.push_back(suffixHardClip);
        opChar.push_back('H');
      }
      prefixSoftClip = 0;
      suffixSoftClip = 0;
    }
    if (clipping == soft or clipping == subread) {
      //
      // Even if clipping is soft, the hard clipping removes the 
      // low quality regions
      //
      if (clipping == soft) {
          prefixHardClip = read.lowQualityPrefix;
          suffixHardClip = read.lowQualitySuffix;
      }
      else if (clipping == subread) {
          prefixHardClip = std::max((DNALength) read.SubreadStart(), read.lowQualityPrefix);
          suffixHardClip = std::max((DNALength)(read.length - read.SubreadEnd()), read.lowQualitySuffix);
      }

      SetSoftClip(alignment, read, prefixHardClip, suffixHardClip, prefixSoftClip, suffixSoftClip);

      if (alignment.tStrand == 1) {
        std::swap(prefixHardClip, suffixHardClip);
        std::swap(prefixSoftClip, suffixSoftClip);
      }

      //
      // Insert the hard and soft clipping so that they are in the
      // order H then S if both exist.
      //
      if (prefixSoftClip > 0) {
        opSize.insert(opSize.begin(), prefixSoftClip);
        opChar.insert(opChar.begin(), 'S');
      }
      if (prefixHardClip > 0) {
        opSize.insert(opSize.begin(), prefixHardClip);
        opChar.insert(opChar.begin(), 'H');
      }
      
      //
      // Append the hard and soft clipping so they are in the order S
      // then H. 
      //
      if (suffixSoftClip > 0) {
        opSize.push_back(suffixSoftClip);
        opChar.push_back('S');
      }
      if (suffixHardClip > 0) {
        opSize.push_back(suffixHardClip);
        opChar.push_back('H');
      }
    }

    CigarOpsToString(opSize, opChar, cigarString);
}