int LocateAnchorBoundsInSuffixArray(T_RefSequence &reference, T_SuffixArray &sa, T_Sequence &read, unsigned int minPrefixMatchLength, std::vector<DNALength> &matchLow, std::vector<DNALength> &matchHigh, std::vector<DNALength> &matchLength, AnchorParameters ¶ms) { // // Make sure there is enough of this read to map. Since searches // are keyed off of 'minPrefixMatchLength' matches, don't search // anything shorter than that. // if (minPrefixMatchLength > 0 and read.SubreadLength() < minPrefixMatchLength) { return 0; } DNALength p, m; DNALength matchEnd = read.SubreadEnd() - minPrefixMatchLength + 1; DNALength numSearchedPositions = matchEnd - read.SubreadStart(); matchLength.resize(numSearchedPositions); matchLow.resize(numSearchedPositions); matchHigh.resize(numSearchedPositions); std::fill(matchLength.begin(), matchLength.end(), 0); std::fill(matchLow.begin(), matchLow.end(), 0); std::fill(matchHigh.begin(), matchHigh.end(), 0); vector<SAIndex> lowMatchBound, highMatchBound; for (m = 0, p = read.SubreadStart(); p < matchEnd; p++, m++) { lowMatchBound.clear(); highMatchBound.clear(); DNALength lcpLength = sa.StoreLCPBounds(reference.seq, reference.length, &read.seq[p], matchEnd - p, params.useLookupTable, params.maxLCPLength, // // Store the positions in the SA // that are searched. // lowMatchBound, highMatchBound, params.stopMappingOnceUnique); // // Possibly print the lcp bounds for debugging // if (params.lcpBoundsOutPtr != NULL) { for (size_t i = 0; i < lowMatchBound.size(); i++) { *params.lcpBoundsOutPtr << (highMatchBound[i] - lowMatchBound[i]); if (i < lowMatchBound.size() - 1) { *params.lcpBoundsOutPtr << " "; } } *params.lcpBoundsOutPtr << endl; } // // Default to no match. // matchLow[m] = matchHigh[m] = matchLength[m] = 0; // // If anything was found in the suffix array: // if (lowMatchBound.size() > 0) { // // First expand the search bounds until at least // one match is found. // int lcpSearchLength = lowMatchBound.size(); while (lcpSearchLength > 0 and lowMatchBound[lcpSearchLength - 1] == highMatchBound[lcpSearchLength - 1]) { lcpSearchLength--; lcpLength--; } matchLow[m] = lowMatchBound[lcpSearchLength - 1]; matchHigh[m] = highMatchBound[lcpSearchLength - 1]; matchLength[m] = minPrefixMatchLength + lcpSearchLength; // // Next, apply some heuristics to the anchor generation. // // 1.1 If the suffix array match is unique, try and extend that // match as long as possible to ease global chaining later on. // // 1.2 If the suffix array match is unique, but cannot be // extended, it probably ends in an error. Back the search up // by 1. // // 2.1 If the suffix array match is not unique, return the // default matches, or expand the search to include more // matches. // // // Check to see if the match was unique. // if (matchLow[m] + 1 == matchHigh[m]) { // // If the match is unique, extend for as long as possible. // lcpLength = minPrefixMatchLength + lcpSearchLength; long refPos = sa.index[matchLow[m]] + lcpLength; long queryPos = p + lcpLength; bool extensionWasPossible = false; while (refPos + 1 < reference.length and queryPos + 1 < read.length and reference.seq[refPos + 1] == read.seq[queryPos + 1] and (params.maxLCPLength == 0 or lcpLength < static_cast<DNALength>(params.maxLCPLength))) { refPos++; queryPos++; lcpLength++; extensionWasPossible = true; } if (extensionWasPossible) { // // Was able to extend match far into the genome, store that. // matchLength[m] = lcpLength; } else if (extensionWasPossible == false) { // // No extension was possible, indicating that this match // ends at an error. To be safe, expand search by up to // 1. // if (lcpSearchLength > 1) { lcpSearchLength = lcpSearchLength - 1; } matchLow[m] = lowMatchBound[lcpSearchLength-1]; matchHigh[m] = highMatchBound[lcpSearchLength-1]; matchLength[m] = minPrefixMatchLength + lcpSearchLength; } } else { // // The match is not unique. Store a possibly expanded search. // if (lcpSearchLength > params.expand) { lcpSearchLength -= params.expand; } else { assert(lowMatchBound.size() > 0); lcpSearchLength = 1; } // // There are multiple matches for this position. // matchLow[m] = lowMatchBound[lcpSearchLength - 1]; matchHigh[m] = highMatchBound[lcpSearchLength - 1]; matchLength[m] = minPrefixMatchLength + lcpSearchLength; } } else { // // The match is shorter than what the search is supposed to // expand to. In order to avoid expanding to before the end // of the match list, do not set any match. // matchLow[m] = 0; matchHigh[m] = 0; matchLength[m] = 0; } // // Possibly advance a bunch of steps. // if (params.advanceExactMatches) { int tmp = (int)lcpLength - (int)params.expand - params.advanceExactMatches; int advance = MAX(tmp, 0); p += advance; m += advance; } } return 1; }
void SAMOutput::CreateCIGARString(T_AlignmentCandidate &alignment, T_Sequence &read, std::string &cigarString, Clipping clipping, DNALength & prefixSoftClip, DNALength & suffixSoftClip, DNALength & prefixHardClip, DNALength & suffixHardClip, bool cigarUseSeqMatch, const bool allowAdjacentIndels) { cigarString = ""; // All cigarString use the no clipping core std::vector<int> opSize; std::vector<char> opChar; CreateNoClippingCigarOps(alignment, opSize, opChar, cigarUseSeqMatch, allowAdjacentIndels); // Clipping needs to be added if (clipping == hard) { SetHardClip(alignment, read, prefixHardClip, suffixHardClip); if (prefixHardClip > 0) { opSize.insert(opSize.begin(), prefixHardClip); opChar.insert(opChar.begin(), 'H'); } if (suffixHardClip > 0) { opSize.push_back(suffixHardClip); opChar.push_back('H'); } prefixSoftClip = 0; suffixSoftClip = 0; } if (clipping == soft or clipping == subread) { // // Even if clipping is soft, the hard clipping removes the // low quality regions // if (clipping == soft) { prefixHardClip = read.lowQualityPrefix; suffixHardClip = read.lowQualitySuffix; } else if (clipping == subread) { prefixHardClip = std::max((DNALength) read.SubreadStart(), read.lowQualityPrefix); suffixHardClip = std::max((DNALength)(read.length - read.SubreadEnd()), read.lowQualitySuffix); } SetSoftClip(alignment, read, prefixHardClip, suffixHardClip, prefixSoftClip, suffixSoftClip); if (alignment.tStrand == 1) { std::swap(prefixHardClip, suffixHardClip); std::swap(prefixSoftClip, suffixSoftClip); } // // Insert the hard and soft clipping so that they are in the // order H then S if both exist. // if (prefixSoftClip > 0) { opSize.insert(opSize.begin(), prefixSoftClip); opChar.insert(opChar.begin(), 'S'); } if (prefixHardClip > 0) { opSize.insert(opSize.begin(), prefixHardClip); opChar.insert(opChar.begin(), 'H'); } // // Append the hard and soft clipping so they are in the order S // then H. // if (suffixSoftClip > 0) { opSize.push_back(suffixSoftClip); opChar.push_back('S'); } if (suffixHardClip > 0) { opSize.push_back(suffixHardClip); opChar.push_back('H'); } } CigarOpsToString(opSize, opChar, cigarString); }
int MapReadToGenome(T_RefSequence &reference, T_SuffixArray &sa, T_Sequence &read, unsigned int minPrefixMatchLength, vector<T_MatchPos> &matchPosList, AnchorParameters &anchorParameters) { vector<DNALength> matchLow, matchHigh, matchLength; DNALength minMatchLen = anchorParameters.minMatchLength; if (read.SubreadLength() < minMatchLen) { matchPosList.clear(); return 0; } LocateAnchorBoundsInSuffixArray(reference, sa, read, minPrefixMatchLength, matchLow, matchHigh, matchLength, anchorParameters); // // Try evaluating some contexts. // DNALength pos; assert(matchLow.size() == matchHigh.size()); DNASequence evalQrySeq, evalRefSeq; vector<Arrow> pathMat; vector<int> scoreMat; Alignment alignment; // // Do some filtering on the matches looking for overlapping matches // if there are any. // if (anchorParameters.removeEncompassedMatches) { vector<bool> removed; removed.resize(read.length); std::fill(removed.begin(), removed.end(), false); size_t i; for (i = 0; i < read.length-1; i++) { if (matchLength[i] == matchLength[i+1]+1) { removed[i+1] = true; } } for (i = 1; i < matchLength.size(); i++) { if (removed[i]) { matchLength[i] = matchLow[i] = matchHigh[i] = 0; } } } // // Now add // DNALength endOfMapping; DNALength trim = MAX(minMatchLen + 1, sa.lookupPrefixLength + 1); if (read.SubreadEnd() < trim) { endOfMapping = 0; } else { endOfMapping = read.SubreadEnd() - trim; } for (pos = read.SubreadStart(); pos < endOfMapping; pos++) { size_t matchIndex = pos - read.SubreadStart(); assert(matchIndex < matchHigh.size()); if (matchHigh[matchIndex] - matchLow[matchIndex] <= anchorParameters.maxAnchorsPerPosition) { DNALength mp; for (mp = matchLow[matchIndex]; mp < matchHigh[matchIndex]; mp++) { if (matchLength[matchIndex] < minMatchLen) { continue; } // // By default, add all anchors. // if (matchLength[matchIndex] + pos > read.length) { // // When doing branching, it's possible that a deletion // branch finds an anchor that goes past the end of a // read. When that is the case, trim back the anchor // match since this confuses downstream assertions. // matchLength[matchIndex] = read.length - pos; } assert(sa.index[mp] + matchLength[matchIndex] <= reference.length); matchPosList.push_back(ChainedMatchPos(sa.index[mp], pos, matchLength[matchIndex], matchHigh[matchIndex] - matchLow[matchIndex])); } } } return matchPosList.size(); }
void SAMOutput::PrintAlignment(T_AlignmentCandidate &alignment, T_Sequence &read, std::ostream &samFile, AlignmentContext &context, SupplementalQVList & qvList, Clipping clipping, bool cigarUseSeqMatch, const bool allowAdjacentIndels) { std::string cigarString; uint16_t flag; T_Sequence alignedSequence; DNALength prefixSoftClip = 0, suffixSoftClip = 0; DNALength prefixHardClip = 0, suffixHardClip = 0; CreateCIGARString(alignment, read, cigarString, clipping, prefixSoftClip, suffixSoftClip, prefixHardClip, suffixHardClip, cigarUseSeqMatch, allowAdjacentIndels); SetAlignedSequence(alignment, read, alignedSequence, clipping); BuildFlag(alignment, context, flag); samFile << alignment.qName << "\t" << flag << "\t" << alignment.tName << "\t"; // RNAME if (alignment.tStrand == 0) { samFile << alignment.TAlignStart() + 1 << "\t"; // POS, add 1 to get 1 based coordinate system } else { samFile << alignment.tLength - (alignment.TAlignStart() + alignment.TEnd()) + 1 << "\t"; // includes - 1 for rev-comp, +1 for one-based } samFile << (int) alignment.mapQV << "\t"// MAPQ << cigarString << "\t"; // CIGAR // // Determine RNEXT std::string rNext; rNext = "*"; /* if (context.hasNextSubreadPos == false) { rNext = "*"; } else { if (context.rNext == alignment.tName) { rNext = "="; } else { rNext = context.rNext; } } */ samFile << rNext << "\t"; // RNEXT DNALength nextSubreadPos = 0; /* if (context.hasNextSubreadPos) { nextSubreadPos = context.nextSubreadPos + 1; }*/ samFile << nextSubreadPos << "\t"; // RNEXT, add 1 for 1 based // indexing //DNALength tLen = alignment.GenomicTEnd() - alignment.GenomicTBegin(); //SAM v1.5, tLen is set as 0 for single-segment template samFile << 0 << "\t"; // TLEN // Print the sequence on one line, and suppress printing the // newline (by setting the line length to alignedSequence.length (static_cast<DNASequence*>(&alignedSequence))->PrintSeq(samFile, 0); // SEQ samFile << "\t"; if (alignedSequence.qual.data != NULL && qvList.useqv == 0) { alignedSequence.PrintAsciiQual(samFile, 0); // QUAL } else { samFile <<"*"; } samFile << "\t"; // // Add optional fields // samFile << "RG:Z:" << context.readGroupId << "\t"; samFile << "AS:i:" << alignment.score << "\t"; // // "RG" read group Id // "AS" alignment score // "XS" read alignment start position without counting previous soft clips (1 based) // "XE" read alignment end position without counting previous soft clips (1 based) // "XL" aligned read length // "XQ" query sequence length // "XT" # of continues reads, always 1 for blasr // "NM" edit distance // "FI" read alignment start position (1 based) // DNALength qAlignStart = alignment.QAlignStart(); DNALength qAlignEnd = alignment.QAlignEnd(); if (clipping == none) { samFile << "XS:i:" << qAlignStart + 1 << "\t"; samFile << "XE:i:" << qAlignEnd + 1 << "\t"; } else if (clipping == hard or clipping == soft or clipping == subread) { DNALength xs = prefixHardClip; DNALength xe = read.length - suffixHardClip; if (alignment.tStrand == 1) { xs = suffixHardClip; xe = read.length - prefixHardClip; } samFile << "XS:i:" << xs + 1 << "\t"; // add 1 for 1-based indexing in sam assert(read.length - suffixHardClip == prefixHardClip + alignedSequence.length); samFile << "XE:i:" << xe + 1 << "\t"; } samFile << "YS:i:" << read.SubreadStart() << "\t"; samFile << "YE:i:" << read.SubreadEnd() << "\t"; samFile << "ZM:i:" << read.HoleNumber() << "\t"; samFile << "XL:i:" << alignment.qAlignedSeq.length << "\t"; samFile << "XT:i:1\t"; // reads are allways continuous reads, not // referenced based circular consensus when // output by blasr. samFile << "NM:i:" << context.editDist << "\t"; samFile << "FI:i:" << alignment.qAlignedSeqPos + 1; // Add query sequence length samFile << "\t" << "XQ:i:" << alignment.qLength; // // Write out optional quality values. If qvlist does not // have any qv's signaled to print, this is a no-op. // // First transform characters that are too large to printable ones. qvList.FormatQVOptionalFields(alignedSequence); qvList.PrintQVOptionalFields(alignedSequence, samFile); samFile << std::endl; }