void SAMOutput::SetAlignedSequence(T_AlignmentCandidate &alignment, T_Sequence &read, T_Sequence &alignedSeq, Clipping clipping) { // // In both no, and hard clipping, the dna sequence that is output // solely corresponds to the aligned sequence. // DNALength clippedReadLength = 0; DNALength clippedStartPos = 0; if (clipping == none or clipping == hard) { DNALength qStart = alignment.QAlignStart(); DNALength qEnd = alignment.QAlignEnd(); clippedReadLength = qEnd - qStart; clippedStartPos = qStart; } else if (clipping == soft) { clippedReadLength = read.length - read.lowQualityPrefix - read.lowQualitySuffix; clippedStartPos = read.lowQualityPrefix; } else if (clipping == subread) { clippedReadLength = read.SubreadLength(); clippedStartPos = read.SubreadStart(); } else { std::cout <<" ERROR! The clipping must be none, hard, subread, or soft when setting the aligned sequence." << std::endl; assert(0); } // // Set the aligned sequence according to the clipping boundaries. // if (alignment.tStrand == 0) { alignedSeq.ReferenceSubstring(read, clippedStartPos, clippedReadLength); } else { T_Sequence subSeq; subSeq.ReferenceSubstring(read, clippedStartPos, clippedReadLength); subSeq.MakeRC(alignedSeq); assert(alignedSeq.deleteOnExit); } }
int LocateAnchorBoundsInSuffixArray(T_RefSequence &reference, T_SuffixArray &sa, T_Sequence &read, unsigned int minPrefixMatchLength, std::vector<DNALength> &matchLow, std::vector<DNALength> &matchHigh, std::vector<DNALength> &matchLength, AnchorParameters ¶ms) { // // Make sure there is enough of this read to map. Since searches // are keyed off of 'minPrefixMatchLength' matches, don't search // anything shorter than that. // if (minPrefixMatchLength > 0 and read.SubreadLength() < minPrefixMatchLength) { return 0; } DNALength p, m; DNALength matchEnd = read.SubreadEnd() - minPrefixMatchLength + 1; DNALength numSearchedPositions = matchEnd - read.SubreadStart(); matchLength.resize(numSearchedPositions); matchLow.resize(numSearchedPositions); matchHigh.resize(numSearchedPositions); std::fill(matchLength.begin(), matchLength.end(), 0); std::fill(matchLow.begin(), matchLow.end(), 0); std::fill(matchHigh.begin(), matchHigh.end(), 0); vector<SAIndex> lowMatchBound, highMatchBound; for (m = 0, p = read.SubreadStart(); p < matchEnd; p++, m++) { lowMatchBound.clear(); highMatchBound.clear(); DNALength lcpLength = sa.StoreLCPBounds(reference.seq, reference.length, &read.seq[p], matchEnd - p, params.useLookupTable, params.maxLCPLength, // // Store the positions in the SA // that are searched. // lowMatchBound, highMatchBound, params.stopMappingOnceUnique); // // Possibly print the lcp bounds for debugging // if (params.lcpBoundsOutPtr != NULL) { for (size_t i = 0; i < lowMatchBound.size(); i++) { *params.lcpBoundsOutPtr << (highMatchBound[i] - lowMatchBound[i]); if (i < lowMatchBound.size() - 1) { *params.lcpBoundsOutPtr << " "; } } *params.lcpBoundsOutPtr << endl; } // // Default to no match. // matchLow[m] = matchHigh[m] = matchLength[m] = 0; // // If anything was found in the suffix array: // if (lowMatchBound.size() > 0) { // // First expand the search bounds until at least // one match is found. // int lcpSearchLength = lowMatchBound.size(); while (lcpSearchLength > 0 and lowMatchBound[lcpSearchLength - 1] == highMatchBound[lcpSearchLength - 1]) { lcpSearchLength--; lcpLength--; } matchLow[m] = lowMatchBound[lcpSearchLength - 1]; matchHigh[m] = highMatchBound[lcpSearchLength - 1]; matchLength[m] = minPrefixMatchLength + lcpSearchLength; // // Next, apply some heuristics to the anchor generation. // // 1.1 If the suffix array match is unique, try and extend that // match as long as possible to ease global chaining later on. // // 1.2 If the suffix array match is unique, but cannot be // extended, it probably ends in an error. Back the search up // by 1. // // 2.1 If the suffix array match is not unique, return the // default matches, or expand the search to include more // matches. // // // Check to see if the match was unique. // if (matchLow[m] + 1 == matchHigh[m]) { // // If the match is unique, extend for as long as possible. // lcpLength = minPrefixMatchLength + lcpSearchLength; long refPos = sa.index[matchLow[m]] + lcpLength; long queryPos = p + lcpLength; bool extensionWasPossible = false; while (refPos + 1 < reference.length and queryPos + 1 < read.length and reference.seq[refPos + 1] == read.seq[queryPos + 1] and (params.maxLCPLength == 0 or lcpLength < static_cast<DNALength>(params.maxLCPLength))) { refPos++; queryPos++; lcpLength++; extensionWasPossible = true; } if (extensionWasPossible) { // // Was able to extend match far into the genome, store that. // matchLength[m] = lcpLength; } else if (extensionWasPossible == false) { // // No extension was possible, indicating that this match // ends at an error. To be safe, expand search by up to // 1. // if (lcpSearchLength > 1) { lcpSearchLength = lcpSearchLength - 1; } matchLow[m] = lowMatchBound[lcpSearchLength-1]; matchHigh[m] = highMatchBound[lcpSearchLength-1]; matchLength[m] = minPrefixMatchLength + lcpSearchLength; } } else { // // The match is not unique. Store a possibly expanded search. // if (lcpSearchLength > params.expand) { lcpSearchLength -= params.expand; } else { assert(lowMatchBound.size() > 0); lcpSearchLength = 1; } // // There are multiple matches for this position. // matchLow[m] = lowMatchBound[lcpSearchLength - 1]; matchHigh[m] = highMatchBound[lcpSearchLength - 1]; matchLength[m] = minPrefixMatchLength + lcpSearchLength; } } else { // // The match is shorter than what the search is supposed to // expand to. In order to avoid expanding to before the end // of the match list, do not set any match. // matchLow[m] = 0; matchHigh[m] = 0; matchLength[m] = 0; } // // Possibly advance a bunch of steps. // if (params.advanceExactMatches) { int tmp = (int)lcpLength - (int)params.expand - params.advanceExactMatches; int advance = MAX(tmp, 0); p += advance; m += advance; } } return 1; }
int MapReadToGenome(T_RefSequence &reference, T_SuffixArray &sa, T_Sequence &read, unsigned int minPrefixMatchLength, vector<T_MatchPos> &matchPosList, AnchorParameters &anchorParameters) { vector<DNALength> matchLow, matchHigh, matchLength; DNALength minMatchLen = anchorParameters.minMatchLength; if (read.SubreadLength() < minMatchLen) { matchPosList.clear(); return 0; } LocateAnchorBoundsInSuffixArray(reference, sa, read, minPrefixMatchLength, matchLow, matchHigh, matchLength, anchorParameters); // // Try evaluating some contexts. // DNALength pos; assert(matchLow.size() == matchHigh.size()); DNASequence evalQrySeq, evalRefSeq; vector<Arrow> pathMat; vector<int> scoreMat; Alignment alignment; // // Do some filtering on the matches looking for overlapping matches // if there are any. // if (anchorParameters.removeEncompassedMatches) { vector<bool> removed; removed.resize(read.length); std::fill(removed.begin(), removed.end(), false); size_t i; for (i = 0; i < read.length-1; i++) { if (matchLength[i] == matchLength[i+1]+1) { removed[i+1] = true; } } for (i = 1; i < matchLength.size(); i++) { if (removed[i]) { matchLength[i] = matchLow[i] = matchHigh[i] = 0; } } } // // Now add // DNALength endOfMapping; DNALength trim = MAX(minMatchLen + 1, sa.lookupPrefixLength + 1); if (read.SubreadEnd() < trim) { endOfMapping = 0; } else { endOfMapping = read.SubreadEnd() - trim; } for (pos = read.SubreadStart(); pos < endOfMapping; pos++) { size_t matchIndex = pos - read.SubreadStart(); assert(matchIndex < matchHigh.size()); if (matchHigh[matchIndex] - matchLow[matchIndex] <= anchorParameters.maxAnchorsPerPosition) { DNALength mp; for (mp = matchLow[matchIndex]; mp < matchHigh[matchIndex]; mp++) { if (matchLength[matchIndex] < minMatchLen) { continue; } // // By default, add all anchors. // if (matchLength[matchIndex] + pos > read.length) { // // When doing branching, it's possible that a deletion // branch finds an anchor that goes past the end of a // read. When that is the case, trim back the anchor // match since this confuses downstream assertions. // matchLength[matchIndex] = read.length - pos; } assert(sa.index[mp] + matchLength[matchIndex] <= reference.length); matchPosList.push_back(ChainedMatchPos(sa.index[mp], pos, matchLength[matchIndex], matchHigh[matchIndex] - matchLow[matchIndex])); } } } return matchPosList.size(); }