int extractRead(Idx2BWT * idx2BWT, unsigned long long readID, char* seq, char* qual) { static char dnaChars[] = {'A','C','G','T'}; int readLen = 0; BWT *bwt = idx2BWT->bwt; ULL l = idx2BWT->bwt->cumulativeFreq[4] + readID; for (;;) { unsigned int c = _getBWTvalue(idx2BWT->bwt, l); if (c == 4) { for (int i = 0, j = readLen - 1; i < j; ++i, --j) { seq[i] ^= seq[j]; seq[j] ^= seq[i]; seq[i] ^= seq[j]; } seq[readLen] = 0; l = bwt->cumulativeFreq[c] + BWTOccValue(bwt, l, c); fprintf(stderr, "ReadID verify: %d. Just for debug, comment this at file %s, line: %d.\n", idx2BWT->readIDtable[l - bwt->cumulativeFreq[4]], __FILE__, __LINE__); return readLen; } else { l = bwt->cumulativeFreq[c] + BWTOccValue(bwt, l, c); seq[readLen] = dnaChars[c]; ++readLen; } } }
//给出saIndex, foreward 算出 saIndex void BWTSARangeForeward(Idx2BWT * idx2BWT, const unsigned char c, unsigned int *saIndexLeft, unsigned int *saIndexRight) { BWT * rev_bwt = idx2BWT->rev_bwt; BWT *bwt = idx2BWT->bwt; unsigned int l = (*saIndexLeft); unsigned int r = (*saIndexRight); (*saIndexLeft) = bwt->cumulativeFreq[c] + BWTOccValue(rev_bwt, l, c) + 1; (*saIndexRight) = bwt->cumulativeFreq[c] + BWTOccValue(rev_bwt, r + 1, c); }
int _dfsExtractReadInf(Idx2BWT * idx2BWT, ULL saL, ULL saR, ReadInf* &ri, int outputLimit, int strand, int _pos) { ULL l, r; unsigned int c; ULL numSAs = 0; int totalCount = 0; l = saL, r = saR; if (saL == saR) { // only one route, backward search to the end do { c = _getBWTvalue(idx2BWT->bwt, saL); saL = idx2BWT->bwt->cumulativeFreq[c] + BWTOccValue(idx2BWT->bwt, saL, c); ++_pos; } while (c != 4); ri->read_id = idx2BWT->readIDtable[saL - idx2BWT->bwt->cumulativeFreq[4]]; ri->strand = strand; ri->pos = _pos - 1; ++ri; return 1; } for (int i = 0; i < 4; ++i) { // try A C G T BWTSARangeBackward(idx2BWT, i ,&saL, &saR); if (saL <= saR && saR-saL <= r-l) { int outputCount = _dfsExtractReadInf(idx2BWT, saL, saR, ri, outputLimit, strand, _pos + 1); outputLimit -= outputCount; totalCount += outputCount; if (outputLimit <= 0) return totalCount; numSAs += saR - saL + 1; if (numSAs == r - l + 1) { // no branch at all return totalCount; } } saL = l; saR = r; // reset ranges } // try add $ BWTSARangeBackward(idx2BWT, 4 ,&saL, &saR); // 4 is $ if (saL <= saR) { int limit = saR - saL + 1; if (limit > outputLimit) { limit = outputLimit; } for (int i = 0; i < limit; ++i) { ri->read_id = idx2BWT->readIDtable[saL + i - idx2BWT->bwt->cumulativeFreq[4]]; ri->strand = strand; ri->pos = _pos; ++ri; } totalCount += limit; } return totalCount; }