// Calculate skip for the suffix array int calcSkip(ESA esa) { int size = esa->size; int *latest = malloc(sizeof(*latest) * (MAXPSSMSIZE+1)); if(latest == NULL) { setError("Couldn't allocate temporary memory for skip calculation."); return 0; } int i, j, minS, curLcp; // latest points to a table of where we last encountered the various possible // lcp values - we start by initializing it to -1 (means we never encountered // any of the lcp's) for(i = 0; i < MAXPSSMSIZE+1; i++) latest[i] = -1; // Last suffix has skip = size curLcp = getLcp(esa, size-1); latest[curLcp] = size-1; setSkip(esa, size-1,size); // Start from the second-last row and calculate skip's upwards for(i = size - 2; i >= 0; i--){ // Get lcp for this row curLcp = getLcp(esa, i); // Register it in table latest[curLcp] = i; // Find the row with the lowest index that is greater than ours and that has a // strictly lower lcp minS = -1; for(j = curLcp - 1; j >= 0; j--){ if(minS == -1) { minS = latest[j]; } else { break; } } // then the smallest skip value is found for(; j >= 0; j--) { if(minS > latest[j]&&(latest[j]> -1)){ minS = latest[j]; } } // If a value was found this value is registrered if(minS != -1) { setSkip(esa, i, minS); } // Else skip is set to the size of the text else { setSkip(esa, i, size); } } free(latest); return 1; }
/* getLcpTreeShulens: compute intervals for each query; * This is the only entry point to the functions in this file. */ void getLcpTreeShulens(FILE *fpout, Args *a, SequenceUnion *seqUnion, FILE *fwout) { Int64 *sa = NULL, *lcpTab = NULL; // **sl = NULL; Int64 i, j, ns; Int64 maxDepth; Int64 *leftBorders = NULL, lb; Int64 *strandBorders = NULL; Int64 *maxShulens = NULL, maxs = 0, lS0 = 0; Int64 *minSumWin = NULL; // minimal sum (threshold) for which winner-sequences are considered to have strong signal //time_t end, start, end2, end3; clock_t end, start, end2, end3; double elapsed_time1, elapsed_time2, elapsed_time3; queryInterval **listQueryIntervalsFwd = NULL; // lists of query intervals, there are |Q| lists queryInterval **listQueryIntervalsRev = NULL; // lists of query intervals, there are |Q| lists queryInterval ***fastSearch = NULL; /* matrix of pointers for fast searching */ Int64 *lastIndex = NULL; /* array of last indices of each query - goes together with fastSearch */ qNode **root = NULL; // binary tree root; initially is NULL qNode ***l = NULL; // list of lists of binary tree nodes /////////////////////////////////////////////////////////////////////////////////////// f1 = fpout; headers1 = seqUnion->seqUnion->headers; /////////////////////////////////////////////////////////////////////////////////////// maxDepth = a->D; if (a->s) { onlyStrongSignal = 1; } // array of left borders of each sequence leftBorders = emalloc(sizeof(Int64) * (seqUnion->numOfSubjects + seqUnion->numOfQueries)); // array of fwd strand borders of each sequence strandBorders = emalloc(sizeof(Int64) * (seqUnion->numOfSubjects + seqUnion->numOfQueries)); lb = 0; for (i = 0; i < seqUnion->numOfSubjects + seqUnion->numOfQueries; i++) { leftBorders[i] = lb; lb = seqUnion->seqBorders[i] + 1; strandBorders[i] = leftBorders[i] + (seqUnion->seqBorders[i] - leftBorders[i]) / 2; } /* for each query form an array of pointers; each pointer points to the query interval * whose right border is closest to the upper bound in terms of args->q, * e.g. when qi.rb = 978, then an element [qi][0] points to qi, that is 978 is closest to 999=upper bound for [qi][0] */ /* fastSearch matrix, m x p, m=number of queries; p-variable for each Qi, p=|Qi|/args->q - 1, *end points to the last interval of Qi */ //if (FASTSEARCH) { // lastIndex = /*e*/malloc(sizeof(Int64) * seqUnion->numOfQueries); // fastSearch = getFastSearch(seqUnion, leftBorders, seqUnion->numOfQueries, a, lastIndex); // sizeMiniList = a->q; //} // compute suffix array start = clock(); sa = getSuffixArray(seqUnion->seqUnion); end = clock(); elapsed_time1 = (double)(end - start) / CLOCKS_PER_SEC; if (!sa) { eprintf("sa: out of memory!\n"); } // compute lcp array lcpTab = getLcp(seqUnion->seqUnion, sa); if (!lcpTab) { eprintf("lcp: out of memory!\n"); } end2 = clock(); elapsed_time2 = (double)(end2 - end) / CLOCKS_PER_SEC; // print sa, lcp #if DEBUG printSA_LCP(sa, lcpTab, seqUnion->len); #endif // print run-time if (a->t) { printf( "\nSA calculation: %.2f seconds.\n", elapsed_time1); printf( "\nLCP calculation: %.2f seconds.\n", elapsed_time2); } /* calculate max shulens expected only by chance for each query */ /* using both subject's and query's gc-content */ maxShulens = emalloc(seqUnion->numOfQueries * sizeof(Int64)); minSumWin = /*e*/malloc(seqUnion->numOfQueries * sizeof(Int64)); lS0 = seqUnion->seqBorders[seqUnion->numOfQueries] - leftBorders[seqUnion->numOfQueries] + 1; // length of subject = S0 for (i = 0; i < seqUnion->numOfQueries; i++) { //arguments: args->P, lS, gcQ, gcS for query=Qi and subject=S0 maxShulens[i] = maxShulenNew(a->P, lS0, seqUnion->gc[i], seqUnion->gc[seqUnion->numOfQueries]); for (j = 1; j < seqUnion->numOfSubjects; j++) { maxs = maxShulenNew(a->P, seqUnion->seqBorders[j + seqUnion->numOfQueries] - leftBorders[j + seqUnion->numOfQueries] + 1 , seqUnion->gc[i], seqUnion->gc[seqUnion->numOfQueries + j]); if (maxs > maxShulens[i]) { /* when smallest or greatest of all max shulens is used, then there is no effect; for hiv max shulen is 8 in most of combinations */ maxShulens[i] = maxs; } } if (a->M == 0) { minSumWin[i] = 0; // threshold sum for a window; below this value, the "winners" are not considered to have strong signal over a window } else { minSumWin[i] = maxShulens[i] * a->w; // threshold sum for a window; below this value, the "winners" are not considered to have strong signal over a window } maxShulens[i] = (Int64)(a->m * maxShulens[i]); } // compute lists of query intervals traverseLcpTree(lcpTab, sa, seqUnion->seqUnion, seqUnion->numOfSubjects, seqUnion->numOfQueries, seqUnion->seqBorders, leftBorders, strandBorders , &listQueryIntervalsFwd, &listQueryIntervalsRev, maxDepth, maxShulens, fastSearch, lastIndex, &root); end3 = clock(); elapsed_time3 = (double)(end3 - end2) / CLOCKS_PER_SEC; // print run-time if (a->t) { printf( "\nLCP-tree traversal calculation: %.2f seconds.\n", elapsed_time3); } free(sa); free(lcpTab); free(maxShulens); // print lists of intervals for each query ns = seqUnion->numOfSubjects; if (BSEARCH) { for (i = 0; i < seqUnion->numOfQueries; i++) { correctBT(root[i], -1, strandBorders[i] - leftBorders[i]); //fprintf(fpout, "Query: %d %s\n", i + 1, &headers[i][1]); if (fpout) { // suppress printing of interval analysis on stdout as default action fprintf(fpout, "%s\n", seqUnion->seqUnion->headers[i]); binTreeTraverse(root[i], seqUnion->seqUnion->headers, strandBorders[i] - leftBorders[i], ns, i, seqUnion->numOfQueries, fpout); fprintf(fpout, "\n"); } } } //else { // list search // for (i = 0; i < seqUnion->numOfQueries; i++) { // printListsQueries(ns, seqUnion->numOfQueries, fpout, listQueryIntervalsFwd, seqUnion->seqUnion->headers, strandBorders[i] - leftBorders[i], i); // } //} /* windows analysis */ //printf("Windows analysis\n"); l = windowAnalysis(a, seqUnion, fwout, listQueryIntervalsFwd, strandBorders, leftBorders, root, BSEARCH, minSumWin, fpout); if (BSEARCH) { freeBTQueryIntervals(root, seqUnion->numOfQueries); if (l) { // windows analysis for (i = 0; i < seqUnion->numOfQueries; i++) { free(l[i]); } free(l); } } //else { // freeListQueryIntervals(listQueryIntervalsFwd, listQueryIntervalsRev, seqUnion->numOfQueries); //} //if (FASTSEARCH) { // freeFastSearch(fastSearch, seqUnion->numOfQueries); // free(lastIndex); //} free(leftBorders); free(strandBorders); free(minSumWin); }