Example #1
0
// Calculate skip for the suffix array
int calcSkip(ESA esa)
{
  int size = esa->size;
  int *latest = malloc(sizeof(*latest) * (MAXPSSMSIZE+1));
  if(latest == NULL)
  {
  	setError("Couldn't allocate temporary memory for skip calculation.");
  	return 0;
  }
  int i, j, minS, curLcp;
  
  // latest points to a table of where we last encountered the various possible
  // lcp values - we start by initializing it to -1 (means we never encountered
  // any of the lcp's)
  for(i = 0; i < MAXPSSMSIZE+1; i++)
    latest[i] = -1;
  
  // Last suffix has skip = size
  curLcp = getLcp(esa, size-1);
  latest[curLcp] = size-1;
  setSkip(esa, size-1,size);
  
  // Start from the second-last row and calculate skip's upwards  
  for(i = size - 2; i >= 0; i--){
    // Get lcp for this row
    curLcp = getLcp(esa, i);
    
    // Register it in table
    latest[curLcp] = i;
    
    // Find the row with the lowest index that is greater than ours and that has a 
    // strictly lower lcp 
    minS = -1;
    for(j = curLcp - 1; j >= 0; j--){			
       if(minS == -1) {
  	      minS = latest[j];
       }
       else {
  	      break;
       }
    }
 
    // then the smallest skip value is found
    for(; j >= 0; j--) {
       if(minS > latest[j]&&(latest[j]> -1)){
	      minS = latest[j];
       }
    }
    
    // If a value was found this value is registrered
    if(minS != -1) {
       setSkip(esa, i, minS);			
    } 
    // Else skip is set to the size of the text
    else {
       setSkip(esa, i, size);			
    }
  }
  free(latest);
  return 1;
}
Example #2
0
/* getLcpTreeShulens: compute intervals for each query; 
 * This is the only entry point to the functions in this file.
 */
void getLcpTreeShulens(FILE *fpout, Args *a, SequenceUnion *seqUnion, FILE *fwout) {

	Int64 *sa = NULL, *lcpTab = NULL; // **sl = NULL;
	Int64 i, j, ns;
	Int64 maxDepth;
	Int64 *leftBorders = NULL, lb;
	Int64 *strandBorders = NULL;
	Int64 *maxShulens = NULL, maxs = 0, lS0 = 0;
	Int64 *minSumWin = NULL; // minimal sum (threshold) for which winner-sequences are considered to have strong signal

	//time_t end, start, end2, end3;
	clock_t end, start, end2, end3;
	double elapsed_time1, elapsed_time2, elapsed_time3;
	queryInterval **listQueryIntervalsFwd = NULL; // lists of query intervals, there are |Q| lists
	queryInterval **listQueryIntervalsRev = NULL; // lists of query intervals, there are |Q| lists
	
	queryInterval ***fastSearch = NULL; /* matrix of pointers for fast searching */
	Int64 *lastIndex = NULL; /* array of last indices of each query - goes together with fastSearch */
	
	qNode **root = NULL; // binary tree root; initially is NULL
	qNode ***l = NULL;  // list of lists of binary tree nodes

///////////////////////////////////////////////////////////////////////////////////////
	f1 = fpout;
	headers1 = seqUnion->seqUnion->headers;
///////////////////////////////////////////////////////////////////////////////////////
  maxDepth = a->D;
	if (a->s) {
		onlyStrongSignal = 1;
	}
	// array of left borders of each sequence
	leftBorders = emalloc(sizeof(Int64) * (seqUnion->numOfSubjects + seqUnion->numOfQueries)); 
	// array of fwd strand borders of each sequence
	strandBorders = emalloc(sizeof(Int64) * (seqUnion->numOfSubjects + seqUnion->numOfQueries));
	lb = 0;
	for (i = 0; i < seqUnion->numOfSubjects + seqUnion->numOfQueries; i++) {
		leftBorders[i] = lb;
		lb = seqUnion->seqBorders[i] + 1;
		strandBorders[i] = leftBorders[i] + (seqUnion->seqBorders[i] - leftBorders[i]) / 2;
	}

	/* for each query form an array of pointers; each pointer points to the query interval 
	* whose right border is closest to the upper bound in terms of args->q, 
	* e.g. when qi.rb = 978, then an element [qi][0] points to qi, that is 978 is closest to 999=upper bound for [qi][0]
	*/
	/* fastSearch matrix, m x p, m=number of queries; p-variable for each Qi, p=|Qi|/args->q - 1, *end points to the last interval of Qi */
	//if (FASTSEARCH) {
	//	lastIndex = /*e*/malloc(sizeof(Int64) * seqUnion->numOfQueries); 
	//	fastSearch = getFastSearch(seqUnion, leftBorders, seqUnion->numOfQueries, a, lastIndex); 
	//	sizeMiniList = a->q;
	//}

	// compute suffix array
	start = clock();
  sa = getSuffixArray(seqUnion->seqUnion);
	end = clock();
	elapsed_time1 = (double)(end - start) / CLOCKS_PER_SEC;
	if (!sa) {
		eprintf("sa: out of memory!\n");
	}	
	
	// compute lcp array
	lcpTab = getLcp(seqUnion->seqUnion, sa);
	if (!lcpTab) {
		eprintf("lcp: out of memory!\n");
	}
	end2 = clock();
	elapsed_time2 = (double)(end2 - end) / CLOCKS_PER_SEC;
	// print sa, lcp 
#if DEBUG
	printSA_LCP(sa, lcpTab, seqUnion->len);
#endif

	// print run-time
	if (a->t) {
		printf( "\nSA calculation: %.2f seconds.\n", elapsed_time1);
		printf( "\nLCP calculation: %.2f seconds.\n", elapsed_time2);
	}

	/* calculate max shulens expected only by chance for each query */
	/* using both subject's and query's gc-content */
	maxShulens = emalloc(seqUnion->numOfQueries * sizeof(Int64));
	minSumWin = /*e*/malloc(seqUnion->numOfQueries * sizeof(Int64));
	lS0 = seqUnion->seqBorders[seqUnion->numOfQueries] - leftBorders[seqUnion->numOfQueries] + 1; // length of subject = S0
	for (i = 0; i < seqUnion->numOfQueries; i++) {
		//arguments: args->P, lS, gcQ, gcS for query=Qi and subject=S0
		maxShulens[i] = maxShulenNew(a->P, lS0, seqUnion->gc[i], seqUnion->gc[seqUnion->numOfQueries]);
		for (j = 1; j < seqUnion->numOfSubjects; j++) {
			maxs = maxShulenNew(a->P, seqUnion->seqBorders[j + seqUnion->numOfQueries] - leftBorders[j + seqUnion->numOfQueries] + 1
															, seqUnion->gc[i], seqUnion->gc[seqUnion->numOfQueries + j]);
			if (maxs > maxShulens[i]) {
				/* when smallest or greatest of all max shulens is used, then there is no effect; for hiv max shulen is 8 in most of combinations */
				maxShulens[i] = maxs; 
			}
		}
		
		if (a->M == 0) {
			minSumWin[i] = 0; // threshold sum for a window; below this value, the "winners" are not considered to have strong signal over a window
		}
		else {
			minSumWin[i] = maxShulens[i] * a->w; // threshold sum for a window; below this value, the "winners" are not considered to have strong signal over a window		
		}
		maxShulens[i] = (Int64)(a->m * maxShulens[i]);
	}		

	// compute lists of query intervals
	traverseLcpTree(lcpTab, sa, seqUnion->seqUnion, seqUnion->numOfSubjects, seqUnion->numOfQueries, seqUnion->seqBorders, leftBorders, strandBorders
		, &listQueryIntervalsFwd, &listQueryIntervalsRev, maxDepth, maxShulens, fastSearch, lastIndex, &root);
	
	end3 = clock();
	elapsed_time3 = (double)(end3 - end2) / CLOCKS_PER_SEC;
	
	// print run-time
	if (a->t) {
		printf( "\nLCP-tree traversal calculation: %.2f seconds.\n", elapsed_time3);
	}

	free(sa);
  free(lcpTab);
	free(maxShulens);

	// print lists of intervals for each query
	ns = seqUnion->numOfSubjects;

	if (BSEARCH) {
		for (i = 0; i < seqUnion->numOfQueries; i++) {
			correctBT(root[i], -1, strandBorders[i] - leftBorders[i]);
			//fprintf(fpout, "Query: %d %s\n", i + 1, &headers[i][1]);
			if (fpout) { // suppress printing of interval analysis on stdout as default action
				fprintf(fpout, "%s\n", seqUnion->seqUnion->headers[i]);	
				binTreeTraverse(root[i], seqUnion->seqUnion->headers, strandBorders[i] - leftBorders[i], ns, i, seqUnion->numOfQueries, fpout);
				fprintf(fpout, "\n");	
			}
		}
	}
	//else { // list search
	//	for (i = 0; i < seqUnion->numOfQueries; i++) {
	//		printListsQueries(ns, seqUnion->numOfQueries, fpout, listQueryIntervalsFwd, seqUnion->seqUnion->headers, strandBorders[i] - leftBorders[i], i);
	//	}
	//}
	
	/* windows analysis */
  //printf("Windows analysis\n");
	l = windowAnalysis(a, seqUnion, fwout, listQueryIntervalsFwd, strandBorders, leftBorders, root, BSEARCH, minSumWin, fpout);
	
	if (BSEARCH) {
		freeBTQueryIntervals(root, seqUnion->numOfQueries);
		if (l) { // windows analysis
			for (i = 0; i < seqUnion->numOfQueries; i++) {
				free(l[i]);
			}
			free(l);
		}
	}
	//else {
	//	freeListQueryIntervals(listQueryIntervalsFwd, listQueryIntervalsRev, seqUnion->numOfQueries);	
	//}

	//if (FASTSEARCH) {
	//	freeFastSearch(fastSearch, seqUnion->numOfQueries);
	//	free(lastIndex);
	//}
	free(leftBorders);
	free(strandBorders);
	free(minSumWin);
}