Ejemplo n.º 1
0
/*
 * Find k-mismatches naively.
 * @param text A character array containing the text
 * @param pattern A character array containing the pattern
 * @param n the length of the text
 * @param m the length of the pattern
 * @param k The threshold, k. If k < 0 or >= m hamming distance at all
 * alignments will be found
 * @param numMatches an unsigned pointer to a location to store the number
 * of k-mismatches found. Note that if k < 0 or >= m this is just n-m.
 */
void sp_km_naive_kmismatch(char *text, char *pattern, int n, int m,int k,
		int *numMatches, struct SP_KM_MATCHING_POSITIONS *listOfMatches, unsigned int flags){
	int i,j;
	if( k < 0){
		k = m+1;
	}
	*numMatches = 0;
	for(i=0;i<n-m+1;i++){
		int hamDistance = 0;
		for(j=0;j<m;j++){
			if(text[i+j] != pattern[j]){
				hamDistance++;
				//If hamming distance is greater than k, we dont need to bother
				//computing this anymore
				if(hamDistance > k){
					break;
				}
			}
		}

		if(hamDistance <= k){
			(*numMatches)++;
			if(listOfMatches!=NULL){
				sp_km_addToListOfMatches(listOfMatches,i,hamDistance);
			}else{
				printf("Hamming distance at text[%d]: %d\n",i,hamDistance);
			}
			if(flags & SP_KM_FIRST_MATCH_ONLY){
				return;
			}
		}

	}

}
/**
 * Find the mismatches without filtering. This is the complete method in the case where
 * the complexity is unbounded by k, but only applies in one case in the bounded method.
 * @TODO: Move this elsewhere; it doesn't apply just to unbounded. Perhaps the same for the above
 * @param text A character array containing the text
 * @param pattern A character array containing the pattern
 * @param n the length of the text
 * @param m the length of the pattern
 * @param k The threshold, k. If k < 0 or >= m hamming distance at all
 * alignments will be found
 * @param results. An unsigned array to store the hamming distance in. Must be
 * of size n-m+1. If NULL, results are printed to stdout.
 * @param numMatches an unsigned pointer to a location to store the number
 * of k-mismatches found. Note that if k < 0 or >= m this is just n-m.
 * @param sortedPattern. An array of charAndPosition structs, that is sorted but contain
 * indices of original location in the pattern
 * @param positionLookup. An int array of indexes into the sortedPattern saying where to find
 * the start of infrequent characters, or NOT_IN_PATTERN or FREQUENT_CHAR.
 */
void findMismatchesWithoutFiltering(char *text, char *pattern, int n, int m,int k,
		int *numMatches,struct SP_KM_MATCHING_POSITIONS *listOfMatches,unsigned flags,
		struct charAndPosition *sortedPattern,struct position *positionLookup){

	int i;
	int transformSize = 2*m;
	if(transformSize < 2048 && n > 4096){
		transformSize = 2048;
	}

	//Use double as we will compute Fourier Transform of this
	double *maskedPattern = (double *)fftw_malloc(sizeof(double)*transformSize);
	//Zero the top "half" of the masked pattern -- this will never be touched
	for(i=m;i<transformSize;i++){
		maskedPattern[i] = 0.0;
	}

	//Repeat for text, except we mask potential "overflow" of chunks past n
	int overflowed = n + (transformSize - m);
	double *maskedText = (double *)fftw_malloc(sizeof(double)*overflowed);
	for(i=n;i<overflowed;i++){
		maskedText[i] = 0.0;
	}

	//We will re-use plans across multiple masked patterns, so define these here
	fftw_plan forward = NULL;
	fftw_plan inverse =NULL;
	if(!fftw_import_system_wisdom()){
		printf("Failed to read system wisdom!\n");
	}

	//Create an array that we will use to add matches for each character at
	//each alignment
	int *matches = (int *) calloc(n-m+1,sizeof(int));
	int positionInPattern;
	short charType;
	int numFFTMatches = 0;
	double numInfrequentComparisons = 0.0;
	for(i=0;i<n;i++){
		charType = positionLookup[(int)text[i]].charType;
		positionInPattern = positionLookup[(int)text[i]].index;
		if(charType==FREQUENT_CHAR){
		

			maskTextAndPattern(text,pattern,n,m,maskedText,maskedPattern,text[i]);
						printf("Frequent char %c\n",text[i]);
			computeNumMatchesWithFFT(maskedText,maskedPattern,n,m,transformSize,matches,&forward,&inverse);
			numFFTMatches++;
			//This will compute matches for ALL of the same character in the text, so now mark this as not occuring in the pattern
			positionLookup[(int)text[i]].charType=NOT_IN_PATTERN;
		}else if(charType ==INFREQUENT_CHAR){
			//printf("Infrequent char %c\n",text[i]);
			//Loop though all of the up to O(threshold) characters that are infrequent
			while(positionInPattern < m && sortedPattern[positionInPattern].index <= i && sortedPattern[positionInPattern].c == text[i]){
//				printf("i-sortedPattern[positionInPattern].index+1: %d\n",i-sortedPattern[positionInPattern].index+1);
//				printf("i: %d\n",i);
//				printf("sortedPattern[positionInPattern].index: %d\n",sortedPattern[positionInPattern].index);
				if(i-sortedPattern[positionInPattern].index < n-m+1){
					matches[i-sortedPattern[positionInPattern].index]++;
				}
				positionInPattern++;
				numInfrequentComparisons++;
			}
		}
	}
	//printf("Calculated %d matches using Fourier Transforms\n",numFFTMatches);
	//printf("Performed %d (*c) comparisons for infrequent characters\n",(int)numInfrequentComparisons);
	//We now have matches, so we can compute (and output if necessary) mis-matches

	int hamDistance;
	for(i=0;i<n-m+1;i++){
		hamDistance = m - matches[i];
		if(hamDistance <= k){
			(*numMatches)++;
			if(listOfMatches!=NULL){
				sp_km_addToListOfMatches(listOfMatches,i,hamDistance);
			}else{
				printf("Hamming distance at text[%d]: %d\n",i,hamDistance);
			}
			if(flags & SP_KM_FIRST_MATCH_ONLY){
				return;
			}
		}
	}

   done();
	free(matches);
	fftw_free(maskedPattern);
	fftw_free(maskedText);
	fftw_destroy_plan(forward);
	fftw_destroy_plan(inverse);
	fftw_forget_wisdom();
	fftw_cleanup();

}