Exemple #1
0
void insertIntoTree(Kmer * kmer, SplayTree ** T)
{
	SplayNode *newNode;

	if (*T == NULL) {
		newNode = allocateSplayNode();
		copyKmers(&(newNode->kmer), kmer);
		newNode->left = newNode->right = NULL;
		*T = newNode;
		return;
	}

	*T = Splay(kmer, *T);
	if (compareKmers(kmer, &((*T)->kmer)) < 0) {
		newNode = allocateSplayNode();
		copyKmers(&(newNode->kmer), kmer);
		newNode->left = (*T)->left;
		newNode->right = *T;
		(*T)->left = NULL;
		*T = newNode;
	} else if (compareKmers(&((*T)->kmer), kmer) < 0) {
		newNode = allocateSplayNode();
		copyKmers(&(newNode->kmer), kmer);
		newNode->right = (*T)->right;
		newNode->left = *T;
		(*T)->right = NULL;
		*T = newNode;
	}
}
Exemple #2
0
boolean
findOrInsertOccurenceInSplayTree(Kmer * kmer, IDnum * seqID,
				 Coordinate * position, SplayTree ** T)
{
	SplayNode *newNode;

	if (*T == NULL) {
		newNode = allocateSplayNode();
		copyKmers(&(newNode->kmer), kmer);
		newNode->seqID = *seqID;
		newNode->position = *position;

		newNode->left = newNode->right = NULL;

		*T = newNode;

		return false;
	}

	*T = Splay(kmer, *T);
	if (compareKmers(kmer, &((*T)->kmer)) < 0) {
		newNode = allocateSplayNode();
		copyKmers(&(newNode->kmer), kmer);
		newNode->seqID = *seqID;
		newNode->position = *position;

		newNode->left = (*T)->left;
		newNode->right = *T;
		(*T)->left = NULL;

		*T = newNode;

		printf("1: sequenceID = %d\n", *seqID);

		return false;
	} else if (compareKmers(kmer, &((*T)->kmer)) > 0) {
		newNode = allocateSplayNode();
		copyKmers(&(newNode->kmer), kmer);
		newNode->seqID = *seqID;
		newNode->position = *position;

		newNode->right = (*T)->right;
		newNode->left = *T;
		(*T)->right = NULL;

		*T = newNode;

		printf("2: sequenceID = %d\n", *seqID);

		return false;
	} else {
		*seqID = (*T)->seqID;
		*position = (*T)->position;
		
		printf("3: sequenceID = %d\n", *seqID);

		return true;
	}
}
void recordKmerOccurence(Kmer * kmer, IDnum nodeID, Coordinate position, KmerOccurenceTable * table) {
	KmerOccurence * kmerOccurence;

#ifdef OPENMP
	#pragma omp critical
#endif 
	{
		kmerOccurence = table->kmerOccurencePtr++;
		table->kmerOccurenceIndex++;
	}

	copyKmers(&(kmerOccurence->kmer), kmer);
	kmerOccurence->nodeID = nodeID;
	kmerOccurence->position = position;

}
static inline KmerKey keyInAccelerationTable(Kmer * kmer,
					  KmerOccurenceTable * table)
{
	KmerKey key = 0;
	Kmer copy;
	int i;

	copyKmers(&copy, kmer);
	for (i = 0; i < table->accelerationShift; i+= 2)
		popNucleotide(&copy);

	for (i = 0; i < table->accelerationBits; i+= 2) {
		key += ((KmerKey) popNucleotide(&copy)) << table->accelerationBits;
		key >>= 2;
	}
	
	return key;
}
static KmerOccurenceTable *referenceGraphKmers(char *preGraphFilename,
					       short int accelerationBits, Graph * graph, boolean double_strand)
{
	FILE *file = fopen(preGraphFilename, "r");
	const int maxline = MAXLINE;
	char line[MAXLINE];
	char c;
	int wordLength;
	Coordinate lineLength, kmerCount;
	Kmer word;
	Kmer antiWord;
	KmerOccurenceTable *kmerTable = NULL;
	KmerOccurence *kmerOccurences, *kmerOccurencePtr;
	Coordinate kmerOccurenceIndex;
	IDnum index;
	IDnum nodeID = 0;
	IDnum *accelPtr = NULL;
	KmerKey lastHeader = 0;
	KmerKey header;
	Nucleotide nucleotide;

	if (file == NULL)
		exitErrorf(EXIT_FAILURE, true, "Could not open %s", preGraphFilename);

	// Count kmers
	printf("Scanning pre-graph file %s for k-mers\n",
	       preGraphFilename);

	// First  line
	if (!fgets(line, maxline, file))
		exitErrorf(EXIT_FAILURE, true, "PreGraph file incomplete");
	sscanf(line, "%*i\t%*i\t%i\n", &wordLength);

	// Initialize kmer occurence table:
	kmerTable = mallocOrExit(1, KmerOccurenceTable);
	if (accelerationBits > 2 * wordLength)
		accelerationBits = 2 * wordLength;

	if (accelerationBits > 32)
		accelerationBits = 32;

	if (accelerationBits > 0) {
		kmerTable->accelerationBits = accelerationBits;
		kmerTable->accelerationTable =
		    callocOrExit((((size_t) 1) << accelerationBits) + 1,
			   IDnum);
		accelPtr = kmerTable->accelerationTable;
		kmerTable->accelerationShift =
		    (short int) 2 *wordLength - accelerationBits;
	} else {
		kmerTable->accelerationBits = 0;
		kmerTable->accelerationTable = NULL;
		kmerTable->accelerationShift = 0;
	}

	// Read nodes
	if (!fgets(line, maxline, file))
		exitErrorf(EXIT_FAILURE, true, "PreGraph file incomplete");
	kmerCount = 0;
	while (line[0] == 'N') {
		lineLength = 0;
		while ((c = getc(file)) != EOF && c != '\n')
			lineLength++;
		kmerCount += lineLength - wordLength + 1;
		if (fgets(line, maxline, file) == NULL)
			break;
	}
	fclose(file);

	// Create table
	printf("%li kmers found\n", (long) kmerCount);
	kmerOccurences = callocOrExit(kmerCount, KmerOccurence);
	kmerOccurencePtr = kmerOccurences;
	kmerOccurenceIndex = 0;
	kmerTable->kmerTable = kmerOccurences;
	kmerTable->kmerTableSize = kmerCount;

	// Fill table
	file = fopen(preGraphFilename, "r");
	if (file == NULL)
		exitErrorf(EXIT_FAILURE, true, "Could not open %s", preGraphFilename);

	if (!fgets(line, maxline, file))
		exitErrorf(EXIT_FAILURE, true, "PreGraph file incomplete");

	// Read nodes
	if (!fgets(line, maxline, file))
		exitErrorf(EXIT_FAILURE, true, "PreGraph file incomplete");
	while (line[0] == 'N') {
		nodeID++;

		// Fill in the initial word : 
		clearKmer(&word);
		clearKmer(&antiWord);

		for (index = 0; index < wordLength - 1; index++) {
			c = getc(file);
			if (c == 'A')
				nucleotide = ADENINE;
			else if (c == 'C')
				nucleotide = CYTOSINE;
			else if (c == 'G')
				nucleotide = GUANINE;
			else if (c == 'T')
				nucleotide = THYMINE;
			else if (c == '\n')
				exitErrorf(EXIT_FAILURE, true, "PreGraph file incomplete");
			else
				nucleotide = ADENINE;
				

			pushNucleotide(&word, nucleotide);
			if (double_strand) {
#ifdef COLOR
				reversePushNucleotide(&antiWord, nucleotide);
#else
				reversePushNucleotide(&antiWord, 3 - nucleotide);
#endif
			}
		}

		// Scan through node
		index = 0;
		while((c = getc(file)) != '\n' && c != EOF) {
			if (c == 'A')
				nucleotide = ADENINE;
			else if (c == 'C')
				nucleotide = CYTOSINE;
			else if (c == 'G')
				nucleotide = GUANINE;
			else if (c == 'T')
				nucleotide = THYMINE;
			else
				nucleotide = ADENINE;

			pushNucleotide(&word, nucleotide);
			if (double_strand) {
#ifdef COLOR
				reversePushNucleotide(&antiWord, nucleotide);
#else
				reversePushNucleotide(&antiWord, 3 - nucleotide);
#endif
			}

			if (!double_strand || compareKmers(&word, &antiWord) <= 0) {
				copyKmers(&kmerOccurencePtr->kmer, &word);
				kmerOccurencePtr->nodeID = nodeID;
				kmerOccurencePtr->position =
				    index;
			} else {
				copyKmers(&kmerOccurencePtr->kmer, &antiWord);
				kmerOccurencePtr->nodeID = -nodeID;
				kmerOccurencePtr->position =
				    getNodeLength(getNodeInGraph(graph, nodeID)) - 1 - index;
			}

			kmerOccurencePtr++;
			kmerOccurenceIndex++;
			index++;
		}

		if (fgets(line, maxline, file) == NULL)
			break;
	}

	fclose(file);

	// Sort table
	qsort(kmerOccurences, kmerCount, sizeof(KmerOccurence),
	      compareKmerOccurences);

	// Fill up acceleration table
	if (kmerTable->accelerationTable != NULL) {
		*accelPtr = (IDnum) 0;
		for (kmerOccurenceIndex = 0;
		     kmerOccurenceIndex < kmerCount;
		     kmerOccurenceIndex++) {
			header =
			    keyInAccelerationTable(&kmerOccurences
						   [kmerOccurenceIndex].
						   kmer, kmerTable);
			while (lastHeader < header) {
				lastHeader++;
				accelPtr++;
				*accelPtr = kmerOccurenceIndex;
			}
		}

		while (lastHeader < (KmerKey) 1 << accelerationBits) {
			lastHeader++;
			accelPtr++;
			*accelPtr = kmerCount;
		}
	}

	return kmerTable;
}