Example #1
0
static KmerOccurence *findKmerOccurenceInSortedTable(Kmer * kmer,
						     KmerOccurenceTable *
						     table)
{
	KmerOccurence *array = table->kmerTable;
	KmerKey key = keyInAccelerationTable(kmer, table);
	Coordinate leftIndex, rightIndex, middleIndex;

	if (table->accelerationTable != NULL) {
		leftIndex = table->accelerationTable[key];
		rightIndex = table->accelerationTable[key + 1];
	} else {
		leftIndex = 0;
		rightIndex = table->kmerTableSize;
	}

	while (true) {
		middleIndex = (rightIndex + leftIndex) / 2;

		if (leftIndex >= rightIndex)
			return NULL;
		else if (compareKmers(&(array[middleIndex]).kmer, kmer) == 0) 
			return &(array[middleIndex]);
		else if (leftIndex == middleIndex)
			return NULL;
		else if (compareKmers(&(array[middleIndex]).kmer, kmer) > 0)
			rightIndex = middleIndex;
		else
			leftIndex = middleIndex;
	}
}
Example #2
0
void insertIntoTree(Kmer * kmer, SplayTree ** T)
{
	SplayNode *newNode;

	if (*T == NULL) {
		newNode = allocateSplayNode();
		copyKmers(&(newNode->kmer), kmer);
		newNode->left = newNode->right = NULL;
		*T = newNode;
		return;
	}

	*T = Splay(kmer, *T);
	if (compareKmers(kmer, &((*T)->kmer)) < 0) {
		newNode = allocateSplayNode();
		copyKmers(&(newNode->kmer), kmer);
		newNode->left = (*T)->left;
		newNode->right = *T;
		(*T)->left = NULL;
		*T = newNode;
	} else if (compareKmers(&((*T)->kmer), kmer) < 0) {
		newNode = allocateSplayNode();
		copyKmers(&(newNode->kmer), kmer);
		newNode->right = (*T)->right;
		newNode->left = *T;
		(*T)->right = NULL;
		*T = newNode;
	}
}
Example #3
0
boolean
findOrInsertOccurenceInSplayTree(Kmer * kmer, IDnum * seqID,
				 Coordinate * position, SplayTree ** T)
{
	SplayNode *newNode;

	if (*T == NULL) {
		newNode = allocateSplayNode();
		copyKmers(&(newNode->kmer), kmer);
		newNode->seqID = *seqID;
		newNode->position = *position;

		newNode->left = newNode->right = NULL;

		*T = newNode;

		return false;
	}

	*T = Splay(kmer, *T);
	if (compareKmers(kmer, &((*T)->kmer)) < 0) {
		newNode = allocateSplayNode();
		copyKmers(&(newNode->kmer), kmer);
		newNode->seqID = *seqID;
		newNode->position = *position;

		newNode->left = (*T)->left;
		newNode->right = *T;
		(*T)->left = NULL;

		*T = newNode;

		printf("1: sequenceID = %d\n", *seqID);

		return false;
	} else if (compareKmers(kmer, &((*T)->kmer)) > 0) {
		newNode = allocateSplayNode();
		copyKmers(&(newNode->kmer), kmer);
		newNode->seqID = *seqID;
		newNode->position = *position;

		newNode->right = (*T)->right;
		newNode->left = *T;
		(*T)->right = NULL;

		*T = newNode;

		printf("2: sequenceID = %d\n", *seqID);

		return false;
	} else {
		*seqID = (*T)->seqID;
		*position = (*T)->position;
		
		printf("3: sequenceID = %d\n", *seqID);

		return true;
	}
}
Example #4
0
int compareKmerOccurences(void const *A, void const *B)
{
	KmerOccurence *a = (KmerOccurence *) A;
	KmerOccurence *b = (KmerOccurence *) B;

	if (compareKmers(&(a->kmer), &(b->kmer)) < 0)
		return -1;
	else if (compareKmers(&(a->kmer), &(b->kmer)) > 0)
		return 1;
	else
		return 0;
}
Example #5
0
KmerOccurence * getNextKmerOccurence(KmerOccurence * current) {
	register KmerOccurence * next = current + 1;
	if (next->nodeID == 0)
		return NULL;
	if (compareKmers(&current->kmer, &next->kmer))
		return NULL;
	return next;
}
Example #6
0
static SplayTree *Splay(Kmer * kmer, SplayTree * T)
{
	SplayNode Header;
	SplayNode *LeftTreeMax, *RightTreeMin;

	if (T == NULL)
		return NULL;

	Header.left = Header.right = NULL;
	LeftTreeMax = RightTreeMin = &Header;

	while (compareKmers(kmer, &(T->kmer))) {
		if (compareKmers(kmer, &(T->kmer)) < 0) {
			if (T->left == NULL)
				break;
			if (compareKmers(kmer, &(T->left->kmer)) < 0)
				T = SingleRotateWithLeft(T);
			if (T->left == NULL)
				break;
			/* Link right */
			RightTreeMin->left = T;
			RightTreeMin = T;
			T = T->left;
		} else {
			if (T->right == NULL)
				break;
			if (compareKmers(kmer, &(T->right->kmer)) > 0)
				T = SingleRotateWithRight(T);
			if (T->right == NULL)
				break;
			/* Link left */
			LeftTreeMax->right = T;
			LeftTreeMax = T;
			T = T->right;
		}
	}			/* while kmer != T->kmer */

	/* Reassemble */
	LeftTreeMax->right = T->left;
	RightTreeMin->left = T->right;
	T->left = Header.right;
	T->right = Header.left;

	return T;
}
Example #7
0
KmerOccurence *findKmerInKmerOccurenceTable(Kmer * kmer,
						     KmerOccurenceTable *
						     table)
{
	KmerOccurence *array = table->kmerTable;
	KmerKey key = keyInAccelerationTable(kmer, table);
	Coordinate leftIndex, rightIndex, middleIndex;
	int diff;

	if (table->accelerationTable != NULL) {
		leftIndex = table->accelerationTable[key];
		rightIndex = table->accelerationTable[key + 1];
	} else {
		leftIndex = 0;
		rightIndex = table->kmerTableSize;
	}

	while (true) {
		middleIndex = (rightIndex + leftIndex) / 2;

		if (leftIndex >= rightIndex)
			return NULL;

		diff = compareKmers(&(array[middleIndex].kmer), kmer);

		if (diff == 0) {
			while (middleIndex > 0 && compareKmers(&(array[middleIndex - 1].kmer), kmer) == 0)
				middleIndex--;
			return &(array[middleIndex]);
		} else if (leftIndex == middleIndex)
			return NULL;
		else if (diff > 0)
			rightIndex = middleIndex;
		else
			leftIndex = middleIndex;
	}
}
Example #8
0
int compareKmerOccurences(void const *A, void const *B)
{
	KmerOccurence *a = (KmerOccurence *) A;
	KmerOccurence *b = (KmerOccurence *) B;
	return compareKmers(&(a->kmer), &(b->kmer));
}
Example #9
0
static void threadSequenceThroughGraph(TightString * tString,
				       KmerOccurenceTable * kmerTable,
				       Graph * graph,
				       IDnum seqID, Category category,
				       boolean readTracking,
				       boolean double_strand,
				       ReferenceMapping * referenceMappings,
				       Coordinate referenceMappingCount,
				       IDnum refCount,
				       Annotation * annotations,
				       IDnum annotationCount,
				       boolean second_in_pair)
{
	Kmer word;
	Kmer antiWord;
	Coordinate readNucleotideIndex;
	Coordinate kmerIndex;
	KmerOccurence *kmerOccurence;
	int wordLength = getWordLength(graph);

	PassageMarkerI marker = NULL_IDX;
	PassageMarkerI previousMarker = NULL_IDX;
	Node *node = NULL;
	Node *previousNode = NULL;
	Coordinate coord = 0;
	Coordinate previousCoord = 0;
	Nucleotide nucleotide;
	boolean reversed;

	IDnum refID;
	Coordinate refCoord = 0;
	ReferenceMapping * refMap;
	Annotation * annotation = annotations;
	Coordinate index = 0;
	Coordinate uniqueIndex = 0;
	Coordinate annotIndex = 0;
	IDnum annotCount = 0;
	SmallNodeList * nodePile = NULL;

	// Neglect any string shorter than WORDLENGTH :
	if (getLength(tString) < wordLength)
		return;

	clearKmer(&word);
	clearKmer(&antiWord);

	// Fill in the initial word : 
	for (readNucleotideIndex = 0;
	     readNucleotideIndex < wordLength - 1; readNucleotideIndex++) {
		nucleotide = getNucleotide(readNucleotideIndex, tString);
		pushNucleotide(&word, nucleotide);
		if (double_strand || second_in_pair) {
#ifdef COLOR
			reversePushNucleotide(&antiWord, nucleotide);
#else
			reversePushNucleotide(&antiWord, 3 - nucleotide);
#endif
		}
	}

	// Go through sequence
	while (readNucleotideIndex < getLength(tString)) {
		nucleotide = getNucleotide(readNucleotideIndex++, tString);
		pushNucleotide(&word, nucleotide);
		if (double_strand || second_in_pair) {
#ifdef COLOR
			reversePushNucleotide(&antiWord, nucleotide);
#else
			reversePushNucleotide(&antiWord, 3 - nucleotide);
#endif
		}

		// Update annotation if necessary
		if (annotCount < annotationCount && annotIndex == getAnnotationLength(annotation)) {
			annotation = getNextAnnotation(annotation);
			annotCount++;
			annotIndex = 0;
		}

		// Search for reference mapping
		if (category == REFERENCE) {
			if (referenceMappings) 
				refMap = findReferenceMapping(seqID, index, referenceMappings, referenceMappingCount);
			else 
				refMap = NULL;

			if (refMap) {
				node = getNodeInGraph(graph, refMap->nodeID);
				if (refMap->nodeID > 0) {
					coord = refMap->nodeStart + (index - refMap->referenceStart);
				} else {
					coord = getNodeLength(node) - refMap->nodeStart - refMap->length + (index - refMap->referenceStart);
				}
			} else  {
				node = NULL;
				if (previousNode)
					break;
			}
		}
		// Search for reference-based mapping
		else if (annotCount < annotationCount && uniqueIndex >= getPosition(annotation) && getAnnotSequenceID(annotation) <= refCount && getAnnotSequenceID(annotation) >= -refCount) {
			refID = getAnnotSequenceID(annotation);
			if (refID > 0)
				refCoord = getStart(annotation) + annotIndex; 
			else
				refCoord = getStart(annotation) - annotIndex; 
			
			refMap = findReferenceMapping(refID, refCoord, referenceMappings, referenceMappingCount);
			// If success
			if (refMap) {
				if (refID > 0) {
					node = getNodeInGraph(graph, refMap->nodeID);
					if (refMap->nodeID > 0) {
						coord = refMap->nodeStart + (refCoord - refMap->referenceStart);
					} else {
						coord = getNodeLength(node) - refMap->nodeStart - refMap->length + (refCoord - refMap->referenceStart);
					}
				} else {
					node = getNodeInGraph(graph, -refMap->nodeID);
					if (refMap->nodeID > 0) {
						coord =  getNodeLength(node) - refMap->nodeStart - (refCoord - refMap->referenceStart) - 1;
					} else {
						coord = refMap->nodeStart + refMap->length - (refCoord - refMap->referenceStart) - 1;
					}
				}
			} else  {
				node = NULL;
				if (previousNode)
					break;
			}
		}		
		// Search in table
		else {
			reversed = false;
			if (double_strand) {
				if (compareKmers(&word, &antiWord) <= 0) {
					kmerOccurence =
					findKmerInKmerOccurenceTable(&word,
								       kmerTable);
				} else { 
					kmerOccurence =
					       findKmerInKmerOccurenceTable(&antiWord,
						kmerTable);
					reversed = true;
				}
			} else {
				if (!second_in_pair) {
					kmerOccurence =
					findKmerInKmerOccurenceTable(&word,
								       kmerTable);
				} else { 
					kmerOccurence =
					       findKmerInKmerOccurenceTable(&antiWord,
						kmerTable);
					reversed = true;
				}
			}
			
			if (kmerOccurence) {
				if (!reversed) {
					node = getNodeInGraph(graph, getKmerOccurenceNodeID(kmerOccurence));
					coord = getKmerOccurencePosition(kmerOccurence);
				} else {
					node = getNodeInGraph(graph, -getKmerOccurenceNodeID(kmerOccurence));
					coord = getNodeLength(node) - getKmerOccurencePosition(kmerOccurence) - 1;
				}
			} else {
				node = NULL;
				if (previousNode) 
					break;
			}
		}

		// Increment positions
		if (annotCount < annotationCount && uniqueIndex >= getPosition(annotation)) 
			annotIndex++;
		else
			uniqueIndex++;

		// Fill in graph
		if (node)
		{
#ifdef OPENMP
			lockNode(node);
#endif
			kmerIndex = readNucleotideIndex - wordLength;

			if (previousNode == node
			    && previousCoord == coord - 1) {
				if (category / 2 >= CATEGORIES) {
					setPassageMarkerFinish(marker,
							       kmerIndex +
							       1);
					setFinishOffset(marker,
							getNodeLength(node)
							- coord - 1);
				} else {
#ifndef SINGLE_COV_CAT
					incrementVirtualCoverage(node, category / 2, 1);
					incrementOriginalVirtualCoverage(node, category / 2, 1);
#else
					incrementVirtualCoverage(node, 1);
#endif
				}
#ifdef OPENMP
				unLockNode(node);
#endif
			} else {
				if (category / 2 >= CATEGORIES) {
					marker =
					    newPassageMarker(seqID,
							     kmerIndex,
							     kmerIndex + 1,
							     coord,
							     getNodeLength
							     (node) -
							     coord - 1);
					transposePassageMarker(marker,
							       node);
					connectPassageMarkers
					    (previousMarker, marker,
					     graph);
					previousMarker = marker;
				} else {
					if (readTracking) {
						if (!isNodeMemorized(node, nodePile)) {
							addReadStart(node,
								     seqID,
								     coord,
								     graph,
								     kmerIndex);
							memorizeNode(node, &nodePile);
						} else {
							blurLastShortReadMarker
							    (node, graph);
						}
					}

#ifndef SINGLE_COV_CAT
					incrementVirtualCoverage(node, category / 2, 1);
					incrementOriginalVirtualCoverage(node, category / 2, 1);
#else
					incrementVirtualCoverage(node, 1);
#endif
				}
#ifdef OPENMP
				lockTwoNodes(node, previousNode);
#endif
				createArc(previousNode, node, graph);
#ifdef OPENMP
				unLockTwoNodes(node, previousNode);
#endif
			}

			previousNode = node;
			previousCoord = coord;
		}
		index++;
	}

	if (readTracking && category / 2 < CATEGORIES)
		unMemorizeNodes(&nodePile);
}
Example #10
0
static void ghostThreadSequenceThroughGraph(TightString * tString,
					    KmerOccurenceTable *
					    kmerTable, Graph * graph,
					    IDnum seqID, Category category,
					    boolean readTracking,
					    boolean double_strand,
					    ReferenceMapping * referenceMappings,
					    Coordinate referenceMappingCount,
					    IDnum refCount,
					    Annotation * annotations,
					    IDnum annotationCount,
					    boolean second_in_pair)
{
	Kmer word;
	Kmer antiWord;
	Coordinate readNucleotideIndex;
	KmerOccurence *kmerOccurence;
	int wordLength = getWordLength(graph);
	Nucleotide nucleotide;
	IDnum refID;
	Coordinate refCoord;
	ReferenceMapping * refMap = NULL;
	Coordinate uniqueIndex = 0;
	Coordinate annotIndex = 0;
	IDnum annotCount = 0;
	boolean reversed;
	SmallNodeList * nodePile = NULL;
	Annotation * annotation = annotations;

	Node *node;
	Node *previousNode = NULL;

	// Neglect any read which will not be short paired
	if ((!readTracking && category % 2 == 0)
	    || category / 2 >= CATEGORIES)
		return;

	// Neglect any string shorter than WORDLENGTH :
	if (getLength(tString) < wordLength)
		return;

	// Verify that all short reads are reasonnably short
	if (getLength(tString) > USHRT_MAX) {
		velvetLog("Short read of length %lli, longer than limit %i\n",
			  (long long) getLength(tString), SHRT_MAX);
		velvetLog("You should better declare this sequence as long, because it genuinely is!\n");
		exit(1);
	}

	clearKmer(&word);
	clearKmer(&antiWord);

	// Fill in the initial word :
	for (readNucleotideIndex = 0;
	     readNucleotideIndex < wordLength - 1; readNucleotideIndex++) {
		nucleotide = getNucleotide(readNucleotideIndex, tString);
		pushNucleotide(&word, nucleotide);
		if (double_strand || second_in_pair) {
#ifdef COLOR
			reversePushNucleotide(&antiWord, nucleotide);
#else
			reversePushNucleotide(&antiWord, 3 - nucleotide);
#endif
		}
	}

	// Go through sequence
	while (readNucleotideIndex < getLength(tString)) {
		// Shift word:
		nucleotide = getNucleotide(readNucleotideIndex++, tString);
		pushNucleotide(&word, nucleotide);
		if (double_strand || second_in_pair) {
#ifdef COLOR
			reversePushNucleotide(&antiWord, nucleotide);
#else
			reversePushNucleotide(&antiWord, 3 - nucleotide);
#endif
		}

		// Update annotation if necessary
		if (annotCount < annotationCount && annotIndex == getAnnotationLength(annotation)) {
			annotation = getNextAnnotation(annotation);
			annotCount++;
			annotIndex = 0;
		}

		// Search for reference mapping
 		if (annotCount < annotationCount && uniqueIndex >= getPosition(annotation) && getAnnotSequenceID(annotation) <= refCount && getAnnotSequenceID(annotation) >= -refCount) {
			refID = getAnnotSequenceID(annotation);
			if (refID > 0)
				refCoord = getStart(annotation) + annotIndex;
			else
				refCoord = getStart(annotation) - annotIndex;
			
			refMap = findReferenceMapping(refID, refCoord, referenceMappings, referenceMappingCount);
			// If success
			if (refMap) {
				if (refID > 0) 
					node = getNodeInGraph(graph, refMap->nodeID);
				else
					node = getNodeInGraph(graph, -refMap->nodeID);
			} else  {
				node = NULL;
				if (previousNode)
					break;
			}
		}
		// if not.. look in table
		else {
			reversed = false;
			if (double_strand) {
				if (compareKmers(&word, &antiWord) <= 0) {
					kmerOccurence =
					findKmerInKmerOccurenceTable(&word,
								       kmerTable);
				} else { 
					kmerOccurence =
					       findKmerInKmerOccurenceTable(&antiWord,
						kmerTable);
					reversed = true;
				}
			} else {
				if (!second_in_pair) {
					kmerOccurence =
					findKmerInKmerOccurenceTable(&word,
								       kmerTable);
				} else { 
					kmerOccurence =
					       findKmerInKmerOccurenceTable(&antiWord,
						kmerTable);
					reversed = true;
				}
			}
			
			if (kmerOccurence) {
				if (!reversed) 
					node = getNodeInGraph(graph, getKmerOccurenceNodeID(kmerOccurence));
				else
					node = getNodeInGraph(graph, -getKmerOccurenceNodeID(kmerOccurence));
			} else {
				node = NULL;
				if (previousNode) 
					break;
			}

		}

		if (annotCount < annotationCount && uniqueIndex >= getPosition(annotation))
			annotIndex++;
		else
			uniqueIndex++;

		previousNode = node;

		// Fill in graph
		if (node && !isNodeMemorized(node, nodePile))
		{
#ifdef OPENMP
			lockNode(node);
#endif
			incrementReadStartCount(node, graph);
#ifdef OPENMP
			unLockNode(node);
#endif
			memorizeNode(node, &nodePile);
		}
	}

	unMemorizeNodes(&nodePile);
}
Example #11
0
static KmerOccurenceTable *referenceGraphKmers(char *preGraphFilename,
					       short int accelerationBits, Graph * graph, boolean double_strand, NodeMask * nodeMasks, Coordinate nodeMaskCount)
{
	FILE *file = fopen(preGraphFilename, "r");
	const int maxline = MAXLINE;
	char line[MAXLINE];
	char c;
	int wordLength;
	Coordinate lineLength, kmerCount;
	Kmer word;
	Kmer antiWord;
	KmerOccurenceTable *kmerTable;
	IDnum index;
	IDnum nodeID = 0;
	Nucleotide nucleotide;
	NodeMask * nodeMask = nodeMasks; 
	Coordinate nodeMaskIndex = 0;

	if (file == NULL)
		exitErrorf(EXIT_FAILURE, true, "Could not open %s", preGraphFilename);

	// Count kmers
	velvetLog("Scanning pre-graph file %s for k-mers\n",
		  preGraphFilename);

	// First  line
	if (!fgets(line, maxline, file))
		exitErrorf(EXIT_FAILURE, true, "PreGraph file incomplete");
	sscanf(line, "%*i\t%*i\t%i\n", &wordLength);

	kmerTable = newKmerOccurenceTable(accelerationBits, wordLength);

	// Read nodes
	if (!fgets(line, maxline, file))
		exitErrorf(EXIT_FAILURE, true, "PreGraph file incomplete");
	kmerCount = 0;
	while (line[0] == 'N') {
		lineLength = 0;
		while ((c = getc(file)) != EOF && c != '\n')
			lineLength++;
		kmerCount += lineLength - wordLength + 1;
		if (fgets(line, maxline, file) == NULL)
			break;
	}

	velvetLog("%li kmers found\n", (long) kmerCount);

	for(nodeMaskIndex = 0; nodeMaskIndex < nodeMaskCount; nodeMaskIndex++) {
		kmerCount -= nodeMasks[nodeMaskIndex].finish -
nodeMasks[nodeMaskIndex].start;
	}

	nodeMaskIndex = 0;

	fclose(file);

	// Create table
	allocateKmerOccurences(kmerCount, kmerTable);

	// Fill table
	file = fopen(preGraphFilename, "r");
	if (file == NULL)
		exitErrorf(EXIT_FAILURE, true, "Could not open %s", preGraphFilename);

	if (!fgets(line, maxline, file))
		exitErrorf(EXIT_FAILURE, true, "PreGraph file incomplete");

	// Read nodes
	if (!fgets(line, maxline, file))
		exitErrorf(EXIT_FAILURE, true, "PreGraph file incomplete");
	while (line[0] == 'N') {
		nodeID++;

		// Fill in the initial word : 
		clearKmer(&word);
		clearKmer(&antiWord);

		for (index = 0; index < wordLength - 1; index++) {
			c = getc(file);
			if (c == 'A')
				nucleotide = ADENINE;
			else if (c == 'C')
				nucleotide = CYTOSINE;
			else if (c == 'G')
				nucleotide = GUANINE;
			else if (c == 'T')
				nucleotide = THYMINE;
			else if (c == '\n')
				exitErrorf(EXIT_FAILURE, true, "PreGraph file incomplete");
			else
				nucleotide = ADENINE;
				

			pushNucleotide(&word, nucleotide);
			if (double_strand) {
#ifdef COLOR
				reversePushNucleotide(&antiWord, nucleotide);
#else
				reversePushNucleotide(&antiWord, 3 - nucleotide);
#endif
			}
		}

		// Scan through node
		index = 0;
		while((c = getc(file)) != '\n' && c != EOF) {
			if (c == 'A')
				nucleotide = ADENINE;
			else if (c == 'C')
				nucleotide = CYTOSINE;
			else if (c == 'G')
				nucleotide = GUANINE;
			else if (c == 'T')
				nucleotide = THYMINE;
			else
				nucleotide = ADENINE;

			pushNucleotide(&word, nucleotide);
			if (double_strand) {
#ifdef COLOR
				reversePushNucleotide(&antiWord, nucleotide);
#else
				reversePushNucleotide(&antiWord, 3 - nucleotide);
#endif
			}

			// Update mask if necessary 
			if (nodeMask) { 
				if (nodeMask->nodeID < nodeID || (nodeMask->nodeID == nodeID && index >= nodeMask->finish)) {
					if (++nodeMaskIndex == nodeMaskCount) 
						nodeMask = NULL;
					else 
						nodeMask++;
				}
			}

			// Check if not masked!
			if (nodeMask) { 
				if (nodeMask->nodeID == nodeID && index >= nodeMask->start && index < nodeMask->finish) {
					index++;
					continue;
				} 			
			}

			if (!double_strand || compareKmers(&word, &antiWord) <= 0)
				recordKmerOccurence(&word, nodeID, index, kmerTable);
			else
				recordKmerOccurence(&antiWord, -nodeID, getNodeLength(getNodeInGraph(graph, nodeID)) - 1 - index, kmerTable);

			index++;
		}

		if (fgets(line, maxline, file) == NULL)
			break;
	}

	fclose(file);

	// Sort table
	sortKmerOccurenceTable(kmerTable);

	return kmerTable;
}
Example #12
0
static void threadSequenceThroughGraph(TightString * tString,
				       KmerOccurenceTable * kmerOccurences,
				       Graph * graph,
				       IDnum seqID, Category category,
				       boolean readTracking,
				       boolean double_strand)
{
	Kmer word;
	Kmer antiWord;
	Coordinate readNucleotideIndex;
	Coordinate kmerIndex;
	KmerOccurence *kmerOccurence;
	int wordLength = getWordLength(graph);

	PassageMarker *marker = NULL;
	PassageMarker *previousMarker = NULL;
	Node *node;
	Node *previousNode = NULL;
	Coordinate coord;
	Coordinate previousCoord = 0;
	Nucleotide nucleotide;

	clearKmer(&word);
	clearKmer(&antiWord);

	// Neglect any string shorter than WORDLENGTH :
	if (getLength(tString) < wordLength)
		return;

	// Fill in the initial word : 
	for (readNucleotideIndex = 0;
	     readNucleotideIndex < wordLength - 1; readNucleotideIndex++) {
		nucleotide = getNucleotide(readNucleotideIndex, tString);
		pushNucleotide(&word, nucleotide);
		if (double_strand) {
#ifdef COLOR
			reversePushNucleotide(&antiWord, nucleotide);
#else
			reversePushNucleotide(&antiWord, 3 - nucleotide);
#endif
		}
	}

	// Go through sequence
	while (readNucleotideIndex < getLength(tString)) {
		nucleotide = getNucleotide(readNucleotideIndex++, tString);
		pushNucleotide(&word, nucleotide);
		if (double_strand) {
#ifdef COLOR
			reversePushNucleotide(&antiWord, nucleotide);
#else
			reversePushNucleotide(&antiWord, 3 - nucleotide);
#endif
		}

		// Search in table
		if ((!double_strand || compareKmers(&word, &antiWord) <= 0)
		    && (kmerOccurence =
			findKmerOccurenceInSortedTable(&word,
						       kmerOccurences))) {
			node =
			    getNodeInGraph(graph, kmerOccurence->nodeID);
			coord = kmerOccurence->position;
		} else if ((double_strand && compareKmers(&word, &antiWord) > 0)
			   && (kmerOccurence =
			       findKmerOccurenceInSortedTable(&antiWord,
							      kmerOccurences)))
		{
			node =
			    getNodeInGraph(graph, -kmerOccurence->nodeID);
			coord =
			    getNodeLength(node) - kmerOccurence->position -
			    1;
		} else {
			node = NULL;
			if (previousNode) {
				break;
			}
		}

		// Fill in graph
		if (node) {
			kmerIndex = readNucleotideIndex - wordLength;

			if (previousNode == node
			    && previousCoord == coord - 1) {
				if (category / 2 >= CATEGORIES) {
					setPassageMarkerFinish(marker,
							       kmerIndex +
							       1);
					setFinishOffset(marker,
							getNodeLength(node)
							- coord - 1);
				} else {
					incrementVirtualCoverage(node,
								 category /
								 2, 1);
					incrementOriginalVirtualCoverage
					    (node, category / 2, 1);
				}

			} else {
				if (category / 2 >= CATEGORIES) {
					marker =
					    newPassageMarker(seqID,
							     kmerIndex,
							     kmerIndex + 1,
							     coord,
							     getNodeLength
							     (node) -
							     coord - 1);
					transposePassageMarker(marker,
							       node);
					connectPassageMarkers
					    (previousMarker, marker,
					     graph);
					previousMarker = marker;
				} else {
					if (readTracking) {
						if (!getNodeStatus(node)) {
							addReadStart(node,
								     seqID,
								     coord,
								     graph,
								     kmerIndex);
							setSingleNodeStatus
							    (node, true);
							memorizeNode(node);
						} else {
							blurLastShortReadMarker
							    (node, graph);
						}
					}

					incrementVirtualCoverage(node,
								 category /
								 2, 1);
					incrementOriginalVirtualCoverage
					    (node, category / 2, 1);
				}

				createArc(previousNode, node, graph);
			}

			previousNode = node;
			previousCoord = coord;
		}
	}

	unlockMemorizedNodes();
}
Example #13
0
static void ghostThreadSequenceThroughGraph(TightString * tString,
					    KmerOccurenceTable *
					    kmerOccurences, Graph * graph,
					    IDnum seqID, Category category,
					    boolean readTracking,
					    boolean double_strand)
{
	Kmer word;
	Kmer antiWord;
	Coordinate readNucleotideIndex;
	KmerOccurence *kmerOccurence;
	int wordLength = getWordLength(graph);
	Nucleotide nucleotide;

	Node *node;
	Node *previousNode = NULL;

	clearKmer(&word);
	clearKmer(&antiWord);

	// Neglect any read which will not be short paired
	if ((!readTracking && category % 2 == 0)
	    || category / 2 >= CATEGORIES)
		return;

	// Neglect any string shorter than WORDLENGTH :
	if (getLength(tString) < wordLength)
		return;

	// Verify that all short reads are reasonnably short
	if (getLength(tString) > USHRT_MAX) {
		printf("Short read of length %lli, longer than limit %i\n",
		       (long long) getLength(tString), SHRT_MAX);
		puts("You should better declare this sequence as long, because it genuinely is!");
		exit(1);
	}
	// Allocate memory for the read pairs
	if (!readStartsAreActivated(graph))
		activateReadStarts(graph);

	// Fill in the initial word : 
	for (readNucleotideIndex = 0;
	     readNucleotideIndex < wordLength - 1; readNucleotideIndex++) {
		nucleotide = getNucleotide(readNucleotideIndex, tString);
		pushNucleotide(&word, nucleotide);
		if (double_strand) {
#ifdef COLOR
			reversePushNucleotide(&antiWord, nucleotide);
#else
			reversePushNucleotide(&antiWord, 3 - nucleotide);
#endif
		}
	}

	// Go through sequence
	while (readNucleotideIndex < getLength(tString)) {
		// Shift word:
		nucleotide = getNucleotide(readNucleotideIndex++, tString);
		pushNucleotide(&word, nucleotide);
		if (double_strand) {
#ifdef COLOR
			reversePushNucleotide(&antiWord, nucleotide);
#else
			reversePushNucleotide(&antiWord, 3 - nucleotide);
#endif
		}

		// Search in table
		if ((!double_strand || compareKmers(&word, &antiWord) <= 0)
		    && (kmerOccurence =
			findKmerOccurenceInSortedTable(&word,
						       kmerOccurences))) {
			node =
			    getNodeInGraph(graph, kmerOccurence->nodeID);
		} else if ((double_strand && compareKmers(&word, &antiWord) > 0)
			   && (kmerOccurence =
			       findKmerOccurenceInSortedTable(&antiWord,
							      kmerOccurences)))
		{
			node =
			    getNodeInGraph(graph, -kmerOccurence->nodeID);
		} else {
			node = NULL;
			if (previousNode)
				break;
		}

		previousNode = node;

		// Fill in graph
		if (node && !getNodeStatus(node)) {
			incrementReadStartCount(node, graph);
			setSingleNodeStatus(node, true);
			memorizeNode(node);
		}
	}

	unlockMemorizedNodes();
}
Example #14
0
static KmerOccurenceTable *referenceGraphKmers(char *preGraphFilename,
					       short int accelerationBits, Graph * graph, boolean double_strand)
{
	FILE *file = fopen(preGraphFilename, "r");
	const int maxline = MAXLINE;
	char line[MAXLINE];
	char c;
	int wordLength;
	Coordinate lineLength, kmerCount;
	Kmer word;
	Kmer antiWord;
	KmerOccurenceTable *kmerTable = NULL;
	KmerOccurence *kmerOccurences, *kmerOccurencePtr;
	Coordinate kmerOccurenceIndex;
	IDnum index;
	IDnum nodeID = 0;
	IDnum *accelPtr = NULL;
	KmerKey lastHeader = 0;
	KmerKey header;
	Nucleotide nucleotide;

	if (file == NULL)
		exitErrorf(EXIT_FAILURE, true, "Could not open %s", preGraphFilename);

	// Count kmers
	printf("Scanning pre-graph file %s for k-mers\n",
	       preGraphFilename);

	// First  line
	if (!fgets(line, maxline, file))
		exitErrorf(EXIT_FAILURE, true, "PreGraph file incomplete");
	sscanf(line, "%*i\t%*i\t%i\n", &wordLength);

	// Initialize kmer occurence table:
	kmerTable = mallocOrExit(1, KmerOccurenceTable);
	if (accelerationBits > 2 * wordLength)
		accelerationBits = 2 * wordLength;

	if (accelerationBits > 32)
		accelerationBits = 32;

	if (accelerationBits > 0) {
		kmerTable->accelerationBits = accelerationBits;
		kmerTable->accelerationTable =
		    callocOrExit((((size_t) 1) << accelerationBits) + 1,
			   IDnum);
		accelPtr = kmerTable->accelerationTable;
		kmerTable->accelerationShift =
		    (short int) 2 *wordLength - accelerationBits;
	} else {
		kmerTable->accelerationBits = 0;
		kmerTable->accelerationTable = NULL;
		kmerTable->accelerationShift = 0;
	}

	// Read nodes
	if (!fgets(line, maxline, file))
		exitErrorf(EXIT_FAILURE, true, "PreGraph file incomplete");
	kmerCount = 0;
	while (line[0] == 'N') {
		lineLength = 0;
		while ((c = getc(file)) != EOF && c != '\n')
			lineLength++;
		kmerCount += lineLength - wordLength + 1;
		if (fgets(line, maxline, file) == NULL)
			break;
	}
	fclose(file);

	// Create table
	printf("%li kmers found\n", (long) kmerCount);
	kmerOccurences = callocOrExit(kmerCount, KmerOccurence);
	kmerOccurencePtr = kmerOccurences;
	kmerOccurenceIndex = 0;
	kmerTable->kmerTable = kmerOccurences;
	kmerTable->kmerTableSize = kmerCount;

	// Fill table
	file = fopen(preGraphFilename, "r");
	if (file == NULL)
		exitErrorf(EXIT_FAILURE, true, "Could not open %s", preGraphFilename);

	if (!fgets(line, maxline, file))
		exitErrorf(EXIT_FAILURE, true, "PreGraph file incomplete");

	// Read nodes
	if (!fgets(line, maxline, file))
		exitErrorf(EXIT_FAILURE, true, "PreGraph file incomplete");
	while (line[0] == 'N') {
		nodeID++;

		// Fill in the initial word : 
		clearKmer(&word);
		clearKmer(&antiWord);

		for (index = 0; index < wordLength - 1; index++) {
			c = getc(file);
			if (c == 'A')
				nucleotide = ADENINE;
			else if (c == 'C')
				nucleotide = CYTOSINE;
			else if (c == 'G')
				nucleotide = GUANINE;
			else if (c == 'T')
				nucleotide = THYMINE;
			else if (c == '\n')
				exitErrorf(EXIT_FAILURE, true, "PreGraph file incomplete");
			else
				nucleotide = ADENINE;
				

			pushNucleotide(&word, nucleotide);
			if (double_strand) {
#ifdef COLOR
				reversePushNucleotide(&antiWord, nucleotide);
#else
				reversePushNucleotide(&antiWord, 3 - nucleotide);
#endif
			}
		}

		// Scan through node
		index = 0;
		while((c = getc(file)) != '\n' && c != EOF) {
			if (c == 'A')
				nucleotide = ADENINE;
			else if (c == 'C')
				nucleotide = CYTOSINE;
			else if (c == 'G')
				nucleotide = GUANINE;
			else if (c == 'T')
				nucleotide = THYMINE;
			else
				nucleotide = ADENINE;

			pushNucleotide(&word, nucleotide);
			if (double_strand) {
#ifdef COLOR
				reversePushNucleotide(&antiWord, nucleotide);
#else
				reversePushNucleotide(&antiWord, 3 - nucleotide);
#endif
			}

			if (!double_strand || compareKmers(&word, &antiWord) <= 0) {
				copyKmers(&kmerOccurencePtr->kmer, &word);
				kmerOccurencePtr->nodeID = nodeID;
				kmerOccurencePtr->position =
				    index;
			} else {
				copyKmers(&kmerOccurencePtr->kmer, &antiWord);
				kmerOccurencePtr->nodeID = -nodeID;
				kmerOccurencePtr->position =
				    getNodeLength(getNodeInGraph(graph, nodeID)) - 1 - index;
			}

			kmerOccurencePtr++;
			kmerOccurenceIndex++;
			index++;
		}

		if (fgets(line, maxline, file) == NULL)
			break;
	}

	fclose(file);

	// Sort table
	qsort(kmerOccurences, kmerCount, sizeof(KmerOccurence),
	      compareKmerOccurences);

	// Fill up acceleration table
	if (kmerTable->accelerationTable != NULL) {
		*accelPtr = (IDnum) 0;
		for (kmerOccurenceIndex = 0;
		     kmerOccurenceIndex < kmerCount;
		     kmerOccurenceIndex++) {
			header =
			    keyInAccelerationTable(&kmerOccurences
						   [kmerOccurenceIndex].
						   kmer, kmerTable);
			while (lastHeader < header) {
				lastHeader++;
				accelPtr++;
				*accelPtr = kmerOccurenceIndex;
			}
		}

		while (lastHeader < (KmerKey) 1 << accelerationBits) {
			lastHeader++;
			accelPtr++;
			*accelPtr = kmerCount;
		}
	}

	return kmerTable;
}