static KmerOccurence *findKmerOccurenceInSortedTable(Kmer * kmer, KmerOccurenceTable * table) { KmerOccurence *array = table->kmerTable; KmerKey key = keyInAccelerationTable(kmer, table); Coordinate leftIndex, rightIndex, middleIndex; if (table->accelerationTable != NULL) { leftIndex = table->accelerationTable[key]; rightIndex = table->accelerationTable[key + 1]; } else { leftIndex = 0; rightIndex = table->kmerTableSize; } while (true) { middleIndex = (rightIndex + leftIndex) / 2; if (leftIndex >= rightIndex) return NULL; else if (compareKmers(&(array[middleIndex]).kmer, kmer) == 0) return &(array[middleIndex]); else if (leftIndex == middleIndex) return NULL; else if (compareKmers(&(array[middleIndex]).kmer, kmer) > 0) rightIndex = middleIndex; else leftIndex = middleIndex; } }
void insertIntoTree(Kmer * kmer, SplayTree ** T) { SplayNode *newNode; if (*T == NULL) { newNode = allocateSplayNode(); copyKmers(&(newNode->kmer), kmer); newNode->left = newNode->right = NULL; *T = newNode; return; } *T = Splay(kmer, *T); if (compareKmers(kmer, &((*T)->kmer)) < 0) { newNode = allocateSplayNode(); copyKmers(&(newNode->kmer), kmer); newNode->left = (*T)->left; newNode->right = *T; (*T)->left = NULL; *T = newNode; } else if (compareKmers(&((*T)->kmer), kmer) < 0) { newNode = allocateSplayNode(); copyKmers(&(newNode->kmer), kmer); newNode->right = (*T)->right; newNode->left = *T; (*T)->right = NULL; *T = newNode; } }
boolean findOrInsertOccurenceInSplayTree(Kmer * kmer, IDnum * seqID, Coordinate * position, SplayTree ** T) { SplayNode *newNode; if (*T == NULL) { newNode = allocateSplayNode(); copyKmers(&(newNode->kmer), kmer); newNode->seqID = *seqID; newNode->position = *position; newNode->left = newNode->right = NULL; *T = newNode; return false; } *T = Splay(kmer, *T); if (compareKmers(kmer, &((*T)->kmer)) < 0) { newNode = allocateSplayNode(); copyKmers(&(newNode->kmer), kmer); newNode->seqID = *seqID; newNode->position = *position; newNode->left = (*T)->left; newNode->right = *T; (*T)->left = NULL; *T = newNode; printf("1: sequenceID = %d\n", *seqID); return false; } else if (compareKmers(kmer, &((*T)->kmer)) > 0) { newNode = allocateSplayNode(); copyKmers(&(newNode->kmer), kmer); newNode->seqID = *seqID; newNode->position = *position; newNode->right = (*T)->right; newNode->left = *T; (*T)->right = NULL; *T = newNode; printf("2: sequenceID = %d\n", *seqID); return false; } else { *seqID = (*T)->seqID; *position = (*T)->position; printf("3: sequenceID = %d\n", *seqID); return true; } }
int compareKmerOccurences(void const *A, void const *B) { KmerOccurence *a = (KmerOccurence *) A; KmerOccurence *b = (KmerOccurence *) B; if (compareKmers(&(a->kmer), &(b->kmer)) < 0) return -1; else if (compareKmers(&(a->kmer), &(b->kmer)) > 0) return 1; else return 0; }
KmerOccurence * getNextKmerOccurence(KmerOccurence * current) { register KmerOccurence * next = current + 1; if (next->nodeID == 0) return NULL; if (compareKmers(¤t->kmer, &next->kmer)) return NULL; return next; }
static SplayTree *Splay(Kmer * kmer, SplayTree * T) { SplayNode Header; SplayNode *LeftTreeMax, *RightTreeMin; if (T == NULL) return NULL; Header.left = Header.right = NULL; LeftTreeMax = RightTreeMin = &Header; while (compareKmers(kmer, &(T->kmer))) { if (compareKmers(kmer, &(T->kmer)) < 0) { if (T->left == NULL) break; if (compareKmers(kmer, &(T->left->kmer)) < 0) T = SingleRotateWithLeft(T); if (T->left == NULL) break; /* Link right */ RightTreeMin->left = T; RightTreeMin = T; T = T->left; } else { if (T->right == NULL) break; if (compareKmers(kmer, &(T->right->kmer)) > 0) T = SingleRotateWithRight(T); if (T->right == NULL) break; /* Link left */ LeftTreeMax->right = T; LeftTreeMax = T; T = T->right; } } /* while kmer != T->kmer */ /* Reassemble */ LeftTreeMax->right = T->left; RightTreeMin->left = T->right; T->left = Header.right; T->right = Header.left; return T; }
KmerOccurence *findKmerInKmerOccurenceTable(Kmer * kmer, KmerOccurenceTable * table) { KmerOccurence *array = table->kmerTable; KmerKey key = keyInAccelerationTable(kmer, table); Coordinate leftIndex, rightIndex, middleIndex; int diff; if (table->accelerationTable != NULL) { leftIndex = table->accelerationTable[key]; rightIndex = table->accelerationTable[key + 1]; } else { leftIndex = 0; rightIndex = table->kmerTableSize; } while (true) { middleIndex = (rightIndex + leftIndex) / 2; if (leftIndex >= rightIndex) return NULL; diff = compareKmers(&(array[middleIndex].kmer), kmer); if (diff == 0) { while (middleIndex > 0 && compareKmers(&(array[middleIndex - 1].kmer), kmer) == 0) middleIndex--; return &(array[middleIndex]); } else if (leftIndex == middleIndex) return NULL; else if (diff > 0) rightIndex = middleIndex; else leftIndex = middleIndex; } }
int compareKmerOccurences(void const *A, void const *B) { KmerOccurence *a = (KmerOccurence *) A; KmerOccurence *b = (KmerOccurence *) B; return compareKmers(&(a->kmer), &(b->kmer)); }
static void threadSequenceThroughGraph(TightString * tString, KmerOccurenceTable * kmerTable, Graph * graph, IDnum seqID, Category category, boolean readTracking, boolean double_strand, ReferenceMapping * referenceMappings, Coordinate referenceMappingCount, IDnum refCount, Annotation * annotations, IDnum annotationCount, boolean second_in_pair) { Kmer word; Kmer antiWord; Coordinate readNucleotideIndex; Coordinate kmerIndex; KmerOccurence *kmerOccurence; int wordLength = getWordLength(graph); PassageMarkerI marker = NULL_IDX; PassageMarkerI previousMarker = NULL_IDX; Node *node = NULL; Node *previousNode = NULL; Coordinate coord = 0; Coordinate previousCoord = 0; Nucleotide nucleotide; boolean reversed; IDnum refID; Coordinate refCoord = 0; ReferenceMapping * refMap; Annotation * annotation = annotations; Coordinate index = 0; Coordinate uniqueIndex = 0; Coordinate annotIndex = 0; IDnum annotCount = 0; SmallNodeList * nodePile = NULL; // Neglect any string shorter than WORDLENGTH : if (getLength(tString) < wordLength) return; clearKmer(&word); clearKmer(&antiWord); // Fill in the initial word : for (readNucleotideIndex = 0; readNucleotideIndex < wordLength - 1; readNucleotideIndex++) { nucleotide = getNucleotide(readNucleotideIndex, tString); pushNucleotide(&word, nucleotide); if (double_strand || second_in_pair) { #ifdef COLOR reversePushNucleotide(&antiWord, nucleotide); #else reversePushNucleotide(&antiWord, 3 - nucleotide); #endif } } // Go through sequence while (readNucleotideIndex < getLength(tString)) { nucleotide = getNucleotide(readNucleotideIndex++, tString); pushNucleotide(&word, nucleotide); if (double_strand || second_in_pair) { #ifdef COLOR reversePushNucleotide(&antiWord, nucleotide); #else reversePushNucleotide(&antiWord, 3 - nucleotide); #endif } // Update annotation if necessary if (annotCount < annotationCount && annotIndex == getAnnotationLength(annotation)) { annotation = getNextAnnotation(annotation); annotCount++; annotIndex = 0; } // Search for reference mapping if (category == REFERENCE) { if (referenceMappings) refMap = findReferenceMapping(seqID, index, referenceMappings, referenceMappingCount); else refMap = NULL; if (refMap) { node = getNodeInGraph(graph, refMap->nodeID); if (refMap->nodeID > 0) { coord = refMap->nodeStart + (index - refMap->referenceStart); } else { coord = getNodeLength(node) - refMap->nodeStart - refMap->length + (index - refMap->referenceStart); } } else { node = NULL; if (previousNode) break; } } // Search for reference-based mapping else if (annotCount < annotationCount && uniqueIndex >= getPosition(annotation) && getAnnotSequenceID(annotation) <= refCount && getAnnotSequenceID(annotation) >= -refCount) { refID = getAnnotSequenceID(annotation); if (refID > 0) refCoord = getStart(annotation) + annotIndex; else refCoord = getStart(annotation) - annotIndex; refMap = findReferenceMapping(refID, refCoord, referenceMappings, referenceMappingCount); // If success if (refMap) { if (refID > 0) { node = getNodeInGraph(graph, refMap->nodeID); if (refMap->nodeID > 0) { coord = refMap->nodeStart + (refCoord - refMap->referenceStart); } else { coord = getNodeLength(node) - refMap->nodeStart - refMap->length + (refCoord - refMap->referenceStart); } } else { node = getNodeInGraph(graph, -refMap->nodeID); if (refMap->nodeID > 0) { coord = getNodeLength(node) - refMap->nodeStart - (refCoord - refMap->referenceStart) - 1; } else { coord = refMap->nodeStart + refMap->length - (refCoord - refMap->referenceStart) - 1; } } } else { node = NULL; if (previousNode) break; } } // Search in table else { reversed = false; if (double_strand) { if (compareKmers(&word, &antiWord) <= 0) { kmerOccurence = findKmerInKmerOccurenceTable(&word, kmerTable); } else { kmerOccurence = findKmerInKmerOccurenceTable(&antiWord, kmerTable); reversed = true; } } else { if (!second_in_pair) { kmerOccurence = findKmerInKmerOccurenceTable(&word, kmerTable); } else { kmerOccurence = findKmerInKmerOccurenceTable(&antiWord, kmerTable); reversed = true; } } if (kmerOccurence) { if (!reversed) { node = getNodeInGraph(graph, getKmerOccurenceNodeID(kmerOccurence)); coord = getKmerOccurencePosition(kmerOccurence); } else { node = getNodeInGraph(graph, -getKmerOccurenceNodeID(kmerOccurence)); coord = getNodeLength(node) - getKmerOccurencePosition(kmerOccurence) - 1; } } else { node = NULL; if (previousNode) break; } } // Increment positions if (annotCount < annotationCount && uniqueIndex >= getPosition(annotation)) annotIndex++; else uniqueIndex++; // Fill in graph if (node) { #ifdef OPENMP lockNode(node); #endif kmerIndex = readNucleotideIndex - wordLength; if (previousNode == node && previousCoord == coord - 1) { if (category / 2 >= CATEGORIES) { setPassageMarkerFinish(marker, kmerIndex + 1); setFinishOffset(marker, getNodeLength(node) - coord - 1); } else { #ifndef SINGLE_COV_CAT incrementVirtualCoverage(node, category / 2, 1); incrementOriginalVirtualCoverage(node, category / 2, 1); #else incrementVirtualCoverage(node, 1); #endif } #ifdef OPENMP unLockNode(node); #endif } else { if (category / 2 >= CATEGORIES) { marker = newPassageMarker(seqID, kmerIndex, kmerIndex + 1, coord, getNodeLength (node) - coord - 1); transposePassageMarker(marker, node); connectPassageMarkers (previousMarker, marker, graph); previousMarker = marker; } else { if (readTracking) { if (!isNodeMemorized(node, nodePile)) { addReadStart(node, seqID, coord, graph, kmerIndex); memorizeNode(node, &nodePile); } else { blurLastShortReadMarker (node, graph); } } #ifndef SINGLE_COV_CAT incrementVirtualCoverage(node, category / 2, 1); incrementOriginalVirtualCoverage(node, category / 2, 1); #else incrementVirtualCoverage(node, 1); #endif } #ifdef OPENMP lockTwoNodes(node, previousNode); #endif createArc(previousNode, node, graph); #ifdef OPENMP unLockTwoNodes(node, previousNode); #endif } previousNode = node; previousCoord = coord; } index++; } if (readTracking && category / 2 < CATEGORIES) unMemorizeNodes(&nodePile); }
static void ghostThreadSequenceThroughGraph(TightString * tString, KmerOccurenceTable * kmerTable, Graph * graph, IDnum seqID, Category category, boolean readTracking, boolean double_strand, ReferenceMapping * referenceMappings, Coordinate referenceMappingCount, IDnum refCount, Annotation * annotations, IDnum annotationCount, boolean second_in_pair) { Kmer word; Kmer antiWord; Coordinate readNucleotideIndex; KmerOccurence *kmerOccurence; int wordLength = getWordLength(graph); Nucleotide nucleotide; IDnum refID; Coordinate refCoord; ReferenceMapping * refMap = NULL; Coordinate uniqueIndex = 0; Coordinate annotIndex = 0; IDnum annotCount = 0; boolean reversed; SmallNodeList * nodePile = NULL; Annotation * annotation = annotations; Node *node; Node *previousNode = NULL; // Neglect any read which will not be short paired if ((!readTracking && category % 2 == 0) || category / 2 >= CATEGORIES) return; // Neglect any string shorter than WORDLENGTH : if (getLength(tString) < wordLength) return; // Verify that all short reads are reasonnably short if (getLength(tString) > USHRT_MAX) { velvetLog("Short read of length %lli, longer than limit %i\n", (long long) getLength(tString), SHRT_MAX); velvetLog("You should better declare this sequence as long, because it genuinely is!\n"); exit(1); } clearKmer(&word); clearKmer(&antiWord); // Fill in the initial word : for (readNucleotideIndex = 0; readNucleotideIndex < wordLength - 1; readNucleotideIndex++) { nucleotide = getNucleotide(readNucleotideIndex, tString); pushNucleotide(&word, nucleotide); if (double_strand || second_in_pair) { #ifdef COLOR reversePushNucleotide(&antiWord, nucleotide); #else reversePushNucleotide(&antiWord, 3 - nucleotide); #endif } } // Go through sequence while (readNucleotideIndex < getLength(tString)) { // Shift word: nucleotide = getNucleotide(readNucleotideIndex++, tString); pushNucleotide(&word, nucleotide); if (double_strand || second_in_pair) { #ifdef COLOR reversePushNucleotide(&antiWord, nucleotide); #else reversePushNucleotide(&antiWord, 3 - nucleotide); #endif } // Update annotation if necessary if (annotCount < annotationCount && annotIndex == getAnnotationLength(annotation)) { annotation = getNextAnnotation(annotation); annotCount++; annotIndex = 0; } // Search for reference mapping if (annotCount < annotationCount && uniqueIndex >= getPosition(annotation) && getAnnotSequenceID(annotation) <= refCount && getAnnotSequenceID(annotation) >= -refCount) { refID = getAnnotSequenceID(annotation); if (refID > 0) refCoord = getStart(annotation) + annotIndex; else refCoord = getStart(annotation) - annotIndex; refMap = findReferenceMapping(refID, refCoord, referenceMappings, referenceMappingCount); // If success if (refMap) { if (refID > 0) node = getNodeInGraph(graph, refMap->nodeID); else node = getNodeInGraph(graph, -refMap->nodeID); } else { node = NULL; if (previousNode) break; } } // if not.. look in table else { reversed = false; if (double_strand) { if (compareKmers(&word, &antiWord) <= 0) { kmerOccurence = findKmerInKmerOccurenceTable(&word, kmerTable); } else { kmerOccurence = findKmerInKmerOccurenceTable(&antiWord, kmerTable); reversed = true; } } else { if (!second_in_pair) { kmerOccurence = findKmerInKmerOccurenceTable(&word, kmerTable); } else { kmerOccurence = findKmerInKmerOccurenceTable(&antiWord, kmerTable); reversed = true; } } if (kmerOccurence) { if (!reversed) node = getNodeInGraph(graph, getKmerOccurenceNodeID(kmerOccurence)); else node = getNodeInGraph(graph, -getKmerOccurenceNodeID(kmerOccurence)); } else { node = NULL; if (previousNode) break; } } if (annotCount < annotationCount && uniqueIndex >= getPosition(annotation)) annotIndex++; else uniqueIndex++; previousNode = node; // Fill in graph if (node && !isNodeMemorized(node, nodePile)) { #ifdef OPENMP lockNode(node); #endif incrementReadStartCount(node, graph); #ifdef OPENMP unLockNode(node); #endif memorizeNode(node, &nodePile); } } unMemorizeNodes(&nodePile); }
static KmerOccurenceTable *referenceGraphKmers(char *preGraphFilename, short int accelerationBits, Graph * graph, boolean double_strand, NodeMask * nodeMasks, Coordinate nodeMaskCount) { FILE *file = fopen(preGraphFilename, "r"); const int maxline = MAXLINE; char line[MAXLINE]; char c; int wordLength; Coordinate lineLength, kmerCount; Kmer word; Kmer antiWord; KmerOccurenceTable *kmerTable; IDnum index; IDnum nodeID = 0; Nucleotide nucleotide; NodeMask * nodeMask = nodeMasks; Coordinate nodeMaskIndex = 0; if (file == NULL) exitErrorf(EXIT_FAILURE, true, "Could not open %s", preGraphFilename); // Count kmers velvetLog("Scanning pre-graph file %s for k-mers\n", preGraphFilename); // First line if (!fgets(line, maxline, file)) exitErrorf(EXIT_FAILURE, true, "PreGraph file incomplete"); sscanf(line, "%*i\t%*i\t%i\n", &wordLength); kmerTable = newKmerOccurenceTable(accelerationBits, wordLength); // Read nodes if (!fgets(line, maxline, file)) exitErrorf(EXIT_FAILURE, true, "PreGraph file incomplete"); kmerCount = 0; while (line[0] == 'N') { lineLength = 0; while ((c = getc(file)) != EOF && c != '\n') lineLength++; kmerCount += lineLength - wordLength + 1; if (fgets(line, maxline, file) == NULL) break; } velvetLog("%li kmers found\n", (long) kmerCount); for(nodeMaskIndex = 0; nodeMaskIndex < nodeMaskCount; nodeMaskIndex++) { kmerCount -= nodeMasks[nodeMaskIndex].finish - nodeMasks[nodeMaskIndex].start; } nodeMaskIndex = 0; fclose(file); // Create table allocateKmerOccurences(kmerCount, kmerTable); // Fill table file = fopen(preGraphFilename, "r"); if (file == NULL) exitErrorf(EXIT_FAILURE, true, "Could not open %s", preGraphFilename); if (!fgets(line, maxline, file)) exitErrorf(EXIT_FAILURE, true, "PreGraph file incomplete"); // Read nodes if (!fgets(line, maxline, file)) exitErrorf(EXIT_FAILURE, true, "PreGraph file incomplete"); while (line[0] == 'N') { nodeID++; // Fill in the initial word : clearKmer(&word); clearKmer(&antiWord); for (index = 0; index < wordLength - 1; index++) { c = getc(file); if (c == 'A') nucleotide = ADENINE; else if (c == 'C') nucleotide = CYTOSINE; else if (c == 'G') nucleotide = GUANINE; else if (c == 'T') nucleotide = THYMINE; else if (c == '\n') exitErrorf(EXIT_FAILURE, true, "PreGraph file incomplete"); else nucleotide = ADENINE; pushNucleotide(&word, nucleotide); if (double_strand) { #ifdef COLOR reversePushNucleotide(&antiWord, nucleotide); #else reversePushNucleotide(&antiWord, 3 - nucleotide); #endif } } // Scan through node index = 0; while((c = getc(file)) != '\n' && c != EOF) { if (c == 'A') nucleotide = ADENINE; else if (c == 'C') nucleotide = CYTOSINE; else if (c == 'G') nucleotide = GUANINE; else if (c == 'T') nucleotide = THYMINE; else nucleotide = ADENINE; pushNucleotide(&word, nucleotide); if (double_strand) { #ifdef COLOR reversePushNucleotide(&antiWord, nucleotide); #else reversePushNucleotide(&antiWord, 3 - nucleotide); #endif } // Update mask if necessary if (nodeMask) { if (nodeMask->nodeID < nodeID || (nodeMask->nodeID == nodeID && index >= nodeMask->finish)) { if (++nodeMaskIndex == nodeMaskCount) nodeMask = NULL; else nodeMask++; } } // Check if not masked! if (nodeMask) { if (nodeMask->nodeID == nodeID && index >= nodeMask->start && index < nodeMask->finish) { index++; continue; } } if (!double_strand || compareKmers(&word, &antiWord) <= 0) recordKmerOccurence(&word, nodeID, index, kmerTable); else recordKmerOccurence(&antiWord, -nodeID, getNodeLength(getNodeInGraph(graph, nodeID)) - 1 - index, kmerTable); index++; } if (fgets(line, maxline, file) == NULL) break; } fclose(file); // Sort table sortKmerOccurenceTable(kmerTable); return kmerTable; }
static void threadSequenceThroughGraph(TightString * tString, KmerOccurenceTable * kmerOccurences, Graph * graph, IDnum seqID, Category category, boolean readTracking, boolean double_strand) { Kmer word; Kmer antiWord; Coordinate readNucleotideIndex; Coordinate kmerIndex; KmerOccurence *kmerOccurence; int wordLength = getWordLength(graph); PassageMarker *marker = NULL; PassageMarker *previousMarker = NULL; Node *node; Node *previousNode = NULL; Coordinate coord; Coordinate previousCoord = 0; Nucleotide nucleotide; clearKmer(&word); clearKmer(&antiWord); // Neglect any string shorter than WORDLENGTH : if (getLength(tString) < wordLength) return; // Fill in the initial word : for (readNucleotideIndex = 0; readNucleotideIndex < wordLength - 1; readNucleotideIndex++) { nucleotide = getNucleotide(readNucleotideIndex, tString); pushNucleotide(&word, nucleotide); if (double_strand) { #ifdef COLOR reversePushNucleotide(&antiWord, nucleotide); #else reversePushNucleotide(&antiWord, 3 - nucleotide); #endif } } // Go through sequence while (readNucleotideIndex < getLength(tString)) { nucleotide = getNucleotide(readNucleotideIndex++, tString); pushNucleotide(&word, nucleotide); if (double_strand) { #ifdef COLOR reversePushNucleotide(&antiWord, nucleotide); #else reversePushNucleotide(&antiWord, 3 - nucleotide); #endif } // Search in table if ((!double_strand || compareKmers(&word, &antiWord) <= 0) && (kmerOccurence = findKmerOccurenceInSortedTable(&word, kmerOccurences))) { node = getNodeInGraph(graph, kmerOccurence->nodeID); coord = kmerOccurence->position; } else if ((double_strand && compareKmers(&word, &antiWord) > 0) && (kmerOccurence = findKmerOccurenceInSortedTable(&antiWord, kmerOccurences))) { node = getNodeInGraph(graph, -kmerOccurence->nodeID); coord = getNodeLength(node) - kmerOccurence->position - 1; } else { node = NULL; if (previousNode) { break; } } // Fill in graph if (node) { kmerIndex = readNucleotideIndex - wordLength; if (previousNode == node && previousCoord == coord - 1) { if (category / 2 >= CATEGORIES) { setPassageMarkerFinish(marker, kmerIndex + 1); setFinishOffset(marker, getNodeLength(node) - coord - 1); } else { incrementVirtualCoverage(node, category / 2, 1); incrementOriginalVirtualCoverage (node, category / 2, 1); } } else { if (category / 2 >= CATEGORIES) { marker = newPassageMarker(seqID, kmerIndex, kmerIndex + 1, coord, getNodeLength (node) - coord - 1); transposePassageMarker(marker, node); connectPassageMarkers (previousMarker, marker, graph); previousMarker = marker; } else { if (readTracking) { if (!getNodeStatus(node)) { addReadStart(node, seqID, coord, graph, kmerIndex); setSingleNodeStatus (node, true); memorizeNode(node); } else { blurLastShortReadMarker (node, graph); } } incrementVirtualCoverage(node, category / 2, 1); incrementOriginalVirtualCoverage (node, category / 2, 1); } createArc(previousNode, node, graph); } previousNode = node; previousCoord = coord; } } unlockMemorizedNodes(); }
static void ghostThreadSequenceThroughGraph(TightString * tString, KmerOccurenceTable * kmerOccurences, Graph * graph, IDnum seqID, Category category, boolean readTracking, boolean double_strand) { Kmer word; Kmer antiWord; Coordinate readNucleotideIndex; KmerOccurence *kmerOccurence; int wordLength = getWordLength(graph); Nucleotide nucleotide; Node *node; Node *previousNode = NULL; clearKmer(&word); clearKmer(&antiWord); // Neglect any read which will not be short paired if ((!readTracking && category % 2 == 0) || category / 2 >= CATEGORIES) return; // Neglect any string shorter than WORDLENGTH : if (getLength(tString) < wordLength) return; // Verify that all short reads are reasonnably short if (getLength(tString) > USHRT_MAX) { printf("Short read of length %lli, longer than limit %i\n", (long long) getLength(tString), SHRT_MAX); puts("You should better declare this sequence as long, because it genuinely is!"); exit(1); } // Allocate memory for the read pairs if (!readStartsAreActivated(graph)) activateReadStarts(graph); // Fill in the initial word : for (readNucleotideIndex = 0; readNucleotideIndex < wordLength - 1; readNucleotideIndex++) { nucleotide = getNucleotide(readNucleotideIndex, tString); pushNucleotide(&word, nucleotide); if (double_strand) { #ifdef COLOR reversePushNucleotide(&antiWord, nucleotide); #else reversePushNucleotide(&antiWord, 3 - nucleotide); #endif } } // Go through sequence while (readNucleotideIndex < getLength(tString)) { // Shift word: nucleotide = getNucleotide(readNucleotideIndex++, tString); pushNucleotide(&word, nucleotide); if (double_strand) { #ifdef COLOR reversePushNucleotide(&antiWord, nucleotide); #else reversePushNucleotide(&antiWord, 3 - nucleotide); #endif } // Search in table if ((!double_strand || compareKmers(&word, &antiWord) <= 0) && (kmerOccurence = findKmerOccurenceInSortedTable(&word, kmerOccurences))) { node = getNodeInGraph(graph, kmerOccurence->nodeID); } else if ((double_strand && compareKmers(&word, &antiWord) > 0) && (kmerOccurence = findKmerOccurenceInSortedTable(&antiWord, kmerOccurences))) { node = getNodeInGraph(graph, -kmerOccurence->nodeID); } else { node = NULL; if (previousNode) break; } previousNode = node; // Fill in graph if (node && !getNodeStatus(node)) { incrementReadStartCount(node, graph); setSingleNodeStatus(node, true); memorizeNode(node); } } unlockMemorizedNodes(); }
static KmerOccurenceTable *referenceGraphKmers(char *preGraphFilename, short int accelerationBits, Graph * graph, boolean double_strand) { FILE *file = fopen(preGraphFilename, "r"); const int maxline = MAXLINE; char line[MAXLINE]; char c; int wordLength; Coordinate lineLength, kmerCount; Kmer word; Kmer antiWord; KmerOccurenceTable *kmerTable = NULL; KmerOccurence *kmerOccurences, *kmerOccurencePtr; Coordinate kmerOccurenceIndex; IDnum index; IDnum nodeID = 0; IDnum *accelPtr = NULL; KmerKey lastHeader = 0; KmerKey header; Nucleotide nucleotide; if (file == NULL) exitErrorf(EXIT_FAILURE, true, "Could not open %s", preGraphFilename); // Count kmers printf("Scanning pre-graph file %s for k-mers\n", preGraphFilename); // First line if (!fgets(line, maxline, file)) exitErrorf(EXIT_FAILURE, true, "PreGraph file incomplete"); sscanf(line, "%*i\t%*i\t%i\n", &wordLength); // Initialize kmer occurence table: kmerTable = mallocOrExit(1, KmerOccurenceTable); if (accelerationBits > 2 * wordLength) accelerationBits = 2 * wordLength; if (accelerationBits > 32) accelerationBits = 32; if (accelerationBits > 0) { kmerTable->accelerationBits = accelerationBits; kmerTable->accelerationTable = callocOrExit((((size_t) 1) << accelerationBits) + 1, IDnum); accelPtr = kmerTable->accelerationTable; kmerTable->accelerationShift = (short int) 2 *wordLength - accelerationBits; } else { kmerTable->accelerationBits = 0; kmerTable->accelerationTable = NULL; kmerTable->accelerationShift = 0; } // Read nodes if (!fgets(line, maxline, file)) exitErrorf(EXIT_FAILURE, true, "PreGraph file incomplete"); kmerCount = 0; while (line[0] == 'N') { lineLength = 0; while ((c = getc(file)) != EOF && c != '\n') lineLength++; kmerCount += lineLength - wordLength + 1; if (fgets(line, maxline, file) == NULL) break; } fclose(file); // Create table printf("%li kmers found\n", (long) kmerCount); kmerOccurences = callocOrExit(kmerCount, KmerOccurence); kmerOccurencePtr = kmerOccurences; kmerOccurenceIndex = 0; kmerTable->kmerTable = kmerOccurences; kmerTable->kmerTableSize = kmerCount; // Fill table file = fopen(preGraphFilename, "r"); if (file == NULL) exitErrorf(EXIT_FAILURE, true, "Could not open %s", preGraphFilename); if (!fgets(line, maxline, file)) exitErrorf(EXIT_FAILURE, true, "PreGraph file incomplete"); // Read nodes if (!fgets(line, maxline, file)) exitErrorf(EXIT_FAILURE, true, "PreGraph file incomplete"); while (line[0] == 'N') { nodeID++; // Fill in the initial word : clearKmer(&word); clearKmer(&antiWord); for (index = 0; index < wordLength - 1; index++) { c = getc(file); if (c == 'A') nucleotide = ADENINE; else if (c == 'C') nucleotide = CYTOSINE; else if (c == 'G') nucleotide = GUANINE; else if (c == 'T') nucleotide = THYMINE; else if (c == '\n') exitErrorf(EXIT_FAILURE, true, "PreGraph file incomplete"); else nucleotide = ADENINE; pushNucleotide(&word, nucleotide); if (double_strand) { #ifdef COLOR reversePushNucleotide(&antiWord, nucleotide); #else reversePushNucleotide(&antiWord, 3 - nucleotide); #endif } } // Scan through node index = 0; while((c = getc(file)) != '\n' && c != EOF) { if (c == 'A') nucleotide = ADENINE; else if (c == 'C') nucleotide = CYTOSINE; else if (c == 'G') nucleotide = GUANINE; else if (c == 'T') nucleotide = THYMINE; else nucleotide = ADENINE; pushNucleotide(&word, nucleotide); if (double_strand) { #ifdef COLOR reversePushNucleotide(&antiWord, nucleotide); #else reversePushNucleotide(&antiWord, 3 - nucleotide); #endif } if (!double_strand || compareKmers(&word, &antiWord) <= 0) { copyKmers(&kmerOccurencePtr->kmer, &word); kmerOccurencePtr->nodeID = nodeID; kmerOccurencePtr->position = index; } else { copyKmers(&kmerOccurencePtr->kmer, &antiWord); kmerOccurencePtr->nodeID = -nodeID; kmerOccurencePtr->position = getNodeLength(getNodeInGraph(graph, nodeID)) - 1 - index; } kmerOccurencePtr++; kmerOccurenceIndex++; index++; } if (fgets(line, maxline, file) == NULL) break; } fclose(file); // Sort table qsort(kmerOccurences, kmerCount, sizeof(KmerOccurence), compareKmerOccurences); // Fill up acceleration table if (kmerTable->accelerationTable != NULL) { *accelPtr = (IDnum) 0; for (kmerOccurenceIndex = 0; kmerOccurenceIndex < kmerCount; kmerOccurenceIndex++) { header = keyInAccelerationTable(&kmerOccurences [kmerOccurenceIndex]. kmer, kmerTable); while (lastHeader < header) { lastHeader++; accelPtr++; *accelPtr = kmerOccurenceIndex; } } while (lastHeader < (KmerKey) 1 << accelerationBits) { lastHeader++; accelPtr++; *accelPtr = kmerCount; } } return kmerTable; }