void insertIntoTree(Kmer * kmer, SplayTree ** T) { SplayNode *newNode; if (*T == NULL) { newNode = allocateSplayNode(); copyKmers(&(newNode->kmer), kmer); newNode->left = newNode->right = NULL; *T = newNode; return; } *T = Splay(kmer, *T); if (compareKmers(kmer, &((*T)->kmer)) < 0) { newNode = allocateSplayNode(); copyKmers(&(newNode->kmer), kmer); newNode->left = (*T)->left; newNode->right = *T; (*T)->left = NULL; *T = newNode; } else if (compareKmers(&((*T)->kmer), kmer) < 0) { newNode = allocateSplayNode(); copyKmers(&(newNode->kmer), kmer); newNode->right = (*T)->right; newNode->left = *T; (*T)->right = NULL; *T = newNode; } }
boolean findOrInsertOccurenceInSplayTree(Kmer * kmer, IDnum * seqID, Coordinate * position, SplayTree ** T) { SplayNode *newNode; if (*T == NULL) { newNode = allocateSplayNode(); copyKmers(&(newNode->kmer), kmer); newNode->seqID = *seqID; newNode->position = *position; newNode->left = newNode->right = NULL; *T = newNode; return false; } *T = Splay(kmer, *T); if (compareKmers(kmer, &((*T)->kmer)) < 0) { newNode = allocateSplayNode(); copyKmers(&(newNode->kmer), kmer); newNode->seqID = *seqID; newNode->position = *position; newNode->left = (*T)->left; newNode->right = *T; (*T)->left = NULL; *T = newNode; printf("1: sequenceID = %d\n", *seqID); return false; } else if (compareKmers(kmer, &((*T)->kmer)) > 0) { newNode = allocateSplayNode(); copyKmers(&(newNode->kmer), kmer); newNode->seqID = *seqID; newNode->position = *position; newNode->right = (*T)->right; newNode->left = *T; (*T)->right = NULL; *T = newNode; printf("2: sequenceID = %d\n", *seqID); return false; } else { *seqID = (*T)->seqID; *position = (*T)->position; printf("3: sequenceID = %d\n", *seqID); return true; } }
void recordKmerOccurence(Kmer * kmer, IDnum nodeID, Coordinate position, KmerOccurenceTable * table) { KmerOccurence * kmerOccurence; #ifdef OPENMP #pragma omp critical #endif { kmerOccurence = table->kmerOccurencePtr++; table->kmerOccurenceIndex++; } copyKmers(&(kmerOccurence->kmer), kmer); kmerOccurence->nodeID = nodeID; kmerOccurence->position = position; }
static inline KmerKey keyInAccelerationTable(Kmer * kmer, KmerOccurenceTable * table) { KmerKey key = 0; Kmer copy; int i; copyKmers(©, kmer); for (i = 0; i < table->accelerationShift; i+= 2) popNucleotide(©); for (i = 0; i < table->accelerationBits; i+= 2) { key += ((KmerKey) popNucleotide(©)) << table->accelerationBits; key >>= 2; } return key; }
static KmerOccurenceTable *referenceGraphKmers(char *preGraphFilename, short int accelerationBits, Graph * graph, boolean double_strand) { FILE *file = fopen(preGraphFilename, "r"); const int maxline = MAXLINE; char line[MAXLINE]; char c; int wordLength; Coordinate lineLength, kmerCount; Kmer word; Kmer antiWord; KmerOccurenceTable *kmerTable = NULL; KmerOccurence *kmerOccurences, *kmerOccurencePtr; Coordinate kmerOccurenceIndex; IDnum index; IDnum nodeID = 0; IDnum *accelPtr = NULL; KmerKey lastHeader = 0; KmerKey header; Nucleotide nucleotide; if (file == NULL) exitErrorf(EXIT_FAILURE, true, "Could not open %s", preGraphFilename); // Count kmers printf("Scanning pre-graph file %s for k-mers\n", preGraphFilename); // First line if (!fgets(line, maxline, file)) exitErrorf(EXIT_FAILURE, true, "PreGraph file incomplete"); sscanf(line, "%*i\t%*i\t%i\n", &wordLength); // Initialize kmer occurence table: kmerTable = mallocOrExit(1, KmerOccurenceTable); if (accelerationBits > 2 * wordLength) accelerationBits = 2 * wordLength; if (accelerationBits > 32) accelerationBits = 32; if (accelerationBits > 0) { kmerTable->accelerationBits = accelerationBits; kmerTable->accelerationTable = callocOrExit((((size_t) 1) << accelerationBits) + 1, IDnum); accelPtr = kmerTable->accelerationTable; kmerTable->accelerationShift = (short int) 2 *wordLength - accelerationBits; } else { kmerTable->accelerationBits = 0; kmerTable->accelerationTable = NULL; kmerTable->accelerationShift = 0; } // Read nodes if (!fgets(line, maxline, file)) exitErrorf(EXIT_FAILURE, true, "PreGraph file incomplete"); kmerCount = 0; while (line[0] == 'N') { lineLength = 0; while ((c = getc(file)) != EOF && c != '\n') lineLength++; kmerCount += lineLength - wordLength + 1; if (fgets(line, maxline, file) == NULL) break; } fclose(file); // Create table printf("%li kmers found\n", (long) kmerCount); kmerOccurences = callocOrExit(kmerCount, KmerOccurence); kmerOccurencePtr = kmerOccurences; kmerOccurenceIndex = 0; kmerTable->kmerTable = kmerOccurences; kmerTable->kmerTableSize = kmerCount; // Fill table file = fopen(preGraphFilename, "r"); if (file == NULL) exitErrorf(EXIT_FAILURE, true, "Could not open %s", preGraphFilename); if (!fgets(line, maxline, file)) exitErrorf(EXIT_FAILURE, true, "PreGraph file incomplete"); // Read nodes if (!fgets(line, maxline, file)) exitErrorf(EXIT_FAILURE, true, "PreGraph file incomplete"); while (line[0] == 'N') { nodeID++; // Fill in the initial word : clearKmer(&word); clearKmer(&antiWord); for (index = 0; index < wordLength - 1; index++) { c = getc(file); if (c == 'A') nucleotide = ADENINE; else if (c == 'C') nucleotide = CYTOSINE; else if (c == 'G') nucleotide = GUANINE; else if (c == 'T') nucleotide = THYMINE; else if (c == '\n') exitErrorf(EXIT_FAILURE, true, "PreGraph file incomplete"); else nucleotide = ADENINE; pushNucleotide(&word, nucleotide); if (double_strand) { #ifdef COLOR reversePushNucleotide(&antiWord, nucleotide); #else reversePushNucleotide(&antiWord, 3 - nucleotide); #endif } } // Scan through node index = 0; while((c = getc(file)) != '\n' && c != EOF) { if (c == 'A') nucleotide = ADENINE; else if (c == 'C') nucleotide = CYTOSINE; else if (c == 'G') nucleotide = GUANINE; else if (c == 'T') nucleotide = THYMINE; else nucleotide = ADENINE; pushNucleotide(&word, nucleotide); if (double_strand) { #ifdef COLOR reversePushNucleotide(&antiWord, nucleotide); #else reversePushNucleotide(&antiWord, 3 - nucleotide); #endif } if (!double_strand || compareKmers(&word, &antiWord) <= 0) { copyKmers(&kmerOccurencePtr->kmer, &word); kmerOccurencePtr->nodeID = nodeID; kmerOccurencePtr->position = index; } else { copyKmers(&kmerOccurencePtr->kmer, &antiWord); kmerOccurencePtr->nodeID = -nodeID; kmerOccurencePtr->position = getNodeLength(getNodeInGraph(graph, nodeID)) - 1 - index; } kmerOccurencePtr++; kmerOccurenceIndex++; index++; } if (fgets(line, maxline, file) == NULL) break; } fclose(file); // Sort table qsort(kmerOccurences, kmerCount, sizeof(KmerOccurence), compareKmerOccurences); // Fill up acceleration table if (kmerTable->accelerationTable != NULL) { *accelPtr = (IDnum) 0; for (kmerOccurenceIndex = 0; kmerOccurenceIndex < kmerCount; kmerOccurenceIndex++) { header = keyInAccelerationTable(&kmerOccurences [kmerOccurenceIndex]. kmer, kmerTable); while (lastHeader < header) { lastHeader++; accelPtr++; *accelPtr = kmerOccurenceIndex; } } while (lastHeader < (KmerKey) 1 << accelerationBits) { lastHeader++; accelPtr++; *accelPtr = kmerCount; } } return kmerTable; }