static KmerOccurence *findKmerOccurenceInSortedTable(Kmer * kmer, KmerOccurenceTable * table) { KmerOccurence *array = table->kmerTable; KmerKey key = keyInAccelerationTable(kmer, table); Coordinate leftIndex, rightIndex, middleIndex; if (table->accelerationTable != NULL) { leftIndex = table->accelerationTable[key]; rightIndex = table->accelerationTable[key + 1]; } else { leftIndex = 0; rightIndex = table->kmerTableSize; } while (true) { middleIndex = (rightIndex + leftIndex) / 2; if (leftIndex >= rightIndex) return NULL; else if (compareKmers(&(array[middleIndex]).kmer, kmer) == 0) return &(array[middleIndex]); else if (leftIndex == middleIndex) return NULL; else if (compareKmers(&(array[middleIndex]).kmer, kmer) > 0) rightIndex = middleIndex; else leftIndex = middleIndex; } }
void sortKmerOccurenceTable(KmerOccurenceTable * table) { KmerKey lastHeader = 0; KmerKey header; IDnum *accelPtr = NULL; IDnum kmerOccurenceIndex; velvetLog("Sorting kmer occurence table ... \n"); qsort(table->kmerTable, table->kmerTableSize, sizeof(KmerOccurence), compareKmerOccurences); velvetLog("Sorting done.\n"); // Fill up acceleration table if (table->accelerationTable != NULL) { accelPtr = table->accelerationTable; *accelPtr = (IDnum) 0; for (kmerOccurenceIndex = 0; kmerOccurenceIndex < table->kmerTableSize; kmerOccurenceIndex++) { header = keyInAccelerationTable(&table->kmerTable [kmerOccurenceIndex]. kmer, table); while (lastHeader < header) { lastHeader++; accelPtr++; *accelPtr = kmerOccurenceIndex; } } while (lastHeader < (KmerKey) 1 << table->accelerationBits) { lastHeader++; accelPtr++; *accelPtr = table->kmerTableSize; } } }
KmerOccurence *findKmerInKmerOccurenceTable(Kmer * kmer, KmerOccurenceTable * table) { KmerOccurence *array = table->kmerTable; KmerKey key = keyInAccelerationTable(kmer, table); Coordinate leftIndex, rightIndex, middleIndex; int diff; if (table->accelerationTable != NULL) { leftIndex = table->accelerationTable[key]; rightIndex = table->accelerationTable[key + 1]; } else { leftIndex = 0; rightIndex = table->kmerTableSize; } while (true) { middleIndex = (rightIndex + leftIndex) / 2; if (leftIndex >= rightIndex) return NULL; diff = compareKmers(&(array[middleIndex].kmer), kmer); if (diff == 0) { while (middleIndex > 0 && compareKmers(&(array[middleIndex - 1].kmer), kmer) == 0) middleIndex--; return &(array[middleIndex]); } else if (leftIndex == middleIndex) return NULL; else if (diff > 0) rightIndex = middleIndex; else leftIndex = middleIndex; } }
static KmerOccurenceTable *referenceGraphKmers(char *preGraphFilename, short int accelerationBits, Graph * graph, boolean double_strand) { FILE *file = fopen(preGraphFilename, "r"); const int maxline = MAXLINE; char line[MAXLINE]; char c; int wordLength; Coordinate lineLength, kmerCount; Kmer word; Kmer antiWord; KmerOccurenceTable *kmerTable = NULL; KmerOccurence *kmerOccurences, *kmerOccurencePtr; Coordinate kmerOccurenceIndex; IDnum index; IDnum nodeID = 0; IDnum *accelPtr = NULL; KmerKey lastHeader = 0; KmerKey header; Nucleotide nucleotide; if (file == NULL) exitErrorf(EXIT_FAILURE, true, "Could not open %s", preGraphFilename); // Count kmers printf("Scanning pre-graph file %s for k-mers\n", preGraphFilename); // First line if (!fgets(line, maxline, file)) exitErrorf(EXIT_FAILURE, true, "PreGraph file incomplete"); sscanf(line, "%*i\t%*i\t%i\n", &wordLength); // Initialize kmer occurence table: kmerTable = mallocOrExit(1, KmerOccurenceTable); if (accelerationBits > 2 * wordLength) accelerationBits = 2 * wordLength; if (accelerationBits > 32) accelerationBits = 32; if (accelerationBits > 0) { kmerTable->accelerationBits = accelerationBits; kmerTable->accelerationTable = callocOrExit((((size_t) 1) << accelerationBits) + 1, IDnum); accelPtr = kmerTable->accelerationTable; kmerTable->accelerationShift = (short int) 2 *wordLength - accelerationBits; } else { kmerTable->accelerationBits = 0; kmerTable->accelerationTable = NULL; kmerTable->accelerationShift = 0; } // Read nodes if (!fgets(line, maxline, file)) exitErrorf(EXIT_FAILURE, true, "PreGraph file incomplete"); kmerCount = 0; while (line[0] == 'N') { lineLength = 0; while ((c = getc(file)) != EOF && c != '\n') lineLength++; kmerCount += lineLength - wordLength + 1; if (fgets(line, maxline, file) == NULL) break; } fclose(file); // Create table printf("%li kmers found\n", (long) kmerCount); kmerOccurences = callocOrExit(kmerCount, KmerOccurence); kmerOccurencePtr = kmerOccurences; kmerOccurenceIndex = 0; kmerTable->kmerTable = kmerOccurences; kmerTable->kmerTableSize = kmerCount; // Fill table file = fopen(preGraphFilename, "r"); if (file == NULL) exitErrorf(EXIT_FAILURE, true, "Could not open %s", preGraphFilename); if (!fgets(line, maxline, file)) exitErrorf(EXIT_FAILURE, true, "PreGraph file incomplete"); // Read nodes if (!fgets(line, maxline, file)) exitErrorf(EXIT_FAILURE, true, "PreGraph file incomplete"); while (line[0] == 'N') { nodeID++; // Fill in the initial word : clearKmer(&word); clearKmer(&antiWord); for (index = 0; index < wordLength - 1; index++) { c = getc(file); if (c == 'A') nucleotide = ADENINE; else if (c == 'C') nucleotide = CYTOSINE; else if (c == 'G') nucleotide = GUANINE; else if (c == 'T') nucleotide = THYMINE; else if (c == '\n') exitErrorf(EXIT_FAILURE, true, "PreGraph file incomplete"); else nucleotide = ADENINE; pushNucleotide(&word, nucleotide); if (double_strand) { #ifdef COLOR reversePushNucleotide(&antiWord, nucleotide); #else reversePushNucleotide(&antiWord, 3 - nucleotide); #endif } } // Scan through node index = 0; while((c = getc(file)) != '\n' && c != EOF) { if (c == 'A') nucleotide = ADENINE; else if (c == 'C') nucleotide = CYTOSINE; else if (c == 'G') nucleotide = GUANINE; else if (c == 'T') nucleotide = THYMINE; else nucleotide = ADENINE; pushNucleotide(&word, nucleotide); if (double_strand) { #ifdef COLOR reversePushNucleotide(&antiWord, nucleotide); #else reversePushNucleotide(&antiWord, 3 - nucleotide); #endif } if (!double_strand || compareKmers(&word, &antiWord) <= 0) { copyKmers(&kmerOccurencePtr->kmer, &word); kmerOccurencePtr->nodeID = nodeID; kmerOccurencePtr->position = index; } else { copyKmers(&kmerOccurencePtr->kmer, &antiWord); kmerOccurencePtr->nodeID = -nodeID; kmerOccurencePtr->position = getNodeLength(getNodeInGraph(graph, nodeID)) - 1 - index; } kmerOccurencePtr++; kmerOccurenceIndex++; index++; } if (fgets(line, maxline, file) == NULL) break; } fclose(file); // Sort table qsort(kmerOccurences, kmerCount, sizeof(KmerOccurence), compareKmerOccurences); // Fill up acceleration table if (kmerTable->accelerationTable != NULL) { *accelPtr = (IDnum) 0; for (kmerOccurenceIndex = 0; kmerOccurenceIndex < kmerCount; kmerOccurenceIndex++) { header = keyInAccelerationTable(&kmerOccurences [kmerOccurenceIndex]. kmer, kmerTable); while (lastHeader < header) { lastHeader++; accelPtr++; *accelPtr = kmerOccurenceIndex; } } while (lastHeader < (KmerKey) 1 << accelerationBits) { lastHeader++; accelPtr++; *accelPtr = kmerCount; } } return kmerTable; }