static void projectFromNode(IDnum nodeID, ReadOccurence ** readNodes, IDnum * readNodeCounts, IDnum * readPairs, Category * cats, boolean * dubious, Coordinate * lengths) { IDnum index; ShortReadMarker *nodeArray, *shortMarker; PassageMarker *marker; Node *node; IDnum nodeReadCount; node = getNodeInGraph(graph, nodeID); if (node == NULL || !getUniqueness(node)) return; nodeArray = getNodeReads(node, graph); nodeReadCount = getNodeReadCount(node, graph); for (index = 0; index < nodeReadCount; index++) { shortMarker = getShortReadMarkerAtIndex(nodeArray, index); if (dubious[getShortReadMarkerID(shortMarker) - 1]) continue; projectFromShortRead(node, shortMarker, readPairs, cats, readNodes, readNodeCounts, lengths); } for (marker = getMarker(node); marker != NULL; marker = getNextInNode(marker)) { if (getPassageMarkerSequenceID(marker) > 0) projectFromLongRead(node, marker, readPairs, cats, readNodes, readNodeCounts, lengths); } }
static Connection *createNewConnection(IDnum nodeID, IDnum node2ID, IDnum direct_count, IDnum paired_count, Coordinate distance, double variance) { Node *destination = getNodeInGraph(graph, node2ID); IDnum nodeIndex = nodeID + nodeCount(graph); Connection *connect = allocateConnection(); // Fill in connect->destination = destination; connect->direct_count = direct_count; connect->paired_count = paired_count; connect->distance = (double) distance; connect->variance = variance; connect->weight = 0; connect->status = false; // Insert in scaffold connect->previous = NULL; connect->next = scaffold[nodeIndex]; if (scaffold[nodeIndex] != NULL) scaffold[nodeIndex]->previous = connect; scaffold[nodeIndex] = connect; if (node2ID != nodeID) createTwinConnection(node2ID, nodeID, connect); else connect->twin = NULL; return connect; }
static void createTwinConnection(IDnum nodeID, IDnum node2ID, Connection * connect) { Connection *newConnection = allocateConnection(); IDnum nodeIndex = nodeID + nodeCount(graph); // Fill in newConnection->distance = connect->distance; newConnection->variance = connect->variance; newConnection->direct_count = connect->direct_count; newConnection->paired_count = connect->paired_count; newConnection->destination = getNodeInGraph(graph, node2ID); newConnection->weight = 0; newConnection->status = false; // Batch to twin newConnection->twin = connect; connect->twin = newConnection; // Insert in scaffold newConnection->previous = NULL; newConnection->next = scaffold[nodeIndex]; if (scaffold[nodeIndex] != NULL) scaffold[nodeIndex]->previous = newConnection; scaffold[nodeIndex] = newConnection; }
static void computePartialReadToNodeMapping(IDnum nodeID, ReadOccurence ** readNodes, IDnum * readNodeCounts, boolean * readMarker, ReadSet * reads) { ShortReadMarker *shortMarker; IDnum index, readIndex; ReadOccurence *readArray, *readOccurence; Node *node = getNodeInGraph(graph, nodeID); ShortReadMarker *nodeArray = getNodeReads(node, graph); IDnum nodeReadCount = getNodeReadCount(node, graph); PassageMarkerI marker; for (index = 0; index < nodeReadCount; index++) { shortMarker = getShortReadMarkerAtIndex(nodeArray, index); readIndex = getShortReadMarkerID(shortMarker); readArray = readNodes[readIndex]; readOccurence = &readArray[readNodeCounts[readIndex]]; readOccurence->nodeID = nodeID; readOccurence->position = getShortReadMarkerPosition(shortMarker); readOccurence->offset = getShortReadMarkerOffset(shortMarker); readNodeCounts[readIndex]++; } for (marker = getMarker(node); marker != NULL_IDX; marker = getNextInNode(marker)) { readIndex = getPassageMarkerSequenceID(marker); if (readIndex <= 0 || reads->categories[readIndex - 1] == REFERENCE) continue; if (!readMarker[readIndex]) { readArray = readNodes[readIndex]; readOccurence = &readArray[readNodeCounts[readIndex]]; readOccurence->nodeID = nodeID; readOccurence->position = getStartOffset(marker); readOccurence->offset = getPassageMarkerStart(marker); readNodeCounts[readIndex]++; readMarker[readIndex] = true; } else { readArray = readNodes[readIndex]; readOccurence = &readArray[readNodeCounts[readIndex] - 1]; readOccurence->position = -1; readOccurence->offset = -1; } } for (marker = getMarker(node); marker != NULL_IDX; marker = getNextInNode(marker)) { readIndex = getPassageMarkerSequenceID(marker); if (readIndex > 0) readMarker[readIndex] = false; } }
static IDnum *computeReadToNodeCounts() { IDnum readIndex, nodeIndex; IDnum maxNodeIndex = 2 * nodeCount(graph) + 1; IDnum maxReadIndex = sequenceCount(graph) + 1; IDnum *readNodeCounts = callocOrExit(maxReadIndex, IDnum); boolean *readMarker = callocOrExit(maxReadIndex, boolean); ShortReadMarker *nodeArray, *shortMarker; PassageMarkerI marker; Node *node; IDnum nodeReadCount; //puts("Computing read to node mapping array sizes"); for (nodeIndex = 0; nodeIndex < maxNodeIndex; nodeIndex++) { node = getNodeInGraph(graph, nodeIndex - nodeCount(graph)); if (node == NULL) continue; // Short reads if (readStartsAreActivated(graph)) { nodeArray = getNodeReads(node, graph); nodeReadCount = getNodeReadCount(node, graph); for (readIndex = 0; readIndex < nodeReadCount; readIndex++) { shortMarker = getShortReadMarkerAtIndex(nodeArray, readIndex); readNodeCounts[getShortReadMarkerID (shortMarker)]++; } } // Long reads for (marker = getMarker(node); marker != NULL_IDX; marker = getNextInNode(marker)) { readIndex = getPassageMarkerSequenceID(marker); if (readIndex < 0) continue; if (readMarker[readIndex]) continue; readNodeCounts[readIndex]++; readMarker[readIndex] = true; } // Clean up marker array for (marker = getMarker(node); marker != NULL_IDX; marker = getNextInNode(marker)) { readIndex = getPassageMarkerSequenceID(marker); if (readIndex > 0) readMarker[readIndex] = false; } } free(readMarker); return readNodeCounts; }
static void measureCoOccurences(Coordinate ** coOccurences, boolean * interestingReads, ReadOccurence ** readNodes, IDnum * readNodeCounts, IDnum * readPairs, Category * cats) { IDnum coOccurencesIndex[CATEGORIES + 1]; IDnum observationIndex; IDnum readIndex, readPairIndex; IDnum readNodeCount; IDnum readOccurenceIndex, readPairOccurenceIndex; ReadOccurence * readOccurence, *readPairOccurence; Category libID; for (libID = 0; libID < CATEGORIES + 1; libID++) coOccurencesIndex[libID] = 0; for (readIndex = 0; readIndex < sequenceCount(graph); readIndex++) { // Eliminating dodgy, unpaired, already counted or user-specified reads if (!interestingReads[readIndex]) continue; // Find co-occurence // We know that for each read the read occurences are ordered by increasing node ID libID = cats[readIndex]/2; readPairIndex = readPairs[readIndex]; observationIndex = coOccurencesIndex[libID]; readOccurence = readNodes[readIndex + 1]; readOccurenceIndex = 0; readNodeCount = readNodeCounts[readIndex + 1]; readPairOccurenceIndex = readNodeCounts[readPairIndex + 1] - 1; readPairOccurence = &(readNodes[readPairIndex + 1][readPairOccurenceIndex]); while (readOccurenceIndex < readNodeCount && readPairOccurenceIndex >= 0) { if (readOccurence->nodeID == -readPairOccurence->nodeID) { if (readOccurence->position > 0 && readPairOccurence->position > 0) { coOccurences[libID][observationIndex] = getNodeLength(getNodeInGraph(graph, readOccurence->nodeID)) + getWordLength(graph) - 1 - (readOccurence->position - readOccurence->offset) - (readPairOccurence->position - readPairOccurence->offset); coOccurencesIndex[libID]++; break; } else { readOccurence++; readOccurenceIndex++; readPairOccurence--; readPairOccurenceIndex--; } } else if (readOccurence->nodeID < -readPairOccurence->nodeID) { readOccurence++; readOccurenceIndex++; } else { readPairOccurence--; readPairOccurenceIndex--; } } } }
void readCoherentGraph(Graph * inGraph, boolean(*isUnique) (Node * node), double coverage, ReadSet * reads) { IDnum nodeIndex; Node *node; IDnum previousNodeCount = 0; graph = inGraph; listMemory = newRecycleBin(sizeof(PassageMarkerList), 100000); expected_coverage = coverage; sequences = reads->tSequences; velvetLog("Read coherency...\n"); resetNodeStatus(graph); identifyUniqueNodes(isUnique); trimLongReadTips(); previousNodeCount = 0; while (previousNodeCount != nodeCount(graph)) { previousNodeCount = nodeCount(graph); for (nodeIndex = 1; nodeIndex <= nodeCount(graph); nodeIndex++) { node = getNodeInGraph(graph, nodeIndex); if (node == NULL || !getUniqueness(node)) continue; while (uniqueNodesConnect(node)) node = bypass(); node = getTwinNode(node); while (uniqueNodesConnect(node)) node = bypass(); } renumberNodes(graph); } destroyRecycleBin(listMemory); destroyRecycleBin(nodeListMemory); velvetLog("Confronted to %li multiple hits and %li null over %li\n", (long) multCounter, (long) nullCounter, (long) dbgCounter); velvetLog("Read coherency over!\n"); }
static IDnum expectedNumberOfConnections(IDnum IDA, Connection * connect, IDnum ** counts, Category cat) { Node *A = getNodeInGraph(graph, IDA); Node *B = connect->destination; IDnum IDB = getNodeID(B); double left, middle, right; Coordinate longLength, shortLength, D; double M, N, O, P; Coordinate mu = getInsertLength(graph, cat); double sigma = sqrt(getInsertLength_var(graph, cat)); double result; double densityA, densityB, minDensity; if (mu <= 0) return 0; if (getNodeLength(A) == 0 || getNodeLength(B) == 0) return 0; if (getNodeLength(A) < getNodeLength(B)) { longLength = getNodeLength(B); shortLength = getNodeLength(A); } else { longLength = getNodeLength(A); shortLength = getNodeLength(B); } densityA = counts[cat][IDA + nodeCount(graph)] / (double) getNodeLength(A); densityB = counts[cat][IDB + nodeCount(graph)] / (double) getNodeLength(B); minDensity = densityA > densityB ? densityB : densityA; D = getConnectionDistance(connect) - (longLength + shortLength) / 2; M = (D - mu) / sigma; N = (D + shortLength - mu) / sigma; O = (D + longLength - mu) / sigma; P = (D + shortLength + longLength - mu) / sigma; left = ((norm(M) - norm(N)) - M * normInt(M, N)) * sigma; middle = shortLength * normInt(N, O); right = ((norm(O) - norm(P)) - P * normInt(O, P)) * (-sigma); result = (minDensity * (left + middle + right)); if (result > 0) return (IDnum) result; else return 0; }
static void projectFromSingleRead(Node * node, ReadOccurence * readOccurence, Coordinate position, Coordinate offset, Coordinate length) { Coordinate distance = 0; Node *target = getNodeInGraph(graph, -readOccurence->nodeID); double variance = 1; if (target == getTwinNode(node) || target == node) return; if (position < 0) { variance += getNodeLength(node) * getNodeLength(node) / 16; // distance += 0; } else { // variance += 0; distance += position - offset - getNodeLength(node) / 2; } if (readOccurence->position < 0) { variance += getNodeLength(target) * getNodeLength(target) / 16; //distance += 0; } else { // variance += 0; distance += -readOccurence->position + readOccurence->offset + getNodeLength(target) / 2; } if (position < 0 || readOccurence->position < 0) { if (offset < readOccurence->offset && distance - getNodeLength(node)/2 - getNodeLength(target)/2 < -10) return; if (offset > readOccurence->offset && distance - getNodeLength(node)/2 - getNodeLength(target)/2 > 10) return; variance += length * length / 16; createConnection(getNodeID(node), getNodeID(target), 1, 0, distance, variance); createConnection(-getNodeID(node), -getNodeID(target), 1, 0, -distance, variance); } else if (distance > 0) { createConnection(getNodeID(node), getNodeID(target), 1, 0, distance, variance); } else { createConnection(-getNodeID(node), -getNodeID(target), 1, 0, -distance, variance); } }
static Connection *findConnection(IDnum nodeID, IDnum node2ID) { Node *node2 = getNodeInGraph(graph, node2ID); Connection *connect; if (node2 == NULL) return NULL; for (connect = scaffold[nodeID + nodeCount(graph)]; connect != NULL; connect = connect->next) if (connect->destination == node2) break; return connect; }
static void computeLocalNodeToNodeMappings() { IDnum index; Node *node; puts("Computing local connections"); activateArcLookupTable(graph); for (index = -nodeCount(graph); index <= nodeCount(graph); index++) { node = getNodeInGraph(graph, index); if (node && getUniqueness(node)) computeLocalNodeToNodeMappingsFromNode(node); } deactivateArcLookupTable(graph); }
static IDnum countConnectedComponents(Graph * graph) { IDnum index; IDnum count = 0; Node *node; resetNodeStatus(graph); for (index = 1; index <= nodeCount(graph); index++) { node = getNodeInGraph(graph, index); if (!getNodeStatus(node) && getUniqueness(node)) { count++; propagateComponent(node); } } return count; }
static IDnum expectedNumberOfConnections(IDnum IDA, Connection * connect, IDnum ** counts, Category cat) { Node *A = getNodeInGraph(graph, IDA); Node *B = connect->destination; double left, middle, right; Coordinate longLength, shortLength, D; IDnum longCount; double M, N, O, P; Coordinate mu = getInsertLength(graph, cat); double sigma = sqrt(getInsertLength_var(graph, cat)); double result; if (mu <= 0) return 0; if (getNodeLength(A) < getNodeLength(B)) { longLength = getNodeLength(B); shortLength = getNodeLength(A); longCount = counts[cat][getNodeID(B) + nodeCount(graph)]; } else { longLength = getNodeLength(A); shortLength = getNodeLength(B); longCount = counts[cat][IDA + nodeCount(graph)]; } D = connect->distance - (longLength + shortLength) / 2; M = (D - mu) / sigma; N = (D + shortLength - mu) / sigma; O = (D + longLength - mu) / sigma; P = (D + shortLength + longLength - mu) / sigma; left = ((norm(M) - norm(N)) - M * normInt(M, N)) * sigma; middle = shortLength * normInt(N, O); right = ((norm(O) - norm(P)) - P * normInt(O, P)) * (-sigma); result = (longCount * (left + middle + right)) / longLength; if (result > 0) return (IDnum) result; else return 0; }
static Locus *extractConnectedComponents(IDnum locusCount) { Locus *loci = allocateLocusArray(locusCount); Locus *locus; IDnum index; IDnum locusIndex = 0; IDnum nodeIndex; Node *node; resetNodeStatus(graph); for (index = 1; index <= nodeCount(graph); index++) { node = getNodeInGraph(graph, index); if (!getNodeStatus(node) && getUniqueness(node)) { locus = getLocus(loci, locusIndex++); clearLocus(locus); // Long contigs fillUpComponent(node); setLongContigCount(locus, countMarkedNodes()); while (existsMarkedNode()) addContig(locus, popNodeRecord()); // Secondary contigs extendComponent(locus); setContigCount(locus, getLongContigCount(locus) + countMarkedNodes()); while (existsMarkedNode()) addContig(locus, popNodeRecord()); // Mark primary nodes so that their twins are not reused for (nodeIndex = 0; nodeIndex < getLongContigCount(locus); nodeIndex++) setNodeStatus(getContig(locus, nodeIndex), true); // Unmark secondary nodes so that they are available to other loci for (nodeIndex = getLongContigCount(locus); nodeIndex < getContigCount(locus); nodeIndex++) setNodeStatus(getContig(locus, nodeIndex), false); } } return loci; }
// Detects sequences that could be simplified through concatentation // Iterates till graph cannot be more simplified // Useless nodes are freed from memory and remaining ones are renumbered void concatenateGraph(Graph * graph) { IDnum nodeIndex; Node *node, *twin; velvetLog("Concatenation...\n"); for (nodeIndex = 1; nodeIndex < nodeCount(graph); nodeIndex++) { node = getNodeInGraph(graph, nodeIndex); if (node == NULL) continue; twin = getTwinNode(node); while (simpleArcCount(node) == 1 && simpleArcCount(getTwinNode (getDestination(getArc(node)))) == 1) { if (getDestination(getArc(node)) == twin || getDestination(getArc(node)) == node) break; concatenateStringOfNodes(node, graph); } while (simpleArcCount(twin) == 1 && simpleArcCount(getTwinNode (getDestination(getArc(twin)))) == 1) { if (getDestination(getArc(twin)) == node || getDestination(getArc(twin)) == twin) break; concatenateStringOfNodes(twin, graph); } } renumberNodes(graph); sortGapMarkers(graph); velvetLog("Concatenation over!\n"); }
static void projectFromReadPair(Node * node, ReadOccurence * readOccurence, Coordinate position, Coordinate offset, Coordinate insertLength, double insertVariance) { Coordinate distance = insertLength; Coordinate variance = insertVariance; Node *target = getNodeInGraph(graph, readOccurence->nodeID); if (target == getTwinNode(node) || target == node) return; if (getUniqueness(target) && getNodeID(target) < getNodeID(node)) return; if (position < 0) { variance += getNodeLength(node) * getNodeLength(node) / 16; // distance += 0; } else { // variance += 0; distance += position - offset - getNodeLength(node) / 2; } if (readOccurence->position < 0) { variance += getNodeLength(target) * getNodeLength(target) / 16; //distance += 0; } else { // variance += 0; distance += readOccurence->position - readOccurence->offset - getNodeLength(target) / 2; } if (distance - getNodeLength(node)/2 - getNodeLength(target)/2 < -6 * sqrt(insertVariance)) return; else if (distance < getNodeLength(node)/2 + getNodeLength(target)/2) distance = getNodeLength(node)/2 + getNodeLength(target)/2; createConnection(getNodeID(node), getNodeID(target), 0, 1, distance, variance); }
static void trimLongReadTips() { IDnum index; Node *node; PassageMarkerI marker, next; velvetLog("Trimming read tips\n"); for (index = 1; index <= nodeCount(graph); index++) { node = getNodeInGraph(graph, index); if (getUniqueness(node)) continue; for (marker = getMarker(node); marker != NULL_IDX; marker = next) { next = getNextInNode(marker); if (!isInitial(marker) && !isTerminal(marker)) continue; if (isTerminal(marker)) marker = getTwinMarker(marker); while (!getUniqueness(getNode(marker))) { if (next != NULL_IDX && (marker == next || marker == getTwinMarker(next))) next = getNextInNode(next); if (getNextInSequence(marker) != NULL_IDX) { marker = getNextInSequence(marker); destroyPassageMarker (getPreviousInSequence (marker)); } else { destroyPassageMarker(marker); break; } } } } }
static boolean expandLongNodes(boolean force_jumps) { IDnum nodeID; Node *node; boolean modified = false; for (nodeID = 1; nodeID <= nodeCount(graph); nodeID++) { node = getNodeInGraph(graph, nodeID); if (node != NULL && getUniqueness(node)) { modified = expandLongNode(node, force_jumps) || modified; modified = expandLongNode(getTwinNode(node), force_jumps) || modified; } } return modified; }
static void identifyUniqueNodes(boolean(*isUniqueFunction) (Node *)) { IDnum index; Node *node; IDnum counter = 0; velvetLog("Identifying unique nodes\n"); for (index = 1; index <= nodeCount(graph); index++) { node = getNodeInGraph(graph, index); if (node == NULL) continue; setUniqueness(node, isUniqueFunction(node)); if (getUniqueness(node)) counter++; } velvetLog("Done, %li unique nodes counted\n", (long) counter); }
static KmerOccurenceTable *referenceGraphKmers(char *preGraphFilename, short int accelerationBits, Graph * graph, boolean double_strand) { FILE *file = fopen(preGraphFilename, "r"); const int maxline = MAXLINE; char line[MAXLINE]; char c; int wordLength; Coordinate lineLength, kmerCount; Kmer word; Kmer antiWord; KmerOccurenceTable *kmerTable = NULL; KmerOccurence *kmerOccurences, *kmerOccurencePtr; Coordinate kmerOccurenceIndex; IDnum index; IDnum nodeID = 0; IDnum *accelPtr = NULL; KmerKey lastHeader = 0; KmerKey header; Nucleotide nucleotide; if (file == NULL) exitErrorf(EXIT_FAILURE, true, "Could not open %s", preGraphFilename); // Count kmers printf("Scanning pre-graph file %s for k-mers\n", preGraphFilename); // First line if (!fgets(line, maxline, file)) exitErrorf(EXIT_FAILURE, true, "PreGraph file incomplete"); sscanf(line, "%*i\t%*i\t%i\n", &wordLength); // Initialize kmer occurence table: kmerTable = mallocOrExit(1, KmerOccurenceTable); if (accelerationBits > 2 * wordLength) accelerationBits = 2 * wordLength; if (accelerationBits > 32) accelerationBits = 32; if (accelerationBits > 0) { kmerTable->accelerationBits = accelerationBits; kmerTable->accelerationTable = callocOrExit((((size_t) 1) << accelerationBits) + 1, IDnum); accelPtr = kmerTable->accelerationTable; kmerTable->accelerationShift = (short int) 2 *wordLength - accelerationBits; } else { kmerTable->accelerationBits = 0; kmerTable->accelerationTable = NULL; kmerTable->accelerationShift = 0; } // Read nodes if (!fgets(line, maxline, file)) exitErrorf(EXIT_FAILURE, true, "PreGraph file incomplete"); kmerCount = 0; while (line[0] == 'N') { lineLength = 0; while ((c = getc(file)) != EOF && c != '\n') lineLength++; kmerCount += lineLength - wordLength + 1; if (fgets(line, maxline, file) == NULL) break; } fclose(file); // Create table printf("%li kmers found\n", (long) kmerCount); kmerOccurences = callocOrExit(kmerCount, KmerOccurence); kmerOccurencePtr = kmerOccurences; kmerOccurenceIndex = 0; kmerTable->kmerTable = kmerOccurences; kmerTable->kmerTableSize = kmerCount; // Fill table file = fopen(preGraphFilename, "r"); if (file == NULL) exitErrorf(EXIT_FAILURE, true, "Could not open %s", preGraphFilename); if (!fgets(line, maxline, file)) exitErrorf(EXIT_FAILURE, true, "PreGraph file incomplete"); // Read nodes if (!fgets(line, maxline, file)) exitErrorf(EXIT_FAILURE, true, "PreGraph file incomplete"); while (line[0] == 'N') { nodeID++; // Fill in the initial word : clearKmer(&word); clearKmer(&antiWord); for (index = 0; index < wordLength - 1; index++) { c = getc(file); if (c == 'A') nucleotide = ADENINE; else if (c == 'C') nucleotide = CYTOSINE; else if (c == 'G') nucleotide = GUANINE; else if (c == 'T') nucleotide = THYMINE; else if (c == '\n') exitErrorf(EXIT_FAILURE, true, "PreGraph file incomplete"); else nucleotide = ADENINE; pushNucleotide(&word, nucleotide); if (double_strand) { #ifdef COLOR reversePushNucleotide(&antiWord, nucleotide); #else reversePushNucleotide(&antiWord, 3 - nucleotide); #endif } } // Scan through node index = 0; while((c = getc(file)) != '\n' && c != EOF) { if (c == 'A') nucleotide = ADENINE; else if (c == 'C') nucleotide = CYTOSINE; else if (c == 'G') nucleotide = GUANINE; else if (c == 'T') nucleotide = THYMINE; else nucleotide = ADENINE; pushNucleotide(&word, nucleotide); if (double_strand) { #ifdef COLOR reversePushNucleotide(&antiWord, nucleotide); #else reversePushNucleotide(&antiWord, 3 - nucleotide); #endif } if (!double_strand || compareKmers(&word, &antiWord) <= 0) { copyKmers(&kmerOccurencePtr->kmer, &word); kmerOccurencePtr->nodeID = nodeID; kmerOccurencePtr->position = index; } else { copyKmers(&kmerOccurencePtr->kmer, &antiWord); kmerOccurencePtr->nodeID = -nodeID; kmerOccurencePtr->position = getNodeLength(getNodeInGraph(graph, nodeID)) - 1 - index; } kmerOccurencePtr++; kmerOccurenceIndex++; index++; } if (fgets(line, maxline, file) == NULL) break; } fclose(file); // Sort table qsort(kmerOccurences, kmerCount, sizeof(KmerOccurence), compareKmerOccurences); // Fill up acceleration table if (kmerTable->accelerationTable != NULL) { *accelPtr = (IDnum) 0; for (kmerOccurenceIndex = 0; kmerOccurenceIndex < kmerCount; kmerOccurenceIndex++) { header = keyInAccelerationTable(&kmerOccurences [kmerOccurenceIndex]. kmer, kmerTable); while (lastHeader < header) { lastHeader++; accelPtr++; *accelPtr = kmerOccurenceIndex; } } while (lastHeader < (KmerKey) 1 << accelerationBits) { lastHeader++; accelPtr++; *accelPtr = kmerCount; } } return kmerTable; }
static void projectFromReadPair(Node * node, ReadOccurence * readOccurence, Coordinate position, Coordinate offset, Coordinate insertLength, double insertVariance, boolean weight) { Coordinate distance = insertLength; Coordinate variance = insertVariance; Node *target = getNodeInGraph(graph, readOccurence->nodeID); Connection *connect; double score; // Filter for useless reads: if (readOccurence->position == -1 && readOccurence->offset == -1) return; if (target == getTwinNode(node) || target == node) return; if (getUniqueness(target) && getNodeID(target) < getNodeID(node)) return; if (weight) { if (position > 0 && readOccurence->position > 0 && (connect = getConnectionBetweenNodes(node, target))) { distance = getConnectionDistance(connect); distance -= position - offset - getNodeLength(node) / 2; distance -= readOccurence->position - readOccurence->offset - getNodeLength(target) / 2; score = K * exp((insertLength - distance) * (distance - insertLength) / (2 * insertVariance)); incrementConnectionWeight(connect, score); } return; } if (position < 0) { variance += getNodeLength(node) * getNodeLength(node) / 16; // distance += 0; } else { // variance += 0; distance += position - offset - getNodeLength(node) / 2; } if (readOccurence->position < 0) { variance += getNodeLength(target) * getNodeLength(target) / 16; //distance += 0; } else { // variance += 0; distance += readOccurence->position - readOccurence->offset - getNodeLength(target) / 2; } if (distance - getNodeLength(node) / 2 - getNodeLength(target) / 2 < -6 * sqrt(insertVariance)) return; createConnection(getNodeID(node), getNodeID(target), 0, 1, distance, variance); }
static void projectFromSingleRead(Node * node, ReadOccurence * readOccurence, Coordinate position, Coordinate offset, Coordinate length, boolean weight) { Coordinate distance = 0; Connection *connect; Node *target = getNodeInGraph(graph, -readOccurence->nodeID); double variance = 1; // Filter out troublemakers if (readOccurence->position == -1 && readOccurence->offset == -1) return; if (offset < 0 || readOccurence->offset < 0) return; if (target == getTwinNode(node) || target == node) return; if (weight) { if ((connect = getConnectionBetweenNodes(node, target))) { incrementConnectionWeight(connect, 1); } else if ((connect = getConnectionBetweenNodes(getTwinNode(node), getTwinNode(target)))) { incrementConnectionWeight(connect, 1); } return; } if (position < 0) { variance += getNodeLength(node) * getNodeLength(node) / 16; distance += getNodeLength(node) / 2; } else { // variance += 0; distance += position - offset - getNodeLength(node) / 2; } if (readOccurence->position < 0) { variance += getNodeLength(target) * getNodeLength(target) / 16; distance += getNodeLength(target) / 2; } else { // variance += 0; distance += -readOccurence->position + readOccurence->offset + getNodeLength(target) / 2; } if (offset < readOccurence->offset) { if (getNodeLength(node) % 2) distance--; createConnection(getNodeID(node), getNodeID(target), 1, 0, distance, variance); } else { if (getNodeLength(target) % 2) distance++; createConnection(-getNodeID(node), -getNodeID(target), 1, 0, -distance, variance); } }
static void projectFromSingleRead(Node * node, ReadOccurence * readOccurence, Coordinate position, Coordinate offset, Coordinate length) { Coordinate distance = 0; Node *target = getNodeInGraph(graph, -readOccurence->nodeID); double variance = 1; if (target == getTwinNode(node) || target == node) return; if (getUniqueness(target) && getNodeID(target) < getNodeID(node)) return; if (position < 0) { variance += getNodeLength(node) * getNodeLength(node) / 16; // distance += 0; } else { // variance += 0; distance += position - getNodeLength(node) / 2; } if (readOccurence->position < 0) { variance += getNodeLength(target) * getNodeLength(target) / 16; //distance += 0; } else { // variance += 0; distance += -readOccurence->position + getNodeLength(target) / 2; } if (readOccurence->offset < 0 || offset < 0) { variance += length * length / 16; //distance += 0; } else { // variance += 0; distance += readOccurence->offset - offset; } // Relative ordering if (offset > 0 && readOccurence->offset > 0) { if (offset < readOccurence->offset) { if (distance - getNodeLength(node)/2 - getNodeLength(target)/2 < -10) ; else if (distance < getNodeLength(node)/2 + getNodeLength(target)/2) createConnection(getNodeID(node), getNodeID(target), 1, 0, getNodeLength(node)/2 + getNodeLength(target)/2, variance); else createConnection(getNodeID(node), getNodeID(target), 1, 0, distance, variance); } else if (offset > readOccurence->offset) { if (-distance - getNodeLength(node)/2 - getNodeLength(target)/2 < -10) ; else if (-distance < getNodeLength(node)/2 + getNodeLength(target)/2) createConnection(-getNodeID(node), -getNodeID(target), 1, 0, getNodeLength(node)/2 + getNodeLength(target)/2 , variance); else createConnection(-getNodeID(node), -getNodeID(target), 1, 0, -distance, variance); } } else if (offset > 0 && position > 0) { if (distance - offset > -getNodeLength(node)/2 && distance - offset + length > getNodeLength(node)/2) createConnection(getNodeID(node), getNodeID(target), 1, 0, getNodeLength(node)/2 + getNodeLength(target)/2, variance); else if (distance - offset < -getNodeLength(node)/2 && distance - offset + length < getNodeLength(node)/2) createConnection(-getNodeID(node), -getNodeID(target), 1, 0, getNodeLength(node)/2 + getNodeLength(target)/2, variance); else { createConnection(getNodeID(node), getNodeID(target), 1, 0, getNodeLength(node)/2 + getNodeLength(target)/2, variance); createConnection(-getNodeID(node), -getNodeID(target), 1, 0, getNodeLength(node)/2 + getNodeLength(target)/2, variance); } } else if (readOccurence->offset > 0 && readOccurence->position > 0) { if (-distance - readOccurence->offset > -getNodeLength(target)/2 && -distance - readOccurence->offset + length > getNodeLength(target)/2) createConnection(-getNodeID(node), -getNodeID(target), 1, 0, getNodeLength(node)/2 + getNodeLength(target)/2, variance); if (-distance - readOccurence->offset < -getNodeLength(target)/2 && -distance - readOccurence->offset + length < getNodeLength(target)/2) createConnection(getNodeID(node), getNodeID(target), 1, 0, getNodeLength(node)/2 + getNodeLength(target)/2, variance); else { createConnection(getNodeID(node), getNodeID(target), 1, 0, getNodeLength(node)/2 + getNodeLength(target)/2, variance); createConnection(-getNodeID(node), -getNodeID(target), 1, 0, getNodeLength(node)/2 + getNodeLength(target)/2, variance); } } else { createConnection(getNodeID(node), getNodeID(target), 1, 0, getNodeLength(node)/2 + getNodeLength(target)/2, variance); createConnection(-getNodeID(node), -getNodeID(target), 1, 0, getNodeLength(node)/2 + getNodeLength(target)/2, variance); } }
static void ghostThreadSequenceThroughGraph(TightString * tString, KmerOccurenceTable * kmerOccurences, Graph * graph, IDnum seqID, Category category, boolean readTracking, boolean double_strand) { Kmer word; Kmer antiWord; Coordinate readNucleotideIndex; KmerOccurence *kmerOccurence; int wordLength = getWordLength(graph); Nucleotide nucleotide; Node *node; Node *previousNode = NULL; clearKmer(&word); clearKmer(&antiWord); // Neglect any read which will not be short paired if ((!readTracking && category % 2 == 0) || category / 2 >= CATEGORIES) return; // Neglect any string shorter than WORDLENGTH : if (getLength(tString) < wordLength) return; // Verify that all short reads are reasonnably short if (getLength(tString) > USHRT_MAX) { printf("Short read of length %lli, longer than limit %i\n", (long long) getLength(tString), SHRT_MAX); puts("You should better declare this sequence as long, because it genuinely is!"); exit(1); } // Allocate memory for the read pairs if (!readStartsAreActivated(graph)) activateReadStarts(graph); // Fill in the initial word : for (readNucleotideIndex = 0; readNucleotideIndex < wordLength - 1; readNucleotideIndex++) { nucleotide = getNucleotide(readNucleotideIndex, tString); pushNucleotide(&word, nucleotide); if (double_strand) { #ifdef COLOR reversePushNucleotide(&antiWord, nucleotide); #else reversePushNucleotide(&antiWord, 3 - nucleotide); #endif } } // Go through sequence while (readNucleotideIndex < getLength(tString)) { // Shift word: nucleotide = getNucleotide(readNucleotideIndex++, tString); pushNucleotide(&word, nucleotide); if (double_strand) { #ifdef COLOR reversePushNucleotide(&antiWord, nucleotide); #else reversePushNucleotide(&antiWord, 3 - nucleotide); #endif } // Search in table if ((!double_strand || compareKmers(&word, &antiWord) <= 0) && (kmerOccurence = findKmerOccurenceInSortedTable(&word, kmerOccurences))) { node = getNodeInGraph(graph, kmerOccurence->nodeID); } else if ((double_strand && compareKmers(&word, &antiWord) > 0) && (kmerOccurence = findKmerOccurenceInSortedTable(&antiWord, kmerOccurences))) { node = getNodeInGraph(graph, -kmerOccurence->nodeID); } else { node = NULL; if (previousNode) break; } previousNode = node; // Fill in graph if (node && !getNodeStatus(node)) { incrementReadStartCount(node, graph); setSingleNodeStatus(node, true); memorizeNode(node); } } unlockMemorizedNodes(); }
static void threadSequenceThroughGraph(TightString * tString, KmerOccurenceTable * kmerOccurences, Graph * graph, IDnum seqID, Category category, boolean readTracking, boolean double_strand) { Kmer word; Kmer antiWord; Coordinate readNucleotideIndex; Coordinate kmerIndex; KmerOccurence *kmerOccurence; int wordLength = getWordLength(graph); PassageMarker *marker = NULL; PassageMarker *previousMarker = NULL; Node *node; Node *previousNode = NULL; Coordinate coord; Coordinate previousCoord = 0; Nucleotide nucleotide; clearKmer(&word); clearKmer(&antiWord); // Neglect any string shorter than WORDLENGTH : if (getLength(tString) < wordLength) return; // Fill in the initial word : for (readNucleotideIndex = 0; readNucleotideIndex < wordLength - 1; readNucleotideIndex++) { nucleotide = getNucleotide(readNucleotideIndex, tString); pushNucleotide(&word, nucleotide); if (double_strand) { #ifdef COLOR reversePushNucleotide(&antiWord, nucleotide); #else reversePushNucleotide(&antiWord, 3 - nucleotide); #endif } } // Go through sequence while (readNucleotideIndex < getLength(tString)) { nucleotide = getNucleotide(readNucleotideIndex++, tString); pushNucleotide(&word, nucleotide); if (double_strand) { #ifdef COLOR reversePushNucleotide(&antiWord, nucleotide); #else reversePushNucleotide(&antiWord, 3 - nucleotide); #endif } // Search in table if ((!double_strand || compareKmers(&word, &antiWord) <= 0) && (kmerOccurence = findKmerOccurenceInSortedTable(&word, kmerOccurences))) { node = getNodeInGraph(graph, kmerOccurence->nodeID); coord = kmerOccurence->position; } else if ((double_strand && compareKmers(&word, &antiWord) > 0) && (kmerOccurence = findKmerOccurenceInSortedTable(&antiWord, kmerOccurences))) { node = getNodeInGraph(graph, -kmerOccurence->nodeID); coord = getNodeLength(node) - kmerOccurence->position - 1; } else { node = NULL; if (previousNode) { break; } } // Fill in graph if (node) { kmerIndex = readNucleotideIndex - wordLength; if (previousNode == node && previousCoord == coord - 1) { if (category / 2 >= CATEGORIES) { setPassageMarkerFinish(marker, kmerIndex + 1); setFinishOffset(marker, getNodeLength(node) - coord - 1); } else { incrementVirtualCoverage(node, category / 2, 1); incrementOriginalVirtualCoverage (node, category / 2, 1); } } else { if (category / 2 >= CATEGORIES) { marker = newPassageMarker(seqID, kmerIndex, kmerIndex + 1, coord, getNodeLength (node) - coord - 1); transposePassageMarker(marker, node); connectPassageMarkers (previousMarker, marker, graph); previousMarker = marker; } else { if (readTracking) { if (!getNodeStatus(node)) { addReadStart(node, seqID, coord, graph, kmerIndex); setSingleNodeStatus (node, true); memorizeNode(node); } else { blurLastShortReadMarker (node, graph); } } incrementVirtualCoverage(node, category / 2, 1); incrementOriginalVirtualCoverage (node, category / 2, 1); } createArc(previousNode, node, graph); } previousNode = node; previousCoord = coord; } } unlockMemorizedNodes(); }
static void threadSequenceThroughGraph(TightString * tString, KmerOccurenceTable * kmerTable, Graph * graph, IDnum seqID, Category category, boolean readTracking, boolean double_strand, ReferenceMapping * referenceMappings, Coordinate referenceMappingCount, IDnum refCount, Annotation * annotations, IDnum annotationCount, boolean second_in_pair) { Kmer word; Kmer antiWord; Coordinate readNucleotideIndex; Coordinate kmerIndex; KmerOccurence *kmerOccurence; int wordLength = getWordLength(graph); PassageMarkerI marker = NULL_IDX; PassageMarkerI previousMarker = NULL_IDX; Node *node = NULL; Node *previousNode = NULL; Coordinate coord = 0; Coordinate previousCoord = 0; Nucleotide nucleotide; boolean reversed; IDnum refID; Coordinate refCoord = 0; ReferenceMapping * refMap; Annotation * annotation = annotations; Coordinate index = 0; Coordinate uniqueIndex = 0; Coordinate annotIndex = 0; IDnum annotCount = 0; SmallNodeList * nodePile = NULL; // Neglect any string shorter than WORDLENGTH : if (getLength(tString) < wordLength) return; clearKmer(&word); clearKmer(&antiWord); // Fill in the initial word : for (readNucleotideIndex = 0; readNucleotideIndex < wordLength - 1; readNucleotideIndex++) { nucleotide = getNucleotide(readNucleotideIndex, tString); pushNucleotide(&word, nucleotide); if (double_strand || second_in_pair) { #ifdef COLOR reversePushNucleotide(&antiWord, nucleotide); #else reversePushNucleotide(&antiWord, 3 - nucleotide); #endif } } // Go through sequence while (readNucleotideIndex < getLength(tString)) { nucleotide = getNucleotide(readNucleotideIndex++, tString); pushNucleotide(&word, nucleotide); if (double_strand || second_in_pair) { #ifdef COLOR reversePushNucleotide(&antiWord, nucleotide); #else reversePushNucleotide(&antiWord, 3 - nucleotide); #endif } // Update annotation if necessary if (annotCount < annotationCount && annotIndex == getAnnotationLength(annotation)) { annotation = getNextAnnotation(annotation); annotCount++; annotIndex = 0; } // Search for reference mapping if (category == REFERENCE) { if (referenceMappings) refMap = findReferenceMapping(seqID, index, referenceMappings, referenceMappingCount); else refMap = NULL; if (refMap) { node = getNodeInGraph(graph, refMap->nodeID); if (refMap->nodeID > 0) { coord = refMap->nodeStart + (index - refMap->referenceStart); } else { coord = getNodeLength(node) - refMap->nodeStart - refMap->length + (index - refMap->referenceStart); } } else { node = NULL; if (previousNode) break; } } // Search for reference-based mapping else if (annotCount < annotationCount && uniqueIndex >= getPosition(annotation) && getAnnotSequenceID(annotation) <= refCount && getAnnotSequenceID(annotation) >= -refCount) { refID = getAnnotSequenceID(annotation); if (refID > 0) refCoord = getStart(annotation) + annotIndex; else refCoord = getStart(annotation) - annotIndex; refMap = findReferenceMapping(refID, refCoord, referenceMappings, referenceMappingCount); // If success if (refMap) { if (refID > 0) { node = getNodeInGraph(graph, refMap->nodeID); if (refMap->nodeID > 0) { coord = refMap->nodeStart + (refCoord - refMap->referenceStart); } else { coord = getNodeLength(node) - refMap->nodeStart - refMap->length + (refCoord - refMap->referenceStart); } } else { node = getNodeInGraph(graph, -refMap->nodeID); if (refMap->nodeID > 0) { coord = getNodeLength(node) - refMap->nodeStart - (refCoord - refMap->referenceStart) - 1; } else { coord = refMap->nodeStart + refMap->length - (refCoord - refMap->referenceStart) - 1; } } } else { node = NULL; if (previousNode) break; } } // Search in table else { reversed = false; if (double_strand) { if (compareKmers(&word, &antiWord) <= 0) { kmerOccurence = findKmerInKmerOccurenceTable(&word, kmerTable); } else { kmerOccurence = findKmerInKmerOccurenceTable(&antiWord, kmerTable); reversed = true; } } else { if (!second_in_pair) { kmerOccurence = findKmerInKmerOccurenceTable(&word, kmerTable); } else { kmerOccurence = findKmerInKmerOccurenceTable(&antiWord, kmerTable); reversed = true; } } if (kmerOccurence) { if (!reversed) { node = getNodeInGraph(graph, getKmerOccurenceNodeID(kmerOccurence)); coord = getKmerOccurencePosition(kmerOccurence); } else { node = getNodeInGraph(graph, -getKmerOccurenceNodeID(kmerOccurence)); coord = getNodeLength(node) - getKmerOccurencePosition(kmerOccurence) - 1; } } else { node = NULL; if (previousNode) break; } } // Increment positions if (annotCount < annotationCount && uniqueIndex >= getPosition(annotation)) annotIndex++; else uniqueIndex++; // Fill in graph if (node) { #ifdef OPENMP lockNode(node); #endif kmerIndex = readNucleotideIndex - wordLength; if (previousNode == node && previousCoord == coord - 1) { if (category / 2 >= CATEGORIES) { setPassageMarkerFinish(marker, kmerIndex + 1); setFinishOffset(marker, getNodeLength(node) - coord - 1); } else { #ifndef SINGLE_COV_CAT incrementVirtualCoverage(node, category / 2, 1); incrementOriginalVirtualCoverage(node, category / 2, 1); #else incrementVirtualCoverage(node, 1); #endif } #ifdef OPENMP unLockNode(node); #endif } else { if (category / 2 >= CATEGORIES) { marker = newPassageMarker(seqID, kmerIndex, kmerIndex + 1, coord, getNodeLength (node) - coord - 1); transposePassageMarker(marker, node); connectPassageMarkers (previousMarker, marker, graph); previousMarker = marker; } else { if (readTracking) { if (!isNodeMemorized(node, nodePile)) { addReadStart(node, seqID, coord, graph, kmerIndex); memorizeNode(node, &nodePile); } else { blurLastShortReadMarker (node, graph); } } #ifndef SINGLE_COV_CAT incrementVirtualCoverage(node, category / 2, 1); incrementOriginalVirtualCoverage(node, category / 2, 1); #else incrementVirtualCoverage(node, 1); #endif } #ifdef OPENMP lockTwoNodes(node, previousNode); #endif createArc(previousNode, node, graph); #ifdef OPENMP unLockTwoNodes(node, previousNode); #endif } previousNode = node; previousCoord = coord; } index++; } if (readTracking && category / 2 < CATEGORIES) unMemorizeNodes(&nodePile); }
static void ghostThreadSequenceThroughGraph(TightString * tString, KmerOccurenceTable * kmerTable, Graph * graph, IDnum seqID, Category category, boolean readTracking, boolean double_strand, ReferenceMapping * referenceMappings, Coordinate referenceMappingCount, IDnum refCount, Annotation * annotations, IDnum annotationCount, boolean second_in_pair) { Kmer word; Kmer antiWord; Coordinate readNucleotideIndex; KmerOccurence *kmerOccurence; int wordLength = getWordLength(graph); Nucleotide nucleotide; IDnum refID; Coordinate refCoord; ReferenceMapping * refMap = NULL; Coordinate uniqueIndex = 0; Coordinate annotIndex = 0; IDnum annotCount = 0; boolean reversed; SmallNodeList * nodePile = NULL; Annotation * annotation = annotations; Node *node; Node *previousNode = NULL; // Neglect any read which will not be short paired if ((!readTracking && category % 2 == 0) || category / 2 >= CATEGORIES) return; // Neglect any string shorter than WORDLENGTH : if (getLength(tString) < wordLength) return; // Verify that all short reads are reasonnably short if (getLength(tString) > USHRT_MAX) { velvetLog("Short read of length %lli, longer than limit %i\n", (long long) getLength(tString), SHRT_MAX); velvetLog("You should better declare this sequence as long, because it genuinely is!\n"); exit(1); } clearKmer(&word); clearKmer(&antiWord); // Fill in the initial word : for (readNucleotideIndex = 0; readNucleotideIndex < wordLength - 1; readNucleotideIndex++) { nucleotide = getNucleotide(readNucleotideIndex, tString); pushNucleotide(&word, nucleotide); if (double_strand || second_in_pair) { #ifdef COLOR reversePushNucleotide(&antiWord, nucleotide); #else reversePushNucleotide(&antiWord, 3 - nucleotide); #endif } } // Go through sequence while (readNucleotideIndex < getLength(tString)) { // Shift word: nucleotide = getNucleotide(readNucleotideIndex++, tString); pushNucleotide(&word, nucleotide); if (double_strand || second_in_pair) { #ifdef COLOR reversePushNucleotide(&antiWord, nucleotide); #else reversePushNucleotide(&antiWord, 3 - nucleotide); #endif } // Update annotation if necessary if (annotCount < annotationCount && annotIndex == getAnnotationLength(annotation)) { annotation = getNextAnnotation(annotation); annotCount++; annotIndex = 0; } // Search for reference mapping if (annotCount < annotationCount && uniqueIndex >= getPosition(annotation) && getAnnotSequenceID(annotation) <= refCount && getAnnotSequenceID(annotation) >= -refCount) { refID = getAnnotSequenceID(annotation); if (refID > 0) refCoord = getStart(annotation) + annotIndex; else refCoord = getStart(annotation) - annotIndex; refMap = findReferenceMapping(refID, refCoord, referenceMappings, referenceMappingCount); // If success if (refMap) { if (refID > 0) node = getNodeInGraph(graph, refMap->nodeID); else node = getNodeInGraph(graph, -refMap->nodeID); } else { node = NULL; if (previousNode) break; } } // if not.. look in table else { reversed = false; if (double_strand) { if (compareKmers(&word, &antiWord) <= 0) { kmerOccurence = findKmerInKmerOccurenceTable(&word, kmerTable); } else { kmerOccurence = findKmerInKmerOccurenceTable(&antiWord, kmerTable); reversed = true; } } else { if (!second_in_pair) { kmerOccurence = findKmerInKmerOccurenceTable(&word, kmerTable); } else { kmerOccurence = findKmerInKmerOccurenceTable(&antiWord, kmerTable); reversed = true; } } if (kmerOccurence) { if (!reversed) node = getNodeInGraph(graph, getKmerOccurenceNodeID(kmerOccurence)); else node = getNodeInGraph(graph, -getKmerOccurenceNodeID(kmerOccurence)); } else { node = NULL; if (previousNode) break; } } if (annotCount < annotationCount && uniqueIndex >= getPosition(annotation)) annotIndex++; else uniqueIndex++; previousNode = node; // Fill in graph if (node && !isNodeMemorized(node, nodePile)) { #ifdef OPENMP lockNode(node); #endif incrementReadStartCount(node, graph); #ifdef OPENMP unLockNode(node); #endif memorizeNode(node, &nodePile); } } unMemorizeNodes(&nodePile); }
static KmerOccurenceTable *referenceGraphKmers(char *preGraphFilename, short int accelerationBits, Graph * graph, boolean double_strand, NodeMask * nodeMasks, Coordinate nodeMaskCount) { FILE *file = fopen(preGraphFilename, "r"); const int maxline = MAXLINE; char line[MAXLINE]; char c; int wordLength; Coordinate lineLength, kmerCount; Kmer word; Kmer antiWord; KmerOccurenceTable *kmerTable; IDnum index; IDnum nodeID = 0; Nucleotide nucleotide; NodeMask * nodeMask = nodeMasks; Coordinate nodeMaskIndex = 0; if (file == NULL) exitErrorf(EXIT_FAILURE, true, "Could not open %s", preGraphFilename); // Count kmers velvetLog("Scanning pre-graph file %s for k-mers\n", preGraphFilename); // First line if (!fgets(line, maxline, file)) exitErrorf(EXIT_FAILURE, true, "PreGraph file incomplete"); sscanf(line, "%*i\t%*i\t%i\n", &wordLength); kmerTable = newKmerOccurenceTable(accelerationBits, wordLength); // Read nodes if (!fgets(line, maxline, file)) exitErrorf(EXIT_FAILURE, true, "PreGraph file incomplete"); kmerCount = 0; while (line[0] == 'N') { lineLength = 0; while ((c = getc(file)) != EOF && c != '\n') lineLength++; kmerCount += lineLength - wordLength + 1; if (fgets(line, maxline, file) == NULL) break; } velvetLog("%li kmers found\n", (long) kmerCount); for(nodeMaskIndex = 0; nodeMaskIndex < nodeMaskCount; nodeMaskIndex++) { kmerCount -= nodeMasks[nodeMaskIndex].finish - nodeMasks[nodeMaskIndex].start; } nodeMaskIndex = 0; fclose(file); // Create table allocateKmerOccurences(kmerCount, kmerTable); // Fill table file = fopen(preGraphFilename, "r"); if (file == NULL) exitErrorf(EXIT_FAILURE, true, "Could not open %s", preGraphFilename); if (!fgets(line, maxline, file)) exitErrorf(EXIT_FAILURE, true, "PreGraph file incomplete"); // Read nodes if (!fgets(line, maxline, file)) exitErrorf(EXIT_FAILURE, true, "PreGraph file incomplete"); while (line[0] == 'N') { nodeID++; // Fill in the initial word : clearKmer(&word); clearKmer(&antiWord); for (index = 0; index < wordLength - 1; index++) { c = getc(file); if (c == 'A') nucleotide = ADENINE; else if (c == 'C') nucleotide = CYTOSINE; else if (c == 'G') nucleotide = GUANINE; else if (c == 'T') nucleotide = THYMINE; else if (c == '\n') exitErrorf(EXIT_FAILURE, true, "PreGraph file incomplete"); else nucleotide = ADENINE; pushNucleotide(&word, nucleotide); if (double_strand) { #ifdef COLOR reversePushNucleotide(&antiWord, nucleotide); #else reversePushNucleotide(&antiWord, 3 - nucleotide); #endif } } // Scan through node index = 0; while((c = getc(file)) != '\n' && c != EOF) { if (c == 'A') nucleotide = ADENINE; else if (c == 'C') nucleotide = CYTOSINE; else if (c == 'G') nucleotide = GUANINE; else if (c == 'T') nucleotide = THYMINE; else nucleotide = ADENINE; pushNucleotide(&word, nucleotide); if (double_strand) { #ifdef COLOR reversePushNucleotide(&antiWord, nucleotide); #else reversePushNucleotide(&antiWord, 3 - nucleotide); #endif } // Update mask if necessary if (nodeMask) { if (nodeMask->nodeID < nodeID || (nodeMask->nodeID == nodeID && index >= nodeMask->finish)) { if (++nodeMaskIndex == nodeMaskCount) nodeMask = NULL; else nodeMask++; } } // Check if not masked! if (nodeMask) { if (nodeMask->nodeID == nodeID && index >= nodeMask->start && index < nodeMask->finish) { index++; continue; } } if (!double_strand || compareKmers(&word, &antiWord) <= 0) recordKmerOccurence(&word, nodeID, index, kmerTable); else recordKmerOccurence(&antiWord, -nodeID, getNodeLength(getNodeInGraph(graph, nodeID)) - 1 - index, kmerTable); index++; } if (fgets(line, maxline, file) == NULL) break; } fclose(file); // Sort table sortKmerOccurenceTable(kmerTable); return kmerTable; }