// DEBUG void checkNode(Node* node) { PassageMarkerI marker1 = getMarker(node); if (marker1 == NULL_IDX) return; PassageMarkerI marker2 = getNextInNode(marker1); if (marker2 == NULL_IDX) return; if (getStartOffset(marker1) == getStartOffset(marker2)) abort(); if (getFinishOffset(marker1) == getFinishOffset(marker2)) abort(); printf(">>>> Node %li\n", (long) getNodeID(node)); printf("Marker1: %li - %li > %li (%li) \n", (long) getStartOffset(marker1), (long) getPassageMarkerLength(marker1), (long) (getNodeLength(node) - getFinishOffset(marker1)), (long) getFinishOffset(marker1)); printf("%s\n", readPassageMarker(marker1)); printf("Marker2: %li - %li > %li (%li) \n", (long) getStartOffset(marker2), (long) getPassageMarkerLength(marker2), (long) (getNodeLength(node) - getFinishOffset(marker2)), (long) getFinishOffset(marker2)); printf("%s\n", readPassageMarker(marker2)); if (getStartOffset(marker1) < getNodeLength(node) - getFinishOffset(marker2) && getStartOffset(marker2) < getNodeLength(node) - getFinishOffset(marker1)) { //abort(); ; } }
static void concatenateLongReads(Node * node, Node * candidate, Graph * graph) { PassageMarkerI marker, tmpMarker; // Passage marker management in node: for (marker = getMarker(node); marker != NULL_IDX; marker = getNextInNode(marker)) { if (!goesToNode(marker, candidate)) incrementFinishOffset(marker, getNodeLength(candidate)); } // Swapping new born passageMarkers from candidate to node for (marker = getMarker(candidate); marker != NULL_IDX; marker = tmpMarker) { tmpMarker = getNextInNode(marker); if (!comesFromNode(marker, node)) { extractPassageMarker(marker); incrementStartOffset(marker, getNodeLength(node)); transposePassageMarker(marker, node); incrementFinishOffset(getTwinMarker(marker), getNodeLength(node)); } else { reconnectPassageMarker(marker, node, &tmpMarker); } } }
void produceTranscript(Locus * locus, IDnum nodesInList) { IDnum index = 0; Node *node; Transcript *transcript = newTranscript(nodesInList, ((double) nodesInList) / getContigCount(locus)); while ((node = popNodeRecord())) { transcript->contigs[index] = node; if (index > 0) { transcript->distances[index - 1] = getConnectionDistance((getConnectionBetweenNodes(transcript->contigs[index - 1], getTwinNode(node)))); transcript->distances[index - 1] -= getNodeLength(node)/2; transcript->distances[index - 1] -= getNodeLength(transcript->contigs[index - 1])/2; if (getNodeLength(node) % 2 > 0 || getNodeLength(transcript->contigs[index - 1]) % 2 > 0) transcript->distances[index - 1] -= 1; if (transcript->distances[index - 1] < 0) transcript->distances[index - 1] = 0; } index++; } transcript->contigCount = index; addTranscript(locus, transcript); }
static boolean extractSequence(PassageMarkerI path, TightString * sequence) { PassageMarkerI marker; Coordinate seqLength = 0; Coordinate writeIndex = 0; //velvetLog("Extracting sequence %li ... ", pathLength); //Measure length for (marker = getNextInSequence(path); !isTerminal(marker); marker = getNextInSequence(marker)) seqLength += getNodeLength(getNode(marker)); if (seqLength > MAXREADLENGTH) return false; else setTightStringLength(sequence, seqLength); //Copy sequences for (marker = getNextInSequence(path); !isTerminal(marker); marker = getNextInSequence(marker)) { appendNodeSequence(getNode(marker), sequence, writeIndex); writeIndex += getNodeLength(getNode(marker)); } return true; }
static void recenterLocalScaffold(Node * node, Coordinate oldLength) { MiniConnection *localConnect; Coordinate distance_shift = (getNodeLength(node) - oldLength) / 2; Coordinate min_distance = getNodeLength(node) / 2 - BACKTRACK_CUTOFF; NodeList *nodeList, *next; IDnum node2ID; Node *node2; for (nodeList = markedNodes; nodeList != NULL; nodeList = next) { next = nodeList->next; node2 = nodeList->node; if (node2 == node) { setSingleNodeStatus(node2, 1); continue; } node2ID = getNodeID(node2); localConnect = &localScaffold[node2ID + nodeCount(graph)]; localConnect->distance -= distance_shift; if (localConnect->distance < min_distance && localConnect->backReference == NULL && localConnect->frontReference == NULL) unmarkNode(node2, localConnect); else if (getNodeStatus(node2) > 0) setSingleNodeStatus(node2, 1); else if (getNodeStatus(node2) < 0) setSingleNodeStatus(node2, -1); } }
static boolean acceptorSiteAtJunction(Node * nodeA, Node * nodeB) { Node *twinNodeA = getTwinNode(nodeA); Node *twinNodeB = getTwinNode(nodeB); Nucleotide n1, n2; int i; n2 = getNucleotideInNode(twinNodeB, getNodeLength(twinNodeB) - SPLICE_FUZZINESS); for (i = SPLICE_FUZZINESS - 1; i > 0; i--) { n1 = n2; n2 = getNucleotideInNode(twinNodeB, getNodeLength(twinNodeB) - i); if (n1 == CYTOSINE && n2 == ADENINE) return true; } for (i = 0; i < SPLICE_FUZZINESS + 2; i++) { n1 = n2; n2 = getNucleotideInNode(twinNodeA, i); if (n1 == CYTOSINE && n2 == ADENINE) return true; } return false; }
boolean isUniqueBasic(Node * node) { if (getNodeLength(node) < LONG_NODE_CUTOFF) { return false; } if (readCoverage(node) / (double) getNodeLength(node) > 1.5 * expected_coverage) { return false; } return true; }
static IDnum expectedNumberOfConnections(IDnum IDA, Connection * connect, IDnum ** counts, Category cat) { Node *A = getNodeInGraph(graph, IDA); Node *B = connect->destination; IDnum IDB = getNodeID(B); double left, middle, right; Coordinate longLength, shortLength, D; double M, N, O, P; Coordinate mu = getInsertLength(graph, cat); double sigma = sqrt(getInsertLength_var(graph, cat)); double result; double densityA, densityB, minDensity; if (mu <= 0) return 0; if (getNodeLength(A) == 0 || getNodeLength(B) == 0) return 0; if (getNodeLength(A) < getNodeLength(B)) { longLength = getNodeLength(B); shortLength = getNodeLength(A); } else { longLength = getNodeLength(A); shortLength = getNodeLength(B); } densityA = counts[cat][IDA + nodeCount(graph)] / (double) getNodeLength(A); densityB = counts[cat][IDB + nodeCount(graph)] / (double) getNodeLength(B); minDensity = densityA > densityB ? densityB : densityA; D = getConnectionDistance(connect) - (longLength + shortLength) / 2; M = (D - mu) / sigma; N = (D + shortLength - mu) / sigma; O = (D + longLength - mu) / sigma; P = (D + shortLength + longLength - mu) / sigma; left = ((norm(M) - norm(N)) - M * normInt(M, N)) * sigma; middle = shortLength * normInt(N, O); right = ((norm(O) - norm(P)) - P * normInt(O, P)) * (-sigma); result = (minDensity * (left + middle + right)); if (result > 0) return (IDnum) result; else return 0; }
static void projectFromSingleRead(Node * node, ReadOccurence * readOccurence, Coordinate position, Coordinate offset, Coordinate length) { Coordinate distance = 0; Node *target = getNodeInGraph(graph, -readOccurence->nodeID); double variance = 1; if (target == getTwinNode(node) || target == node) return; if (position < 0) { variance += getNodeLength(node) * getNodeLength(node) / 16; // distance += 0; } else { // variance += 0; distance += position - offset - getNodeLength(node) / 2; } if (readOccurence->position < 0) { variance += getNodeLength(target) * getNodeLength(target) / 16; //distance += 0; } else { // variance += 0; distance += -readOccurence->position + readOccurence->offset + getNodeLength(target) / 2; } if (position < 0 || readOccurence->position < 0) { if (offset < readOccurence->offset && distance - getNodeLength(node)/2 - getNodeLength(target)/2 < -10) return; if (offset > readOccurence->offset && distance - getNodeLength(node)/2 - getNodeLength(target)/2 > 10) return; variance += length * length / 16; createConnection(getNodeID(node), getNodeID(target), 1, 0, distance, variance); createConnection(-getNodeID(node), -getNodeID(target), 1, 0, -distance, variance); } else if (distance > 0) { createConnection(getNodeID(node), getNodeID(target), 1, 0, distance, variance); } else { createConnection(-getNodeID(node), -getNodeID(target), 1, 0, -distance, variance); } }
static boolean testConnection(IDnum IDA, Connection * connect, IDnum ** counts) { IDnum total = 0; Category cat; // Destroy tenuous connections if (connect->weight < 0.1) return false; if (connect->paired_count + connect->direct_count < UNRELIABLE_CONNECTION_CUTOFF) return false; if (getNodeLength(connect->destination) <= LENGTHCUTOFF) return connect->direct_count > 0; for (cat = 0; cat <= CATEGORIES; cat++) total += expectedNumberOfConnections(IDA, connect, counts, cat); if (total == 0) return connect->direct_count > 0; else return connect->paired_count >= total * pairedThreshold; }
boolean isUniqueSolexa(Node * node) { Coordinate nodeLength = getNodeLength(node); Coordinate nodeCoverage = (getVirtualCoverage(node, 0) + getVirtualCoverage(node, 1)); double nodeDensity, probability; if (nodeLength == 0) { return false; } if (nodeLength > LONG_NODE_CUTOFF) { nodeDensity = nodeCoverage / (double) nodeLength; probability = LN2 / 2 + nodeLength / (2 * expected_coverage) * (expected_coverage * expected_coverage - nodeDensity * nodeDensity / 2); return probability > PROBABILITY_CUTOFF; } else { return false; probability = expected_coverage * nodeLength - nodeCoverage / LN2; return probability > 0; } }
static void markInterestingNodes(Node * node) { Connection *connect; Node *destination; MiniConnection *localConnect; Coordinate min_distance = getNodeLength(node) / 2 - BACKTRACK_CUTOFF; // Mark own node setEmptyMiniConnection(node); // Loop thru primary scaffold for (connect = getConnection(node); connect != NULL; connect = getNextConnection(connect)) { destination = getTwinNode(getConnectionDestination(connect)); localConnect = &localScaffold[getNodeID(destination) + nodeCount(graph)]; if (getNodeStatus(destination)) { readjustMiniConnection(destination, localConnect, getConnectionDistance(connect), min_distance, getConnectionVariance(connect), connect, NULL); localConnect->backReference = NULL; } else { resetMiniConnection(destination, localConnect, getConnectionDistance(connect), getConnectionVariance(connect), connect, NULL, true); } integrateDerivativeDistances(connect, min_distance, true); } // Loop thru twin's primary scaffold for (connect = getConnection(getTwinNode(node)); connect != NULL; connect = getNextConnection(connect)) { destination = getConnectionDestination(connect); localConnect = &localScaffold[getNodeID(destination) + nodeCount(graph)]; if (getNodeStatus(destination)) readjustMiniConnection(destination, localConnect, -getConnectionDistance(connect), min_distance, getConnectionVariance(connect), NULL, connect); else resetMiniConnection(destination, localConnect, -getConnectionDistance(connect), getConnectionVariance(connect), NULL, connect, -1); integrateDerivativeDistances(connect, min_distance, false); } }
void concatenateReadStarts(Node * target, Node * source, Graph * graph) { IDnum sourceLength, targetLength; ShortReadMarker *sourceArray, *targetArray, *marker; IDnum index; Coordinate position, nodeLength; if (!readStartsAreActivated(graph)) return; if (target == NULL || source == NULL) return; // Update Coordinates sourceArray = getNodeReads(source, graph); sourceLength = getNodeReadCount(source, graph); nodeLength = getNodeLength(target); for (index = 0; index < sourceLength; index++) { marker = getShortReadMarkerAtIndex(sourceArray, index); position = getShortReadMarkerPosition(marker); if (position != -1) { position += nodeLength; setShortReadMarkerPosition(marker, position); } } // Same but for symmetrical reads targetArray = getNodeReads(getTwinNode(target), graph); targetLength = getNodeReadCount(getTwinNode(target), graph); nodeLength = getNodeLength(source); for (index = 0; index < targetLength; index++) { marker = getShortReadMarkerAtIndex(targetArray, index); position = getShortReadMarkerPosition(marker); if (position != -1) { position += nodeLength; setShortReadMarkerPosition(marker, position); } } // Merging lists mergeNodeReads(target, source, graph); mergeNodeReads(getTwinNode(target), getTwinNode(source), graph); }
static void measureCoOccurences(Coordinate ** coOccurences, boolean * interestingReads, ReadOccurence ** readNodes, IDnum * readNodeCounts, IDnum * readPairs, Category * cats) { IDnum coOccurencesIndex[CATEGORIES + 1]; IDnum observationIndex; IDnum readIndex, readPairIndex; IDnum readNodeCount; IDnum readOccurenceIndex, readPairOccurenceIndex; ReadOccurence * readOccurence, *readPairOccurence; Category libID; for (libID = 0; libID < CATEGORIES + 1; libID++) coOccurencesIndex[libID] = 0; for (readIndex = 0; readIndex < sequenceCount(graph); readIndex++) { // Eliminating dodgy, unpaired, already counted or user-specified reads if (!interestingReads[readIndex]) continue; // Find co-occurence // We know that for each read the read occurences are ordered by increasing node ID libID = cats[readIndex]/2; readPairIndex = readPairs[readIndex]; observationIndex = coOccurencesIndex[libID]; readOccurence = readNodes[readIndex + 1]; readOccurenceIndex = 0; readNodeCount = readNodeCounts[readIndex + 1]; readPairOccurenceIndex = readNodeCounts[readPairIndex + 1] - 1; readPairOccurence = &(readNodes[readPairIndex + 1][readPairOccurenceIndex]); while (readOccurenceIndex < readNodeCount && readPairOccurenceIndex >= 0) { if (readOccurence->nodeID == -readPairOccurence->nodeID) { if (readOccurence->position > 0 && readPairOccurence->position > 0) { coOccurences[libID][observationIndex] = getNodeLength(getNodeInGraph(graph, readOccurence->nodeID)) + getWordLength(graph) - 1 - (readOccurence->position - readOccurence->offset) - (readPairOccurence->position - readPairOccurence->offset); coOccurencesIndex[libID]++; break; } else { readOccurence++; readOccurenceIndex++; readPairOccurence--; readPairOccurenceIndex--; } } else if (readOccurence->nodeID < -readPairOccurence->nodeID) { readOccurence++; readOccurenceIndex++; } else { readPairOccurence--; readPairOccurenceIndex--; } } } }
static void recenterNode(Node * node, Coordinate oldLength) { IDnum nodeID = getNodeID(node); Connection *connect, *next; Coordinate distance_shift = (getNodeLength(node) - oldLength) / 2; Coordinate min_distance = getNodeLength(node) / 2 - BACKTRACK_CUTOFF; MiniConnection *localConnect; //velvetLog("Recentering node\n"); for (connect = getConnection(node); connect != NULL; connect = next) { next = getNextConnection(connect); incrementConnectionDistance(connect, -distance_shift); if (getConnectionDistance(connect) < min_distance) { //velvetLog("Unrecording %li\n", // -getNodeID(getConnectionDestination(connect))); localConnect = &localScaffold[-getNodeID(getConnectionDestination(connect)) + nodeCount(graph)]; localConnect->frontReference = NULL; unmarkNode(getTwinNode(getConnectionDestination(connect)), localConnect); destroyConnection(connect, nodeID); } else if (getTwinConnection(connect) != NULL) incrementConnectionDistance(getTwinConnection(connect), -distance_shift); } for (connect = getConnection(getTwinNode(node)); connect != NULL; connect = next) { next = getNextConnection(connect); incrementConnectionDistance(connect, distance_shift); if (getTwinConnection(connect) != NULL) incrementConnectionDistance(getTwinConnection(connect), distance_shift); } }
static void computeLocalNodeToNodeMappingsFromConnections(Connection * connect, Connection * connect2) { Node *node1 = getTwinNode(getConnectionDestination(connect)); Node *node2 = getTwinNode(getConnectionDestination(connect2)); IDnum nodeID1 = getNodeID(node1); IDnum nodeID2 = getNodeID(node2); Coordinate distance = getNodeLength(node1)/2 + getNodeLength(node2)/2; Arc *arc; if (getUniqueness(node1) || getUniqueness(node2)) return; if ((arc = getArcBetweenNodes(node1, node2, graph)) && !getConnectionBetweenNodes(node1, getTwinNode(node2))) { createConnection(nodeID1, -nodeID2, getMultiplicity(arc), 0, distance, 1 / (double) getMultiplicity(arc)); incrementConnectionWeight(getConnectionBetweenNodes (node1, getTwinNode(node2)), getMultiplicity(arc)); } if ((arc = getArcBetweenNodes(node2, node1, graph)) && !getConnectionBetweenNodes(node2, getTwinNode(node1))) { createConnection(nodeID2, -nodeID1, getMultiplicity(arc), 0, distance, 1 / (double) getMultiplicity(arc)); incrementConnectionWeight(getConnectionBetweenNodes (node2, getTwinNode(node1)), getMultiplicity(arc)); } }
static void admitGroupies(Node * source, Node * bypass) { PassageMarkerI marker, tmpMarker; for (marker = getMarker(source); marker != NULL_IDX; marker = tmpMarker) { tmpMarker = getNextInNode(marker); extractPassageMarker(marker); transposePassageMarker(marker, bypass); incrementFinishOffset(getTwinMarker(marker), getNodeLength(bypass)); } }
static boolean donorSiteAtJunction(Node * nodeA, Node * nodeB) { Nucleotide n1, n2; int i; n2 = getNucleotideInNode(nodeA, getNodeLength(nodeA) - SPLICE_FUZZINESS); for (i = SPLICE_FUZZINESS - 1; i > 0; i--) { n1 = n2; n2 = getNucleotideInNode(nodeA, getNodeLength(nodeA) - i); if (n1 == GUANINE && n2 == THYMINE) return true; } for (i = 0; i < SPLICE_FUZZINESS + 2; i++) { n1 = n2; n2 = getNucleotideInNode(nodeB, i); if (n1 == GUANINE && n2 == THYMINE) return true; } return false; }
static boolean finishesWithPAS(Node * node) { char *nodeSeq = expandNodeFragment(node, 0, getNodeLength(node), getWordLength(graph)); boolean res = false; char *ptr = strstr(nodeSeq, "AATAAA"); if (ptr) res = true; ptr = strstr(nodeSeq, "ATTAAA"); if (ptr) res = true; free(nodeSeq); return res; }
static void projectFromReadPair(Node * node, ReadOccurence * readOccurence, Coordinate position, Coordinate offset, Coordinate insertLength, double insertVariance) { Coordinate distance = insertLength; Coordinate variance = insertVariance; Node *target = getNodeInGraph(graph, readOccurence->nodeID); if (target == getTwinNode(node) || target == node) return; if (getUniqueness(target) && getNodeID(target) < getNodeID(node)) return; if (position < 0) { variance += getNodeLength(node) * getNodeLength(node) / 16; // distance += 0; } else { // variance += 0; distance += position - offset - getNodeLength(node) / 2; } if (readOccurence->position < 0) { variance += getNodeLength(target) * getNodeLength(target) / 16; //distance += 0; } else { // variance += 0; distance += readOccurence->position - readOccurence->offset - getNodeLength(target) / 2; } if (distance - getNodeLength(node)/2 - getNodeLength(target)/2 < -6 * sqrt(insertVariance)) return; createConnection(getNodeID(node), getNodeID(target), 0, 1, distance, variance); }
static void tourBusArc_local(Node * origin, Arc * arc, Time originTime) { Node *destination = getDestination(arc); Time arcTime, totalTime, destinationTime; IDnum nodeIndex = getNodeID(destination) + nodeCount(graph); Node *oldPrevious = previous[nodeIndex]; //velvetLog("Trying arc from %li -> %li\n", getNodeID(origin), getNodeID(destination)); if (oldPrevious == origin) return; arcTime = ((Time) getNodeLength(origin)) / ((Time) getMultiplicity(arc)); totalTime = originTime + arcTime; destinationTime = times[nodeIndex]; if (destinationTime == -1) { //velvetLog("New destination\n"); setNodeTime(destination, totalTime); dheapNodes[nodeIndex] = insertNodeIntoDHeap(dheap, totalTime, destination); previous[nodeIndex] = origin; return; } else if (destinationTime > totalTime) { //velvetLog("Previously visited from slower node %li\n", getNodeID(getNodePrevious(destination))); if (dheapNodes[nodeIndex] == NULL) { return; } setNodeTime(destination, totalTime); replaceKeyInDHeap(dheap, dheapNodes[nodeIndex], totalTime); previous[nodeIndex] = origin; comparePaths_local(destination, oldPrevious); return; } else { //velvetLog("Previously visited by faster node %li\n", getNodeID(getNodePrevious(destination))); comparePaths_local(destination, origin); } }
boolean isUniqueSolexa(Node * node) { Coordinate nodeLength = getNodeLength(node); Coordinate nodeCoverage; double nodeDensity, probability; nodeCoverage = getTotalCoverage(node); if (nodeLength > LONG_NODE_CUTOFF) { nodeDensity = nodeCoverage / (double) nodeLength; probability = LN2 / 2 + nodeLength / (2 * expected_coverage) * (expected_coverage * expected_coverage - nodeDensity * nodeDensity / 2); return probability > PROBABILITY_CUTOFF; } return false; }
static void updateMembers(Node * bypass, Node * nextNode) { PassageMarkerI marker, next, tmp; Coordinate nextLength = getNodeLength(nextNode); // Update marker + arc info for (marker = getMarker(bypass); marker != NULL_IDX; marker = tmp) { tmp = getNextInNode(marker); if (!isTerminal(marker) && getNode(getNextInSequence(marker)) == nextNode) { // Marker steps right into target next = getNextInSequence(marker); disconnectNextPassageMarker(marker, graph); destroyPassageMarker(next); } else if (getUniqueness(nextNode) && goesToNode(marker, nextNode)) { // Marker goes indirectly to target while (getNode(getNextInSequence(marker)) != nextNode) { next = getNextInSequence(marker); disconnectNextPassageMarker(marker, graph); destroyPassageMarker(next); } next = getNextInSequence(marker); disconnectNextPassageMarker(marker, graph); destroyPassageMarker(next); } else if (!isTerminal(marker) && getFinishOffset(marker) == 0) { // Marker goes somewhere else than to target next = getNextInSequence(marker); incrementFinishOffset(marker, nextLength); } else { // Marker goes nowhere incrementFinishOffset(marker, nextLength); } } }
static void adjustShortReads(Node * target, Node * source) { ShortReadMarker *targetArray, *marker; IDnum targetLength, index; Coordinate position, nodeLength; if (!readStartsAreActivated(graph)) return; targetArray = getNodeReads(getTwinNode(target), graph); targetLength = getNodeReadCount(getTwinNode(target), graph); nodeLength = getNodeLength(source); for (index = 0; index < targetLength; index++) { marker = getShortReadMarkerAtIndex(targetArray, index); position = getShortReadMarkerPosition(marker); if (position != -1) { position += nodeLength; setShortReadMarkerPosition(marker, position); } } }
static IDnum expectedNumberOfConnections(IDnum IDA, Connection * connect, IDnum ** counts, Category cat) { Node *A = getNodeInGraph(graph, IDA); Node *B = connect->destination; double left, middle, right; Coordinate longLength, shortLength, D; IDnum longCount; double M, N, O, P; Coordinate mu = getInsertLength(graph, cat); double sigma = sqrt(getInsertLength_var(graph, cat)); double result; if (mu <= 0) return 0; if (getNodeLength(A) < getNodeLength(B)) { longLength = getNodeLength(B); shortLength = getNodeLength(A); longCount = counts[cat][getNodeID(B) + nodeCount(graph)]; } else { longLength = getNodeLength(A); shortLength = getNodeLength(B); longCount = counts[cat][IDA + nodeCount(graph)]; } D = connect->distance - (longLength + shortLength) / 2; M = (D - mu) / sigma; N = (D + shortLength - mu) / sigma; O = (D + longLength - mu) / sigma; P = (D + shortLength + longLength - mu) / sigma; left = ((norm(M) - norm(N)) - M * normInt(M, N)) * sigma; middle = shortLength * normInt(N, O); right = ((norm(O) - norm(P)) - P * normInt(O, P)) * (-sigma); result = (longCount * (left + middle + right)) / longLength; if (result > 0) return (IDnum) result; else return 0; }
static void projectFromReadPair(Node * node, ReadOccurence * readOccurence, Coordinate position, Coordinate offset, Coordinate insertLength, double insertVariance, boolean weight) { Coordinate distance = insertLength; Coordinate variance = insertVariance; Node *target = getNodeInGraph(graph, readOccurence->nodeID); Connection *connect; double score; // Filter for useless reads: if (readOccurence->position == -1 && readOccurence->offset == -1) return; if (target == getTwinNode(node) || target == node) return; if (getUniqueness(target) && getNodeID(target) < getNodeID(node)) return; if (weight) { if (position > 0 && readOccurence->position > 0 && (connect = getConnectionBetweenNodes(node, target))) { distance = getConnectionDistance(connect); distance -= position - offset - getNodeLength(node) / 2; distance -= readOccurence->position - readOccurence->offset - getNodeLength(target) / 2; score = K * exp((insertLength - distance) * (distance - insertLength) / (2 * insertVariance)); incrementConnectionWeight(connect, score); } return; } if (position < 0) { variance += getNodeLength(node) * getNodeLength(node) / 16; // distance += 0; } else { // variance += 0; distance += position - offset - getNodeLength(node) / 2; } if (readOccurence->position < 0) { variance += getNodeLength(target) * getNodeLength(target) / 16; //distance += 0; } else { // variance += 0; distance += readOccurence->position - readOccurence->offset - getNodeLength(target) / 2; } if (distance - getNodeLength(node) / 2 - getNodeLength(target) / 2 < -6 * sqrt(insertVariance)) return; createConnection(getNodeID(node), getNodeID(target), 0, 1, distance, variance); }
static void projectFromSingleRead(Node * node, ReadOccurence * readOccurence, Coordinate position, Coordinate offset, Coordinate length, boolean weight) { Coordinate distance = 0; Connection *connect; Node *target = getNodeInGraph(graph, -readOccurence->nodeID); double variance = 1; // Filter out troublemakers if (readOccurence->position == -1 && readOccurence->offset == -1) return; if (offset < 0 || readOccurence->offset < 0) return; if (target == getTwinNode(node) || target == node) return; if (weight) { if ((connect = getConnectionBetweenNodes(node, target))) { incrementConnectionWeight(connect, 1); } else if ((connect = getConnectionBetweenNodes(getTwinNode(node), getTwinNode(target)))) { incrementConnectionWeight(connect, 1); } return; } if (position < 0) { variance += getNodeLength(node) * getNodeLength(node) / 16; distance += getNodeLength(node) / 2; } else { // variance += 0; distance += position - offset - getNodeLength(node) / 2; } if (readOccurence->position < 0) { variance += getNodeLength(target) * getNodeLength(target) / 16; distance += getNodeLength(target) / 2; } else { // variance += 0; distance += -readOccurence->position + readOccurence->offset + getNodeLength(target) / 2; } if (offset < readOccurence->offset) { if (getNodeLength(node) % 2) distance--; createConnection(getNodeID(node), getNodeID(target), 1, 0, distance, variance); } else { if (getNodeLength(target) % 2) distance++; createConnection(-getNodeID(node), -getNodeID(target), 1, 0, -distance, variance); } }
static KmerOccurenceTable *referenceGraphKmers(char *preGraphFilename, short int accelerationBits, Graph * graph, boolean double_strand, NodeMask * nodeMasks, Coordinate nodeMaskCount) { FILE *file = fopen(preGraphFilename, "r"); const int maxline = MAXLINE; char line[MAXLINE]; char c; int wordLength; Coordinate lineLength, kmerCount; Kmer word; Kmer antiWord; KmerOccurenceTable *kmerTable; IDnum index; IDnum nodeID = 0; Nucleotide nucleotide; NodeMask * nodeMask = nodeMasks; Coordinate nodeMaskIndex = 0; if (file == NULL) exitErrorf(EXIT_FAILURE, true, "Could not open %s", preGraphFilename); // Count kmers velvetLog("Scanning pre-graph file %s for k-mers\n", preGraphFilename); // First line if (!fgets(line, maxline, file)) exitErrorf(EXIT_FAILURE, true, "PreGraph file incomplete"); sscanf(line, "%*i\t%*i\t%i\n", &wordLength); kmerTable = newKmerOccurenceTable(accelerationBits, wordLength); // Read nodes if (!fgets(line, maxline, file)) exitErrorf(EXIT_FAILURE, true, "PreGraph file incomplete"); kmerCount = 0; while (line[0] == 'N') { lineLength = 0; while ((c = getc(file)) != EOF && c != '\n') lineLength++; kmerCount += lineLength - wordLength + 1; if (fgets(line, maxline, file) == NULL) break; } velvetLog("%li kmers found\n", (long) kmerCount); for(nodeMaskIndex = 0; nodeMaskIndex < nodeMaskCount; nodeMaskIndex++) { kmerCount -= nodeMasks[nodeMaskIndex].finish - nodeMasks[nodeMaskIndex].start; } nodeMaskIndex = 0; fclose(file); // Create table allocateKmerOccurences(kmerCount, kmerTable); // Fill table file = fopen(preGraphFilename, "r"); if (file == NULL) exitErrorf(EXIT_FAILURE, true, "Could not open %s", preGraphFilename); if (!fgets(line, maxline, file)) exitErrorf(EXIT_FAILURE, true, "PreGraph file incomplete"); // Read nodes if (!fgets(line, maxline, file)) exitErrorf(EXIT_FAILURE, true, "PreGraph file incomplete"); while (line[0] == 'N') { nodeID++; // Fill in the initial word : clearKmer(&word); clearKmer(&antiWord); for (index = 0; index < wordLength - 1; index++) { c = getc(file); if (c == 'A') nucleotide = ADENINE; else if (c == 'C') nucleotide = CYTOSINE; else if (c == 'G') nucleotide = GUANINE; else if (c == 'T') nucleotide = THYMINE; else if (c == '\n') exitErrorf(EXIT_FAILURE, true, "PreGraph file incomplete"); else nucleotide = ADENINE; pushNucleotide(&word, nucleotide); if (double_strand) { #ifdef COLOR reversePushNucleotide(&antiWord, nucleotide); #else reversePushNucleotide(&antiWord, 3 - nucleotide); #endif } } // Scan through node index = 0; while((c = getc(file)) != '\n' && c != EOF) { if (c == 'A') nucleotide = ADENINE; else if (c == 'C') nucleotide = CYTOSINE; else if (c == 'G') nucleotide = GUANINE; else if (c == 'T') nucleotide = THYMINE; else nucleotide = ADENINE; pushNucleotide(&word, nucleotide); if (double_strand) { #ifdef COLOR reversePushNucleotide(&antiWord, nucleotide); #else reversePushNucleotide(&antiWord, 3 - nucleotide); #endif } // Update mask if necessary if (nodeMask) { if (nodeMask->nodeID < nodeID || (nodeMask->nodeID == nodeID && index >= nodeMask->finish)) { if (++nodeMaskIndex == nodeMaskCount) nodeMask = NULL; else nodeMask++; } } // Check if not masked! if (nodeMask) { if (nodeMask->nodeID == nodeID && index >= nodeMask->start && index < nodeMask->finish) { index++; continue; } } if (!double_strand || compareKmers(&word, &antiWord) <= 0) recordKmerOccurence(&word, nodeID, index, kmerTable); else recordKmerOccurence(&antiWord, -nodeID, getNodeLength(getNodeInGraph(graph, nodeID)) - 1 - index, kmerTable); index++; } if (fgets(line, maxline, file) == NULL) break; } fclose(file); // Sort table sortKmerOccurenceTable(kmerTable); return kmerTable; }
static void threadSequenceThroughGraph(TightString * tString, KmerOccurenceTable * kmerTable, Graph * graph, IDnum seqID, Category category, boolean readTracking, boolean double_strand, ReferenceMapping * referenceMappings, Coordinate referenceMappingCount, IDnum refCount, Annotation * annotations, IDnum annotationCount, boolean second_in_pair) { Kmer word; Kmer antiWord; Coordinate readNucleotideIndex; Coordinate kmerIndex; KmerOccurence *kmerOccurence; int wordLength = getWordLength(graph); PassageMarkerI marker = NULL_IDX; PassageMarkerI previousMarker = NULL_IDX; Node *node = NULL; Node *previousNode = NULL; Coordinate coord = 0; Coordinate previousCoord = 0; Nucleotide nucleotide; boolean reversed; IDnum refID; Coordinate refCoord = 0; ReferenceMapping * refMap; Annotation * annotation = annotations; Coordinate index = 0; Coordinate uniqueIndex = 0; Coordinate annotIndex = 0; IDnum annotCount = 0; SmallNodeList * nodePile = NULL; // Neglect any string shorter than WORDLENGTH : if (getLength(tString) < wordLength) return; clearKmer(&word); clearKmer(&antiWord); // Fill in the initial word : for (readNucleotideIndex = 0; readNucleotideIndex < wordLength - 1; readNucleotideIndex++) { nucleotide = getNucleotide(readNucleotideIndex, tString); pushNucleotide(&word, nucleotide); if (double_strand || second_in_pair) { #ifdef COLOR reversePushNucleotide(&antiWord, nucleotide); #else reversePushNucleotide(&antiWord, 3 - nucleotide); #endif } } // Go through sequence while (readNucleotideIndex < getLength(tString)) { nucleotide = getNucleotide(readNucleotideIndex++, tString); pushNucleotide(&word, nucleotide); if (double_strand || second_in_pair) { #ifdef COLOR reversePushNucleotide(&antiWord, nucleotide); #else reversePushNucleotide(&antiWord, 3 - nucleotide); #endif } // Update annotation if necessary if (annotCount < annotationCount && annotIndex == getAnnotationLength(annotation)) { annotation = getNextAnnotation(annotation); annotCount++; annotIndex = 0; } // Search for reference mapping if (category == REFERENCE) { if (referenceMappings) refMap = findReferenceMapping(seqID, index, referenceMappings, referenceMappingCount); else refMap = NULL; if (refMap) { node = getNodeInGraph(graph, refMap->nodeID); if (refMap->nodeID > 0) { coord = refMap->nodeStart + (index - refMap->referenceStart); } else { coord = getNodeLength(node) - refMap->nodeStart - refMap->length + (index - refMap->referenceStart); } } else { node = NULL; if (previousNode) break; } } // Search for reference-based mapping else if (annotCount < annotationCount && uniqueIndex >= getPosition(annotation) && getAnnotSequenceID(annotation) <= refCount && getAnnotSequenceID(annotation) >= -refCount) { refID = getAnnotSequenceID(annotation); if (refID > 0) refCoord = getStart(annotation) + annotIndex; else refCoord = getStart(annotation) - annotIndex; refMap = findReferenceMapping(refID, refCoord, referenceMappings, referenceMappingCount); // If success if (refMap) { if (refID > 0) { node = getNodeInGraph(graph, refMap->nodeID); if (refMap->nodeID > 0) { coord = refMap->nodeStart + (refCoord - refMap->referenceStart); } else { coord = getNodeLength(node) - refMap->nodeStart - refMap->length + (refCoord - refMap->referenceStart); } } else { node = getNodeInGraph(graph, -refMap->nodeID); if (refMap->nodeID > 0) { coord = getNodeLength(node) - refMap->nodeStart - (refCoord - refMap->referenceStart) - 1; } else { coord = refMap->nodeStart + refMap->length - (refCoord - refMap->referenceStart) - 1; } } } else { node = NULL; if (previousNode) break; } } // Search in table else { reversed = false; if (double_strand) { if (compareKmers(&word, &antiWord) <= 0) { kmerOccurence = findKmerInKmerOccurenceTable(&word, kmerTable); } else { kmerOccurence = findKmerInKmerOccurenceTable(&antiWord, kmerTable); reversed = true; } } else { if (!second_in_pair) { kmerOccurence = findKmerInKmerOccurenceTable(&word, kmerTable); } else { kmerOccurence = findKmerInKmerOccurenceTable(&antiWord, kmerTable); reversed = true; } } if (kmerOccurence) { if (!reversed) { node = getNodeInGraph(graph, getKmerOccurenceNodeID(kmerOccurence)); coord = getKmerOccurencePosition(kmerOccurence); } else { node = getNodeInGraph(graph, -getKmerOccurenceNodeID(kmerOccurence)); coord = getNodeLength(node) - getKmerOccurencePosition(kmerOccurence) - 1; } } else { node = NULL; if (previousNode) break; } } // Increment positions if (annotCount < annotationCount && uniqueIndex >= getPosition(annotation)) annotIndex++; else uniqueIndex++; // Fill in graph if (node) { #ifdef OPENMP lockNode(node); #endif kmerIndex = readNucleotideIndex - wordLength; if (previousNode == node && previousCoord == coord - 1) { if (category / 2 >= CATEGORIES) { setPassageMarkerFinish(marker, kmerIndex + 1); setFinishOffset(marker, getNodeLength(node) - coord - 1); } else { #ifndef SINGLE_COV_CAT incrementVirtualCoverage(node, category / 2, 1); incrementOriginalVirtualCoverage(node, category / 2, 1); #else incrementVirtualCoverage(node, 1); #endif } #ifdef OPENMP unLockNode(node); #endif } else { if (category / 2 >= CATEGORIES) { marker = newPassageMarker(seqID, kmerIndex, kmerIndex + 1, coord, getNodeLength (node) - coord - 1); transposePassageMarker(marker, node); connectPassageMarkers (previousMarker, marker, graph); previousMarker = marker; } else { if (readTracking) { if (!isNodeMemorized(node, nodePile)) { addReadStart(node, seqID, coord, graph, kmerIndex); memorizeNode(node, &nodePile); } else { blurLastShortReadMarker (node, graph); } } #ifndef SINGLE_COV_CAT incrementVirtualCoverage(node, category / 2, 1); incrementOriginalVirtualCoverage(node, category / 2, 1); #else incrementVirtualCoverage(node, 1); #endif } #ifdef OPENMP lockTwoNodes(node, previousNode); #endif createArc(previousNode, node, graph); #ifdef OPENMP unLockTwoNodes(node, previousNode); #endif } previousNode = node; previousCoord = coord; } index++; } if (readTracking && category / 2 < CATEGORIES) unMemorizeNodes(&nodePile); }
static void extractNodeASEvents(Node * node, Locus * locus) { Node *nodeA, *nodeB, *nodeC; Event *event; // If linear or more than 2 outgoing arcs: ignore if (countActiveConnections(node) != 2) return; // Follow the two active arcs nodeA = getTwinNode(getConnectionDestination (getActiveConnection(node))); nodeB = getTwinNode(getConnectionDestination (getSecondActiveConnection(node))); // A should be the longer of the two if (getNodeLength(nodeA) < getNodeLength(nodeB)) { nodeC = nodeA; nodeA = nodeB; nodeB = nodeC; nodeC = NULL; } // If both very short, ignore: if (getNodeLength(nodeA) < 2 * getWordLength(graph) - 1) return; if (getNodeLength(nodeB) < 2 * getWordLength(graph) - 1) { if (countActiveConnections(nodeA) != 1 || countActiveConnections(nodeB) != 1 || getConnectionDestination(getActiveConnection(nodeA)) != getConnectionDestination(getActiveConnection(nodeB))) return; nodeC = getTwinNode(getConnectionDestination (getActiveConnection(nodeA))); // Intron retention if (donorSiteAtJunction(node, nodeA) && acceptorSiteAtJunction(nodeA, nodeC)) { event = allocateEvent(); event->type = intron_retention; event->nodes[0] = node; event->nodes[1] = nodeA; event->nodes[2] = nodeB; event->nodes[3] = nodeC; event->next = locus->event; locus->event = event; } // Alternative 5' splice site else if (donorSiteAtJunction(node, nodeA)) { event = allocateEvent(); event->type = alternative_5prime_splice; event->nodes[0] = node; event->nodes[1] = nodeA; event->nodes[2] = nodeB; event->nodes[3] = nodeC; event->next = locus->event; locus->event = event; } // Alternative 3' splice site else if (acceptorSiteAtJunction(nodeA, nodeC)) { event = allocateEvent(); event->type = alternative_3prime_splice; event->nodes[0] = node; event->nodes[1] = nodeA; event->nodes[2] = nodeB; event->nodes[3] = nodeC; event->next = locus->event; locus->event = event; } // Skipped exon else { event = allocateEvent(); event->type = skipped_exon; event->nodes[0] = node; event->nodes[1] = nodeA; event->nodes[2] = nodeB; event->nodes[3] = nodeC; event->next = locus->event; locus->event = event; } } else { // Alt. poly A: if (finishesWithPAS(node) && finishesWithPAS(nodeA)) { event = allocateEvent(); event->type = alternative_polyA; event->nodes[0] = node; event->nodes[1] = nodeA; event->nodes[2] = nodeB; event->nodes[3] = NULL; event->next = locus->event; locus->event = event; } // Mutually exclusive exons if (countActiveConnections(nodeA) == 1 && countActiveConnections(nodeB) == 1 && getConnectionDestination(getActiveConnection(nodeA)) == getConnectionDestination(getActiveConnection(nodeB))) { event = allocateEvent(); event->type = mutually_exclusive_exons; event->nodes[0] = node; event->nodes[1] = nodeA; event->nodes[2] = nodeB; event->nodes[3] = getTwinNode(getConnectionDestination (getActiveConnection(nodeA))); event->next = locus->event; locus->event = event; } } }