unsigned getDistance(Kmer source, Kmer target, unsigned kmer_length) { unsigned minDistance = kmer_length; unsigned dpArray[2][kmer_length]; bzero(dpArray, kmer_length * 2 * sizeof(unsigned)); /* #ifdef VERBOSE printf("(dist) source k-mer: "); print_kmer(source, kmer_length); printf("\n"); printf("(dist) target k-mer: "); print_kmer(target, kmer_length); printf("\n"); #endif // VERBOSE */ for(unsigned is=0; is < kmer_length; ++is) { unsigned *previousRow = &dpArray[((is+1) % 2)][0]; unsigned *currentRow = &dpArray[is % 2][0]; bzero(currentRow, kmer_length * sizeof(int)); for(unsigned it=0; it < kmer_length; ++it) { if( getNucleotide(target, it) == getNucleotide(source, is) ) { /* #ifdef VERBOSE printf("(dist) is=%d isN=%c it=%d itN=%c\n", is, CHAR_BASE_MAP[getNucleotide(source,is)], it, CHAR_BASE_MAP[getNucleotide(target,it)]); #endif // VERBOSE */ unsigned previousCount = (it == 0 ? 0 : previousRow[it-1]); unsigned currentCount = previousCount + 1; currentRow[it] = currentCount; // calculate distance unsigned ms = min(is, kmer_length - is - currentCount); unsigned mt = (ms < is ? it : kmer_length - it - currentCount); unsigned distance = ms + (kmer_length - currentCount) + mt; if( distance < minDistance ) { /* #ifdef VERBOSE printf("previous: "); print_row(previousRow, 21, is-1, source, target); printf(" current: "); print_row(currentRow, 21, is, source, target); printf("(dist) is=%d it=%d ms=%d mt=%d len=%d distance=%d\n", is, it, ms, mt, currentCount, distance); #endif // VERBOSE */ minDistance = distance; } } } } /* #ifdef VERBOSE printf("(dist) minDistance = %d\n", minDistance); #endif // VERBOSE */ return minDistance; }
static boolean compareSequences(TightString * sequence1, TightString * sequence2) { Coordinate i, j; Coordinate length1 = getLength(sequence1); Coordinate length2 = getLength(sequence2); Coordinate maxLength; Time Choice1, Choice2, Choice3; Time maxScore; if (length1 == 0 || length2 == 0) return false; maxLength = (length1 > length2 ? length1 : length2); if (length1 < WORDLENGTH || length2 < WORDLENGTH) if (maxLength - length1 > MAXGAPS || maxLength - length2 > MAXGAPS) return false; for (i = 0; i <= length1; i++) Fmatrix[i][0] = 0; for (j = 0; j <= length2; j++) Fmatrix[0][j] = 0; for (i = 1; i <= length1; i++) { for (j = 1; j <= length2; j++) { Choice1 = Fmatrix[i - 1][j - 1] + SIM[(int) getNucleotide(i - 1, sequence1)] [(int) getNucleotide(j - 1, sequence2)]; Choice2 = Fmatrix[i - 1][j] + INDEL; Choice3 = Fmatrix[i][j - 1] + INDEL; Fmatrix[i][j] = max(Choice1, Choice2, Choice3); } } maxScore = Fmatrix[length1][length2]; if (maxScore < maxLength - MAXGAPS) return false; if ((1 - maxScore / maxLength) > MAXDIVERGENCE) return false; return true; }
int getDataSingle (char *label, char *data, int position){ int status, i; struct sequence *seq; status = 0; /* ** Find the sequence label. */ seq = &(seqList); while (strcmp (seq->label, label) != 0) { if (seq->next == NULL) { status = -1; fprintf (stderr, "Unmatched sequence label in tree: '%s'.\n", label); goto End_of_Routine; } seq = seq->next; } /* ** Get the data from the requested columns. */ data[0] = seq->sequence[position-1]; data[1] = '\0'; /* ** Remember the observed combination. */ i = getNucleotide (data); if (i == -1) { status = -1; goto End_of_Routine; } obsCombs[i] = 1; End_of_Routine: /* ** Return the status. */ return (status); }
static TightString *readPositivePassageMarker(PassageMarkerI marker, TightString ** seqs, int WORDLENGTH) { Coordinate index; Nucleotide nucleotide; TightString *tString = seqs[getPassageMarkerSequenceID(marker) - 1]; TightString *res = newTightString(getPassageMarkerLength(marker)); for (index = 0; index < getLength(tString); index++) { nucleotide = getNucleotide(getPassageMarkerStart(marker) + index + WORDLENGTH - 1, tString); writeNucleotideAtPosition(nucleotide, index, res); } return res; }
static TightString *readNegativePassageMarker(PassageMarkerI marker, TightString ** seqs) { Coordinate index; Nucleotide nucleotide; TightString *tString = seqs[getAbsolutePassMarkerSeqID(marker) - 1]; TightString *res = newTightString(getPassageMarkerLength(marker)); for (index = 0; index < getPassageMarkerLength(marker); index++) { nucleotide = getNucleotide(getPassageMarkerStart(marker) - index, tString); #ifndef COLOR writeNucleotideAtPosition(3 - nucleotide, index, res); #else writeNucleotideAtPosition(nucleotide, index, res); #endif } return res; }
static void threadSequenceThroughGraph(TightString * tString, KmerOccurenceTable * kmerTable, Graph * graph, IDnum seqID, Category category, boolean readTracking, boolean double_strand, ReferenceMapping * referenceMappings, Coordinate referenceMappingCount, IDnum refCount, Annotation * annotations, IDnum annotationCount, boolean second_in_pair) { Kmer word; Kmer antiWord; Coordinate readNucleotideIndex; Coordinate kmerIndex; KmerOccurence *kmerOccurence; int wordLength = getWordLength(graph); PassageMarkerI marker = NULL_IDX; PassageMarkerI previousMarker = NULL_IDX; Node *node = NULL; Node *previousNode = NULL; Coordinate coord = 0; Coordinate previousCoord = 0; Nucleotide nucleotide; boolean reversed; IDnum refID; Coordinate refCoord = 0; ReferenceMapping * refMap; Annotation * annotation = annotations; Coordinate index = 0; Coordinate uniqueIndex = 0; Coordinate annotIndex = 0; IDnum annotCount = 0; SmallNodeList * nodePile = NULL; // Neglect any string shorter than WORDLENGTH : if (getLength(tString) < wordLength) return; clearKmer(&word); clearKmer(&antiWord); // Fill in the initial word : for (readNucleotideIndex = 0; readNucleotideIndex < wordLength - 1; readNucleotideIndex++) { nucleotide = getNucleotide(readNucleotideIndex, tString); pushNucleotide(&word, nucleotide); if (double_strand || second_in_pair) { #ifdef COLOR reversePushNucleotide(&antiWord, nucleotide); #else reversePushNucleotide(&antiWord, 3 - nucleotide); #endif } } // Go through sequence while (readNucleotideIndex < getLength(tString)) { nucleotide = getNucleotide(readNucleotideIndex++, tString); pushNucleotide(&word, nucleotide); if (double_strand || second_in_pair) { #ifdef COLOR reversePushNucleotide(&antiWord, nucleotide); #else reversePushNucleotide(&antiWord, 3 - nucleotide); #endif } // Update annotation if necessary if (annotCount < annotationCount && annotIndex == getAnnotationLength(annotation)) { annotation = getNextAnnotation(annotation); annotCount++; annotIndex = 0; } // Search for reference mapping if (category == REFERENCE) { if (referenceMappings) refMap = findReferenceMapping(seqID, index, referenceMappings, referenceMappingCount); else refMap = NULL; if (refMap) { node = getNodeInGraph(graph, refMap->nodeID); if (refMap->nodeID > 0) { coord = refMap->nodeStart + (index - refMap->referenceStart); } else { coord = getNodeLength(node) - refMap->nodeStart - refMap->length + (index - refMap->referenceStart); } } else { node = NULL; if (previousNode) break; } } // Search for reference-based mapping else if (annotCount < annotationCount && uniqueIndex >= getPosition(annotation) && getAnnotSequenceID(annotation) <= refCount && getAnnotSequenceID(annotation) >= -refCount) { refID = getAnnotSequenceID(annotation); if (refID > 0) refCoord = getStart(annotation) + annotIndex; else refCoord = getStart(annotation) - annotIndex; refMap = findReferenceMapping(refID, refCoord, referenceMappings, referenceMappingCount); // If success if (refMap) { if (refID > 0) { node = getNodeInGraph(graph, refMap->nodeID); if (refMap->nodeID > 0) { coord = refMap->nodeStart + (refCoord - refMap->referenceStart); } else { coord = getNodeLength(node) - refMap->nodeStart - refMap->length + (refCoord - refMap->referenceStart); } } else { node = getNodeInGraph(graph, -refMap->nodeID); if (refMap->nodeID > 0) { coord = getNodeLength(node) - refMap->nodeStart - (refCoord - refMap->referenceStart) - 1; } else { coord = refMap->nodeStart + refMap->length - (refCoord - refMap->referenceStart) - 1; } } } else { node = NULL; if (previousNode) break; } } // Search in table else { reversed = false; if (double_strand) { if (compareKmers(&word, &antiWord) <= 0) { kmerOccurence = findKmerInKmerOccurenceTable(&word, kmerTable); } else { kmerOccurence = findKmerInKmerOccurenceTable(&antiWord, kmerTable); reversed = true; } } else { if (!second_in_pair) { kmerOccurence = findKmerInKmerOccurenceTable(&word, kmerTable); } else { kmerOccurence = findKmerInKmerOccurenceTable(&antiWord, kmerTable); reversed = true; } } if (kmerOccurence) { if (!reversed) { node = getNodeInGraph(graph, getKmerOccurenceNodeID(kmerOccurence)); coord = getKmerOccurencePosition(kmerOccurence); } else { node = getNodeInGraph(graph, -getKmerOccurenceNodeID(kmerOccurence)); coord = getNodeLength(node) - getKmerOccurencePosition(kmerOccurence) - 1; } } else { node = NULL; if (previousNode) break; } } // Increment positions if (annotCount < annotationCount && uniqueIndex >= getPosition(annotation)) annotIndex++; else uniqueIndex++; // Fill in graph if (node) { #ifdef OPENMP lockNode(node); #endif kmerIndex = readNucleotideIndex - wordLength; if (previousNode == node && previousCoord == coord - 1) { if (category / 2 >= CATEGORIES) { setPassageMarkerFinish(marker, kmerIndex + 1); setFinishOffset(marker, getNodeLength(node) - coord - 1); } else { #ifndef SINGLE_COV_CAT incrementVirtualCoverage(node, category / 2, 1); incrementOriginalVirtualCoverage(node, category / 2, 1); #else incrementVirtualCoverage(node, 1); #endif } #ifdef OPENMP unLockNode(node); #endif } else { if (category / 2 >= CATEGORIES) { marker = newPassageMarker(seqID, kmerIndex, kmerIndex + 1, coord, getNodeLength (node) - coord - 1); transposePassageMarker(marker, node); connectPassageMarkers (previousMarker, marker, graph); previousMarker = marker; } else { if (readTracking) { if (!isNodeMemorized(node, nodePile)) { addReadStart(node, seqID, coord, graph, kmerIndex); memorizeNode(node, &nodePile); } else { blurLastShortReadMarker (node, graph); } } #ifndef SINGLE_COV_CAT incrementVirtualCoverage(node, category / 2, 1); incrementOriginalVirtualCoverage(node, category / 2, 1); #else incrementVirtualCoverage(node, 1); #endif } #ifdef OPENMP lockTwoNodes(node, previousNode); #endif createArc(previousNode, node, graph); #ifdef OPENMP unLockTwoNodes(node, previousNode); #endif } previousNode = node; previousCoord = coord; } index++; } if (readTracking && category / 2 < CATEGORIES) unMemorizeNodes(&nodePile); }
static void ghostThreadSequenceThroughGraph(TightString * tString, KmerOccurenceTable * kmerTable, Graph * graph, IDnum seqID, Category category, boolean readTracking, boolean double_strand, ReferenceMapping * referenceMappings, Coordinate referenceMappingCount, IDnum refCount, Annotation * annotations, IDnum annotationCount, boolean second_in_pair) { Kmer word; Kmer antiWord; Coordinate readNucleotideIndex; KmerOccurence *kmerOccurence; int wordLength = getWordLength(graph); Nucleotide nucleotide; IDnum refID; Coordinate refCoord; ReferenceMapping * refMap = NULL; Coordinate uniqueIndex = 0; Coordinate annotIndex = 0; IDnum annotCount = 0; boolean reversed; SmallNodeList * nodePile = NULL; Annotation * annotation = annotations; Node *node; Node *previousNode = NULL; // Neglect any read which will not be short paired if ((!readTracking && category % 2 == 0) || category / 2 >= CATEGORIES) return; // Neglect any string shorter than WORDLENGTH : if (getLength(tString) < wordLength) return; // Verify that all short reads are reasonnably short if (getLength(tString) > USHRT_MAX) { velvetLog("Short read of length %lli, longer than limit %i\n", (long long) getLength(tString), SHRT_MAX); velvetLog("You should better declare this sequence as long, because it genuinely is!\n"); exit(1); } clearKmer(&word); clearKmer(&antiWord); // Fill in the initial word : for (readNucleotideIndex = 0; readNucleotideIndex < wordLength - 1; readNucleotideIndex++) { nucleotide = getNucleotide(readNucleotideIndex, tString); pushNucleotide(&word, nucleotide); if (double_strand || second_in_pair) { #ifdef COLOR reversePushNucleotide(&antiWord, nucleotide); #else reversePushNucleotide(&antiWord, 3 - nucleotide); #endif } } // Go through sequence while (readNucleotideIndex < getLength(tString)) { // Shift word: nucleotide = getNucleotide(readNucleotideIndex++, tString); pushNucleotide(&word, nucleotide); if (double_strand || second_in_pair) { #ifdef COLOR reversePushNucleotide(&antiWord, nucleotide); #else reversePushNucleotide(&antiWord, 3 - nucleotide); #endif } // Update annotation if necessary if (annotCount < annotationCount && annotIndex == getAnnotationLength(annotation)) { annotation = getNextAnnotation(annotation); annotCount++; annotIndex = 0; } // Search for reference mapping if (annotCount < annotationCount && uniqueIndex >= getPosition(annotation) && getAnnotSequenceID(annotation) <= refCount && getAnnotSequenceID(annotation) >= -refCount) { refID = getAnnotSequenceID(annotation); if (refID > 0) refCoord = getStart(annotation) + annotIndex; else refCoord = getStart(annotation) - annotIndex; refMap = findReferenceMapping(refID, refCoord, referenceMappings, referenceMappingCount); // If success if (refMap) { if (refID > 0) node = getNodeInGraph(graph, refMap->nodeID); else node = getNodeInGraph(graph, -refMap->nodeID); } else { node = NULL; if (previousNode) break; } } // if not.. look in table else { reversed = false; if (double_strand) { if (compareKmers(&word, &antiWord) <= 0) { kmerOccurence = findKmerInKmerOccurenceTable(&word, kmerTable); } else { kmerOccurence = findKmerInKmerOccurenceTable(&antiWord, kmerTable); reversed = true; } } else { if (!second_in_pair) { kmerOccurence = findKmerInKmerOccurenceTable(&word, kmerTable); } else { kmerOccurence = findKmerInKmerOccurenceTable(&antiWord, kmerTable); reversed = true; } } if (kmerOccurence) { if (!reversed) node = getNodeInGraph(graph, getKmerOccurenceNodeID(kmerOccurence)); else node = getNodeInGraph(graph, -getKmerOccurenceNodeID(kmerOccurence)); } else { node = NULL; if (previousNode) break; } } if (annotCount < annotationCount && uniqueIndex >= getPosition(annotation)) annotIndex++; else uniqueIndex++; previousNode = node; // Fill in graph if (node && !isNodeMemorized(node, nodePile)) { #ifdef OPENMP lockNode(node); #endif incrementReadStartCount(node, graph); #ifdef OPENMP unLockNode(node); #endif memorizeNode(node, &nodePile); } } unMemorizeNodes(&nodePile); }
static void threadSequenceThroughGraph(TightString * tString, KmerOccurenceTable * kmerOccurences, Graph * graph, IDnum seqID, Category category, boolean readTracking, boolean double_strand) { Kmer word; Kmer antiWord; Coordinate readNucleotideIndex; Coordinate kmerIndex; KmerOccurence *kmerOccurence; int wordLength = getWordLength(graph); PassageMarker *marker = NULL; PassageMarker *previousMarker = NULL; Node *node; Node *previousNode = NULL; Coordinate coord; Coordinate previousCoord = 0; Nucleotide nucleotide; clearKmer(&word); clearKmer(&antiWord); // Neglect any string shorter than WORDLENGTH : if (getLength(tString) < wordLength) return; // Fill in the initial word : for (readNucleotideIndex = 0; readNucleotideIndex < wordLength - 1; readNucleotideIndex++) { nucleotide = getNucleotide(readNucleotideIndex, tString); pushNucleotide(&word, nucleotide); if (double_strand) { #ifdef COLOR reversePushNucleotide(&antiWord, nucleotide); #else reversePushNucleotide(&antiWord, 3 - nucleotide); #endif } } // Go through sequence while (readNucleotideIndex < getLength(tString)) { nucleotide = getNucleotide(readNucleotideIndex++, tString); pushNucleotide(&word, nucleotide); if (double_strand) { #ifdef COLOR reversePushNucleotide(&antiWord, nucleotide); #else reversePushNucleotide(&antiWord, 3 - nucleotide); #endif } // Search in table if ((!double_strand || compareKmers(&word, &antiWord) <= 0) && (kmerOccurence = findKmerOccurenceInSortedTable(&word, kmerOccurences))) { node = getNodeInGraph(graph, kmerOccurence->nodeID); coord = kmerOccurence->position; } else if ((double_strand && compareKmers(&word, &antiWord) > 0) && (kmerOccurence = findKmerOccurenceInSortedTable(&antiWord, kmerOccurences))) { node = getNodeInGraph(graph, -kmerOccurence->nodeID); coord = getNodeLength(node) - kmerOccurence->position - 1; } else { node = NULL; if (previousNode) { break; } } // Fill in graph if (node) { kmerIndex = readNucleotideIndex - wordLength; if (previousNode == node && previousCoord == coord - 1) { if (category / 2 >= CATEGORIES) { setPassageMarkerFinish(marker, kmerIndex + 1); setFinishOffset(marker, getNodeLength(node) - coord - 1); } else { incrementVirtualCoverage(node, category / 2, 1); incrementOriginalVirtualCoverage (node, category / 2, 1); } } else { if (category / 2 >= CATEGORIES) { marker = newPassageMarker(seqID, kmerIndex, kmerIndex + 1, coord, getNodeLength (node) - coord - 1); transposePassageMarker(marker, node); connectPassageMarkers (previousMarker, marker, graph); previousMarker = marker; } else { if (readTracking) { if (!getNodeStatus(node)) { addReadStart(node, seqID, coord, graph, kmerIndex); setSingleNodeStatus (node, true); memorizeNode(node); } else { blurLastShortReadMarker (node, graph); } } incrementVirtualCoverage(node, category / 2, 1); incrementOriginalVirtualCoverage (node, category / 2, 1); } createArc(previousNode, node, graph); } previousNode = node; previousCoord = coord; } } unlockMemorizedNodes(); }
static void ghostThreadSequenceThroughGraph(TightString * tString, KmerOccurenceTable * kmerOccurences, Graph * graph, IDnum seqID, Category category, boolean readTracking, boolean double_strand) { Kmer word; Kmer antiWord; Coordinate readNucleotideIndex; KmerOccurence *kmerOccurence; int wordLength = getWordLength(graph); Nucleotide nucleotide; Node *node; Node *previousNode = NULL; clearKmer(&word); clearKmer(&antiWord); // Neglect any read which will not be short paired if ((!readTracking && category % 2 == 0) || category / 2 >= CATEGORIES) return; // Neglect any string shorter than WORDLENGTH : if (getLength(tString) < wordLength) return; // Verify that all short reads are reasonnably short if (getLength(tString) > USHRT_MAX) { printf("Short read of length %lli, longer than limit %i\n", (long long) getLength(tString), SHRT_MAX); puts("You should better declare this sequence as long, because it genuinely is!"); exit(1); } // Allocate memory for the read pairs if (!readStartsAreActivated(graph)) activateReadStarts(graph); // Fill in the initial word : for (readNucleotideIndex = 0; readNucleotideIndex < wordLength - 1; readNucleotideIndex++) { nucleotide = getNucleotide(readNucleotideIndex, tString); pushNucleotide(&word, nucleotide); if (double_strand) { #ifdef COLOR reversePushNucleotide(&antiWord, nucleotide); #else reversePushNucleotide(&antiWord, 3 - nucleotide); #endif } } // Go through sequence while (readNucleotideIndex < getLength(tString)) { // Shift word: nucleotide = getNucleotide(readNucleotideIndex++, tString); pushNucleotide(&word, nucleotide); if (double_strand) { #ifdef COLOR reversePushNucleotide(&antiWord, nucleotide); #else reversePushNucleotide(&antiWord, 3 - nucleotide); #endif } // Search in table if ((!double_strand || compareKmers(&word, &antiWord) <= 0) && (kmerOccurence = findKmerOccurenceInSortedTable(&word, kmerOccurences))) { node = getNodeInGraph(graph, kmerOccurence->nodeID); } else if ((double_strand && compareKmers(&word, &antiWord) > 0) && (kmerOccurence = findKmerOccurenceInSortedTable(&antiWord, kmerOccurences))) { node = getNodeInGraph(graph, -kmerOccurence->nodeID); } else { node = NULL; if (previousNode) break; } previousNode = node; // Fill in graph if (node && !getNodeStatus(node)) { incrementReadStartCount(node, graph); setSingleNodeStatus(node, true); memorizeNode(node); } } unlockMemorizedNodes(); }