int MRC::writePixel(void *buf, int nimage, int nline, int npixel) { size_t ImSize=getImSize(); size_t LineLength=getNy()*getWordLength(); if(npixel>=LineLength) return 0; size_t offset=1024+getSymdatasize()+nimage*ImSize+nline*LineLength+npixel*getWordLength(); if(fseek(m_fp, offset, SEEK_SET)!=0) return 0; return fwrite(buf, 1, getWordLength(), m_fp); }
/* One file pointer calculates the length of the next word so that we can dynamically allocate memory for that length. Second file pointer writes the word into dynamically allocated memory. */ void read(struct BST* bst, struct BST* stop, FILE *fp, FILE *fp2){ char *word; int wn = getWordLength(fp2); while(wn){ word = getWord(fp, wn); wn = getWordLength(fp2); if(stop == NULL || search(stop, word) == NULL){enter(bst, word);} // If stop is NULL, it means we are reading STOP_WORDS file. else{free(word);} // This means the word is also in STOP_WORDS file, so we are not saving it. } }
/** Write a string to the display, beginning from the cursor's current * position. This does do word wrapping. * \param str The string to write. */ void writeStringToDisplayWordWrap(const char *str) { uint32_t length; uint32_t i; while (*str != '\0') { length = getWordLength(str); if ((cursor_pos + length) > CHARACTERS_PER_LINE) { // Need to word wrap. nextLine(); } for (i = 0; i < length; i++) { writeCharacterToTextBuffer(str[i]); } str += length; while (*str == ' ') { str++; if (cursor_pos != 0) { // A newline is equivalent to any number of spaces. writeCharacterToTextBuffer(' '); } } } renderDisplay(); }
void prepareGraphForLocalCorrections(Graph * argGraph) { IDnum nodes = nodeCount(argGraph); IDnum index; //Setting global params graph = argGraph; WORDLENGTH = getWordLength(graph);; // Done with global params velvetLog("Preparing to correct graph with cutoff %f\n", MAXDIVERGENCE); // Allocating memory times = mallocOrExit(2 * nodes + 1, Time); previous = mallocOrExit(2 * nodes + 1, Node *); dheapNodes = mallocOrExit(2 * nodes + 1, DFibHeapNode *); dheap = newDFibHeap(); fastSequence = newTightString(MAXREADLENGTH); slowSequence = newTightString(MAXREADLENGTH); for (index = 0; index < (2 * nodeCount(graph) + 1); index++) { times[index] = -1; dheapNodes[index] = NULL; previous[index] = NULL; } Fmatrix = callocOrExit(MAXREADLENGTH + 1, double *); for (index = 0; index < MAXREADLENGTH + 1; index++) Fmatrix[index] = callocOrExit(MAXREADLENGTH + 1, double); //Done with memory }
int MRC::writeLine(void *buf, int nimage, int nline) { size_t ImSize=getImSize(); size_t LineLength=getNy()*getWordLength(); size_t offset=1024+getSymdatasize()+nimage*ImSize+nline*LineLength; if(fseek(m_fp, offset, SEEK_SET)!=0) return 0; return fwrite(buf, 1, LineLength, m_fp); }
static void measureCoOccurences(Coordinate ** coOccurences, boolean * interestingReads, ReadOccurence ** readNodes, IDnum * readNodeCounts, IDnum * readPairs, Category * cats) { IDnum coOccurencesIndex[CATEGORIES + 1]; IDnum observationIndex; IDnum readIndex, readPairIndex; IDnum readNodeCount; IDnum readOccurenceIndex, readPairOccurenceIndex; ReadOccurence * readOccurence, *readPairOccurence; Category libID; for (libID = 0; libID < CATEGORIES + 1; libID++) coOccurencesIndex[libID] = 0; for (readIndex = 0; readIndex < sequenceCount(graph); readIndex++) { // Eliminating dodgy, unpaired, already counted or user-specified reads if (!interestingReads[readIndex]) continue; // Find co-occurence // We know that for each read the read occurences are ordered by increasing node ID libID = cats[readIndex]/2; readPairIndex = readPairs[readIndex]; observationIndex = coOccurencesIndex[libID]; readOccurence = readNodes[readIndex + 1]; readOccurenceIndex = 0; readNodeCount = readNodeCounts[readIndex + 1]; readPairOccurenceIndex = readNodeCounts[readPairIndex + 1] - 1; readPairOccurence = &(readNodes[readPairIndex + 1][readPairOccurenceIndex]); while (readOccurenceIndex < readNodeCount && readPairOccurenceIndex >= 0) { if (readOccurence->nodeID == -readPairOccurence->nodeID) { if (readOccurence->position > 0 && readPairOccurence->position > 0) { coOccurences[libID][observationIndex] = getNodeLength(getNodeInGraph(graph, readOccurence->nodeID)) + getWordLength(graph) - 1 - (readOccurence->position - readOccurence->offset) - (readPairOccurence->position - readPairOccurence->offset); coOccurencesIndex[libID]++; break; } else { readOccurence++; readOccurenceIndex++; readPairOccurence--; readPairOccurenceIndex--; } } else if (readOccurence->nodeID < -readPairOccurence->nodeID) { readOccurence++; readOccurenceIndex++; } else { readPairOccurence--; readPairOccurenceIndex--; } } } }
int main(void) { long length = 0; for (int i = 1; i <= 1000; ++i) { length += getWordLength(i); } printf("%d\n", length); return 0; }
static boolean finishesWithPAS(Node * node) { char *nodeSeq = expandNodeFragment(node, 0, getNodeLength(node), getWordLength(graph)); boolean res = false; char *ptr = strstr(nodeSeq, "AATAAA"); if (ptr) res = true; ptr = strstr(nodeSeq, "ATTAAA"); if (ptr) res = true; free(nodeSeq); return res; }
static boolean uniqueNodesConnect(Node * startingNode) { Node *destination = NULL; PassageMarkerI startMarker, currentMarker; RBConnection *newList; RBConnection *list = NULL; boolean multipleHits = false; if (arcCount(startingNode) == 0) return false; if (getMarker(startingNode) == NULL_IDX) return false; dbgCounter++; // Checking for multiple destinations for (startMarker = getMarker(startingNode); startMarker != NULL_IDX; startMarker = getNextInNode(startMarker)) { if (getFinishOffset(startMarker) > 2 * getWordLength(graph)) continue; for (currentMarker = getNextInSequence(startMarker); currentMarker != NULL_IDX; currentMarker = getNextInSequence(currentMarker)) { if (!getUniqueness(getNode(currentMarker))) { continue; } else if (getNodeStatus(getNode(currentMarker))) { if (getStartOffset(currentMarker) > 2 * getWordLength(graph)) break; for (newList = list; newList != NULL; newList = newList->next) { if (newList->node == getNode(currentMarker)) { newList->multiplicity++; break; } } if (newList == NULL) abort(); break; } else { if (getStartOffset(currentMarker) > 2 * getWordLength(graph)) break; setSingleNodeStatus(getNode(currentMarker), true); newList = allocateRBConnection(); newList->node = getNode(currentMarker); newList->multiplicity = 1; newList->marker = startMarker; newList->next = list; list = newList; break; } } } while (list != NULL) { newList = list; list = newList->next; setSingleNodeStatus(newList->node, false); if (newList->multiplicity >= MULTIPLICITY_CUTOFF) { if (destination == NULL) { destination = newList->node; path = newList->marker; } else if (destination != newList->node) multipleHits = true; } deallocateRBConnection(newList); } if (multipleHits) { multCounter++; setUniqueness(startingNode, false); return false; } if (destination == NULL || destination == startingNode || destination == getTwinNode(startingNode)) { nullCounter++; return false; } // Check for reciprocity for (startMarker = getMarker(getTwinNode(destination)); startMarker != NULL_IDX; startMarker = getNextInNode(startMarker)) { if (getFinishOffset(startMarker) > 2 * getWordLength(graph)) continue; for (currentMarker = getNextInSequence(startMarker); currentMarker != NULL_IDX; currentMarker = getNextInSequence(currentMarker)) { if (!getUniqueness(getNode(currentMarker))) { continue; } else if (getNodeStatus(getNode(currentMarker))) { if (getStartOffset(currentMarker) > 2 * getWordLength(graph)) break; for (newList = list; newList != NULL; newList = newList->next) { if (newList->node == getNode(currentMarker)) { newList->multiplicity++; break; } } if (newList == NULL) abort(); break; } else { if (getStartOffset(currentMarker) > 2 * getWordLength(graph)) break; setSingleNodeStatus(getNode(currentMarker), true); newList = allocateRBConnection(); newList->node = getNode(currentMarker); newList->multiplicity = 1; newList->next = list; list = newList; break; } } } while (list != NULL) { newList = list; list = newList->next; setSingleNodeStatus(newList->node, false); if (newList->multiplicity >= MULTIPLICITY_CUTOFF && newList->node != getTwinNode(startingNode)) multipleHits = true; deallocateRBConnection(newList); } if (multipleHits) { multCounter++; setUniqueness(destination, false); return false; } // Aligning long reads to each other: // TODO // Merge pairwise alignments and produce consensus // TODO return true; }
void buildTrellisCol(trieNode & node, string & input, int pos) { //First column if (pos == 0) { //Start node don't have previous node if (node.c != '*') { //The node is in first line if (node.pre->c == '*') { if (node.c == input[pos]) node.costTrellis[pos] = INIT_STATE; else { node.costTrellis[pos] = INIT_STATE + 1; } node.costPre[pos] = DUMMY_DIAG; } else { if (node.c == input[pos]) node.costTrellis[pos] = node.pre->costTrellis[pos]; else { node.costTrellis[pos] = node.pre->costTrellis[pos] + 1; } node.costPre[pos] = DEL; } } //Save the last node for transition calculation if (node.next[0] != NULL && node.next[0]->c == END_OF_WORD) { int newCost = node.costTrellis[pos] + TRANS_PENALTY; //Replace the shortest path with lower cost if (newCost < root.costTrellis[pos]) { root.costTrellis[pos] = newCost; transition[pos] = &node; previousLength[pos] = getWordLength(node); //Cost is the same but word length is different } else if (newCost == root.costTrellis[pos]) { int wordLength = getWordLength(node); if (previousLength[pos] < wordLength) { transition[pos] = &node; previousLength[pos] = getWordLength(node); } } } //Not the last node,visit further if (node.childCnt != 0) { int cnt = 0; for (int i = 1; cnt < node.childCnt; i++) { if (node.next[i] != NULL) { buildTrellisCol(*(node.next[i]), input, pos); cnt++; } } } } else { node.costTrellis[pos] = INF; //Start node don't have previous node if (node.c != '*') { //The first row if (node.pre->c == '*') { if (node.c == input[pos]) { if (node.costTrellis[pos] > root.costTrellis[pos - 1]) node.costTrellis[pos] = root.costTrellis[pos - 1]; } else if (node.costTrellis[pos] > root.costTrellis[pos - 1] + 1) node.costTrellis[pos] = root.costTrellis[pos - 1] + 1; node.costPre[pos] = DUMMY_DIAG; if (node.c == input[pos]) { if (node.costTrellis[pos] > node.costTrellis[pos - 1]) { node.costTrellis[pos] = node.costTrellis[pos - 1]; node.costPre[pos] = INS; } else if (node.costTrellis[pos] > node.costTrellis[pos - 1] + 1) { node.costTrellis[pos] = node.costTrellis[pos - 1]; node.costPre[pos] = INS; } } //Not the first row } else { if (node.c == input[pos]) { if (node.costTrellis[pos] > node.pre->costTrellis[pos - 1]) node.costTrellis[pos] = node.pre->costTrellis[pos - 1]; } else if (node.costTrellis[pos] > node.pre->costTrellis[pos - 1] + 1) node.costTrellis[pos] = node.pre->costTrellis[pos - 1] + 1; node.costPre[pos] = DIAG; if (node.costTrellis[pos] > node.costTrellis[pos - 1] + 1) { node.costTrellis[pos] = node.costTrellis[pos - 1] + 1; node.costPre[pos] = INS; } if (node.costTrellis[pos] > node.pre->costTrellis[pos] + 1) { node.costTrellis[pos] = node.pre->costTrellis[pos] + 1; node.costPre[pos] = DEL; } } } //Save the last node for transition calculation if (node.next[0] != NULL && node.next[0]->c == END_OF_WORD) { int newCost = node.costTrellis[pos] + TRANS_PENALTY; //Replace the shortest path with lower cost if (newCost < root.costTrellis[pos]) { root.costTrellis[pos] = newCost; transition[pos] = &node; previousLength[pos] = getWordLength(node); } else if (newCost == root.costTrellis[pos]) { int wordLength = getWordLength(node); int oldInsNum = getInsNum(transition, transition[pos], pos); int newInsNum = getInsNum(transition, &node, pos); //Give priority to less insertion path if (oldInsNum > newInsNum) { transition[pos] = &node; previousLength[pos] = getWordLength(node); //Pick the longesst word if insertion num is the same } else if (oldInsNum == newInsNum && previousLength[pos] < wordLength) { transition[pos] = &node; previousLength[pos] = getWordLength(node); } //Give priority to diag transition else if (previousLength[pos] == wordLength) { if (transition[pos]->costPre[pos] == INS || transition[pos]->costPre[pos] == DEL) if (node.costPre[pos] == DIAG || node.costPre[pos] == DUMMY_DIAG) { transition[pos] = &node; previousLength[pos] = getWordLength(node); } } } } //Not the last node,visit further if (node.childCnt != 0) { //对子树进行建立 int cnt = 0; for (int i = 1; cnt < node.childCnt; i++) { if (node.next[i] != NULL) { buildTrellisCol(*(node.next[i]), input, pos); cnt++; } } } } }
static void extractNodeASEvents(Node * node, Locus * locus) { Node *nodeA, *nodeB, *nodeC; Event *event; // If linear or more than 2 outgoing arcs: ignore if (countActiveConnections(node) != 2) return; // Follow the two active arcs nodeA = getTwinNode(getConnectionDestination (getActiveConnection(node))); nodeB = getTwinNode(getConnectionDestination (getSecondActiveConnection(node))); // A should be the longer of the two if (getNodeLength(nodeA) < getNodeLength(nodeB)) { nodeC = nodeA; nodeA = nodeB; nodeB = nodeC; nodeC = NULL; } // If both very short, ignore: if (getNodeLength(nodeA) < 2 * getWordLength(graph) - 1) return; if (getNodeLength(nodeB) < 2 * getWordLength(graph) - 1) { if (countActiveConnections(nodeA) != 1 || countActiveConnections(nodeB) != 1 || getConnectionDestination(getActiveConnection(nodeA)) != getConnectionDestination(getActiveConnection(nodeB))) return; nodeC = getTwinNode(getConnectionDestination (getActiveConnection(nodeA))); // Intron retention if (donorSiteAtJunction(node, nodeA) && acceptorSiteAtJunction(nodeA, nodeC)) { event = allocateEvent(); event->type = intron_retention; event->nodes[0] = node; event->nodes[1] = nodeA; event->nodes[2] = nodeB; event->nodes[3] = nodeC; event->next = locus->event; locus->event = event; } // Alternative 5' splice site else if (donorSiteAtJunction(node, nodeA)) { event = allocateEvent(); event->type = alternative_5prime_splice; event->nodes[0] = node; event->nodes[1] = nodeA; event->nodes[2] = nodeB; event->nodes[3] = nodeC; event->next = locus->event; locus->event = event; } // Alternative 3' splice site else if (acceptorSiteAtJunction(nodeA, nodeC)) { event = allocateEvent(); event->type = alternative_3prime_splice; event->nodes[0] = node; event->nodes[1] = nodeA; event->nodes[2] = nodeB; event->nodes[3] = nodeC; event->next = locus->event; locus->event = event; } // Skipped exon else { event = allocateEvent(); event->type = skipped_exon; event->nodes[0] = node; event->nodes[1] = nodeA; event->nodes[2] = nodeB; event->nodes[3] = nodeC; event->next = locus->event; locus->event = event; } } else { // Alt. poly A: if (finishesWithPAS(node) && finishesWithPAS(nodeA)) { event = allocateEvent(); event->type = alternative_polyA; event->nodes[0] = node; event->nodes[1] = nodeA; event->nodes[2] = nodeB; event->nodes[3] = NULL; event->next = locus->event; locus->event = event; } // Mutually exclusive exons if (countActiveConnections(nodeA) == 1 && countActiveConnections(nodeB) == 1 && getConnectionDestination(getActiveConnection(nodeA)) == getConnectionDestination(getActiveConnection(nodeB))) { event = allocateEvent(); event->type = mutually_exclusive_exons; event->nodes[0] = node; event->nodes[1] = nodeA; event->nodes[2] = nodeB; event->nodes[3] = getTwinNode(getConnectionDestination (getActiveConnection(nodeA))); event->next = locus->event; locus->event = event; } } }
static void threadSequenceThroughGraph(TightString * tString, KmerOccurenceTable * kmerTable, Graph * graph, IDnum seqID, Category category, boolean readTracking, boolean double_strand, ReferenceMapping * referenceMappings, Coordinate referenceMappingCount, IDnum refCount, Annotation * annotations, IDnum annotationCount, boolean second_in_pair) { Kmer word; Kmer antiWord; Coordinate readNucleotideIndex; Coordinate kmerIndex; KmerOccurence *kmerOccurence; int wordLength = getWordLength(graph); PassageMarkerI marker = NULL_IDX; PassageMarkerI previousMarker = NULL_IDX; Node *node = NULL; Node *previousNode = NULL; Coordinate coord = 0; Coordinate previousCoord = 0; Nucleotide nucleotide; boolean reversed; IDnum refID; Coordinate refCoord = 0; ReferenceMapping * refMap; Annotation * annotation = annotations; Coordinate index = 0; Coordinate uniqueIndex = 0; Coordinate annotIndex = 0; IDnum annotCount = 0; SmallNodeList * nodePile = NULL; // Neglect any string shorter than WORDLENGTH : if (getLength(tString) < wordLength) return; clearKmer(&word); clearKmer(&antiWord); // Fill in the initial word : for (readNucleotideIndex = 0; readNucleotideIndex < wordLength - 1; readNucleotideIndex++) { nucleotide = getNucleotide(readNucleotideIndex, tString); pushNucleotide(&word, nucleotide); if (double_strand || second_in_pair) { #ifdef COLOR reversePushNucleotide(&antiWord, nucleotide); #else reversePushNucleotide(&antiWord, 3 - nucleotide); #endif } } // Go through sequence while (readNucleotideIndex < getLength(tString)) { nucleotide = getNucleotide(readNucleotideIndex++, tString); pushNucleotide(&word, nucleotide); if (double_strand || second_in_pair) { #ifdef COLOR reversePushNucleotide(&antiWord, nucleotide); #else reversePushNucleotide(&antiWord, 3 - nucleotide); #endif } // Update annotation if necessary if (annotCount < annotationCount && annotIndex == getAnnotationLength(annotation)) { annotation = getNextAnnotation(annotation); annotCount++; annotIndex = 0; } // Search for reference mapping if (category == REFERENCE) { if (referenceMappings) refMap = findReferenceMapping(seqID, index, referenceMappings, referenceMappingCount); else refMap = NULL; if (refMap) { node = getNodeInGraph(graph, refMap->nodeID); if (refMap->nodeID > 0) { coord = refMap->nodeStart + (index - refMap->referenceStart); } else { coord = getNodeLength(node) - refMap->nodeStart - refMap->length + (index - refMap->referenceStart); } } else { node = NULL; if (previousNode) break; } } // Search for reference-based mapping else if (annotCount < annotationCount && uniqueIndex >= getPosition(annotation) && getAnnotSequenceID(annotation) <= refCount && getAnnotSequenceID(annotation) >= -refCount) { refID = getAnnotSequenceID(annotation); if (refID > 0) refCoord = getStart(annotation) + annotIndex; else refCoord = getStart(annotation) - annotIndex; refMap = findReferenceMapping(refID, refCoord, referenceMappings, referenceMappingCount); // If success if (refMap) { if (refID > 0) { node = getNodeInGraph(graph, refMap->nodeID); if (refMap->nodeID > 0) { coord = refMap->nodeStart + (refCoord - refMap->referenceStart); } else { coord = getNodeLength(node) - refMap->nodeStart - refMap->length + (refCoord - refMap->referenceStart); } } else { node = getNodeInGraph(graph, -refMap->nodeID); if (refMap->nodeID > 0) { coord = getNodeLength(node) - refMap->nodeStart - (refCoord - refMap->referenceStart) - 1; } else { coord = refMap->nodeStart + refMap->length - (refCoord - refMap->referenceStart) - 1; } } } else { node = NULL; if (previousNode) break; } } // Search in table else { reversed = false; if (double_strand) { if (compareKmers(&word, &antiWord) <= 0) { kmerOccurence = findKmerInKmerOccurenceTable(&word, kmerTable); } else { kmerOccurence = findKmerInKmerOccurenceTable(&antiWord, kmerTable); reversed = true; } } else { if (!second_in_pair) { kmerOccurence = findKmerInKmerOccurenceTable(&word, kmerTable); } else { kmerOccurence = findKmerInKmerOccurenceTable(&antiWord, kmerTable); reversed = true; } } if (kmerOccurence) { if (!reversed) { node = getNodeInGraph(graph, getKmerOccurenceNodeID(kmerOccurence)); coord = getKmerOccurencePosition(kmerOccurence); } else { node = getNodeInGraph(graph, -getKmerOccurenceNodeID(kmerOccurence)); coord = getNodeLength(node) - getKmerOccurencePosition(kmerOccurence) - 1; } } else { node = NULL; if (previousNode) break; } } // Increment positions if (annotCount < annotationCount && uniqueIndex >= getPosition(annotation)) annotIndex++; else uniqueIndex++; // Fill in graph if (node) { #ifdef OPENMP lockNode(node); #endif kmerIndex = readNucleotideIndex - wordLength; if (previousNode == node && previousCoord == coord - 1) { if (category / 2 >= CATEGORIES) { setPassageMarkerFinish(marker, kmerIndex + 1); setFinishOffset(marker, getNodeLength(node) - coord - 1); } else { #ifndef SINGLE_COV_CAT incrementVirtualCoverage(node, category / 2, 1); incrementOriginalVirtualCoverage(node, category / 2, 1); #else incrementVirtualCoverage(node, 1); #endif } #ifdef OPENMP unLockNode(node); #endif } else { if (category / 2 >= CATEGORIES) { marker = newPassageMarker(seqID, kmerIndex, kmerIndex + 1, coord, getNodeLength (node) - coord - 1); transposePassageMarker(marker, node); connectPassageMarkers (previousMarker, marker, graph); previousMarker = marker; } else { if (readTracking) { if (!isNodeMemorized(node, nodePile)) { addReadStart(node, seqID, coord, graph, kmerIndex); memorizeNode(node, &nodePile); } else { blurLastShortReadMarker (node, graph); } } #ifndef SINGLE_COV_CAT incrementVirtualCoverage(node, category / 2, 1); incrementOriginalVirtualCoverage(node, category / 2, 1); #else incrementVirtualCoverage(node, 1); #endif } #ifdef OPENMP lockTwoNodes(node, previousNode); #endif createArc(previousNode, node, graph); #ifdef OPENMP unLockTwoNodes(node, previousNode); #endif } previousNode = node; previousCoord = coord; } index++; } if (readTracking && category / 2 < CATEGORIES) unMemorizeNodes(&nodePile); }
static void ghostThreadSequenceThroughGraph(TightString * tString, KmerOccurenceTable * kmerTable, Graph * graph, IDnum seqID, Category category, boolean readTracking, boolean double_strand, ReferenceMapping * referenceMappings, Coordinate referenceMappingCount, IDnum refCount, Annotation * annotations, IDnum annotationCount, boolean second_in_pair) { Kmer word; Kmer antiWord; Coordinate readNucleotideIndex; KmerOccurence *kmerOccurence; int wordLength = getWordLength(graph); Nucleotide nucleotide; IDnum refID; Coordinate refCoord; ReferenceMapping * refMap = NULL; Coordinate uniqueIndex = 0; Coordinate annotIndex = 0; IDnum annotCount = 0; boolean reversed; SmallNodeList * nodePile = NULL; Annotation * annotation = annotations; Node *node; Node *previousNode = NULL; // Neglect any read which will not be short paired if ((!readTracking && category % 2 == 0) || category / 2 >= CATEGORIES) return; // Neglect any string shorter than WORDLENGTH : if (getLength(tString) < wordLength) return; // Verify that all short reads are reasonnably short if (getLength(tString) > USHRT_MAX) { velvetLog("Short read of length %lli, longer than limit %i\n", (long long) getLength(tString), SHRT_MAX); velvetLog("You should better declare this sequence as long, because it genuinely is!\n"); exit(1); } clearKmer(&word); clearKmer(&antiWord); // Fill in the initial word : for (readNucleotideIndex = 0; readNucleotideIndex < wordLength - 1; readNucleotideIndex++) { nucleotide = getNucleotide(readNucleotideIndex, tString); pushNucleotide(&word, nucleotide); if (double_strand || second_in_pair) { #ifdef COLOR reversePushNucleotide(&antiWord, nucleotide); #else reversePushNucleotide(&antiWord, 3 - nucleotide); #endif } } // Go through sequence while (readNucleotideIndex < getLength(tString)) { // Shift word: nucleotide = getNucleotide(readNucleotideIndex++, tString); pushNucleotide(&word, nucleotide); if (double_strand || second_in_pair) { #ifdef COLOR reversePushNucleotide(&antiWord, nucleotide); #else reversePushNucleotide(&antiWord, 3 - nucleotide); #endif } // Update annotation if necessary if (annotCount < annotationCount && annotIndex == getAnnotationLength(annotation)) { annotation = getNextAnnotation(annotation); annotCount++; annotIndex = 0; } // Search for reference mapping if (annotCount < annotationCount && uniqueIndex >= getPosition(annotation) && getAnnotSequenceID(annotation) <= refCount && getAnnotSequenceID(annotation) >= -refCount) { refID = getAnnotSequenceID(annotation); if (refID > 0) refCoord = getStart(annotation) + annotIndex; else refCoord = getStart(annotation) - annotIndex; refMap = findReferenceMapping(refID, refCoord, referenceMappings, referenceMappingCount); // If success if (refMap) { if (refID > 0) node = getNodeInGraph(graph, refMap->nodeID); else node = getNodeInGraph(graph, -refMap->nodeID); } else { node = NULL; if (previousNode) break; } } // if not.. look in table else { reversed = false; if (double_strand) { if (compareKmers(&word, &antiWord) <= 0) { kmerOccurence = findKmerInKmerOccurenceTable(&word, kmerTable); } else { kmerOccurence = findKmerInKmerOccurenceTable(&antiWord, kmerTable); reversed = true; } } else { if (!second_in_pair) { kmerOccurence = findKmerInKmerOccurenceTable(&word, kmerTable); } else { kmerOccurence = findKmerInKmerOccurenceTable(&antiWord, kmerTable); reversed = true; } } if (kmerOccurence) { if (!reversed) node = getNodeInGraph(graph, getKmerOccurenceNodeID(kmerOccurence)); else node = getNodeInGraph(graph, -getKmerOccurenceNodeID(kmerOccurence)); } else { node = NULL; if (previousNode) break; } } if (annotCount < annotationCount && uniqueIndex >= getPosition(annotation)) annotIndex++; else uniqueIndex++; previousNode = node; // Fill in graph if (node && !isNodeMemorized(node, nodePile)) { #ifdef OPENMP lockNode(node); #endif incrementReadStartCount(node, graph); #ifdef OPENMP unLockNode(node); #endif memorizeNode(node, &nodePile); } } unMemorizeNodes(&nodePile); }
int main(int argc, char **argv) { ReadSet *sequences = NULL; RoadMapArray *rdmaps; PreGraph *preGraph; Graph *graph; char *directory, *graphFilename, *preGraphFilename, *seqFilename, *roadmapFilename; double coverageCutoff = -1; double maxCoverageCutoff = -1; double expectedCoverage = -1; int longMultCutoff = -1; Coordinate minContigLength = -1; Coordinate minContigKmerLength; boolean *dubious = NULL; Coordinate insertLength[CATEGORIES]; Coordinate insertLengthLong = -1; Coordinate std_dev[CATEGORIES]; Coordinate std_dev_long = -1; short int accelerationBits = 24; boolean readTracking = false; boolean exportAssembly = false; boolean unusedReads = false; boolean estimateCoverage = false; boolean estimateCutoff = false; FILE *file; int arg_index, arg_int; double arg_double; char *arg; Coordinate *sequenceLengths = NULL; Category cat; boolean scaffolding = true; int pebbleRounds = 1; long long longlong_var; short int short_var; setProgramName("velvetg"); for (cat = 0; cat < CATEGORIES; cat++) { insertLength[cat] = -1; std_dev[cat] = -1; } // Error message if (argc == 1) { puts("velvetg - de Bruijn graph construction, error removal and repeat resolution"); printf("Version %i.%i.%2.2i\n", VERSION_NUMBER, RELEASE_NUMBER, UPDATE_NUMBER); puts("\nCopyright 2007, 2008 Daniel Zerbino ([email protected])"); puts("This is free software; see the source for copying conditions. There is NO"); puts("warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.\n"); puts("Compilation settings:"); printf("CATEGORIES = %i\n", CATEGORIES); printf("MAXKMERLENGTH = %i\n", MAXKMERLENGTH); puts(""); printUsage(); return 1; } if (strcmp(argv[1], "--help") == 0) { printUsage(); return 0; } // Memory allocation directory = argv[1]; graphFilename = mallocOrExit(strlen(directory) + 100, char); preGraphFilename = mallocOrExit(strlen(directory) + 100, char); roadmapFilename = mallocOrExit(strlen(directory) + 100, char); seqFilename = mallocOrExit(strlen(directory) + 100, char); // Argument parsing for (arg_index = 2; arg_index < argc; arg_index++) { arg = argv[arg_index++]; if (arg_index >= argc) { puts("Unusual number of arguments!"); printUsage(); exit(1); } if (strcmp(arg, "-cov_cutoff") == 0) { if (strcmp(argv[arg_index], "auto") == 0) { estimateCutoff = true; } else { sscanf(argv[arg_index], "%lf", &coverageCutoff); } } else if (strcmp(arg, "-exp_cov") == 0) { if (strcmp(argv[arg_index], "auto") == 0) { estimateCoverage = true; readTracking = true; } else { sscanf(argv[arg_index], "%lf", &expectedCoverage); if (expectedCoverage > 0) readTracking = true; } } else if (strcmp(arg, "-ins_length") == 0) { sscanf(argv[arg_index], "%lli", &longlong_var); insertLength[0] = (Coordinate) longlong_var; if (insertLength[0] < 0) { printf("Invalid insert length: %lli\n", (long long) insertLength[0]); exit(1); } } else if (strcmp(arg, "-ins_length_sd") == 0) { sscanf(argv[arg_index], "%lli", &longlong_var); std_dev[0] = (Coordinate) longlong_var; if (std_dev[0] < 0) { printf("Invalid std deviation: %lli\n", (long long) std_dev[0]); exit(1); } } else if (strcmp(arg, "-ins_length_long") == 0) { sscanf(argv[arg_index], "%lli", &longlong_var); insertLengthLong = (Coordinate) longlong_var; } else if (strcmp(arg, "-ins_length_long_sd") == 0) { sscanf(argv[arg_index], "%lli", &longlong_var); std_dev_long = (Coordinate) longlong_var; } else if (strncmp(arg, "-ins_length", 11) == 0 && strchr(arg, 'd') == NULL) { sscanf(arg, "-ins_length%hi", &short_var); cat = (Category) short_var; if (cat < 1 || cat > CATEGORIES) { printf("Unknown option: %s\n", arg); exit(1); } sscanf(argv[arg_index], "%lli", &longlong_var); insertLength[cat - 1] = (Coordinate) longlong_var; if (insertLength[cat - 1] < 0) { printf("Invalid insert length: %lli\n", (long long) insertLength[cat - 1]); exit(1); } } else if (strncmp(arg, "-ins_length", 11) == 0) { sscanf(arg, "-ins_length%hi_sd", &short_var); cat = (Category) short_var; if (cat < 1 || cat > CATEGORIES) { printf("Unknown option: %s\n", arg); exit(1); } sscanf(argv[arg_index], "%lli", &longlong_var); std_dev[cat - 1] = (Coordinate) longlong_var; if (std_dev[cat - 1] < 0) { printf("Invalid std deviation: %lli\n", (long long) std_dev[cat - 1]); exit(1); } } else if (strcmp(arg, "-read_trkg") == 0) { readTracking = (strcmp(argv[arg_index], "yes") == 0); } else if (strcmp(arg, "-scaffolding") == 0) { scaffolding = (strcmp(argv[arg_index], "yes") == 0); } else if (strcmp(arg, "-amos_file") == 0) { exportAssembly = (strcmp(argv[arg_index], "yes") == 0); } else if (strcmp(arg, "-min_contig_lgth") == 0) { sscanf(argv[arg_index], "%lli", &longlong_var); minContigLength = (Coordinate) longlong_var; } else if (strcmp(arg, "-accel_bits") == 0) { sscanf(argv[arg_index], "%hi", &accelerationBits); if (accelerationBits < 0) { printf ("Illegal acceleration parameter: %s\n", argv[arg_index]); printUsage(); return -1; } } else if (strcmp(arg, "-max_branch_length") == 0) { sscanf(argv[arg_index], "%i", &arg_int); setMaxReadLength(arg_int); setLocalMaxReadLength(arg_int); } else if (strcmp(arg, "-max_divergence") == 0) { sscanf(argv[arg_index], "%lf", &arg_double); setMaxDivergence(arg_double); setLocalMaxDivergence(arg_double); } else if (strcmp(arg, "-max_gap_count") == 0) { sscanf(argv[arg_index], "%i", &arg_int); setMaxGaps(arg_int); setLocalMaxGaps(arg_int); } else if (strcmp(arg, "-min_pair_count") == 0) { sscanf(argv[arg_index], "%i", &arg_int); setUnreliableConnectionCutoff(arg_int); } else if (strcmp(arg, "-max_coverage") == 0) { sscanf(argv[arg_index], "%lf", &maxCoverageCutoff); } else if (strcmp(arg, "-long_mult_cutoff") == 0) { sscanf(argv[arg_index], "%i", &longMultCutoff); setMultiplicityCutoff(longMultCutoff); } else if (strcmp(arg, "-unused_reads") == 0) { unusedReads = (strcmp(argv[arg_index], "yes") == 0); if (unusedReads) readTracking = true; } else if (strcmp(arg, "--help") == 0) { printUsage(); return 0; } else { printf("Unknown option: %s;\n", arg); printUsage(); return 1; } } // Bookkeeping logInstructions(argc, argv, directory); strcpy(seqFilename, directory); strcat(seqFilename, "/Sequences"); strcpy(roadmapFilename, directory); strcat(roadmapFilename, "/Roadmaps"); strcpy(preGraphFilename, directory); strcat(preGraphFilename, "/PreGraph"); if (!readTracking) { strcpy(graphFilename, directory); strcat(graphFilename, "/Graph"); } else { strcpy(graphFilename, directory); strcat(graphFilename, "/Graph2"); } // Graph uploading or creation if ((file = fopen(graphFilename, "r")) != NULL) { fclose(file); graph = importGraph(graphFilename); } else if ((file = fopen(preGraphFilename, "r")) != NULL) { fclose(file); sequences = importReadSet(seqFilename); convertSequences(sequences); graph = importPreGraph(preGraphFilename, sequences, readTracking, accelerationBits); sequenceLengths = getSequenceLengths(sequences, getWordLength(graph)); correctGraph(graph, sequenceLengths); exportGraph(graphFilename, graph, sequences->tSequences); } else if ((file = fopen(roadmapFilename, "r")) != NULL) { fclose(file); rdmaps = importRoadMapArray(roadmapFilename); preGraph = newPreGraph_pg(rdmaps, seqFilename); clipTips_pg(preGraph); exportPreGraph_pg(preGraphFilename, preGraph); destroyPreGraph_pg(preGraph); sequences = importReadSet(seqFilename); convertSequences(sequences); graph = importPreGraph(preGraphFilename, sequences, readTracking, accelerationBits); sequenceLengths = getSequenceLengths(sequences, getWordLength(graph)); correctGraph(graph, sequenceLengths); exportGraph(graphFilename, graph, sequences->tSequences); } else { puts("No Roadmap file to build upon! Please run velveth (see manual)"); exit(1); } // Set insert lengths and their standard deviations for (cat = 0; cat < CATEGORIES; cat++) { if (insertLength[cat] > -1 && std_dev[cat] < 0) std_dev[cat] = insertLength[cat] / 10; setInsertLengths(graph, cat, insertLength[cat], std_dev[cat]); } if (insertLengthLong > -1 && std_dev_long < 0) std_dev_long = insertLengthLong / 10; setInsertLengths(graph, CATEGORIES, insertLengthLong, std_dev_long); // Coverage cutoff if (expectedCoverage < 0 && estimateCoverage == true) { expectedCoverage = estimated_cov(graph); if (coverageCutoff < 0) { coverageCutoff = expectedCoverage / 2; estimateCutoff = true; } } else { estimateCoverage = false; if (coverageCutoff < 0 && estimateCutoff) coverageCutoff = estimated_cov(graph) / 2; else estimateCutoff = false; } if (coverageCutoff < 0) { puts("WARNING: NO COVERAGE CUTOFF PROVIDED"); puts("Velvet will probably leave behind many detectable errors"); puts("See manual for instructions on how to set the coverage cutoff parameter"); } dubious = removeLowCoverageNodesAndDenounceDubiousReads(graph, coverageCutoff); removeHighCoverageNodes(graph, maxCoverageCutoff); clipTipsHard(graph); if (expectedCoverage > 0) { if (sequences == NULL) { sequences = importReadSet(seqFilename); convertSequences(sequences); } // Mixed length sequencing readCoherentGraph(graph, isUniqueSolexa, expectedCoverage, sequences); // Paired ends module createReadPairingArray(sequences); for (cat = 0; cat < CATEGORIES; cat++) if(pairUpReads(sequences, 2 * cat + 1)) pebbleRounds++; if (pairUpReads(sequences, 2 * CATEGORIES + 1)) pebbleRounds++; detachDubiousReads(sequences, dubious); activateGapMarkers(graph); for ( ;pebbleRounds > 0; pebbleRounds--) exploitShortReadPairs(graph, sequences, dubious, scaffolding); } else { puts("WARNING: NO EXPECTED COVERAGE PROVIDED"); puts("Velvet will be unable to resolve any repeats"); puts("See manual for instructions on how to set the expected coverage parameter"); } free(dubious); concatenateGraph(graph); if (minContigLength < 2 * getWordLength(graph)) minContigKmerLength = getWordLength(graph); else minContigKmerLength = minContigLength - getWordLength(graph) + 1; strcpy(graphFilename, directory); strcat(graphFilename, "/contigs.fa"); exportLongNodeSequences(graphFilename, graph, minContigKmerLength); strcpy(graphFilename, directory); strcat(graphFilename, "/stats.txt"); displayGeneralStatistics(graph, graphFilename); if (sequences == NULL) { sequences = importReadSet(seqFilename); convertSequences(sequences); } strcpy(graphFilename, directory); strcat(graphFilename, "/LastGraph"); exportGraph(graphFilename, graph, sequences->tSequences); if (exportAssembly) { strcpy(graphFilename, directory); strcat(graphFilename, "/velvet_asm.afg"); exportAMOSContigs(graphFilename, graph, minContigKmerLength, sequences); } if (unusedReads) exportUnusedReads(graph, sequences, minContigKmerLength, directory); if (estimateCoverage) printf("Estimated Coverage = %f\n", expectedCoverage); if (estimateCutoff) printf("Estimated Coverage cutoff = %f\n", coverageCutoff); logFinalStats(graph, minContigKmerLength, directory); destroyGraph(graph); free(graphFilename); free(preGraphFilename); free(seqFilename); free(roadmapFilename); destroyReadSet(sequences); return 0; }
int MRC::getImSize() { return getNx()*getNy()*getWordLength(); }
static void threadSequenceThroughGraph(TightString * tString, KmerOccurenceTable * kmerOccurences, Graph * graph, IDnum seqID, Category category, boolean readTracking, boolean double_strand) { Kmer word; Kmer antiWord; Coordinate readNucleotideIndex; Coordinate kmerIndex; KmerOccurence *kmerOccurence; int wordLength = getWordLength(graph); PassageMarker *marker = NULL; PassageMarker *previousMarker = NULL; Node *node; Node *previousNode = NULL; Coordinate coord; Coordinate previousCoord = 0; Nucleotide nucleotide; clearKmer(&word); clearKmer(&antiWord); // Neglect any string shorter than WORDLENGTH : if (getLength(tString) < wordLength) return; // Fill in the initial word : for (readNucleotideIndex = 0; readNucleotideIndex < wordLength - 1; readNucleotideIndex++) { nucleotide = getNucleotide(readNucleotideIndex, tString); pushNucleotide(&word, nucleotide); if (double_strand) { #ifdef COLOR reversePushNucleotide(&antiWord, nucleotide); #else reversePushNucleotide(&antiWord, 3 - nucleotide); #endif } } // Go through sequence while (readNucleotideIndex < getLength(tString)) { nucleotide = getNucleotide(readNucleotideIndex++, tString); pushNucleotide(&word, nucleotide); if (double_strand) { #ifdef COLOR reversePushNucleotide(&antiWord, nucleotide); #else reversePushNucleotide(&antiWord, 3 - nucleotide); #endif } // Search in table if ((!double_strand || compareKmers(&word, &antiWord) <= 0) && (kmerOccurence = findKmerOccurenceInSortedTable(&word, kmerOccurences))) { node = getNodeInGraph(graph, kmerOccurence->nodeID); coord = kmerOccurence->position; } else if ((double_strand && compareKmers(&word, &antiWord) > 0) && (kmerOccurence = findKmerOccurenceInSortedTable(&antiWord, kmerOccurences))) { node = getNodeInGraph(graph, -kmerOccurence->nodeID); coord = getNodeLength(node) - kmerOccurence->position - 1; } else { node = NULL; if (previousNode) { break; } } // Fill in graph if (node) { kmerIndex = readNucleotideIndex - wordLength; if (previousNode == node && previousCoord == coord - 1) { if (category / 2 >= CATEGORIES) { setPassageMarkerFinish(marker, kmerIndex + 1); setFinishOffset(marker, getNodeLength(node) - coord - 1); } else { incrementVirtualCoverage(node, category / 2, 1); incrementOriginalVirtualCoverage (node, category / 2, 1); } } else { if (category / 2 >= CATEGORIES) { marker = newPassageMarker(seqID, kmerIndex, kmerIndex + 1, coord, getNodeLength (node) - coord - 1); transposePassageMarker(marker, node); connectPassageMarkers (previousMarker, marker, graph); previousMarker = marker; } else { if (readTracking) { if (!getNodeStatus(node)) { addReadStart(node, seqID, coord, graph, kmerIndex); setSingleNodeStatus (node, true); memorizeNode(node); } else { blurLastShortReadMarker (node, graph); } } incrementVirtualCoverage(node, category / 2, 1); incrementOriginalVirtualCoverage (node, category / 2, 1); } createArc(previousNode, node, graph); } previousNode = node; previousCoord = coord; } } unlockMemorizedNodes(); }
static void ghostThreadSequenceThroughGraph(TightString * tString, KmerOccurenceTable * kmerOccurences, Graph * graph, IDnum seqID, Category category, boolean readTracking, boolean double_strand) { Kmer word; Kmer antiWord; Coordinate readNucleotideIndex; KmerOccurence *kmerOccurence; int wordLength = getWordLength(graph); Nucleotide nucleotide; Node *node; Node *previousNode = NULL; clearKmer(&word); clearKmer(&antiWord); // Neglect any read which will not be short paired if ((!readTracking && category % 2 == 0) || category / 2 >= CATEGORIES) return; // Neglect any string shorter than WORDLENGTH : if (getLength(tString) < wordLength) return; // Verify that all short reads are reasonnably short if (getLength(tString) > USHRT_MAX) { printf("Short read of length %lli, longer than limit %i\n", (long long) getLength(tString), SHRT_MAX); puts("You should better declare this sequence as long, because it genuinely is!"); exit(1); } // Allocate memory for the read pairs if (!readStartsAreActivated(graph)) activateReadStarts(graph); // Fill in the initial word : for (readNucleotideIndex = 0; readNucleotideIndex < wordLength - 1; readNucleotideIndex++) { nucleotide = getNucleotide(readNucleotideIndex, tString); pushNucleotide(&word, nucleotide); if (double_strand) { #ifdef COLOR reversePushNucleotide(&antiWord, nucleotide); #else reversePushNucleotide(&antiWord, 3 - nucleotide); #endif } } // Go through sequence while (readNucleotideIndex < getLength(tString)) { // Shift word: nucleotide = getNucleotide(readNucleotideIndex++, tString); pushNucleotide(&word, nucleotide); if (double_strand) { #ifdef COLOR reversePushNucleotide(&antiWord, nucleotide); #else reversePushNucleotide(&antiWord, 3 - nucleotide); #endif } // Search in table if ((!double_strand || compareKmers(&word, &antiWord) <= 0) && (kmerOccurence = findKmerOccurenceInSortedTable(&word, kmerOccurences))) { node = getNodeInGraph(graph, kmerOccurence->nodeID); } else if ((double_strand && compareKmers(&word, &antiWord) > 0) && (kmerOccurence = findKmerOccurenceInSortedTable(&antiWord, kmerOccurences))) { node = getNodeInGraph(graph, -kmerOccurence->nodeID); } else { node = NULL; if (previousNode) break; } previousNode = node; // Fill in graph if (node && !getNodeStatus(node)) { incrementReadStartCount(node, graph); setSingleNodeStatus(node, true); memorizeNode(node); } } unlockMemorizedNodes(); }