static IDnum *computeReadToNodeCounts() { IDnum readIndex, nodeIndex; IDnum maxNodeIndex = 2 * nodeCount(graph) + 1; IDnum maxReadIndex = sequenceCount(graph) + 1; IDnum *readNodeCounts = callocOrExit(maxReadIndex, IDnum); boolean *readMarker = callocOrExit(maxReadIndex, boolean); ShortReadMarker *nodeArray, *shortMarker; PassageMarkerI marker; Node *node; IDnum nodeReadCount; //puts("Computing read to node mapping array sizes"); for (nodeIndex = 0; nodeIndex < maxNodeIndex; nodeIndex++) { node = getNodeInGraph(graph, nodeIndex - nodeCount(graph)); if (node == NULL) continue; // Short reads if (readStartsAreActivated(graph)) { nodeArray = getNodeReads(node, graph); nodeReadCount = getNodeReadCount(node, graph); for (readIndex = 0; readIndex < nodeReadCount; readIndex++) { shortMarker = getShortReadMarkerAtIndex(nodeArray, readIndex); readNodeCounts[getShortReadMarkerID (shortMarker)]++; } } // Long reads for (marker = getMarker(node); marker != NULL_IDX; marker = getNextInNode(marker)) { readIndex = getPassageMarkerSequenceID(marker); if (readIndex < 0) continue; if (readMarker[readIndex]) continue; readNodeCounts[readIndex]++; readMarker[readIndex] = true; } // Clean up marker array for (marker = getMarker(node); marker != NULL_IDX; marker = getNextInNode(marker)) { readIndex = getPassageMarkerSequenceID(marker); if (readIndex > 0) readMarker[readIndex] = false; } } free(readMarker); return readNodeCounts; }
KmerOccurenceTable * newKmerOccurenceTable(short int accelerationBits, int wordLength) { KmerOccurenceTable * kmerTable = mallocOrExit(1, KmerOccurenceTable); if (accelerationBits > 2 * wordLength) accelerationBits = 2 * wordLength; if (accelerationBits > 32) accelerationBits = 32; if (accelerationBits > 0) { resetKeyFilter(accelerationBits); kmerTable->accelerationBits = accelerationBits; kmerTable->accelerationTable = callocOrExit((((size_t) 1) << accelerationBits) + 1, IDnum); kmerTable->accelerationShift = (short int) 2 *wordLength - accelerationBits; } else { kmerTable->accelerationBits = 0; kmerTable->accelerationTable = NULL; kmerTable->accelerationShift = 0; } return kmerTable; }
// // Creates a tightString from a tradionnal string of A,T,G, and C of length size // TightString *newTightStringFromString(char *sequence) { TightString *newTString = mallocOrExit(1, TightString); int size = (int) strlen(sequence); int arrayLength = size / 4; int index; if (size % 4 > 0) arrayLength++; newTString->length = size; newTString->sequence = callocOrExit(arrayLength, Codon); for (index = 0; index < arrayLength; index++) newTString->sequence[index] = 0; for (index = 0; index < size; index++) writeNucleotide(sequence[index], &(newTString->sequence[index / 4]), index % 4); free(sequence); return newTString; }
static NodeMask * computeNodeMasks(ReferenceMapping * referenceMappings, Coordinate arrayLength, Graph * graph) { NodeMask * nodeMasks; NodeMask * currentMask; ReferenceMapping * currentMapping = referenceMappings; Coordinate index; if (referenceMappings == NULL) return NULL; nodeMasks = callocOrExit(arrayLength, NodeMask); currentMask = nodeMasks; for (index = 0; index < arrayLength; index++) { if (currentMapping->nodeID > 0) { currentMask->nodeID = currentMapping->nodeID; } else { currentMask->nodeID = -currentMapping->nodeID; } currentMask->start = currentMapping->nodeStart; currentMask->finish = currentMapping->nodeStart + currentMapping->length; currentMask++; currentMapping++; } qsort(nodeMasks, arrayLength, sizeof(NodeMask), compareNodeMasks); return nodeMasks; }
// The full monty, wrapped up in one function PreGraph *newPreGraph_pg(RoadMapArray * rdmapArray, SequencesReader *seqReadInfo) { int WORDLENGTH = rdmapArray->WORDLENGTH; IDnum sequenceCount = rdmapArray->length; IDnum *markerCounters = callocOrExit(sequenceCount + 1, IDnum); IDnum *chains = callocOrExit(sequenceCount + 1, IDnum); InsertionMarker *insertionMarkers; InsertionMarker *veryLastMarker; PreGraph *preGraph = emptyPreGraph_pg(sequenceCount, rdmapArray->referenceCount, rdmapArray->WORDLENGTH, rdmapArray->double_strand); velvetLog("Creating insertion markers\n"); setInsertionMarkers(rdmapArray, markerCounters, &veryLastMarker, &insertionMarkers); velvetLog("Counting preNodes\n"); countPreNodes(rdmapArray, preGraph, markerCounters, insertionMarkers, veryLastMarker); velvetLog("%li preNodes counted, creating them now\n", (long) preNodeCount_pg(preGraph)); createPreNodes(rdmapArray, preGraph, markerCounters, insertionMarkers, veryLastMarker, chains, seqReadInfo, WORDLENGTH); velvetLog("Adjusting marker info...\n"); convertInsertionMarkers(insertionMarkers, veryLastMarker, chains); #ifdef _OPENMP createNodeLocks(preGraph); #endif velvetLog("Connecting preNodes\n"); connectPreNodes(rdmapArray, preGraph, chains); velvetLog("Cleaning up memory\n"); cleanUpMemory(preGraph, rdmapArray, chains); #ifdef _OPENMP free(nodeLocks); nodeLocks = NULL; #endif velvetLog("Done creating preGraph\n"); return preGraph; }
Transcript * newTranscript(IDnum contigCount, double confidence) { Transcript * transcript = allocateTranscript(); transcript->contigCount = 0; transcript->contigs = callocOrExit(contigCount, Node *); transcript->distances = callocOrExit(contigCount, Coordinate); transcript->confidence = confidence; return transcript; }
StringBuffer *newStringBuffer(size_t size) { StringBuffer *buffer; buffer = callocOrExit(1, StringBuffer); if (size > 0) { buffer->str = callocOrExit(size, char); buffer->allocated = size; }
void allocateKmerOccurences(IDnum kmerCount, KmerOccurenceTable * table) { KmerOccurence * kmerOccurences = callocOrExit(kmerCount + 1, KmerOccurence); kmerOccurences[kmerCount].position = -1; kmerOccurences[kmerCount].nodeID = 0; table->kmerTable = kmerOccurences; table->kmerTableSize = kmerCount; table->kmerOccurencePtr = kmerOccurences; table->kmerOccurenceIndex = 0; }
static boolean * countCoOccurences(IDnum * coOccurencesCount, ReadOccurence ** readNodes, IDnum * readNodeCounts, IDnum * readPairs, Category * cats) { IDnum readIndex, readPairIndex; IDnum readNodeCount; IDnum readOccurenceIndex, readPairOccurenceIndex; ReadOccurence * readOccurence, *readPairOccurence; boolean * interestingReads = callocOrExit(sequenceCount(graph), boolean); Category libID; for (libID = 0; libID < CATEGORIES + 1; libID++) coOccurencesCount[libID] = 0; for (readIndex = 0; readIndex < sequenceCount(graph); readIndex++) { // Eliminating dodgy, unpaired, already counted or user-specified reads if ( readPairs[readIndex] < readIndex || getInsertLength(graph, cats[readIndex]) > -1) continue; // Check for co-occurence // We know that for each read the read occurences are ordered by increasing node ID // Therefore one list is followed by increasing index, whereas the other is followed // by decreasing index libID = cats[readIndex]/2; readPairIndex = readPairs[readIndex]; readOccurenceIndex = 0; readOccurence = readNodes[readIndex + 1]; readNodeCount = readNodeCounts[readIndex + 1]; readPairOccurenceIndex = readNodeCounts[readPairIndex + 1] - 1; readPairOccurence = &(readNodes[readPairIndex + 1][readPairOccurenceIndex]); while (readOccurenceIndex < readNodeCount && readPairOccurenceIndex >= 0) { if (readOccurence->nodeID == -readPairOccurence->nodeID) { if (readOccurence->position > 0 && readPairOccurence->position > 0) { coOccurencesCount[libID]++; interestingReads[readIndex] = true; break; } else { readOccurence++; readOccurenceIndex++; readPairOccurence--; readPairOccurenceIndex--; } } else if (readOccurence->nodeID < -readPairOccurence->nodeID) { readOccurence++; readOccurenceIndex++; } else { readPairOccurence--; readPairOccurenceIndex--; } } } return interestingReads; }
void exploitShortReadPairs(Graph * argGraph, ReadSet * reads, boolean * dubious, boolean * shadows, boolean force_jumps) { boolean modified = true; graph = argGraph; if (!readStartsAreActivated(graph)) return; velvetLog("Starting pebble resolution...\n"); resetNodeStatus(graph); // Prepare scaffold buildScaffold(graph, reads, dubious, shadows); // Prepare graph prepareGraphForLocalCorrections(graph); // Prepare local scaffold localScaffold = callocOrExit(2 * nodeCount(graph) + 1, MiniConnection); // Loop until convergence while (modified) modified = expandLongNodes(force_jumps); // Clean up memory cleanMemory(); deactivateLocalCorrectionSettings(); sortGapMarkers(graph); velvetLog("Pebble done.\n"); }
static void estimateMissingInsertLengths(ReadOccurence ** readNodes, IDnum * readNodeCounts, IDnum * readPairs, Category * cats) { Coordinate * coOccurences[CATEGORIES + 1]; IDnum coOccurencesCounts[CATEGORIES + 1]; Category libID; puts("Estimating library insert lengths..."); boolean * interestingReads = countCoOccurences(coOccurencesCounts, readNodes, readNodeCounts, readPairs, cats); for (libID = 0; libID < CATEGORIES + 1; libID++) coOccurences[libID] = callocOrExit(coOccurencesCounts[libID], Coordinate); measureCoOccurences(coOccurences, interestingReads, readNodes, readNodeCounts, readPairs, cats); estimateLibraryInsertLengths(coOccurences, coOccurencesCounts); for (libID = 0; libID < CATEGORIES + 1; libID++) free(coOccurences[libID]); free(interestingReads); puts("Done"); }
static void estimateMissingInsertLengths(ReadOccurence ** readNodes, IDnum * readNodeCounts, IDnum * readPairs, Category * cats) { Coordinate * coOccurences[CATEGORIES + 1]; /* SF TODO This could probably be done with IDnum */ IDnum coOccurencesCounts[CATEGORIES + 1]; Category libID; velvetLog("Estimating library insert lengths...\n"); boolean * interestingReads = countCoOccurences(coOccurencesCounts, readNodes, readNodeCounts, readPairs, cats); for (libID = 0; libID < CATEGORIES + 1; libID++) coOccurences[libID] = callocOrExit(coOccurencesCounts[libID], Coordinate); /* SF TODO This could probably be done with IDnum */ measureCoOccurences(coOccurences, interestingReads, readNodes, readNodeCounts, readPairs, cats); estimateLibraryInsertLengths(coOccurences, coOccurencesCounts); for (libID = 0; libID < CATEGORIES + 1; libID++) free(coOccurences[libID]); free(interestingReads); velvetLog("Done\n"); }
static ReferenceMapping * recordReferenceMappings(char * preGraphFilename, IDnum arrayLength) { ReferenceMapping * mappings = callocOrExit(arrayLength, ReferenceMapping); FILE *file = fopen(preGraphFilename, "r"); const int maxline = MAXLINE; char line[MAXLINE]; ReferenceMapping * current = mappings; IDnum referenceID; long long_var; long long coord1, coord2, coord3; // Go past NODE blocks while(fgets(line, maxline, file)) if (line[0] == 'S') break; sscanf(line, "SEQ\t%li\n", &long_var); referenceID = long_var; // Go relevant lines while(fgets(line, maxline, file)) { if (line[0] != 'S') { sscanf(line, "%li\t%lli\t%lli\t%lli\n", &long_var, &coord1, &coord2, &coord3); current->referenceID = referenceID; current->nodeID = long_var; current->nodeStart = coord1; current->referenceStart = coord2; current->length = coord3; current++; } else { sscanf(line, "SEQ\t%li\n", &long_var); referenceID = long_var; } } fclose(file); return mappings; }
// Imports roadmap from the appropriate file format // Memory allocated within the function RoadMapArray *importRoadMapArray(char *filename) { FILE *file; const int maxline = 100; char *line = mallocOrExit(maxline, char); RoadMap *array; RoadMap *rdmap = NULL; IDnum rdmapIndex = 0; IDnum seqID; Coordinate position, start, finish; Annotation *nextAnnotation; RoadMapArray *result = mallocOrExit(1, RoadMapArray); IDnum sequenceCount; IDnum annotationCount = 0; short short_var; long long_var; long long longlong_var, longlong_var2, longlong_var3; printf("Reading roadmap file %s\n", filename); file = fopen(filename, "r"); if (!fgets(line, maxline, file)) exitErrorf(EXIT_FAILURE, true, "%s incomplete.", filename); sscanf(line, "%ld\t%i\t%hi\n", &long_var, &(result->WORDLENGTH), &short_var); sequenceCount = (IDnum) long_var; resetWordFilter(result->WORDLENGTH); result->length = sequenceCount; array = mallocOrExit(sequenceCount, RoadMap); result->array = array; result->double_strand = (boolean) short_var; while (fgets(line, maxline, file) != NULL) if (line[0] != 'R') annotationCount++; result->annotations = callocOrExit(annotationCount, Annotation); nextAnnotation = result->annotations; fclose(file); file = fopen(filename, "r"); if (!fgets(line, maxline, file)) exitErrorf(EXIT_FAILURE, true, "%s incomplete.", filename); while (fgets(line, maxline, file) != NULL) { if (line[0] == 'R') { rdmap = getRoadMapInArray(result, rdmapIndex++); rdmap->annotationCount = 0; } else { sscanf(line, "%ld\t%lld\t%lld\t%lld\n", &long_var, &longlong_var, &longlong_var2, &longlong_var3); seqID = (IDnum) long_var; position = (Coordinate) longlong_var; start = (Coordinate) longlong_var2; finish = (Coordinate) longlong_var3; nextAnnotation->sequenceID = seqID; nextAnnotation->position = position; nextAnnotation->start.coord = start; nextAnnotation->finish.coord = finish; if (seqID > 0) nextAnnotation->length = finish - start; else nextAnnotation->length = start - finish; rdmap->annotationCount++; nextAnnotation++; } } printf("%d roadmaps reads\n", rdmapIndex); fclose(file); free(line); return result; }
static KmerOccurenceTable *referenceGraphKmers(char *preGraphFilename, short int accelerationBits, Graph * graph, boolean double_strand) { FILE *file = fopen(preGraphFilename, "r"); const int maxline = MAXLINE; char line[MAXLINE]; char c; int wordLength; Coordinate lineLength, kmerCount; Kmer word; Kmer antiWord; KmerOccurenceTable *kmerTable = NULL; KmerOccurence *kmerOccurences, *kmerOccurencePtr; Coordinate kmerOccurenceIndex; IDnum index; IDnum nodeID = 0; IDnum *accelPtr = NULL; KmerKey lastHeader = 0; KmerKey header; Nucleotide nucleotide; if (file == NULL) exitErrorf(EXIT_FAILURE, true, "Could not open %s", preGraphFilename); // Count kmers printf("Scanning pre-graph file %s for k-mers\n", preGraphFilename); // First line if (!fgets(line, maxline, file)) exitErrorf(EXIT_FAILURE, true, "PreGraph file incomplete"); sscanf(line, "%*i\t%*i\t%i\n", &wordLength); // Initialize kmer occurence table: kmerTable = mallocOrExit(1, KmerOccurenceTable); if (accelerationBits > 2 * wordLength) accelerationBits = 2 * wordLength; if (accelerationBits > 32) accelerationBits = 32; if (accelerationBits > 0) { kmerTable->accelerationBits = accelerationBits; kmerTable->accelerationTable = callocOrExit((((size_t) 1) << accelerationBits) + 1, IDnum); accelPtr = kmerTable->accelerationTable; kmerTable->accelerationShift = (short int) 2 *wordLength - accelerationBits; } else { kmerTable->accelerationBits = 0; kmerTable->accelerationTable = NULL; kmerTable->accelerationShift = 0; } // Read nodes if (!fgets(line, maxline, file)) exitErrorf(EXIT_FAILURE, true, "PreGraph file incomplete"); kmerCount = 0; while (line[0] == 'N') { lineLength = 0; while ((c = getc(file)) != EOF && c != '\n') lineLength++; kmerCount += lineLength - wordLength + 1; if (fgets(line, maxline, file) == NULL) break; } fclose(file); // Create table printf("%li kmers found\n", (long) kmerCount); kmerOccurences = callocOrExit(kmerCount, KmerOccurence); kmerOccurencePtr = kmerOccurences; kmerOccurenceIndex = 0; kmerTable->kmerTable = kmerOccurences; kmerTable->kmerTableSize = kmerCount; // Fill table file = fopen(preGraphFilename, "r"); if (file == NULL) exitErrorf(EXIT_FAILURE, true, "Could not open %s", preGraphFilename); if (!fgets(line, maxline, file)) exitErrorf(EXIT_FAILURE, true, "PreGraph file incomplete"); // Read nodes if (!fgets(line, maxline, file)) exitErrorf(EXIT_FAILURE, true, "PreGraph file incomplete"); while (line[0] == 'N') { nodeID++; // Fill in the initial word : clearKmer(&word); clearKmer(&antiWord); for (index = 0; index < wordLength - 1; index++) { c = getc(file); if (c == 'A') nucleotide = ADENINE; else if (c == 'C') nucleotide = CYTOSINE; else if (c == 'G') nucleotide = GUANINE; else if (c == 'T') nucleotide = THYMINE; else if (c == '\n') exitErrorf(EXIT_FAILURE, true, "PreGraph file incomplete"); else nucleotide = ADENINE; pushNucleotide(&word, nucleotide); if (double_strand) { #ifdef COLOR reversePushNucleotide(&antiWord, nucleotide); #else reversePushNucleotide(&antiWord, 3 - nucleotide); #endif } } // Scan through node index = 0; while((c = getc(file)) != '\n' && c != EOF) { if (c == 'A') nucleotide = ADENINE; else if (c == 'C') nucleotide = CYTOSINE; else if (c == 'G') nucleotide = GUANINE; else if (c == 'T') nucleotide = THYMINE; else nucleotide = ADENINE; pushNucleotide(&word, nucleotide); if (double_strand) { #ifdef COLOR reversePushNucleotide(&antiWord, nucleotide); #else reversePushNucleotide(&antiWord, 3 - nucleotide); #endif } if (!double_strand || compareKmers(&word, &antiWord) <= 0) { copyKmers(&kmerOccurencePtr->kmer, &word); kmerOccurencePtr->nodeID = nodeID; kmerOccurencePtr->position = index; } else { copyKmers(&kmerOccurencePtr->kmer, &antiWord); kmerOccurencePtr->nodeID = -nodeID; kmerOccurencePtr->position = getNodeLength(getNodeInGraph(graph, nodeID)) - 1 - index; } kmerOccurencePtr++; kmerOccurenceIndex++; index++; } if (fgets(line, maxline, file) == NULL) break; } fclose(file); // Sort table qsort(kmerOccurences, kmerCount, sizeof(KmerOccurence), compareKmerOccurences); // Fill up acceleration table if (kmerTable->accelerationTable != NULL) { *accelPtr = (IDnum) 0; for (kmerOccurenceIndex = 0; kmerOccurenceIndex < kmerCount; kmerOccurenceIndex++) { header = keyInAccelerationTable(&kmerOccurences [kmerOccurenceIndex]. kmer, kmerTable); while (lastHeader < header) { lastHeader++; accelPtr++; *accelPtr = kmerOccurenceIndex; } } while (lastHeader < (KmerKey) 1 << accelerationBits) { lastHeader++; accelPtr++; *accelPtr = kmerCount; } } return kmerTable; }
// Creates empty RoadMap RoadMap *newRoadMap() { return callocOrExit(1, RoadMap); }
ReadSet *newReadSet() { ReadSet *rs = callocOrExit(1, ReadSet); return rs; }
// Replaces two consecutive preNodes into a single equivalent preNode // The extra memory is freed static void concatenatePreNodes(IDnum preNodeAID, PreArcI oldPreArc, PreGraph * preGraph) { IDnum preNodeBID = preNodeAID; IDnum currentPreNodeID, nextPreNodeID; PreArcI preArc = oldPreArc; Coordinate totalLength = 0; Coordinate arrayLength; Descriptor * descr, * ptr; int writeOffset = 0; int wordLength = getWordLength_pg(preGraph); Coordinate totalOffset = 0; //velvetLog("Concatenating nodes %li and %li\n", preNodeAID, preNodeBID); while(hasSinglePreArc_pg(preNodeBID, preGraph) && hasSinglePreArc_pg(getOtherEnd_pg (preArc, preNodeBID), preGraph) && !isLoop_pg(preArc) && getDestination_pg(preArc, preNodeBID) != preNodeAID) { totalLength += getPreNodeLength_pg(preNodeBID, preGraph); preNodeBID = getDestination_pg(preArc, preNodeBID); preArc = getPreArc_pg(preNodeBID, preGraph); } totalLength += getPreNodeLength_pg(preNodeBID, preGraph); totalLength += wordLength - 1; // Reference marker management if (referenceMarkersAreActivated_pg(preGraph)) { currentPreNodeID = preNodeAID; preArc = getPreArc_pg(currentPreNodeID, preGraph); for (currentPreNodeID = getDestination_pg(preArc, currentPreNodeID); currentPreNodeID != preNodeBID; currentPreNodeID = getDestination_pg(preArc, currentPreNodeID)) { concatenateReferenceMarkers_pg(preNodeAID, currentPreNodeID, preGraph, totalOffset); preArc = getPreArc_pg(currentPreNodeID, preGraph); totalOffset += getPreNodeLength_pg(currentPreNodeID, preGraph); } concatenateReferenceMarkers_pg(preNodeAID, currentPreNodeID, preGraph, totalOffset); } // Descriptor management (preNode) arrayLength = totalLength / 4; if (totalLength % 4) arrayLength++; descr = callocOrExit(arrayLength, Descriptor); ptr = descr; if (preNodeAID > 0) { currentPreNodeID = preNodeAID; appendDescriptors_pg(&ptr, &writeOffset, currentPreNodeID, preGraph, true); preArc = getPreArc_pg(currentPreNodeID, preGraph); currentPreNodeID = getDestination_pg(preArc, currentPreNodeID); while (currentPreNodeID != preNodeBID) { appendDescriptors_pg(&ptr, &writeOffset, currentPreNodeID, preGraph, false); preArc = getPreArc_pg(currentPreNodeID, preGraph); currentPreNodeID = getDestination_pg(preArc, currentPreNodeID); } appendDescriptors_pg(&ptr, &writeOffset, currentPreNodeID, preGraph, false); } else { currentPreNodeID = -preNodeBID; appendDescriptors_pg(&ptr, &writeOffset ,currentPreNodeID, preGraph, true); preArc = getPreArc_pg(currentPreNodeID, preGraph); currentPreNodeID = getDestination_pg(preArc, currentPreNodeID); while (currentPreNodeID != -preNodeAID) { appendDescriptors_pg(&ptr, &writeOffset ,currentPreNodeID, preGraph, false); preArc = getPreArc_pg(currentPreNodeID, preGraph); currentPreNodeID = getDestination_pg(preArc, currentPreNodeID); } appendDescriptors_pg(&ptr, &writeOffset ,currentPreNodeID, preGraph, false); } if (writeOffset != 0) while (writeOffset++ != 4) (*ptr) >>= 2; setPreNodeDescriptor_pg(descr, totalLength - wordLength + 1, preNodeAID, preGraph); // Correct preArcs for (preArc = getPreArc_pg(preNodeBID, preGraph); preArc != NULL_IDX; preArc = getNextPreArc_pg(preArc, preNodeBID)) { if (getDestination_pg(preArc, preNodeBID) != -preNodeBID) createAnalogousPreArc_pg(preNodeAID, getDestination_pg(preArc, preNodeBID), preArc, preGraph); else createAnalogousPreArc_pg(preNodeAID, -preNodeAID, preArc, preGraph); } // Freeing gobbled preNode currentPreNodeID = -preNodeBID; while (currentPreNodeID != -preNodeAID) { preArc = getPreArc_pg(currentPreNodeID, preGraph); nextPreNodeID = getDestination_pg(preArc, currentPreNodeID); destroyPreNode_pg(currentPreNodeID, preGraph); currentPreNodeID = nextPreNodeID; } }
static void resizeReferenceCoordinateTable(ReferenceCoordinateTable * table, IDnum extraLength) { if (table->array == NULL) table->array = callocOrExit(extraLength, ReferenceCoordinate); else table->array = reallocOrExit(table->array, table->arrayLength + extraLength, ReferenceCoordinate); }
static ReferenceCoordinateTable * newReferenceCoordinateTable() { ReferenceCoordinateTable * table = callocOrExit(1, ReferenceCoordinateTable); table->array = NULL; table->arrayLength = 0; return table; }
int main(int argc, char **argv) { ReadSet *sequences = NULL; RoadMapArray *rdmaps; PreGraph *preGraph; Graph *graph; char *directory, *graphFilename, *connectedGraphFilename, *preGraphFilename, *seqFilename, *roadmapFilename, *lowCovContigsFilename, *highCovContigsFilename; double coverageCutoff = -1; double longCoverageCutoff = -1; double maxCoverageCutoff = -1; double expectedCoverage = -1; Coordinate minContigLength = -1; Coordinate minContigKmerLength; boolean *dubious = NULL; Coordinate insertLength[CATEGORIES]; Coordinate insertLengthLong = -1; Coordinate std_dev[CATEGORIES]; Coordinate std_dev_long = -1; short int accelerationBits = 24; boolean readTracking = false; boolean exportAssembly = false; boolean unusedReads = false; boolean estimateCoverage = false; boolean estimateCutoff = false; boolean exportAlignments = false; FILE *file; int arg_index, arg_int; double arg_double; char *arg; ShortLength *sequenceLengths = NULL; Category cat; boolean scaffolding = true; int pebbleRounds = 1; long long longlong_var; short int short_var; boolean exportFilteredNodes = false; int clean = 0; boolean conserveLong = false; boolean shadows[CATEGORIES]; int coverageMask = 1; SequencesReader *seqReadInfo = NULL; setProgramName("velvetg"); for (cat = 0; cat < CATEGORIES; cat++) { insertLength[cat] = -1; std_dev[cat] = -1; shadows[cat] = false; } // Error message if (argc == 1) { puts("velvetg - de Bruijn graph construction, error removal and repeat resolution"); printf("Version %i.%i.%2.2i\n", VERSION_NUMBER, RELEASE_NUMBER, UPDATE_NUMBER); puts("Copyright 2007, 2008 Daniel Zerbino ([email protected])"); puts("This is free software; see the source for copying conditions. There is NO"); puts("warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE."); puts("Compilation settings:"); printf("CATEGORIES = %i\n", CATEGORIES); printf("MAXKMERLENGTH = %i\n", MAXKMERLENGTH); #ifdef _OPENMP puts("OPENMP"); #endif #ifdef LONGSEQUENCES puts("LONGSEQUENCES"); #endif #ifdef BIGASSEMBLY puts("BIGASSEMBLY"); #endif #ifdef COLOR puts("COLOR"); #endif #ifdef DEBUG puts("DEBUG"); #endif puts(""); printUsage(); return 1; } if (strcmp(argv[1], "--help") == 0) { printUsage(); return 0; } // Memory allocation directory = argv[1]; graphFilename = mallocOrExit(strlen(directory) + 100, char); connectedGraphFilename = mallocOrExit(strlen(directory) + 100, char); preGraphFilename = mallocOrExit(strlen(directory) + 100, char); roadmapFilename = mallocOrExit(strlen(directory) + 100, char); seqFilename = mallocOrExit(strlen(directory) + 100, char); lowCovContigsFilename = mallocOrExit(strlen(directory) + 100, char); highCovContigsFilename = mallocOrExit(strlen(directory) + 100, char); // Argument parsing for (arg_index = 2; arg_index < argc; arg_index++) { arg = argv[arg_index++]; if (arg_index >= argc) { velvetLog("Unusual number of arguments!\n"); printUsage(); #ifdef DEBUG abort(); #endif exit(1); } if (strcmp(arg, "-cov_cutoff") == 0) { if (strcmp(argv[arg_index], "auto") == 0) { estimateCutoff = true; } else { sscanf(argv[arg_index], "%lf", &coverageCutoff); } } else if (strcmp(arg, "-long_cov_cutoff") == 0) { sscanf(argv[arg_index], "%lf", &longCoverageCutoff); } else if (strcmp(arg, "-exp_cov") == 0) { if (strcmp(argv[arg_index], "auto") == 0) { estimateCoverage = true; readTracking = true; } else { sscanf(argv[arg_index], "%lf", &expectedCoverage); if (expectedCoverage > 0) readTracking = true; } } else if (strcmp(arg, "-ins_length") == 0) { sscanf(argv[arg_index], "%lli", &longlong_var); insertLength[0] = (Coordinate) longlong_var; if (insertLength[0] < 0) { velvetLog("Invalid insert length: %lli\n", (long long) insertLength[0]); #ifdef DEBUG abort(); #endif exit(1); } } else if (strcmp(arg, "-ins_length_sd") == 0) { sscanf(argv[arg_index], "%lli", &longlong_var); std_dev[0] = (Coordinate) longlong_var; if (std_dev[0] < 0) { velvetLog("Invalid std deviation: %lli\n", (long long) std_dev[0]); #ifdef DEBUG abort(); #endif exit(1); } } else if (strcmp(arg, "-ins_length_long") == 0) { sscanf(argv[arg_index], "%lli", &longlong_var); insertLengthLong = (Coordinate) longlong_var; } else if (strcmp(arg, "-ins_length_long_sd") == 0) { sscanf(argv[arg_index], "%lli", &longlong_var); std_dev_long = (Coordinate) longlong_var; } else if (strncmp(arg, "-ins_length", 11) == 0 && strchr(arg, 'd') == NULL) { sscanf(arg, "-ins_length%hi", &short_var); cat = (Category) short_var; if (cat < 1 || cat > CATEGORIES) { velvetLog("Unknown option: %s\n", arg); #ifdef DEBUG abort(); #endif exit(1); } sscanf(argv[arg_index], "%lli", &longlong_var); insertLength[cat - 1] = (Coordinate) longlong_var; if (insertLength[cat - 1] < 0) { velvetLog("Invalid insert length: %lli\n", (long long) insertLength[cat - 1]); #ifdef DEBUG abort(); #endif exit(1); } } else if (strncmp(arg, "-ins_length", 11) == 0) { sscanf(arg, "-ins_length%hi_sd", &short_var); cat = (Category) short_var; if (cat < 1 || cat > CATEGORIES) { velvetLog("Unknown option: %s\n", arg); #ifdef DEBUG abort(); #endif exit(1); } sscanf(argv[arg_index], "%lli", &longlong_var); std_dev[cat - 1] = (Coordinate) longlong_var; if (std_dev[cat - 1] < 0) { velvetLog("Invalid std deviation: %lli\n", (long long) std_dev[cat - 1]); #ifdef DEBUG abort(); #endif exit(1); } } else if (strcmp(arg, "-read_trkg") == 0) { readTracking = (strcmp(argv[arg_index], "yes") == 0); } else if (strcmp(arg, "-scaffolding") == 0) { scaffolding = (strcmp(argv[arg_index], "yes") == 0); } else if (strcmp(arg, "-exportFiltered") == 0) { exportFilteredNodes = (strcmp(argv[arg_index], "yes") == 0); } else if (strcmp(arg, "-amos_file") == 0) { exportAssembly = (strcmp(argv[arg_index], "yes") == 0); } else if (strcmp(arg, "-alignments") == 0) { exportAlignments = (strcmp(argv[arg_index], "yes") == 0); } else if (strcmp(arg, "-min_contig_lgth") == 0) { sscanf(argv[arg_index], "%lli", &longlong_var); minContigLength = (Coordinate) longlong_var; } else if (strcmp(arg, "-coverage_mask") == 0) { sscanf(argv[arg_index], "%lli", &longlong_var); coverageMask = (IDnum) longlong_var; } else if (strcmp(arg, "-accel_bits") == 0) { sscanf(argv[arg_index], "%hi", &accelerationBits); if (accelerationBits < 0) { velvetLog ("Illegal acceleration parameter: %s\n", argv[arg_index]); printUsage(); return -1; } } else if (strcmp(arg, "-max_branch_length") == 0) { sscanf(argv[arg_index], "%i", &arg_int); setMaxReadLength(arg_int); setLocalMaxReadLength(arg_int); } else if (strcmp(arg, "-max_divergence") == 0) { sscanf(argv[arg_index], "%lf", &arg_double); setMaxDivergence(arg_double); setLocalMaxDivergence(arg_double); } else if (strcmp(arg, "-max_gap_count") == 0) { sscanf(argv[arg_index], "%i", &arg_int); setMaxGaps(arg_int); setLocalMaxGaps(arg_int); } else if (strcmp(arg, "-min_pair_count") == 0) { sscanf(argv[arg_index], "%i", &arg_int); setUnreliableConnectionCutoff(arg_int); } else if (strcmp(arg, "-max_coverage") == 0) { sscanf(argv[arg_index], "%lf", &maxCoverageCutoff); } else if (strcmp(arg, "-long_mult_cutoff") == 0) { sscanf(argv[arg_index], "%i", &arg_int); setMultiplicityCutoff(arg_int); } else if (strcmp(arg, "-paired_exp_fraction") == 0) { sscanf(argv[arg_index], "%lf", &arg_double); setPairedExpFraction(arg_double); } else if (strcmp(arg, "-clean") == 0) { if (strcmp(argv[arg_index], "yes") == 0) clean = 1; } else if (strcmp(arg, "-very_clean") == 0) { if (strcmp(argv[arg_index], "yes") == 0) clean = 2; } else if (strcmp(arg, "-conserveLong") == 0) { if (strcmp(argv[arg_index], "yes") == 0) conserveLong = 2; } else if (strcmp(arg, "-unused_reads") == 0) { unusedReads = (strcmp(argv[arg_index], "yes") == 0); if (unusedReads) readTracking = true; } else if (strcmp(arg, "-shortMatePaired") == 0) { shadows[0] = (strcmp(argv[arg_index], "yes") == 0); } else if (strncmp(arg, "-shortMatePaired", 16) == 0) { sscanf(arg, "-shortMatePaired%hi", &short_var); cat = (Category) short_var; if (cat < 1 || cat > CATEGORIES) { velvetLog("Unknown option: %s\n", arg); #ifdef DEBUG abort(); #endif exit(1); } shadows[cat - 1] = (strcmp(argv[arg_index], "yes") == 0); } else if (strcmp(arg, "--help") == 0) { printUsage(); return 0; } else { velvetLog("Unknown option: %s;\n", arg); printUsage(); return 1; } } // Bookkeeping logInstructions(argc, argv, directory); seqReadInfo = callocOrExit(1, SequencesReader); strcpy(seqFilename, directory); // if binary CnyUnifiedSeq exists, use it. Otherwise try Sequences strcat(seqFilename, "/CnyUnifiedSeq"); if (access(seqFilename, R_OK) == 0) { seqReadInfo->m_bIsBinary = true; } else { seqReadInfo->m_bIsBinary = false; strcpy(seqFilename, directory); strcat(seqFilename, "/Sequences"); } seqReadInfo->m_seqFilename = seqFilename; strcpy(roadmapFilename, directory); strcat(roadmapFilename, "/Roadmaps"); strcpy(preGraphFilename, directory); strcat(preGraphFilename, "/PreGraph"); strcpy(connectedGraphFilename, directory); strcat(connectedGraphFilename, "/ConnectedGraph"); if (!readTracking) { strcpy(graphFilename, directory); strcat(graphFilename, "/Graph"); } else { strcpy(graphFilename, directory); strcat(graphFilename, "/Graph2"); } strcpy(lowCovContigsFilename, directory); strcat(lowCovContigsFilename, "/lowCoverageContigs.fa"); strcpy(highCovContigsFilename, directory); strcat(highCovContigsFilename, "/highCoverageContigs.fa"); // Graph uploading or creation if ((file = fopen(graphFilename, "r")) != NULL) { fclose(file); graph = importGraph(graphFilename); } else if ((file = fopen(connectedGraphFilename, "r")) != NULL) { fclose(file); if (seqReadInfo->m_bIsBinary) { sequences = importCnyReadSet(seqFilename); #if 0 // compare to velvet's version of a seq ReadSet *compareSequences = NULL; compareSeqFilename = mallocOrExit(strlen(directory) + 100, char); strcpy(compareSeqFilename, directory); strcat(compareSeqFilename, "/Sequences"); compareSequences = importReadSet(compareSeqFilename); convertSequences(compareSequences); if (sequences->readCount != compareSequences->readCount) { printf("read count mismatch\n"); exit(1); } int i; for (i = 0; i < sequences->readCount; i++) { TightString *tString = getTightStringInArray(sequences->tSequences, i); TightString *tStringCmp = getTightStringInArray(compareSequences->tSequences, i); if (getLength(tString) != getLength(tStringCmp)) { printf("sequence %d len mismatch\n", i); exit(1); } if (strcmp(readTightString(tString), readTightString(tStringCmp)) != 0) { printf("sequence %d cmp mismatch\n", i); printf("seq %s != cmp %s\n", readTightString(tString), readTightString(tStringCmp)); exit(1); } } #endif } else {
static void fillUpGraph(ReadSet * reads, KmerOccurenceTable * kmerTable, Graph * graph, boolean readTracking, boolean double_strand, ReferenceMapping * referenceMappings, Coordinate referenceMappingCount, IDnum refCount, char * roadmapFilename) { IDnum readIndex; RoadMapArray *roadmap = NULL; Coordinate *annotationOffset = NULL; struct timeval start, end, diff; if (referenceMappings) { roadmap = importRoadMapArray(roadmapFilename); annotationOffset = callocOrExit(reads->readCount, Coordinate); for (readIndex = 1; readIndex < reads->readCount; readIndex++) annotationOffset[readIndex] = annotationOffset[readIndex - 1] + getAnnotationCount(getRoadMapInArray(roadmap, readIndex - 1)); } resetNodeStatus(graph); // Allocate memory for the read pairs if (!readStartsAreActivated(graph)) activateReadStarts(graph); gettimeofday(&start, NULL); #ifdef OPENMP initSmallNodeListMemory(); createNodeLocks(graph); #pragma omp parallel for #endif for (readIndex = refCount; readIndex < reads->readCount; readIndex++) { Annotation * annotations = NULL; IDnum annotationCount = 0; Category category; boolean second_in_pair; if (readIndex % 1000000 == 0) velvetLog("Ghost Threading through reads %ld / %ld\n", (long) readIndex, (long) reads->readCount); category = reads->categories[readIndex]; second_in_pair = reads->categories[readIndex] & 1 && isSecondInPair(reads, readIndex); if (referenceMappings) { annotationCount = getAnnotationCount(getRoadMapInArray(roadmap, readIndex)); annotations = getAnnotationInArray(roadmap->annotations, annotationOffset[readIndex]); } ghostThreadSequenceThroughGraph(getTightStringInArray(reads->tSequences, readIndex), kmerTable, graph, readIndex + 1, category, readTracking, double_strand, referenceMappings, referenceMappingCount, refCount, annotations, annotationCount, second_in_pair); } createNodeReadStartArrays(graph); gettimeofday(&end, NULL); timersub(&end, &start, &diff); velvetLog(" === Ghost-Threaded in %ld.%06ld s\n", diff.tv_sec, diff.tv_usec); gettimeofday(&start, NULL); #ifdef OPENMP int threads = omp_get_max_threads(); if (threads > 32) threads = 32; #pragma omp parallel for num_threads(threads) #endif for (readIndex = 0; readIndex < reads->readCount; readIndex++) { Annotation * annotations = NULL; IDnum annotationCount = 0; Category category; boolean second_in_pair; if (readIndex % 1000000 == 0) velvetLog("Threading through reads %li / %li\n", (long) readIndex, (long) reads->readCount); category = reads->categories[readIndex]; second_in_pair = reads->categories[readIndex] % 2 && isSecondInPair(reads, readIndex); if (referenceMappings) { annotationCount = getAnnotationCount(getRoadMapInArray(roadmap, readIndex)); annotations = getAnnotationInArray(roadmap->annotations, annotationOffset[readIndex]); } threadSequenceThroughGraph(getTightStringInArray(reads->tSequences, readIndex), kmerTable, graph, readIndex + 1, category, readTracking, double_strand, referenceMappings, referenceMappingCount, refCount, annotations, annotationCount, second_in_pair); } gettimeofday(&end, NULL); timersub(&end, &start, &diff); velvetLog(" === Threaded in %ld.%06ld s\n", diff.tv_sec, diff.tv_usec); #ifdef OPENMP free(nodeLocks); nodeLocks = NULL; #endif if (referenceMappings) { destroyRoadMapArray(roadmap); free (annotationOffset); } orderNodeReadStartArrays(graph); destroySmallNodeListMemmory(); destroyKmerOccurenceTable(kmerTable); }
// Creates insertion marker lists static void setInsertionMarkers(RoadMapArray * rdmaps, IDnum * markerCounters, InsertionMarker ** veryLastMarker, InsertionMarker ** insertionMarkers) { IDnum sequenceCounter = rdmaps->length; IDnum sequenceIndex, sequenceIndex2; Coordinate totalCount = 0; RoadMap *rdmap; Annotation *annot = rdmaps->annotations; InsertionMarker *nextMarker, *newMarker; IDnum annotIndex, lastAnnotIndex; InsertionMarker **insMarkers = callocOrExit(rdmaps->length + 1, InsertionMarker *); // Counting insertion markers for (sequenceIndex = 1; sequenceIndex < sequenceCounter + 1; sequenceIndex++) { //velvetLog("Going through sequence %d\n", sequenceIndex); rdmap = getRoadMapInArray(rdmaps, sequenceIndex - 1); lastAnnotIndex = getAnnotationCount(rdmap); // Set insertion markers in previous sequences : for (annotIndex = 0; annotIndex < lastAnnotIndex; annotIndex++) { if (getAnnotSequenceID(annot) > 0) { markerCounters[getAnnotSequenceID(annot)] += 2; } else { markerCounters[-getAnnotSequenceID(annot)] += 2; } totalCount += 2; annot = getNextAnnotation(annot); } } // Allocating space *insertionMarkers = callocOrExit(totalCount, InsertionMarker); *veryLastMarker = *insertionMarkers + totalCount; // Pointing each node to its space nextMarker = *insertionMarkers; for (sequenceIndex = 1; sequenceIndex < sequenceCounter + 1; sequenceIndex++) { insMarkers[sequenceIndex] = nextMarker; nextMarker = nextMarker + markerCounters[sequenceIndex]; markerCounters[sequenceIndex] = 0; } // Filling up space with data annot = rdmaps->annotations; for (sequenceIndex = 1; sequenceIndex < sequenceCounter + 1; sequenceIndex++) { //velvetLog("Going through sequence %d\n", sequenceIndex); rdmap = getRoadMapInArray(rdmaps, sequenceIndex - 1); lastAnnotIndex = getAnnotationCount(rdmap); // Set insertion markers in previous sequences : for (annotIndex = 0; annotIndex < lastAnnotIndex; annotIndex++) { sequenceIndex2 = getAnnotSequenceID(annot); if (sequenceIndex2 > 0) { newMarker = insMarkers[sequenceIndex2] + (markerCounters[sequenceIndex2])++; newMarker->annot = annot; newMarker->isStart = true; newMarker = insMarkers[sequenceIndex2] + (markerCounters[sequenceIndex2])++; newMarker->annot = annot; newMarker->isStart = false; } else { incrementAnnotationCoordinates(annot); newMarker = insMarkers[-sequenceIndex2] + (markerCounters[-sequenceIndex2])++; newMarker->annot = annot; newMarker->isStart = true; newMarker = insMarkers[-sequenceIndex2] + (markerCounters[-sequenceIndex2])++; newMarker->annot = annot; newMarker->isStart = false; } annot = getNextAnnotation(annot); } } orderInsertionMarkers(insMarkers, markerCounters, rdmaps); free(insMarkers); }