static void fillUpGraph(ReadSet * reads, KmerOccurenceTable * kmerTable, Graph * graph, boolean readTracking, boolean double_strand, ReferenceMapping * referenceMappings, Coordinate referenceMappingCount, IDnum refCount, char * roadmapFilename) { IDnum readIndex; RoadMapArray *roadmap = NULL; Coordinate *annotationOffset = NULL; struct timeval start, end, diff; if (referenceMappings) { roadmap = importRoadMapArray(roadmapFilename); annotationOffset = callocOrExit(reads->readCount, Coordinate); for (readIndex = 1; readIndex < reads->readCount; readIndex++) annotationOffset[readIndex] = annotationOffset[readIndex - 1] + getAnnotationCount(getRoadMapInArray(roadmap, readIndex - 1)); } resetNodeStatus(graph); // Allocate memory for the read pairs if (!readStartsAreActivated(graph)) activateReadStarts(graph); gettimeofday(&start, NULL); #ifdef OPENMP initSmallNodeListMemory(); createNodeLocks(graph); #pragma omp parallel for #endif for (readIndex = refCount; readIndex < reads->readCount; readIndex++) { Annotation * annotations = NULL; IDnum annotationCount = 0; Category category; boolean second_in_pair; if (readIndex % 1000000 == 0) velvetLog("Ghost Threading through reads %ld / %ld\n", (long) readIndex, (long) reads->readCount); category = reads->categories[readIndex]; second_in_pair = reads->categories[readIndex] & 1 && isSecondInPair(reads, readIndex); if (referenceMappings) { annotationCount = getAnnotationCount(getRoadMapInArray(roadmap, readIndex)); annotations = getAnnotationInArray(roadmap->annotations, annotationOffset[readIndex]); } ghostThreadSequenceThroughGraph(getTightStringInArray(reads->tSequences, readIndex), kmerTable, graph, readIndex + 1, category, readTracking, double_strand, referenceMappings, referenceMappingCount, refCount, annotations, annotationCount, second_in_pair); } createNodeReadStartArrays(graph); gettimeofday(&end, NULL); timersub(&end, &start, &diff); velvetLog(" === Ghost-Threaded in %ld.%06ld s\n", diff.tv_sec, diff.tv_usec); gettimeofday(&start, NULL); #ifdef OPENMP int threads = omp_get_max_threads(); if (threads > 32) threads = 32; #pragma omp parallel for num_threads(threads) #endif for (readIndex = 0; readIndex < reads->readCount; readIndex++) { Annotation * annotations = NULL; IDnum annotationCount = 0; Category category; boolean second_in_pair; if (readIndex % 1000000 == 0) velvetLog("Threading through reads %li / %li\n", (long) readIndex, (long) reads->readCount); category = reads->categories[readIndex]; second_in_pair = reads->categories[readIndex] % 2 && isSecondInPair(reads, readIndex); if (referenceMappings) { annotationCount = getAnnotationCount(getRoadMapInArray(roadmap, readIndex)); annotations = getAnnotationInArray(roadmap->annotations, annotationOffset[readIndex]); } threadSequenceThroughGraph(getTightStringInArray(reads->tSequences, readIndex), kmerTable, graph, readIndex + 1, category, readTracking, double_strand, referenceMappings, referenceMappingCount, refCount, annotations, annotationCount, second_in_pair); } gettimeofday(&end, NULL); timersub(&end, &start, &diff); velvetLog(" === Threaded in %ld.%06ld s\n", diff.tv_sec, diff.tv_usec); #ifdef OPENMP free(nodeLocks); nodeLocks = NULL; #endif if (referenceMappings) { destroyRoadMapArray(roadmap); free (annotationOffset); } orderNodeReadStartArrays(graph); destroySmallNodeListMemmory(); destroyKmerOccurenceTable(kmerTable); }
// Creates the preNode using insertion marker and annotation lists for each sequence static void // Creates the preNode using insertion marker and annotation lists for each sequence createPreNodes(RoadMapArray * rdmaps, PreGraph * preGraph, IDnum * markerCounters, InsertionMarker * insertionMarkers, InsertionMarker * veryLastMarker, IDnum * chains, SequencesReader *seqReadInfo, int WORDLENGTH) { char *sequenceFilename = seqReadInfo->m_seqFilename; Annotation *annot = rdmaps->annotations; IDnum latestPreNodeID; InsertionMarker *currentMarker = insertionMarkers; IDnum sequenceIndex; Coordinate currentPosition, nextStop; IDnum preNodeCounter = 1; FILE *file = NULL; char line[50000]; int lineLength = 50000; Coordinate readIndex; boolean tooShort; Kmer initialKmer; char c; RoadMap *rdmap; IDnum annotIndex, lastAnnotIndex; IDnum markerIndex, lastMarkerIndex; if (!seqReadInfo->m_bIsBinary) { file = fopen(sequenceFilename, "r"); if (file == NULL) exitErrorf(EXIT_FAILURE, true, "Could not read %s", sequenceFilename); // Reading sequence descriptor in first line if (sequenceCount_pg(preGraph) > 0 && !fgets(line, lineLength, file)) exitErrorf(EXIT_FAILURE, true, "%s incomplete.", sequenceFilename); seqReadInfo->m_pFile = file; } // Now that we have read all of the annotations, we go on to create the preNodes and tie them up for (sequenceIndex = 1; sequenceIndex <= sequenceCount_pg(preGraph); sequenceIndex++) { if (sequenceIndex % 1000000 == 0) velvetLog("Sequence %li / %li\n", (long) sequenceIndex, (long) sequenceCount_pg(preGraph)); if (!seqReadInfo->m_bIsBinary) { while (line[0] != '>') if (!fgets(line, lineLength, file)) exitErrorf(EXIT_FAILURE, true, "%s incomplete.", sequenceFilename); } rdmap = getRoadMapInArray(rdmaps, sequenceIndex - 1); annotIndex = 0; lastAnnotIndex = getAnnotationCount(rdmap); markerIndex = 0; lastMarkerIndex = markerCounters[sequenceIndex]; currentPosition = 0; // Reading first (k-1) nucleotides tooShort = false; clearKmer(&initialKmer); //velvetLog("Initial kmer: "); TightString *tString = NULL; char *strString = NULL; if (seqReadInfo->m_bIsBinary) { tString = getTightStringInArray(seqReadInfo->m_sequences->tSequences, sequenceIndex - 1); strString = readTightString(tString); } for (readIndex = 0; readIndex < WORDLENGTH - 1; readIndex++) { if (seqReadInfo->m_bIsBinary) { if (readIndex >= tString->length) { tooShort = true; break; } c = strString[readIndex]; } else { c = getc(file); while (c == '\n' || c == '\r') c = getc(file); if (c == '>' || c == 'M' || c == EOF) { ungetc(c, file); tooShort = true; break; } } switch (c) { case 'A': case 'N': pushNucleotide(&initialKmer, ADENINE); break; case 'C': pushNucleotide(&initialKmer, CYTOSINE); break; case 'G': pushNucleotide(&initialKmer, GUANINE); break; case 'T': pushNucleotide(&initialKmer, THYMINE); break; default: velvetLog ("Irregular sequence file: are you sure your Sequence and Roadmap file come from the same source?\n"); fflush(stdout); abort(); } } if (tooShort) { //velvetLog("Skipping short read.. %d\n", sequenceIndex); chains[sequenceIndex] = preNodeCounter; if (seqReadInfo->m_bIsBinary) { free(strString); } else { if (!fgets(line, lineLength, file) && sequenceIndex < sequenceCount_pg(preGraph)) exitErrorf(EXIT_FAILURE, true, "%s incomplete.", sequenceFilename); } continue; } char *currString = NULL; if (seqReadInfo->m_bIsBinary) { currString = &strString[readIndex]; seqReadInfo->m_ppCurrString = &currString; } latestPreNodeID = 0; while (annotIndex < lastAnnotIndex) { if (markerIndex == lastMarkerIndex || getPosition(annot) <= getInsertionMarkerPosition(currentMarker)) nextStop = getPosition(annot); else { nextStop = getInsertionMarkerPosition (currentMarker); } if (currentPosition != nextStop) { if (seqReadInfo->m_bIsBinary) { if (readIndex >= tString->length) { velvetLog("readIndex %ld beyond string len %ld\n", (uint64_t) readIndex, (uint64_t) tString->length); exit(1); } } //if (sequenceIndex == 481) // velvetLog("Adding pre nodes from %lli to %lli\n", (long long) currentPosition, (long long) nextStop); addPreNodeToPreGraph_pg(preGraph, currentPosition, nextStop, seqReadInfo, &initialKmer, preNodeCounter); if (latestPreNodeID == 0) { chains[sequenceIndex] = preNodeCounter; } latestPreNodeID = preNodeCounter++; currentPosition = nextStop; } while (markerIndex < lastMarkerIndex && getInsertionMarkerPosition(currentMarker) == nextStop) { convertMarker(currentMarker, latestPreNodeID); currentMarker++; markerIndex++; } while (annotIndex < lastAnnotIndex && getPosition(annot) == nextStop) { for (readIndex = 0; readIndex < getAnnotationLength(annot); readIndex++) { if (seqReadInfo->m_bIsBinary) { c = *currString; currString += 1; // increment the pointer } else { c = getc(file); while (!isalpha(c)) c = getc(file); } //if (sequenceIndex == 481) // velvetLog("(%c)", c); switch (c) { case 'A': case 'N': pushNucleotide(&initialKmer, ADENINE); break; case 'C': pushNucleotide(&initialKmer, CYTOSINE); break; case 'G': pushNucleotide(&initialKmer, GUANINE); break; case 'T': pushNucleotide(&initialKmer, THYMINE); break; default: velvetLog ("Irregular sequence file: are you sure your Sequence and Roadmap file come from the same source?\n"); fflush(stdout); #ifdef DEBUG abort(); #endif exit(1); } } annot = getNextAnnotation(annot); annotIndex++; } } while (markerIndex < lastMarkerIndex) { if (currentPosition == getInsertionMarkerPosition(currentMarker)) { convertMarker(currentMarker, latestPreNodeID); currentMarker++; markerIndex++; } else { nextStop = getInsertionMarkerPosition (currentMarker); //if (sequenceIndex == 481) // velvetLog("Adding pre nodes from %lli to %lli\n", (long long) currentPosition, (long long) nextStop); addPreNodeToPreGraph_pg(preGraph, currentPosition, nextStop, seqReadInfo, &initialKmer, preNodeCounter); if (latestPreNodeID == 0) chains[sequenceIndex] = preNodeCounter; latestPreNodeID = preNodeCounter++; currentPosition = getInsertionMarkerPosition (currentMarker); } } if (seqReadInfo->m_bIsBinary) { free(strString); } else { // End of sequence if (!fgets(line, lineLength, file) && sequenceIndex < sequenceCount_pg(preGraph)) exitErrorf(EXIT_FAILURE, true, "%s incomplete.", sequenceFilename); //velvetLog(" \n"); } if (latestPreNodeID == 0) chains[sequenceIndex] = preNodeCounter; } free(markerCounters); if (!seqReadInfo->m_bIsBinary) { fclose(file); } }
int main(int argc, char **argv) { ReadSet *sequences = NULL; RoadMapArray *rdmaps; PreGraph *preGraph; Graph *graph; char *directory, *graphFilename, *connectedGraphFilename, *preGraphFilename, *seqFilename, *roadmapFilename, *lowCovContigsFilename, *highCovContigsFilename; double coverageCutoff = -1; double longCoverageCutoff = -1; double maxCoverageCutoff = -1; double expectedCoverage = -1; Coordinate minContigLength = -1; Coordinate minContigKmerLength; boolean *dubious = NULL; Coordinate insertLength[CATEGORIES]; Coordinate insertLengthLong = -1; Coordinate std_dev[CATEGORIES]; Coordinate std_dev_long = -1; short int accelerationBits = 24; boolean readTracking = false; boolean exportAssembly = false; boolean unusedReads = false; boolean estimateCoverage = false; boolean estimateCutoff = false; boolean exportAlignments = false; FILE *file; int arg_index, arg_int; double arg_double; char *arg; ShortLength *sequenceLengths = NULL; Category cat; boolean scaffolding = true; int pebbleRounds = 1; long long longlong_var; short int short_var; boolean exportFilteredNodes = false; int clean = 0; boolean conserveLong = false; boolean shadows[CATEGORIES]; int coverageMask = 1; SequencesReader *seqReadInfo = NULL; setProgramName("velvetg"); for (cat = 0; cat < CATEGORIES; cat++) { insertLength[cat] = -1; std_dev[cat] = -1; shadows[cat] = false; } // Error message if (argc == 1) { puts("velvetg - de Bruijn graph construction, error removal and repeat resolution"); printf("Version %i.%i.%2.2i\n", VERSION_NUMBER, RELEASE_NUMBER, UPDATE_NUMBER); puts("Copyright 2007, 2008 Daniel Zerbino ([email protected])"); puts("This is free software; see the source for copying conditions. There is NO"); puts("warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE."); puts("Compilation settings:"); printf("CATEGORIES = %i\n", CATEGORIES); printf("MAXKMERLENGTH = %i\n", MAXKMERLENGTH); #ifdef _OPENMP puts("OPENMP"); #endif #ifdef LONGSEQUENCES puts("LONGSEQUENCES"); #endif #ifdef BIGASSEMBLY puts("BIGASSEMBLY"); #endif #ifdef COLOR puts("COLOR"); #endif #ifdef DEBUG puts("DEBUG"); #endif puts(""); printUsage(); return 1; } if (strcmp(argv[1], "--help") == 0) { printUsage(); return 0; } // Memory allocation directory = argv[1]; graphFilename = mallocOrExit(strlen(directory) + 100, char); connectedGraphFilename = mallocOrExit(strlen(directory) + 100, char); preGraphFilename = mallocOrExit(strlen(directory) + 100, char); roadmapFilename = mallocOrExit(strlen(directory) + 100, char); seqFilename = mallocOrExit(strlen(directory) + 100, char); lowCovContigsFilename = mallocOrExit(strlen(directory) + 100, char); highCovContigsFilename = mallocOrExit(strlen(directory) + 100, char); // Argument parsing for (arg_index = 2; arg_index < argc; arg_index++) { arg = argv[arg_index++]; if (arg_index >= argc) { velvetLog("Unusual number of arguments!\n"); printUsage(); #ifdef DEBUG abort(); #endif exit(1); } if (strcmp(arg, "-cov_cutoff") == 0) { if (strcmp(argv[arg_index], "auto") == 0) { estimateCutoff = true; } else { sscanf(argv[arg_index], "%lf", &coverageCutoff); } } else if (strcmp(arg, "-long_cov_cutoff") == 0) { sscanf(argv[arg_index], "%lf", &longCoverageCutoff); } else if (strcmp(arg, "-exp_cov") == 0) { if (strcmp(argv[arg_index], "auto") == 0) { estimateCoverage = true; readTracking = true; } else { sscanf(argv[arg_index], "%lf", &expectedCoverage); if (expectedCoverage > 0) readTracking = true; } } else if (strcmp(arg, "-ins_length") == 0) { sscanf(argv[arg_index], "%lli", &longlong_var); insertLength[0] = (Coordinate) longlong_var; if (insertLength[0] < 0) { velvetLog("Invalid insert length: %lli\n", (long long) insertLength[0]); #ifdef DEBUG abort(); #endif exit(1); } } else if (strcmp(arg, "-ins_length_sd") == 0) { sscanf(argv[arg_index], "%lli", &longlong_var); std_dev[0] = (Coordinate) longlong_var; if (std_dev[0] < 0) { velvetLog("Invalid std deviation: %lli\n", (long long) std_dev[0]); #ifdef DEBUG abort(); #endif exit(1); } } else if (strcmp(arg, "-ins_length_long") == 0) { sscanf(argv[arg_index], "%lli", &longlong_var); insertLengthLong = (Coordinate) longlong_var; } else if (strcmp(arg, "-ins_length_long_sd") == 0) { sscanf(argv[arg_index], "%lli", &longlong_var); std_dev_long = (Coordinate) longlong_var; } else if (strncmp(arg, "-ins_length", 11) == 0 && strchr(arg, 'd') == NULL) { sscanf(arg, "-ins_length%hi", &short_var); cat = (Category) short_var; if (cat < 1 || cat > CATEGORIES) { velvetLog("Unknown option: %s\n", arg); #ifdef DEBUG abort(); #endif exit(1); } sscanf(argv[arg_index], "%lli", &longlong_var); insertLength[cat - 1] = (Coordinate) longlong_var; if (insertLength[cat - 1] < 0) { velvetLog("Invalid insert length: %lli\n", (long long) insertLength[cat - 1]); #ifdef DEBUG abort(); #endif exit(1); } } else if (strncmp(arg, "-ins_length", 11) == 0) { sscanf(arg, "-ins_length%hi_sd", &short_var); cat = (Category) short_var; if (cat < 1 || cat > CATEGORIES) { velvetLog("Unknown option: %s\n", arg); #ifdef DEBUG abort(); #endif exit(1); } sscanf(argv[arg_index], "%lli", &longlong_var); std_dev[cat - 1] = (Coordinate) longlong_var; if (std_dev[cat - 1] < 0) { velvetLog("Invalid std deviation: %lli\n", (long long) std_dev[cat - 1]); #ifdef DEBUG abort(); #endif exit(1); } } else if (strcmp(arg, "-read_trkg") == 0) { readTracking = (strcmp(argv[arg_index], "yes") == 0); } else if (strcmp(arg, "-scaffolding") == 0) { scaffolding = (strcmp(argv[arg_index], "yes") == 0); } else if (strcmp(arg, "-exportFiltered") == 0) { exportFilteredNodes = (strcmp(argv[arg_index], "yes") == 0); } else if (strcmp(arg, "-amos_file") == 0) { exportAssembly = (strcmp(argv[arg_index], "yes") == 0); } else if (strcmp(arg, "-alignments") == 0) { exportAlignments = (strcmp(argv[arg_index], "yes") == 0); } else if (strcmp(arg, "-min_contig_lgth") == 0) { sscanf(argv[arg_index], "%lli", &longlong_var); minContigLength = (Coordinate) longlong_var; } else if (strcmp(arg, "-coverage_mask") == 0) { sscanf(argv[arg_index], "%lli", &longlong_var); coverageMask = (IDnum) longlong_var; } else if (strcmp(arg, "-accel_bits") == 0) { sscanf(argv[arg_index], "%hi", &accelerationBits); if (accelerationBits < 0) { velvetLog ("Illegal acceleration parameter: %s\n", argv[arg_index]); printUsage(); return -1; } } else if (strcmp(arg, "-max_branch_length") == 0) { sscanf(argv[arg_index], "%i", &arg_int); setMaxReadLength(arg_int); setLocalMaxReadLength(arg_int); } else if (strcmp(arg, "-max_divergence") == 0) { sscanf(argv[arg_index], "%lf", &arg_double); setMaxDivergence(arg_double); setLocalMaxDivergence(arg_double); } else if (strcmp(arg, "-max_gap_count") == 0) { sscanf(argv[arg_index], "%i", &arg_int); setMaxGaps(arg_int); setLocalMaxGaps(arg_int); } else if (strcmp(arg, "-min_pair_count") == 0) { sscanf(argv[arg_index], "%i", &arg_int); setUnreliableConnectionCutoff(arg_int); } else if (strcmp(arg, "-max_coverage") == 0) { sscanf(argv[arg_index], "%lf", &maxCoverageCutoff); } else if (strcmp(arg, "-long_mult_cutoff") == 0) { sscanf(argv[arg_index], "%i", &arg_int); setMultiplicityCutoff(arg_int); } else if (strcmp(arg, "-paired_exp_fraction") == 0) { sscanf(argv[arg_index], "%lf", &arg_double); setPairedExpFraction(arg_double); } else if (strcmp(arg, "-clean") == 0) { if (strcmp(argv[arg_index], "yes") == 0) clean = 1; } else if (strcmp(arg, "-very_clean") == 0) { if (strcmp(argv[arg_index], "yes") == 0) clean = 2; } else if (strcmp(arg, "-conserveLong") == 0) { if (strcmp(argv[arg_index], "yes") == 0) conserveLong = 2; } else if (strcmp(arg, "-unused_reads") == 0) { unusedReads = (strcmp(argv[arg_index], "yes") == 0); if (unusedReads) readTracking = true; } else if (strcmp(arg, "-shortMatePaired") == 0) { shadows[0] = (strcmp(argv[arg_index], "yes") == 0); } else if (strncmp(arg, "-shortMatePaired", 16) == 0) { sscanf(arg, "-shortMatePaired%hi", &short_var); cat = (Category) short_var; if (cat < 1 || cat > CATEGORIES) { velvetLog("Unknown option: %s\n", arg); #ifdef DEBUG abort(); #endif exit(1); } shadows[cat - 1] = (strcmp(argv[arg_index], "yes") == 0); } else if (strcmp(arg, "--help") == 0) { printUsage(); return 0; } else { velvetLog("Unknown option: %s;\n", arg); printUsage(); return 1; } } // Bookkeeping logInstructions(argc, argv, directory); seqReadInfo = callocOrExit(1, SequencesReader); strcpy(seqFilename, directory); // if binary CnyUnifiedSeq exists, use it. Otherwise try Sequences strcat(seqFilename, "/CnyUnifiedSeq"); if (access(seqFilename, R_OK) == 0) { seqReadInfo->m_bIsBinary = true; } else { seqReadInfo->m_bIsBinary = false; strcpy(seqFilename, directory); strcat(seqFilename, "/Sequences"); } seqReadInfo->m_seqFilename = seqFilename; strcpy(roadmapFilename, directory); strcat(roadmapFilename, "/Roadmaps"); strcpy(preGraphFilename, directory); strcat(preGraphFilename, "/PreGraph"); strcpy(connectedGraphFilename, directory); strcat(connectedGraphFilename, "/ConnectedGraph"); if (!readTracking) { strcpy(graphFilename, directory); strcat(graphFilename, "/Graph"); } else { strcpy(graphFilename, directory); strcat(graphFilename, "/Graph2"); } strcpy(lowCovContigsFilename, directory); strcat(lowCovContigsFilename, "/lowCoverageContigs.fa"); strcpy(highCovContigsFilename, directory); strcat(highCovContigsFilename, "/highCoverageContigs.fa"); // Graph uploading or creation if ((file = fopen(graphFilename, "r")) != NULL) { fclose(file); graph = importGraph(graphFilename); } else if ((file = fopen(connectedGraphFilename, "r")) != NULL) { fclose(file); if (seqReadInfo->m_bIsBinary) { sequences = importCnyReadSet(seqFilename); #if 0 // compare to velvet's version of a seq ReadSet *compareSequences = NULL; compareSeqFilename = mallocOrExit(strlen(directory) + 100, char); strcpy(compareSeqFilename, directory); strcat(compareSeqFilename, "/Sequences"); compareSequences = importReadSet(compareSeqFilename); convertSequences(compareSequences); if (sequences->readCount != compareSequences->readCount) { printf("read count mismatch\n"); exit(1); } int i; for (i = 0; i < sequences->readCount; i++) { TightString *tString = getTightStringInArray(sequences->tSequences, i); TightString *tStringCmp = getTightStringInArray(compareSequences->tSequences, i); if (getLength(tString) != getLength(tStringCmp)) { printf("sequence %d len mismatch\n", i); exit(1); } if (strcmp(readTightString(tString), readTightString(tStringCmp)) != 0) { printf("sequence %d cmp mismatch\n", i); printf("seq %s != cmp %s\n", readTightString(tString), readTightString(tStringCmp)); exit(1); } } #endif } else {