示例#1
0
文件: run2.c 项目: a1aks/velvet
int main(int argc, char **argv)
{
	ReadSet *sequences = NULL;
	RoadMapArray *rdmaps;
	PreGraph *preGraph;
	Graph *graph;
	char *directory, *graphFilename, *connectedGraphFilename,
	    *preGraphFilename, *seqFilename, *roadmapFilename,
	    *lowCovContigsFilename, *highCovContigsFilename;
	double coverageCutoff = -1;
	double longCoverageCutoff = -1;
	double maxCoverageCutoff = -1;
	double expectedCoverage = -1;
	Coordinate minContigLength = -1;
	Coordinate minContigKmerLength;
	boolean *dubious = NULL;
	Coordinate insertLength[CATEGORIES];
	Coordinate insertLengthLong = -1;
	Coordinate std_dev[CATEGORIES];
	Coordinate std_dev_long = -1;
	short int accelerationBits = 24;
	boolean readTracking = false;
	boolean exportAssembly = false;
	boolean unusedReads = false;
	boolean estimateCoverage = false;
	boolean estimateCutoff = false;
	boolean exportAlignments = false;
	FILE *file;
	int arg_index, arg_int;
	double arg_double;
	char *arg;
	ShortLength *sequenceLengths = NULL;
	Category cat;
	boolean scaffolding = true;
	int pebbleRounds = 1;
	long long longlong_var;
	short int short_var;
	boolean exportFilteredNodes = false;
	int clean = 0;
	boolean conserveLong = false;
	boolean shadows[CATEGORIES];
	int coverageMask = 1;
	SequencesReader *seqReadInfo = NULL;

	setProgramName("velvetg");

	for (cat = 0; cat < CATEGORIES; cat++) {
		insertLength[cat] = -1;
		std_dev[cat] = -1;
		shadows[cat] = false;
	}

	// Error message
	if (argc == 1) {
		puts("velvetg - de Bruijn graph construction, error removal and repeat resolution");
		printf("Version %i.%i.%2.2i\n", VERSION_NUMBER,
		       RELEASE_NUMBER, UPDATE_NUMBER);
		puts("Copyright 2007, 2008 Daniel Zerbino ([email protected])");
		puts("This is free software; see the source for copying conditions.  There is NO");
		puts("warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.");
		puts("Compilation settings:");
		printf("CATEGORIES = %i\n", CATEGORIES);
		printf("MAXKMERLENGTH = %i\n", MAXKMERLENGTH);
#ifdef _OPENMP
		puts("OPENMP");
#endif
#ifdef LONGSEQUENCES
		puts("LONGSEQUENCES");
#endif
#ifdef BIGASSEMBLY
		puts("BIGASSEMBLY");
#endif
#ifdef COLOR
		puts("COLOR");
#endif
#ifdef DEBUG
		puts("DEBUG");
#endif
		puts("");
		printUsage();
		return 1;
	}

	if (strcmp(argv[1], "--help") == 0) {
		printUsage();
		return 0;
	}

	// Memory allocation 
	directory = argv[1];
	graphFilename = mallocOrExit(strlen(directory) + 100, char);
	connectedGraphFilename = mallocOrExit(strlen(directory) + 100, char);
	preGraphFilename =
	    mallocOrExit(strlen(directory) + 100, char);
	roadmapFilename = mallocOrExit(strlen(directory) + 100, char);
	seqFilename = mallocOrExit(strlen(directory) + 100, char);
	lowCovContigsFilename = mallocOrExit(strlen(directory) + 100, char);
	highCovContigsFilename = mallocOrExit(strlen(directory) + 100, char);

	// Argument parsing
	for (arg_index = 2; arg_index < argc; arg_index++) {
		arg = argv[arg_index++];
		if (arg_index >= argc) {
			velvetLog("Unusual number of arguments!\n");
			printUsage();
#ifdef DEBUG 
			abort();
#endif 
			exit(1);
		}

		if (strcmp(arg, "-cov_cutoff") == 0) {
			if (strcmp(argv[arg_index], "auto") == 0) {
				estimateCutoff = true;
			} else {
				sscanf(argv[arg_index], "%lf", &coverageCutoff);
			}
		} else if (strcmp(arg, "-long_cov_cutoff") == 0) {
			sscanf(argv[arg_index], "%lf", &longCoverageCutoff);
		} else if (strcmp(arg, "-exp_cov") == 0) {
			if (strcmp(argv[arg_index], "auto") == 0) {
				estimateCoverage = true;
				readTracking = true;
			} else {
				sscanf(argv[arg_index], "%lf", &expectedCoverage);
				if (expectedCoverage > 0)
					readTracking = true;
			}
		} else if (strcmp(arg, "-ins_length") == 0) {
			sscanf(argv[arg_index], "%lli", &longlong_var);
			insertLength[0] = (Coordinate) longlong_var;
			if (insertLength[0] < 0) {
				velvetLog("Invalid insert length: %lli\n",
				       (long long) insertLength[0]);
#ifdef DEBUG 
				abort();
#endif 
				exit(1);
			}
		} else if (strcmp(arg, "-ins_length_sd") == 0) {
			sscanf(argv[arg_index], "%lli", &longlong_var);
			std_dev[0] = (Coordinate) longlong_var;
			if (std_dev[0] < 0) {
				velvetLog("Invalid std deviation: %lli\n",
				       (long long) std_dev[0]);
#ifdef DEBUG 
				abort();
#endif 
				exit(1);
			}
		} else if (strcmp(arg, "-ins_length_long") == 0) {
			sscanf(argv[arg_index], "%lli", &longlong_var);
			insertLengthLong = (Coordinate) longlong_var;
		} else if (strcmp(arg, "-ins_length_long_sd") == 0) {
			sscanf(argv[arg_index], "%lli", &longlong_var);
			std_dev_long = (Coordinate) longlong_var;
		} else if (strncmp(arg, "-ins_length", 11) == 0
			   && strchr(arg, 'd') == NULL) {
			sscanf(arg, "-ins_length%hi", &short_var);
			cat = (Category) short_var;
			if (cat < 1 || cat > CATEGORIES) {
				velvetLog("Unknown option: %s\n", arg);
#ifdef DEBUG 
				abort();
#endif 
				exit(1);
			}
			sscanf(argv[arg_index], "%lli", &longlong_var);
			insertLength[cat - 1] = (Coordinate) longlong_var;
			if (insertLength[cat - 1] < 0) {
				velvetLog("Invalid insert length: %lli\n",
				       (long long) insertLength[cat - 1]);
#ifdef DEBUG 
				abort();
#endif 
				exit(1);
			}
		} else if (strncmp(arg, "-ins_length", 11) == 0) {
			sscanf(arg, "-ins_length%hi_sd", &short_var);
			cat = (Category) short_var;
			if (cat < 1 || cat > CATEGORIES) {
				velvetLog("Unknown option: %s\n", arg);
#ifdef DEBUG 
				abort();
#endif 
				exit(1);
			}
			sscanf(argv[arg_index], "%lli", &longlong_var);
			std_dev[cat - 1] = (Coordinate) longlong_var;
			if (std_dev[cat - 1] < 0) {
				velvetLog("Invalid std deviation: %lli\n",
				       (long long) std_dev[cat - 1]);
#ifdef DEBUG 
				abort();
#endif 
				exit(1);
			}
		} else if (strcmp(arg, "-read_trkg") == 0) {
			readTracking =
			    (strcmp(argv[arg_index], "yes") == 0);
		} else if (strcmp(arg, "-scaffolding") == 0) {
			scaffolding =
			    (strcmp(argv[arg_index], "yes") == 0);
		} else if (strcmp(arg, "-exportFiltered") == 0) {
			exportFilteredNodes =
			    (strcmp(argv[arg_index], "yes") == 0);
		} else if (strcmp(arg, "-amos_file") == 0) {
			exportAssembly =
			    (strcmp(argv[arg_index], "yes") == 0);
		} else if (strcmp(arg, "-alignments") == 0) {
			exportAlignments =
			    (strcmp(argv[arg_index], "yes") == 0);
		} else if (strcmp(arg, "-min_contig_lgth") == 0) {
			sscanf(argv[arg_index], "%lli", &longlong_var);
			minContigLength = (Coordinate) longlong_var;
		} else if (strcmp(arg, "-coverage_mask") == 0) {
			sscanf(argv[arg_index], "%lli", &longlong_var);
			coverageMask = (IDnum) longlong_var;
		} else if (strcmp(arg, "-accel_bits") == 0) {
			sscanf(argv[arg_index], "%hi", &accelerationBits);
			if (accelerationBits < 0) {
				velvetLog
				    ("Illegal acceleration parameter: %s\n",
				     argv[arg_index]);
				printUsage();
				return -1;
			}
		} else if (strcmp(arg, "-max_branch_length") == 0) {
			sscanf(argv[arg_index], "%i", &arg_int);
			setMaxReadLength(arg_int);
			setLocalMaxReadLength(arg_int);
		} else if (strcmp(arg, "-max_divergence") == 0) {
			sscanf(argv[arg_index], "%lf", &arg_double);
			setMaxDivergence(arg_double);
			setLocalMaxDivergence(arg_double);
		} else if (strcmp(arg, "-max_gap_count") == 0) {
			sscanf(argv[arg_index], "%i", &arg_int);
			setMaxGaps(arg_int);
			setLocalMaxGaps(arg_int);
		} else if (strcmp(arg, "-min_pair_count") == 0) {
			sscanf(argv[arg_index], "%i", &arg_int);
			setUnreliableConnectionCutoff(arg_int);
		} else if (strcmp(arg, "-max_coverage") == 0) {
			sscanf(argv[arg_index], "%lf", &maxCoverageCutoff);
		} else if (strcmp(arg, "-long_mult_cutoff") == 0) {
			sscanf(argv[arg_index], "%i", &arg_int);
			setMultiplicityCutoff(arg_int);
		} else if (strcmp(arg, "-paired_exp_fraction") == 0) {
			sscanf(argv[arg_index], "%lf", &arg_double);
			setPairedExpFraction(arg_double);
		} else if (strcmp(arg, "-clean") == 0) {
			if (strcmp(argv[arg_index], "yes") == 0)
				clean = 1;
		} else if (strcmp(arg, "-very_clean") == 0) {
			if (strcmp(argv[arg_index], "yes") == 0)
				clean = 2;
		} else if (strcmp(arg, "-conserveLong") == 0) {
			if (strcmp(argv[arg_index], "yes") == 0)
				conserveLong = 2;
		} else if (strcmp(arg, "-unused_reads") == 0) {
			unusedReads =
			    (strcmp(argv[arg_index], "yes") == 0);
			if (unusedReads)
				readTracking = true;
		} else if (strcmp(arg, "-shortMatePaired") == 0) {
			shadows[0] = (strcmp(argv[arg_index], "yes") == 0);
		} else if (strncmp(arg, "-shortMatePaired", 16) == 0) {
			sscanf(arg, "-shortMatePaired%hi", &short_var);
			cat = (Category) short_var;
			if (cat < 1 || cat > CATEGORIES) {
				velvetLog("Unknown option: %s\n", arg);
#ifdef DEBUG
				abort();
#endif
				exit(1);
			}
			shadows[cat - 1] = (strcmp(argv[arg_index], "yes") == 0);
		} else if (strcmp(arg, "--help") == 0) {
			printUsage();
			return 0;
		} else {
			velvetLog("Unknown option: %s;\n", arg);
			printUsage();
			return 1;
		}
	}

	// Bookkeeping
	logInstructions(argc, argv, directory);

	seqReadInfo = callocOrExit(1, SequencesReader);
	strcpy(seqFilename, directory);
	// if binary CnyUnifiedSeq exists, use it.  Otherwise try Sequences
	strcat(seqFilename, "/CnyUnifiedSeq");
	if (access(seqFilename, R_OK) == 0) {
		seqReadInfo->m_bIsBinary = true;
	} else {
		seqReadInfo->m_bIsBinary = false;
		strcpy(seqFilename, directory);
	strcat(seqFilename, "/Sequences");
	}
	seqReadInfo->m_seqFilename = seqFilename;
	strcpy(roadmapFilename, directory);
	strcat(roadmapFilename, "/Roadmaps");

	strcpy(preGraphFilename, directory);
	strcat(preGraphFilename, "/PreGraph");

	strcpy(connectedGraphFilename, directory);
	strcat(connectedGraphFilename, "/ConnectedGraph");

	if (!readTracking) {
		strcpy(graphFilename, directory);
		strcat(graphFilename, "/Graph");
	} else {
		strcpy(graphFilename, directory);
		strcat(graphFilename, "/Graph2");
	}

	strcpy(lowCovContigsFilename, directory);
	strcat(lowCovContigsFilename, "/lowCoverageContigs.fa");

	strcpy(highCovContigsFilename, directory);
	strcat(highCovContigsFilename, "/highCoverageContigs.fa");

	// Graph uploading or creation
	if ((file = fopen(graphFilename, "r")) != NULL) {
		fclose(file);

		graph = importGraph(graphFilename);

	} else if ((file = fopen(connectedGraphFilename, "r")) != NULL) {
		fclose(file);
		if (seqReadInfo->m_bIsBinary) {

			sequences = importCnyReadSet(seqFilename);

#if 0
			// compare to velvet's version of a seq
			ReadSet *compareSequences = NULL;
			compareSeqFilename = mallocOrExit(strlen(directory) + 100, char);
			strcpy(compareSeqFilename, directory);
			strcat(compareSeqFilename, "/Sequences");
			compareSequences = importReadSet(compareSeqFilename);
			convertSequences(compareSequences);
			if (sequences->readCount != compareSequences->readCount) {
				printf("read count mismatch\n");
				exit(1);
			}
			int i;
			for (i = 0; i < sequences->readCount; i++) {
				TightString *tString = getTightStringInArray(sequences->tSequences, i);
				TightString *tStringCmp = getTightStringInArray(compareSequences->tSequences, i);
				if (getLength(tString) != getLength(tStringCmp)) {
					printf("sequence %d len mismatch\n", i);
					exit(1);
				}
				if (strcmp(readTightString(tString), readTightString(tStringCmp)) != 0) {
					printf("sequence %d cmp mismatch\n", i);
					printf("seq %s != cmp %s\n", readTightString(tString), readTightString(tStringCmp));
					exit(1);
				}
			}
#endif
		} else {
示例#2
0
int main(int argc, char **argv)
{
	ReadSet *allSequences = NULL;
	SplayTable *splayTable;
	int hashLength, hashLengthStep, hashLengthMax, h;
	char *directory, *filename, *seqFilename, *baseSeqName, *buf;
	char * token;
	boolean double_strand = true;
	boolean noHash = false;
	boolean multiple_kmers = false;
	char buffer[100];
	DIR *dir;

	setProgramName("velveth");

	if (argc < 4) {
		printf("velveth - simple hashing program\n");
		printf("Version %i.%i.%2.2i\n", VERSION_NUMBER,
		       RELEASE_NUMBER, UPDATE_NUMBER);
		printf("\nCopyright 2007, 2008 Daniel Zerbino ([email protected])\n");
		printf("This is free software; see the source for copying conditions.  There is NO\n");
		printf("warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.\n\n");
		printf("Compilation settings:\n");
		printf("CATEGORIES = %i\n", CATEGORIES);
		printf("MAXKMERLENGTH = %i\n", MAXKMERLENGTH);
#ifdef _OPENMP
		puts("OPENMP");
#endif
#ifdef LONGSEQUENCES
		puts("LONGSEQUENCES");
#endif
#ifdef BIGASSEMBLY
		puts("BIGASSEMBLY");
#endif
#ifdef COLOR
		puts("COLOR");
#endif
#ifdef DEBUG
		puts("DEBUG");
#endif
		printf("\n");
		printUsage();
		return 0;
	}

	strcpy(buffer, argv[2]);
	token = strtok(buffer, ",");
	hashLength = atoi(token);
	token = strtok(NULL, ",");
	if (token == NULL) {
		multiple_kmers = false;
		hashLengthMax = hashLength + 1;
	} else {
		multiple_kmers = true;
		hashLengthMax = atoi(token);
	}
	token = strtok(NULL, ",");
	if (token == NULL) {
		hashLengthStep = 2;
	} else {
		hashLengthStep = atoi(token);
	}

	if (hashLength > MAXKMERLENGTH) {
		velvetLog
		    ("Velvet can't handle k-mers as long as %i! We'll stick to %i if you don't mind.\n",
		     hashLength, MAXKMERLENGTH);
		hashLength = MAXKMERLENGTH;
	} 
	if (hashLength <= 0) {
		velvetLog("Invalid hash length: %s\n", argv[2]);
		printUsage();
		return 0;
	} 
	if (hashLength % 2 == 0) {
		velvetLog
		    ("Velvet can't work with even length k-mers, such as %i. We'll use %i instead, if you don't mind.\n",
		     hashLength, hashLength - 1);
		hashLength--;
	} 

	if (multiple_kmers) {
		if (hashLengthMax > MAXKMERLENGTH + 1) {
			velvetLog
			    ("Velvet can't handle k-mers as long as %i! We'll stick to %i if you don't mind.\n",
			     hashLengthMax, MAXKMERLENGTH + 1);
			hashLengthMax = MAXKMERLENGTH + 1;
		} 
		if (hashLengthMax <= hashLength) {
			velvetLog("hashLengthMin < hashLengthMax is required %s", argv[2]);
			printUsage();
			return 0;
		} 

		if (hashLengthStep <= 0) {
			velvetLog("Non-positive hash length! Setting it to 2\n");
			hashLengthStep = 2;
		}
		if (hashLengthStep % 2 == 1) {
			velvetLog
			    ("Velvet can't work with an odd length k-mer step, such as %i. We'll use %i instead, if you don't mind.\n",
			     hashLengthStep, hashLengthStep + 1);
			hashLengthStep++;
		}
	}

	// check if binary sequences should be used
	int argIndex;
	for (argIndex = 3; argIndex < argc; argIndex++)
		if (strcmp(argv[argIndex], "-create_binary") == 0 || strcmp(argv[argIndex], "-reuse_binary") == 0)
			setCreateBinary(true);

	for (h = hashLength; h < hashLengthMax; h += hashLengthStep) {

		resetWordFilter(h);

		buf = mallocOrExit(2 * strlen(argv[1]) + 500, char);

		if ( multiple_kmers ) {
			sprintf(buf,"%s_%d",argv[1],h);
			directory = mallocOrExit(strlen(buf) + 100, char);
			strcpy(directory,buf);
		} else 
			directory = argv[1];

		filename = mallocOrExit(strlen(directory) + 100, char);
		seqFilename = mallocOrExit(strlen(directory) + 100, char);
		baseSeqName = mallocOrExit(100, char);

		dir = opendir(directory);

		if (dir == NULL)
			mkdir(directory, 0777);
		else {
			sprintf(buf, "%s/PreGraph", directory);
			remove(buf);
			sprintf(buf, "%s/Graph", directory);
			remove(buf);
			sprintf(buf, "%s/Graph2", directory);
			remove(buf);
			sprintf(buf, "%s/Graph3", directory);
			remove(buf);
			sprintf(buf, "%s/Graph4", directory);
			remove(buf);
		}

		logInstructions(argc, argv, directory);

		strcpy(seqFilename, directory);
		if (isCreateBinary()) {
			// use the CNY unified seq writer
			strcpy(baseSeqName, "/CnyUnifiedSeq");
			// remove other style sequences file
			sprintf(buf, "%s/Sequences", directory);
			remove(buf);
		} else {
			strcpy(baseSeqName, "/Sequences");
			// remove other style sequences file
			sprintf(buf, "%s/CnyUnifiedSeq", directory);
			remove(buf);
			sprintf(buf, "%s/CnyUnifiedSeq.names", directory);
			remove(buf);
		}
		strcat(seqFilename, baseSeqName);

		if ( h == hashLength ) {
			parseDataAndReadFiles(seqFilename, argc - 2, &(argv[2]), &double_strand, &noHash);
		} else {
			sprintf(buf,"rm -f %s",seqFilename);
			if (system(buf)) {
				velvetLog("Command failed!\n");
				velvetLog("%s\n", buf);
#ifdef DEBUG
				abort();
#endif
				exit(1);
			}
			if (argv[1][0] == '/')
				sprintf(buf,"ln -s %s_%d%s %s",argv[1],hashLength,baseSeqName,seqFilename);
			else
				sprintf(buf,"ln -s `pwd`/%s_%d%s %s",argv[1],hashLength,baseSeqName,seqFilename);
			if (system(buf)) {
				velvetLog("Command failed!\n");
				velvetLog("%s\n", buf);
#ifdef DEBUG
				abort();
#endif
				exit(1);
			}
		}

		if (noHash)
			continue;

		splayTable = newSplayTable(h, double_strand);
		if (isCreateBinary()) {
			allSequences = importCnyReadSet(seqFilename);
		} else {
			allSequences = importReadSet(seqFilename);
		}
		velvetLog("%li sequences in total.\n", (long) allSequences->readCount);

		strcpy(filename, directory);
		strcat(filename, "/Roadmaps");
		inputSequenceArrayIntoSplayTableAndArchive(allSequences,
							   splayTable, filename, seqFilename);

		destroySplayTable(splayTable);
		if (dir)
			closedir(dir);
		if (directory != argv[1])
			free(directory);
		free(filename);
		free(seqFilename);
		free(baseSeqName);
		free(buf);
		if (allSequences) {
			destroyReadSet(allSequences);
		}
	}