int main(int argc, char *argv[]) { if(argc < 3) { fprintf (stderr,"%s: [d]isk [s]treaming of [k]-mers (constant-memory k-mer counting)\n",argv[0]); fprintf (stderr,"usage:\n"); fprintf (stderr," %s input_file kmer_size [-t min_abundance] [-m max_memory] [-d max_disk_space] [-o out_prefix] [-histo]\n",argv[0]); fprintf (stderr,"details:\n [-t min_abundance] filters out k-mers seen ( < min_abundance ) times, default: 1 (all kmers are returned)\n [-m max_memory] is in MB, default: min(total system memory / 2, 5 GB) \n [-d max_disk_space] is in MB, default: min(available disk space / 2, reads file size)\n [-o out_prefix] saves results in [out_prefix].solid_kmers. default out_prefix = basename(input_file)\n [-histo] outputs histogram of kmers abundance\n [-rev] outputs only one of forward or reverse complement k-mers\n Input file can be fasta, fastq, gzipped or not, or a file containing a list of file names.\n"); #ifdef SVN_REV fprintf(stderr,"Running dsk version %s\n",STR(SVN_REV)); #endif return 0; } // reads file Bank *Reads = new Bank(argv[1]); if (argv[2][0] == '-') { printf("please specify a k value\n"); exit(1); } /* Changes by Raunaq * Code addition for taking in multiple values of k. The file containing should have values of k sorted in decreasing order * */ int *Kmerlist = loadKmers(argv[2]); // kmer size //fprintf(stderr,"Smallest kmer is %d \n",smallestKmer); sizeKmer = Kmerlist[0]; if (sizeKmer>(int)(sizeof(kmer_type)*4)) { printf("Max kmer size on this compiled version is %lu\n",sizeof(kmer_type)*4); exit(1); } kmerMask=(((kmer_type)1)<<(sizeKmer*2))-1; // default solidity nks = 1; // default max memory max_memory = 5*1024; #ifndef OSX struct sysinfo info; sysinfo(&info); int total_ram = (int)(((double)info.totalram*(double)info.mem_unit)/1024/1024); printf("Total RAM: %d MB\n",total_ram); #else int total_ram = 128*1024; #endif // default prefix is the reads file basename char *reads_path=strdup(argv[1]); string reads_name(basename(reads_path)); // posix basename() may alter reads_path free(reads_path); int lastindex = reads_name.find_last_of("."); strcpy(prefix,reads_name.substr(0, lastindex).c_str()); for (int n_a = 3; n_a < argc ; n_a++) { if (strcmp(argv[n_a],"-t")==0) nks = atoi(argv[n_a+1]); if (strcmp(argv[n_a],"-o")==0) strcpy(prefix,argv[n_a+1]); } int verbose = 0; bool reverse = false; max_disk_space = 0; output_histo =false; // parse the remaining arguments: these will override the default max memory / max disk for (int n_a = 3; n_a < argc ; n_a++) { if (strcmp(argv[n_a],"-m")==0) max_memory = atoi(argv[n_a+1]); if (strcmp(argv[n_a],"-d")==0) max_disk_space = atoi(argv[n_a+1]); if (strcmp(argv[n_a],"-v")==0) verbose = 1; if (strcmp(argv[n_a],"-vv")==0) verbose = 2; if (strcmp(argv[n_a],"-histo")==0) output_histo =true; if (strcmp(argv[n_a],"-rev")==0) reverse = true; } if (max_memory > total_ram) { printf("Maximum memory (%d MB), exceeds total RAM (%d MB). Setting maximum memory to %d MB.\n",max_memory,total_ram,total_ram/2); max_memory = total_ram/2; } STARTWALL(0); sorting_count(Reads,prefix,max_memory,max_disk_space,true,verbose,reverse); STOPWALL(0,"Total"); delete Reads; return 0; }
int main(int argc, char *argv[]) { if(argc < 6) { fprintf (stderr,"usage:\n"); fprintf (stderr," %s input_file kmer_size min_abundance estimated_genome_size prefix\n",argv[0]); fprintf (stderr,"hints:\n min_abundance ~ 3\n estimated_genome_size is in bp, does not need to be accurate, only controls memory usage\n prefix is any name you want the results to start with\n"); return 1; } bool FOUR_BLOOM_VERSION = true; // shortcuts to go directly to assembly using serialized bloom and serialized hash int START_FROM_SOLID_KMERS=0; // if = 0, construct the fasta file of solid kmers, if = 1, start directly from that file int LOAD_FALSE_POSITIVE_KMERS=0; // if = 0, construct the fasta file of false positive kmers (debloom), if = 1, load that file into the hashtable int NO_FALSE_POSITIVES_AT_ALL=0; // if = 0, normal behavior, if = 1, don't load false positives (will be a probabilistic de bruijn graph) int max_disk_space = 0;// let dsk decide for (int n_a = 6; n_a < argc ; n_a++) { if (strcmp(argv[n_a],"--original") == 0) FOUR_BLOOM_VERSION = false; if (strcmp(argv[n_a],"--dont-count")==0) START_FROM_SOLID_KMERS = 1; if (strcmp(argv[n_a],"--dont-debloom")==0) LOAD_FALSE_POSITIVE_KMERS = 1; if (strcmp(argv[n_a],"--just-assemble")==0) { START_FROM_SOLID_KMERS = 1; LOAD_FALSE_POSITIVE_KMERS = 1; } if (strcmp(argv[n_a],"--titus-mode")==0) NO_FALSE_POSITIVES_AT_ALL = 1; if (strcmp(argv[n_a],"-d")==0) max_disk_space = atoi(argv[n_a+1]); if (strcmp(argv[n_a],"-maxc")==0) max_couv = atoi(argv[n_a+1]); if (strcmp(argv[n_a],"--le-changement")==0) {printf("c'est maintenant!\n");exit(0);} } // kmer size sizeKmer=27; // let's make it even for now, because i havnt thought of how to handle palindromes (dont want to stop on them) if(argc >= 3) { sizeKmer = atoi(argv[2]); if (sizeKmer%2==0) { sizeKmer-=1; printf("Need odd kmer size to avoid palindromes. I've set kmer size to %d.\n",sizeKmer); } if (sizeKmer>((int)sizeof(kmer_type)*4)) { printf("Max kmer size on this compiled version is %lu\n",sizeof(kmer_type)*4); exit(1); } } if (sizeKmer == (int)(sizeof(kmer_type)*4)) kmerMask = -1; else kmerMask=(((kmer_type)1)<<(sizeKmer*2))-1; double lg2 = log(2); if (sizeKmer > 128) { FOUR_BLOOM_VERSION = false; printf("Reverted to single Bloom filter implementation for k>128\n"); } if (!FOUR_BLOOM_VERSION) NBITS_PER_KMER = log(16*sizeKmer*(lg2*lg2))/(lg2*lg2); // needed to process argv[5] else NBITS_PER_KMER = rvalues[sizeKmer][1]; // solidity nks =NNKS; if(argc >= 4) { nks = atoi(argv[3]); if (nks==0) nks=1; // min abundance can't be 0 } if(argc >= 5) { genome_size = atoll(argv[4]); // int estimated_bloom_size = max( (int)ceilf(log2f(genome_size * NBITS_PER_KMER )), 1); uint64_t estimated_bloom_size = (uint64_t) (genome_size * NBITS_PER_KMER); uint64_t estimated_nb_FP = (uint64_t)(genome_size * 4 * powf(0.6,11)); // just indicative //max_memory = max( (1LL << estimated_bloom_size)/8LL /1024LL/1024LL, 1LL ); max_memory = max((int64_t) estimated_bloom_size/8LL /1024LL/1024LL,1LL); printf("estimated values: nbits Bloom %lli, nb FP %lld, max memory %i MB\n",estimated_bloom_size,estimated_nb_FP,max_memory); } // output prefix if(argc >= 6) { strcpy(prefix,argv[5]); } fprintf (stderr,"taille cell %lu \n", sizeof(cell<kmer_type>)); STARTWALL(0); Bank *Reads = new Bank(argv[1]); // counter kmers, write solid kmers to disk if (!START_FROM_SOLID_KMERS) { int verbose = 0; bool write_count = false; bool skip_binary_conversion = false; sorting_count(Reads,prefix,max_memory,max_disk_space,write_count,verbose, skip_binary_conversion); } // debloom, write false positives to disk, insert them into false_positives if (! LOAD_FALSE_POSITIVE_KMERS) { debloom(order, max_memory); } bloo1 = bloom_create_bloo1((BloomCpt *)NULL, false); if (! NO_FALSE_POSITIVES_AT_ALL) { // load false positives from disk into false_positives if (!FOUR_BLOOM_VERSION) false_positives = load_false_positives(); else false_positives = load_false_positives_cascading4(); } else { // titus mode: no FP's false_positives = dummy_false_positives(); } // return 1; assemble(); STOPWALL(0,"Total"); delete Reads; return 0; }