void bloom_count(Bank *Reads, unsigned long max_memory) { #define NBITS_BLOOMCPT 23 // 33 :4GB (4 bits/elem) // size of the bloom counter table to count kmers fprintf(stderr,"nbits bloom counter: %i \n",NBITS_BLOOMCPT); BloomCpt3 * bloocpt = new BloomCpt3(NBITS_BLOOMCPT); BloomCpt3 * bloocpt2 = new BloomCpt3(NBITS_BLOOMCPT); bloocpt->setSeed( 0x4909FEA3A68CC6A7LL); bloocpt2->setSeed( 0x0CD5DA28467C5492LL); // bloocpt->set_number_of_hash_func(4); // bloocpt2->set_number_of_hash_func(6); ///////////////////////////////////first pass ; count kmers with Bloom cpt bloom_pass_reads(Reads,bloocpt, (BloomCpt * ) NULL, (char*)"%cFirst pass %lld"); fprintf (stderr,"\n ____________ Second bloom counter _________\n"); bloom_pass_reads(Reads,bloocpt2, bloocpt, (char*)"%cSecond pass %lld"); STARTWALL(count); fprintf(stderr,"\n------------------ second pass bloom counter \n\n"); delete bloocpt; ////////////////////////////////////// exact kmer count with hash table partitionning, //also create solid kmers file and fills bloo1 exact_kmer_count(Reads,bloocpt2,max_memory); ////////////////////////////////////// STOPWALL(count,"Counted kmers"); fprintf(stderr,"\n------------------ Counted kmers and kept those with abundance >=%i \n\n",nks); ////////////////////////////////////////////////////fin bloom insert //delete bloocpt2; }
int debloom(int order, int max_memory) { // read bloo1 from disk dump Bloom *bloo1 = bloom_create_bloo1((BloomCpt *)NULL); STARTWALL(pos); FILE * debloom_file = fopen(return_file_name("debloom"),"wb+"); FILE * debloom_file_2 = fopen(return_file_name("debloom2"),"wb+"); FILE * F_tmp; F_debloom_read = debloom_file; F_debloom_write = debloom_file_2; BinaryBank *SolidKmers = new BinaryBank(return_file_name(solid_kmers_file),sizeof(kmer_type),0); uint64_t cc=0; kmer_type new_graine, kmer; int nt; uint64_t NbSolidKmer =0; // write all positive extensions in disk file while (SolidKmers->read_element(&kmer)) { //8 right extensions (4F and 4R); left extensions are redundant by revcomplementation for(nt=0; nt<4; nt++) { int strand; for (strand = 0; strand < 2 ; strand++) { int current_strand = strand; new_graine = next_kmer(kmer,nt, ¤t_strand); if(bloo1->contains(new_graine)){ // extension is positive // maybe do more lax deblooming; if it's a dead-end, it's no big deal, don't pass it to the false positive test // what would have been needed if i decided to enable order>0 (but actually this won't happen): // - better estimate of structure size in the presence of order>0 deblooming if (order == 1) // this case just detects tips {printf("ORDER==1"); bool is_linked = false; for(int tip_nt=0; tip_nt<4; tip_nt++) { int new_strand = current_strand; kmer_type kmer_after_possible_tip = next_kmer(new_graine,tip_nt, &new_strand); if(bloo1->contains(kmer_after_possible_tip)) { is_linked = true; break; } } if (!is_linked) continue; // it's a tip, because it's linked to nothing } if (order > 1) // general case. should work for order = 1, but i coded an optimized version above { printf("ORDER>1"); Frontline frontline( new_graine, current_strand, bloo1, NULL, NULL, NULL); while (frontline.depth < order) { frontline.go_next_depth(); if (frontline.size() == 0) break; // don't allow a breadth too large anywqy if (frontline.size()> 10) break; } if (frontline.size() == 0) continue; // it's a deadend } if (!fwrite(&new_graine, sizeof(new_graine), 1, debloom_file)) { printf("error: can't fwrite (disk full?)\n"); exit(1); } cc++; } } } NbSolidKmer++; if ((NbSolidKmer%table_print_frequency)==0) fprintf (stderr,"%c Writing positive Bloom Kmers %lld",13,NbSolidKmer); } nbkmers_solid = NbSolidKmer; // GUS: it's global now fprintf(stderr,"\n%lli kmers written\n",cc); STOPWALL(pos,"Write all positive kmers"); STARTWALL(deb); double bl1tai = (double)bloo1->tai ; delete bloo1; // now that bloo1 is deleted, initialize hasht1 int NBITS_HT = max( (int)ceilf(log2f((0.1*max_memory*1024L*1024L)/sizeof(cell_ptr_t))), 1); // set hasht1 cells to occupy 0.1 * [as much mem as poss] hasht1 =new Hash16(NBITS_HT); //////////////////////////////////////////////////////////////// --find false positive, with hash table partitioning uint64_t max_kmer_per_part = (uint64_t) (0.8*max_memory*1024LL*1024LL /sizeof(cell<kmer_type>)); //adapter taille ht en fonction printf("%d partitions will be needed\n",(int)(nbkmers_solid/max_kmer_per_part)); NbSolidKmer =0; int numpart = 0; SolidKmers->rewind_all(); // deblooming: // read the list of (non-redundant) solid kmers and load it, in chunks, into a hash table // at each pass, check all the positive extensions and keep those which are not indicated, by the current chunk, as solid kmers // at the end, only the positive extensions which are not solid are kept while (SolidKmers->read_element(&kmer)) { hasht1->add(kmer); NbSolidKmer++; if ((NbSolidKmer%table_print_frequency)==0) fprintf (stderr,"%cBuild Hash table %lld",13,NbSolidKmer); if(hasht1->nb_elem >max_kmer_per_part) //end partition, find false positives { fprintf(stderr,"End of debloom partition %lli / %lld \n",hasht1->nb_elem,max_kmer_per_part); end_debloom_partition(false); //swap file pointers F_tmp = F_debloom_read; F_debloom_read = F_debloom_write; F_debloom_write = F_tmp; /////////end write files //reset hash table hasht1->empty_all(); fprintf(stderr,"\n%lli false positives written , partition %i \n",n_false_positives,numpart); numpart++; } ///end partition } fprintf(stderr,"Nb kmers stored in the bloom table %lld\n",nbkmers_solid); ///////////////////////// last partition, will write all the FP's to the good file end_debloom_partition(true); /////////end write files fprintf(stderr,"Total nb false positives stored in the Debloom hashtable %lli \n",n_false_positives); delete hasht1; STOPWALL(deb,"Debloom"); // GUS: will use to output summary later b1_size = (uint64_t) bl1tai; fclose(debloom_file); fclose(debloom_file_2); SolidKmers->close(); return 1; }
int main(int argc, char *argv[]) { if(argc < 3) { fprintf (stderr,"%s: [d]isk [s]treaming of [k]-mers (constant-memory k-mer counting)\n",argv[0]); fprintf (stderr,"usage:\n"); fprintf (stderr," %s input_file kmer_size [-t min_abundance] [-m max_memory] [-d max_disk_space] [-o out_prefix] [-histo]\n",argv[0]); fprintf (stderr,"details:\n [-t min_abundance] filters out k-mers seen ( < min_abundance ) times, default: 1 (all kmers are returned)\n [-m max_memory] is in MB, default: min(total system memory / 2, 5 GB) \n [-d max_disk_space] is in MB, default: min(available disk space / 2, reads file size)\n [-o out_prefix] saves results in [out_prefix].solid_kmers. default out_prefix = basename(input_file)\n [-histo] outputs histogram of kmers abundance\n [-rev] outputs only one of forward or reverse complement k-mers\n Input file can be fasta, fastq, gzipped or not, or a file containing a list of file names.\n"); #ifdef SVN_REV fprintf(stderr,"Running dsk version %s\n",STR(SVN_REV)); #endif return 0; } // reads file Bank *Reads = new Bank(argv[1]); if (argv[2][0] == '-') { printf("please specify a k value\n"); exit(1); } /* Changes by Raunaq * Code addition for taking in multiple values of k. The file containing should have values of k sorted in decreasing order * */ int *Kmerlist = loadKmers(argv[2]); // kmer size //fprintf(stderr,"Smallest kmer is %d \n",smallestKmer); sizeKmer = Kmerlist[0]; if (sizeKmer>(int)(sizeof(kmer_type)*4)) { printf("Max kmer size on this compiled version is %lu\n",sizeof(kmer_type)*4); exit(1); } kmerMask=(((kmer_type)1)<<(sizeKmer*2))-1; // default solidity nks = 1; // default max memory max_memory = 5*1024; #ifndef OSX struct sysinfo info; sysinfo(&info); int total_ram = (int)(((double)info.totalram*(double)info.mem_unit)/1024/1024); printf("Total RAM: %d MB\n",total_ram); #else int total_ram = 128*1024; #endif // default prefix is the reads file basename char *reads_path=strdup(argv[1]); string reads_name(basename(reads_path)); // posix basename() may alter reads_path free(reads_path); int lastindex = reads_name.find_last_of("."); strcpy(prefix,reads_name.substr(0, lastindex).c_str()); for (int n_a = 3; n_a < argc ; n_a++) { if (strcmp(argv[n_a],"-t")==0) nks = atoi(argv[n_a+1]); if (strcmp(argv[n_a],"-o")==0) strcpy(prefix,argv[n_a+1]); } int verbose = 0; bool reverse = false; max_disk_space = 0; output_histo =false; // parse the remaining arguments: these will override the default max memory / max disk for (int n_a = 3; n_a < argc ; n_a++) { if (strcmp(argv[n_a],"-m")==0) max_memory = atoi(argv[n_a+1]); if (strcmp(argv[n_a],"-d")==0) max_disk_space = atoi(argv[n_a+1]); if (strcmp(argv[n_a],"-v")==0) verbose = 1; if (strcmp(argv[n_a],"-vv")==0) verbose = 2; if (strcmp(argv[n_a],"-histo")==0) output_histo =true; if (strcmp(argv[n_a],"-rev")==0) reverse = true; } if (max_memory > total_ram) { printf("Maximum memory (%d MB), exceeds total RAM (%d MB). Setting maximum memory to %d MB.\n",max_memory,total_ram,total_ram/2); max_memory = total_ram/2; } STARTWALL(0); sorting_count(Reads,prefix,max_memory,max_disk_space,true,verbose,reverse); STOPWALL(0,"Total"); delete Reads; return 0; }
// main k-mer counting function, shared between minia and dsk // verbose == 0 : stderr progress bar // verbose >= 1 : print basic status // verbose >= 2 : print extra partition information // write_count == True: include kmer count in results file, in that form: // - save kmer count for each kmer in the resulting binary file // - the very first four bytes of the result file are the kmer length void sorting_count(Bank *Sequences, char *prefix, int max_memory, int max_disk_space, bool write_count, int verbose) { // create a temp dir from the prefix char temp_dir[1024]; sprintf(temp_dir,"%s_temp",prefix); // clear the temp folder (needs to be done before estimating disk space) DIR* dp; struct dirent* ep; char p_buf[512] = {0}; dp = opendir(temp_dir); while ( (dp != NULL) && ((ep = readdir(dp)) != NULL)) { sprintf(p_buf, "%s/%s", temp_dir, ep->d_name); remove(p_buf); } if(dp != NULL) closedir(dp); if (max_disk_space == 0) { // default max disk space struct statvfs buffer ; char current_path[1000]; getcwd(current_path,sizeof(current_path)); // int ret = statvfs(current_path, &buffer); int available = (int)(((double)buffer.f_bavail * (double)buffer.f_bsize) / 1024 / 1024); uint32_t tt_new_temp = (uint32_t) (((double)Sequences->filesizes)/(1024*1024)); printf("Available disk space in %s: %d %u %llu MB\n",current_path,available,tt_new_temp,Sequences->filesizes); // not working in osx (is that a TODO then?) max_disk_space = min((uint32_t)available/2, tt_new_temp); } if (max_disk_space <= 0) // still 0? max_disk_space = 10000; // = default for osx // estimate number of iterations TODO Check if multiplication with totalKmers is actually required or not. It may be just increasing number of partitions for no reason //uint64_t volume = totalKmers*Sequences->estimate_kmers_volume(smallestKmer); //Since there are totalKmers no of kmers and an upper bound can be estimated by using the smallest size of kmer. Added by Raunaq uint64_t volume = Sequences->estimate_kmers_volume(smallestKmer); //Since there are totalKmers no of kmers and an upper bound can be estimated by using the smallest size of kmer. Added by Raunaq uint32_t nb_passes = ( volume / max_disk_space ) + 1; int passes_hash ; int nb_threads=1; #if OMP use_compressed_reads =true; nb_threads = 8; max_memory /= nb_threads; max_memory = max (max_memory,1); #endif // temp bugfix: don't use compressed reads for long reads if (Sequences->estimate_max_readlen() > 1000000) use_compressed_reads = false; uint64_t volume_per_pass,volume_per_partition; uint32_t nb_partitions; int partitions_hash; // loop to lower the number of partitions below the maximum number of simulatenously open files do { volume_per_pass = volume / nb_passes; nb_partitions = ( volume_per_pass * totalKmers / max_memory ) + 1; //printf("volume per pass and total volume %llu %llu \n",volume_per_pass,(unsigned long long)volume); // if partitions are hashed instead of sorted, adjust for load factor // (as in the worst case, all kmers in the partition are distinct and partition may be slightly bigger due to hash-repartition) if (use_hashing) { nb_partitions = (uint32_t) ceil((float) nb_partitions / load_factor); nb_partitions = ((nb_partitions * OAHash::size_entry() ) + sizeof(key_type)-1) / sizeof(key_type); // also adjust for hash overhead } struct rlimit lim; int max_open_files = 1000; int err = getrlimit(RLIMIT_NOFILE, &lim); if (err == 0) max_open_files = lim.rlim_cur / 2; if (nb_partitions >= max_open_files) nb_passes++; else break; } while (1); volume_per_partition= volume_per_pass/nb_partitions; passes_hash = ceil(log(nb_passes)/log(4)); partitions_hash = ceil(log(nb_partitions)/log(4)); int size_for_reestimation = ceil((passes_hash + partitions_hash)*1.8); double * lmer_counts = (double * ) malloc(sizeof(long)*pow(4,size_for_reestimation)); long * lmers_for_hash = (long * ) malloc(sizeof(long)*pow(4,size_for_reestimation)); int * partitions_for_lmers =(int * ) malloc(sizeof(int)*pow(4,size_for_reestimation)); Sequences->count_kmers_for_small_value(size_for_reestimation,lmer_counts); int temp_partition=reestimate_partitions(size_for_reestimation,volume_per_partition,lmer_counts,lmers_for_hash,partitions_for_lmers); unordered_map<long,int> part_hash; int total_lmers=pow(4,size_for_reestimation); for(int it=0;it<total_lmers;it++) { pair<long,int> temp_pair(lmers_for_hash[it],partitions_for_lmers[it]); part_hash.insert (temp_pair); // Add element to the hash } //uint64_t up_passes_size = volume_per_pass; do { //recompute the number of partitions based on updated partitions estimate nb_partitions = ceil(temp_partition*1.0/nb_passes); struct rlimit lim; int max_open_files = 1000; int err = getrlimit(RLIMIT_NOFILE, &lim); if (err == 0) max_open_files = lim.rlim_cur / 2; if (nb_partitions >= max_open_files) nb_passes++; else break; }while(1); printf("no of partitions before %lu and after %d passes %lu \n",nb_partitions*nb_passes,temp_partition,nb_passes); uint64_t total_IO = volume * 2LL * 1024LL*1024LL ;// in bytes + nb_passes * ( volume / (sizeof(kmer_type)*4) ) ; // in bytes uint64_t temp_IO = 0; BinaryBankConcurrent * redundant_partitions_file[nb_partitions]; char redundant_filename[nb_partitions][256]; kmer_type kmer; int max_read_length = KMERSBUFFER_MAX_READLEN; kmer_type * kmer_table_seq = (kmer_type * ) malloc(sizeof(kmer_type)*max_read_length); ; kmer_type * kmer_length_table_seq = (kmer_type * ) malloc(sizeof(kmer_type)*max_read_length); BinaryReads * binread = NULL; if(use_compressed_reads) binread = new BinaryReads(return_file_name(binary_read_file),true); fprintf(stderr,"Sequentially counting ~%llu MB of kmers with %d partition(s) and %d passes using %d thread(s), ~%d MB of memory and ~%d MB of disk space\n", (unsigned long long)volume, nb_partitions,nb_passes, nb_threads, max_memory * nb_threads, max_disk_space); STARTWALL(count); mkdir(temp_dir, S_IRWXU | S_IRWXG | S_IROTH | S_IXOTH); // Open totalKmers files to store counts of totalKmers different k's BinaryBankConcurrent * SolidKmers[totalKmers]; for (int s=0;s<totalKmers;s++) { char temp[1024]; sprintf(temp,"%s.%d",return_file_name(solid_kmers_file),Kmerlist[s]); uint64_t exp = (((uint64_t)1)<<(Kmerlist[s]*2))-1; SolidKmers[s] = new BinaryBankConcurrent(temp,sizeof(kmer),true,nb_threads); //printf("kmer is %d exp is %llu \n",Kmerlist[s],exp); //BinaryBankConcurrent * SolidKmers = new BinaryBankConcurrent(return_file_name(solid_kmers_file),sizeof(kmer),true,nb_threads); if (write_count) { // write k-mer nbits as the first 4 bytes; and actual k-mer size as the next 4 bits uint32_t kmer_nbits = sizeof(kmer) * 8; SolidKmers[s]->write_buffered(&kmer_nbits, 4,0); SolidKmers[s]->write_buffered(&Kmerlist[s], 4,0); SolidKmers[s]->flush(0); } } int64_t estimated_NbReads = Sequences->estimate_nb_reads(); char * rseq; int readlen; int64_t NbSolid = 0; int64_t * NbSolid_omp = (int64_t *) calloc(nb_threads,sizeof(int64_t)); //long total_kmers_per_partition[nb_partitions]; //guillaume probably commented it because updating this variable would require synchronization long distinct_kmers_per_partition[nb_partitions]; uint64_t * histo_count = (uint64_t *) calloc(10001,sizeof(uint64_t)); #if OMP uint64_t ** histo_count_omp = (uint64_t **) calloc(nb_threads,sizeof(uint64_t *)); for(int ii=0;ii<nb_threads;ii++) { histo_count_omp[ii]= (uint64_t *) calloc(10001,sizeof(uint64_t)); } #endif //start by the conversion of the file to binary format if(use_compressed_reads) { char * pt_begin; int idx =0 ; int64_t NbRead = 0; Progress progress_conversion; // progress_conversion.timer_mode=1; // to switch to timer mode (show elapsed and estimated remaining time) progress_conversion.init(estimated_NbReads,"First step: Converting input file into Binary format"); Sequences->rewind_all(); while(1) { if(! Sequences->get_next_seq(&rseq,&readlen)) break; // read original fasta file if(readlen > max_read_length) // realloc kmer_table_seq if needed { max_read_length = 2*readlen; kmer_table_seq = (kmer_type * ) realloc(kmer_table_seq,sizeof(kmer_type)*max_read_length); kmer_length_table_seq = (kmer_type * ) realloc(kmer_length_table_seq,sizeof(kmer_type)*max_read_length); } pt_begin = rseq; //should be ok while (pt_begin < (rseq+ readlen)) { idx=0; // start a new read //skips NN while (*pt_begin =='N' && pt_begin < (rseq+ readlen)) { pt_begin ++; } // goes to next N or end of seq while ( (pt_begin[idx] !='N') && ((pt_begin +idx) < (rseq+ readlen)) ) { idx++; } //we have a seq beginning at pt_begin of size idx ,without any N, will be treated as a read: binread->write_read(pt_begin,idx); revcomp_sequence(pt_begin,idx); // reverse complement the string binread->write_read(pt_begin,idx); // write reverse complement string revcomp_sequence(pt_begin,idx); // restore the string pt_begin += idx; } // binread->write_read(rseq,readlen); NbRead++; if ((NbRead%10000)==0) { progress_conversion.inc(10000); } } //printf("Number of reads converted to binary %d \n",NbRead); progress_conversion.finish(); binread->close(); } ///fin conversion if (clear_cache) { #ifdef OSX system("purge"); #else system("echo 3 > /proc/sys/vm/drop_caches"); #endif } #if SINGLE_BAR Progress progress; char message[1000]; sprintf(message,"Counting kmers"); progress.timer_mode=1; if (verbose == 0 ) progress.init(total_IO,message); #endif //use_compressed_reads=false; // for testing compute_kmer_from_one_seq // how many times we will traverse the whole reads file (has an influence on temp disk space) uint64_t iter_partition=0; for (uint32_t current_pass = 0; current_pass < nb_passes; current_pass ++) { // stop computing if all partitions are done Added by Raunaq if (iter_partition==temp_partition) break; if(use_compressed_reads ) //open binary reads for reading binread->open(false); STARTWALL(debpass); STARTWALL(debw); int initial_value = current_pass*nb_partitions; for (uint32_t p=0;p<nb_partitions;p++) { sprintf(redundant_filename[p],"%s/partition%d.redundant_kmers",temp_dir,p); redundant_partitions_file[p] = new BinaryBankConcurrent (redundant_filename[p],sizeof(kmer_type),true, nb_threads); distinct_kmers_per_partition[p]=0; } int final_value = ((current_pass+1)*nb_partitions)-1; printf("Storing k-mers in partition files between %d and %d \n",initial_value,final_value); Sequences->rewind_all(); #if !SINGLE_BAR Progress progress; progress.timer_mode=1; // to switch to timer mode (show elapsed and estimated remaining time) char message[1000]; sprintf(message,"Pass %d/%d, Step 1: partitioning",current_pass+1,nb_passes); if (verbose == 0 ) progress.init(estimated_NbReads,message); #endif //current_pass> 0 && #if OMP #pragma omp parallel if(use_compressed_reads) num_threads(nb_threads) #endif { int64_t nbkmers_written =0; int tid =0; int64_t NbRead = 0; int64_t nread =0; int64_t tempread =0; long it_zero_wrt =0; #if OMP tid = omp_get_thread_num(); #endif int nreads_in_buffer= 1000; KmersBuffer * kbuff =NULL; if(use_compressed_reads) { kbuff = new KmersBuffer (binread, 1000000, nreads_in_buffer); //buffer size (in nb of kmers), seq per task // the buffer is per thread kbuff->binary_read_file = binread->binary_read_file; } kmer_type * kmer_table ; kmer_type * kmer_length_info ; // Added by Raunaq, to store the length of read into the partitions file while(1) { //read the fasta file if(use_compressed_reads) // && current_pass>0 { nread = kbuff->readkmers(); if( nread < 0) break; NbRead+= nread; tempread+= nread; } else { if(! Sequences->get_next_seq(&rseq,&readlen)) break; // read original fasta file if(readlen > max_read_length) // realloc kmer_table_seq if needed { max_read_length = 2*readlen; kmer_table_seq = (kmer_type * ) realloc(kmer_table_seq,sizeof(kmer_type)*max_read_length); kmer_length_table_seq = (kmer_type * ) realloc(kmer_length_table_seq,sizeof(kmer_type)*max_read_length); } } // if(use_compressed_reads ) //write compressed read file at first pass //&& current_pass==0 // binread->write_read(rseq,readlen); int i; int nbkmers =readlen-sizeKmer+1; if( use_compressed_reads) //current_pass >0 && { nbkmers = kbuff->nkmers; kmer_table = kbuff->kmers_buffer; kmer_length_info = kbuff->kmer_length; } else //old fashion { compute_kmer_table_from_one_seq(readlen,rseq,kmer_table_seq,kmer_length_table_seq,Kmerlist[totalKmers-1]); // Added by Raunaq for computing kmers for all values of k nbkmers =readlen-Kmerlist[totalKmers-1]+1; kmer_table = kmer_table_seq; kmer_length_info = kmer_length_table_seq; NbRead++; //printf("Number of kmers read from seq %d \n",nbkmers); } nbkmers_written= 0; char temp_kmer[256]; int zero; //compute the kmers stored in the buffer kmer_table for (i=0; i<nbkmers; i++) { kmer_type lkmer; kmer_type lkmer_length; // kmer = extractKmerFromRead(rseq,i,&graine,&graine_revcomp); lkmer = kmer_table[i]; lkmer_length = kmer_length_info[i]; // zero = code2seq(lkmer,temp_kmer); long pass_lkmer = code2first_n_nucleotide(lkmer,size_for_reestimation); unordered_map<long,int>::const_iterator got = part_hash.find(pass_lkmer); int p;// compute in which partition this kmer falls into if(got==part_hash.end()) continue; else p = got->second; // check if this kmer should be included in the current pass if(!(p >= initial_value && p<= final_value)) continue; /* #ifdef _ttmath (reduced_kmer % nb_partitions).ToInt(p); #else p = reduced_kmer % nb_partitions; #endif */ p = p - current_pass*nb_partitions; nbkmers_written++; redundant_partitions_file[p]->write_element_buffered(&lkmer,tid); // save this kmer to the right partition file redundant_partitions_file[p]->write_buffered(&lkmer_length,sizeof(lkmer_length),tid,false); // save the kmer length next to the kmer in the same partition file // total_kmers_per_partition[p]++; // guillaume probably commented it because updating this variable would require synchronization } //NbRead++; #if SINGLE_BAR if(verbose==0) { if (nb_threads == 1) progress.inc(nbkmers_written * sizeof(kmer_type)); else progress.inc(nbkmers_written * sizeof(kmer_type),tid); } #endif // if ((NbRead%10000)==0) if(tempread> 10000) { tempread -= 10000; if (verbose) fprintf (stderr,"%cPass %d/%d, loop through reads to separate (redundant) kmers into partitions, processed %lluM reads out of %lluM",13,current_pass+1,nb_passes,(unsigned long long)(NbRead/1000/1000),(unsigned long long)(estimated_NbReads/1000/1000)); #if !SINGLE_BAR else if (nb_threads == 1) progress.set(NbRead); else progress.inc(10000,tid); #endif } } //end while // printf("Count of zero in write is %lu \n",it_zero_wrt); if(use_compressed_reads) delete kbuff; } // end OMP #if !SINGLE_BAR if (verbose == 0) { if (nb_threads == 1) progress.finish(); else progress.finish_threaded(); // here only one thread sprintf(message,"Pass %d/%d, Step 2: computing kmer count per partition",current_pass+1,nb_passes); progress.init(nb_partitions+1,message); } #endif if (verbose)fprintf(stderr,"\n"); if (verbose >= 2) { STOPWALL(debw,"Writing redundant kmers"); } STARTWALL(debtri); for (uint32_t p=0;p<nb_partitions;p++) { redundant_partitions_file[p]->close(); redundant_partitions_file[p]->open(false); } // for better timing: clear the file cache, since the partitions may still be in memory, that's unfair to low mem machines if (clear_cache) { #ifdef OSX system("purge"); #else system("echo 3 > /proc/sys/vm/drop_caches"); #endif } //quick and dirty parall with omp, testing //todo if we want omp and histo : separate histo_count tab per thread that needs to be merged at the end // TODO to guillaume: remove that todo above, because it is done, right? kmer_type lkmer,lkmer_length,lkmer_temp,exp; long it_zero=0; OAHash * hash; int p,s; #if OMP //omp_set_numthreads(2); //num_threads(2) //if(!output_histo) num_threads(nb_threads) #pragma omp parallel for private (p,s,lkmer,lkmer_length,hash,lkmer_temp,exp) num_threads(nb_threads) #endif // load, sort each partition to output solid kmers for ( p=0;p<nb_partitions;p++) { char temp_kmer[256]; // bug check code int zero; kmer_type lkmer_revcomp; // to store revcomps bool use_hashing_for_this_partition = use_hashing; if(hybrid_mode) { if( (redundant_partitions_file[p]->nb_elements()*sizeof(kmer_type)) < (max_memory*1024LL*1024LL) ) // Maintain totalKmers hash for each partition file { use_hashing_for_this_partition = false; } else { use_hashing_for_this_partition = true; } } int tid =0; //int s; //Computing if hashing should be used or not for this partition #if OMP tid = omp_get_thread_num(); #endif //use_hashing_for_this_partition = false; //to check the vector part of the code if (use_hashing_for_this_partition) { // hash partition and save to solid file hash = new OAHash(max_memory*1024LL*1024LL/2); // One hash to store all types of k-mer lengths uint64_t nkmers_read=0; redundant_partitions_file[p]->read_element_buffered(&lkmer_length); while (redundant_partitions_file[p]->read_element_buffered(&lkmer)) { if(lkmer_length == Kmerlist[0]) //only add the largest k-mer hash->increment(lkmer,convert_to_int(lkmer_length)); else { unordered_map<int,int>::const_iterator got = kmerlength_map.find(convert_to_int(lkmer_length)); exp = (((kmer_type)1)<<(got->second*2))-1; lkmer_temp = lkmer & exp; hash->increment(lkmer_temp,got->second); } if(!redundant_partitions_file[p]->read_element_buffered(&lkmer_length)) { break; } nkmers_read++; #if SINGLE_BAR if(verbose==0 && nkmers_read==10000) { if (nb_threads == 1) progress.inc(nkmers_read*sizeof(kmer_type)); else progress.inc(nkmers_read*sizeof(kmer_type),tid); nkmers_read=0; } #endif } if (verbose >= 2) printf("Pass %d/%d partition %d/%d hash load factor: %0.3f\n",current_pass+1,nb_passes,p+1,nb_partitions,hash->load_factor()); for( s=0;s<totalKmers;s++) { OAHash * temp_ = new OAHash(max_memory*1024LL*1024LL/2); hash->start_iterator(); while (hash->next_iterator()) { uint_abundance_t abundance = hash->iterator->value; uint_abundance_t abund_tid = (current_pass+1)*100+p; if(output_histo) { uint_abundance_t saturated_abundance; saturated_abundance = (abundance >= 10000) ? 10000 : abundance; #if OMP histo_count_omp[tid][saturated_abundance]++; #else histo_count[saturated_abundance]++; #endif } int length_kmer = hash->iterator->length; lkmer = hash->iterator->key; if (abundance >= nks && abundance <= max_couv && length_kmer == Kmerlist[s]) { //write if lkmer is the smaller of it and its reverse complement lkmer_revcomp = revcomp(lkmer,length_kmer); if(lkmer < lkmer_revcomp) { SolidKmers[s]->write_element_buffered(&(hash->iterator->key),tid); NbSolid_omp[tid]++; if (write_count) SolidKmers[s]->write_buffered(&abundance, sizeof(abundance),tid, false); } } distinct_kmers_per_partition[p]++; if(s!=totalKmers-1) { if(length_kmer == Kmerlist[s]) { exp = (((kmer_type)1)<<(Kmerlist[s+1]*2))-1; lkmer_temp = lkmer & exp; temp_->increment_by_value(lkmer_temp,abundance,Kmerlist[s+1]); }else { temp_->increment_by_value(lkmer,abundance,length_kmer); } } } hash->~OAHash(); hash = temp_; } hash->~OAHash(); //printf("All hashes closed and destroyed \n"); } else { // This part does it in slower fashion // sort partition and save to solid file //vector < kmer_type > kmers; vector < kmer_type > kmers[totalKmers]; uint64_t nkmers_read=0; //int s=0; redundant_partitions_file[p]->read_element_buffered(&lkmer_length); while (redundant_partitions_file[p]->read_element_buffered (&lkmer)) { for(s=0;s<totalKmers;s++) { //kmer_type lkmer_temp; //kmer_type exp; if(lkmer_length<Kmerlist[s]) continue; if(s==0) kmers[s].push_back (lkmer); else { exp = (((kmer_type)1)<<(Kmerlist[s]*2))-1; lkmer_temp = lkmer & exp; // Converting the kmer to its smaller equivalent in binary kmers[s].push_back (lkmer_temp); } } nkmers_read++; if(!redundant_partitions_file[p]->read_element_buffered(&lkmer_length)) break; //Added to get the next length of kmer #if SINGLE_BAR if(verbose==0 && nkmers_read==10000) { if (nb_threads == 1) progress.inc(nkmers_read*sizeof(kmer_type)); else progress.inc(nkmers_read*sizeof(kmer_type),tid); nkmers_read=0; } #endif } for(s=0;s<totalKmers;s++) { sort (kmers[s].begin (), kmers[s].end ()); kmer_type previous_kmer = *(kmers[s].begin ()); uint_abundance_t abundance = 0; for (vector < kmer_type >::iterator it = kmers[s].begin (); it != kmers[s].end ();it++) { kmer_type current_kmer = *it; if (current_kmer == previous_kmer) abundance++; else { if(output_histo) { uint_abundance_t saturated_abundance; saturated_abundance = (abundance >= 10000) ? 10000 : abundance; #if OMP histo_count_omp[tid][saturated_abundance]++; #else histo_count[saturated_abundance]++; #endif } if (abundance >= nks && abundance <= max_couv) { NbSolid_omp[tid]++; SolidKmers[s]->write_element_buffered(&previous_kmer,tid); if (write_count) SolidKmers[s]->write_buffered(&abundance, sizeof(abundance),tid, false); } abundance = 1; distinct_kmers_per_partition[p]++; } previous_kmer = current_kmer; } //last kmer distinct_kmers_per_partition[p]++; if(output_histo) { uint_abundance_t saturated_abundance; saturated_abundance = (abundance >= 10000) ? 10000 : abundance; #if OMP histo_count_omp[tid][saturated_abundance]++; #else histo_count[saturated_abundance]++; #endif } if (abundance >= nks && abundance <= max_couv) { NbSolid_omp[tid]++; SolidKmers[s]->write_element_buffered(&previous_kmer,tid); if (write_count) SolidKmers[s]->write_buffered(&abundance, sizeof(abundance),tid, false); } } } //printf("Done writing kmers for all K \n"); if (verbose >= 1) fprintf(stderr,"%cPass %d/%d, loaded and sorted partition %d/%d, found %lld solid kmers so far",13,current_pass+1,nb_passes,p+1,nb_partitions,(long long)(NbSolid_omp[tid])); //printf("Done writing kmers for all K %d check 1 \n",p); if (verbose >= 2) printf("\nPass %d/%d partition %d/%d %ld distinct kmers\n",current_pass+1,nb_passes,p+1,nb_partitions,/*total_kmers_per_partition[p],*/distinct_kmers_per_partition[p]); #if !SINGLE_BAR if (verbose == 0 && nb_threads==1) progress.inc(1); else if (verbose == 0 && nb_threads>1) progress.inc(1,tid); #endif //if(redundant_partitions_file[p]->find_error()) { // printf("Error in the binary file \n"); //} redundant_partitions_file[p]->close(); remove(redundant_filename[p]); } // end for partitions #if OMP //merge histo if(output_histo) { for (int cc=1; cc<10001; cc++) { uint64_t sum_omp = 0; for(int ii=0;ii<nb_threads;ii++) { sum_omp += histo_count_omp[ii][cc]; } histo_count[cc] = sum_omp; } } #endif #if !SINGLE_BAR if (verbose == 0 && nb_threads == 1) progress.finish(); else if (verbose == 0 && nb_threads > 1 ) progress.finish_threaded(); #endif if (verbose) fprintf(stderr,"\n"); if (verbose >= 2) { STOPWALL(debtri,"Reading and sorting partitions"); STOPWALL(debpass,"Pass total"); } //printf("Done writing kmers for all K check 4 \n"); if(use_compressed_reads) binread->close(); //delete for (uint32_t p=0;p<nb_partitions;p++) { delete redundant_partitions_file[p] ; } } //printf("Done writing kmers for all K check 5 \n"); //single bar #if SINGLE_BAR if (verbose == 0 && nb_threads == 1) progress.finish(); else if (verbose == 0 && nb_threads > 1 ) progress.finish_threaded(); #endif if(output_histo) { FILE * histo_file = fopen(return_file_name(histo_file_name),"w"); for (int cc=1; cc<10001; cc++) { fprintf(histo_file,"%i\t%llu\n",cc,(unsigned long long)(histo_count[cc])); } fclose(histo_file); } free(histo_count); NbSolid = NbSolid_omp[0]; #if OMP NbSolid=0; for(int ii=0;ii<nb_threads;ii++) { NbSolid += NbSolid_omp[ii]; } #endif for ( int s=0;s<totalKmers;s++) SolidKmers[s]->close(); printf("\nSaved %lld solid kmers\n",(long long)NbSolid); rmdir(temp_dir); STOPWALL(count,"Counted kmers"); fprintf(stderr,"\n------------------ Counted kmers and kept those with abundance >=%i, \n",nks); }
inline void assemble() { //////------------------------------------------------------------------------------------------- fprintf (stderr,"______________________________________________________ \n"); fprintf (stderr,"___________ Assemble from bloom filter _______________ \n"); fprintf (stderr,"______________________________________________________ \n\n"); //////------------------------------------------------------------------------------------------- long long len_left = 0; long long len_right = 0; long long contig_len =0; long long maxlen=10000000; char *left_traversal = (char *) malloc(maxlen*sizeof(char)); char *right_traversal = (char *) malloc(maxlen*sizeof(char)); char *contig = (char *) malloc(2*(maxlen+sizeKmer)*sizeof(char)); kmer_type kmer; long long nbContig =0; long long nbSmallContig =0; long long totalnt=0; long long max_contig_len=0; long long mlenleft=0,mlenright=0; int64_t NbBranchingKmer=0; char kmer_seq[sizeKmer+1]; FILE * file_assembly = fopen(return_file_name(assembly_file),"w+"); BinaryBank *SolidKmers = new BinaryBank(return_file_name(solid_kmers_file),sizeof(kmer_type),0); STARTWALL(assembly); char *assemble_only_one_region = NULL; // debugging, set to a ASCII kmer to activate, NULL to desactivate bool LOAD_BRANCHING_KMERS=false; // debugging bool DUMP_BRANCHING_KMERS=false; BranchingTerminator *terminator; if (LOAD_BRANCHING_KMERS) { BinaryBank *BranchingKmers = new BinaryBank(return_file_name(branching_kmers_file),sizeof(kmer_type),false); terminator = new BranchingTerminator(BranchingKmers,SolidKmers, bloo1,false_positives); BranchingKmers->close(); } else terminator = new BranchingTerminator(SolidKmers,genome_size, bloo1,false_positives); if (DUMP_BRANCHING_KMERS) { BinaryBank *BranchingKmers = new BinaryBank(return_file_name(branching_kmers_file),sizeof(kmer_type),true); terminator->dump_branching_kmers(BranchingKmers); BranchingKmers->close(); } #ifdef UNITIG SimplePathsTraversal *traversal = new SimplePathsTraversal(bloo1,false_positives,terminator); fprintf (stderr,"_________________Assembling in Unitig mode ..._____________________ \n\n"); #else MonumentTraversal *traversal = new MonumentTraversal(bloo1,false_positives,terminator); #endif //RandomBranchingTraversal *traversal = new RandomBranchingTraversal(bloo1,false_positives,terminator); traversal->set_maxlen(maxlen); traversal->set_max_depth(500); traversal->set_max_breadth(20); while (terminator->next(&kmer)) { // keep looping while a starting kmer is available from this kmer // everything will be marked during the traversal()'s kmer_type starting_kmer; #ifdef UNITIG while (traversal->get_new_starting_node_improved(kmer,starting_kmer)) #else while (traversal->find_starting_kmer(kmer,starting_kmer)) #endif { code2seq(starting_kmer,kmer_seq); // convert starting kmer to nucleotide seq traversal->revert_stats(); // set stats from the last commit (discard stats from find_starting_kmer / small contigs) if (assemble_only_one_region != NULL) { kmer_type dummy; starting_kmer = extractKmerFromRead(assemble_only_one_region,0,&kmer,&dummy,false); } // right extension len_right = traversal->traverse(starting_kmer,right_traversal,0); mlenright= max(len_right,mlenright); // left extension, is equivalent to right extension of the revcomp len_left = traversal->traverse(starting_kmer,left_traversal,1); mlenleft= max(len_left,mlenleft); // form the contig revcomp_sequence(left_traversal,len_left); strcpy(contig,left_traversal); // contig = revcomp(left_traversal) strcat(contig,kmer_seq);// + starting_kmer strcat(contig,right_traversal);// + right_traversal contig_len=len_left+len_right+sizeKmer; // save the contig if(contig_len >= MIN_CONTIG_SIZE) { max_contig_len = max(max_contig_len,contig_len); fprintf(file_assembly,">%lli__len__%lli \n",nbContig,contig_len); fprintf(file_assembly,"%s\n",contig); nbContig++; totalnt+=contig_len; traversal->commit_stats(); } else { traversal->revert_stats(); nbSmallContig++; } if (assemble_only_one_region != NULL) break; } NbBranchingKmer++; if ((NbBranchingKmer%300)==0) fprintf (stderr,"%cLooping through branching kmer n° %lld / %lld total nt %lld ",13,(long long int) NbBranchingKmer,(long long int) terminator->nb_branching_kmers, (long long int)totalnt ); if (nbContig > 0 && assemble_only_one_region != NULL) break; } fclose(file_assembly); fprintf (stderr,"\n Total nt assembled %lli nbContig %lli\n",totalnt,nbContig); fprintf (stderr," Max contig len %lli (debug: max len left %lli, max len right %lli)\n",max_contig_len,mlenleft,mlenright); fprintf (stderr,"\n Debug traversal stats: %ld ends of contigs (%lld unsaved small contigs), among them:\n",traversal->final_stats.ended_traversals,nbSmallContig); fprintf (stderr," %ld couldn't validate consensuses\n",traversal->final_stats.couldnt_validate_consensuses); fprintf (stderr," %ld large bubble breadth, %ld large bubble depth, %ld marked kmer, %ld no extension\n",traversal->final_stats.couldnt_traverse_bubble_breadth,traversal->final_stats.couldnt_traverse_bubble_depth,traversal->final_stats.couldnt_because_marked_kmer,traversal->final_stats.couldnt_find_extension); fprintf (stderr," %ld in-branchin large depth, %ld in-branching large breadth, %ld in-branching other\n",traversal->final_stats.couldnt_inbranching_depth,traversal->final_stats.couldnt_inbranching_breadth,traversal->final_stats.couldnt_inbranching_other); STOPWALL(assembly,"Assembly"); free(left_traversal); free(right_traversal); free(contig); SolidKmers->close(); }
int main(int argc, char *argv[]) { if(argc < 6) { fprintf (stderr,"usage:\n"); fprintf (stderr," %s input_file kmer_size min_abundance estimated_genome_size prefix\n",argv[0]); fprintf (stderr,"hints:\n min_abundance ~ 3\n estimated_genome_size is in bp, does not need to be accurate, only controls memory usage\n prefix is any name you want the results to start with\n"); return 1; } bool FOUR_BLOOM_VERSION = true; // shortcuts to go directly to assembly using serialized bloom and serialized hash int START_FROM_SOLID_KMERS=0; // if = 0, construct the fasta file of solid kmers, if = 1, start directly from that file int LOAD_FALSE_POSITIVE_KMERS=0; // if = 0, construct the fasta file of false positive kmers (debloom), if = 1, load that file into the hashtable int NO_FALSE_POSITIVES_AT_ALL=0; // if = 0, normal behavior, if = 1, don't load false positives (will be a probabilistic de bruijn graph) int max_disk_space = 0;// let dsk decide for (int n_a = 6; n_a < argc ; n_a++) { if (strcmp(argv[n_a],"--original") == 0) FOUR_BLOOM_VERSION = false; if (strcmp(argv[n_a],"--dont-count")==0) START_FROM_SOLID_KMERS = 1; if (strcmp(argv[n_a],"--dont-debloom")==0) LOAD_FALSE_POSITIVE_KMERS = 1; if (strcmp(argv[n_a],"--just-assemble")==0) { START_FROM_SOLID_KMERS = 1; LOAD_FALSE_POSITIVE_KMERS = 1; } if (strcmp(argv[n_a],"--titus-mode")==0) NO_FALSE_POSITIVES_AT_ALL = 1; if (strcmp(argv[n_a],"-d")==0) max_disk_space = atoi(argv[n_a+1]); if (strcmp(argv[n_a],"-maxc")==0) max_couv = atoi(argv[n_a+1]); if (strcmp(argv[n_a],"--le-changement")==0) {printf("c'est maintenant!\n");exit(0);} } // kmer size sizeKmer=27; // let's make it even for now, because i havnt thought of how to handle palindromes (dont want to stop on them) if(argc >= 3) { sizeKmer = atoi(argv[2]); if (sizeKmer%2==0) { sizeKmer-=1; printf("Need odd kmer size to avoid palindromes. I've set kmer size to %d.\n",sizeKmer); } if (sizeKmer>((int)sizeof(kmer_type)*4)) { printf("Max kmer size on this compiled version is %lu\n",sizeof(kmer_type)*4); exit(1); } } if (sizeKmer == (int)(sizeof(kmer_type)*4)) kmerMask = -1; else kmerMask=(((kmer_type)1)<<(sizeKmer*2))-1; double lg2 = log(2); if (sizeKmer > 128) { FOUR_BLOOM_VERSION = false; printf("Reverted to single Bloom filter implementation for k>128\n"); } if (!FOUR_BLOOM_VERSION) NBITS_PER_KMER = log(16*sizeKmer*(lg2*lg2))/(lg2*lg2); // needed to process argv[5] else NBITS_PER_KMER = rvalues[sizeKmer][1]; // solidity nks =NNKS; if(argc >= 4) { nks = atoi(argv[3]); if (nks==0) nks=1; // min abundance can't be 0 } if(argc >= 5) { genome_size = atoll(argv[4]); // int estimated_bloom_size = max( (int)ceilf(log2f(genome_size * NBITS_PER_KMER )), 1); uint64_t estimated_bloom_size = (uint64_t) (genome_size * NBITS_PER_KMER); uint64_t estimated_nb_FP = (uint64_t)(genome_size * 4 * powf(0.6,11)); // just indicative //max_memory = max( (1LL << estimated_bloom_size)/8LL /1024LL/1024LL, 1LL ); max_memory = max((int64_t) estimated_bloom_size/8LL /1024LL/1024LL,1LL); printf("estimated values: nbits Bloom %lli, nb FP %lld, max memory %i MB\n",estimated_bloom_size,estimated_nb_FP,max_memory); } // output prefix if(argc >= 6) { strcpy(prefix,argv[5]); } fprintf (stderr,"taille cell %lu \n", sizeof(cell<kmer_type>)); STARTWALL(0); Bank *Reads = new Bank(argv[1]); // counter kmers, write solid kmers to disk if (!START_FROM_SOLID_KMERS) { int verbose = 0; bool write_count = false; bool skip_binary_conversion = false; sorting_count(Reads,prefix,max_memory,max_disk_space,write_count,verbose, skip_binary_conversion); } // debloom, write false positives to disk, insert them into false_positives if (! LOAD_FALSE_POSITIVE_KMERS) { debloom(order, max_memory); } bloo1 = bloom_create_bloo1((BloomCpt *)NULL, false); if (! NO_FALSE_POSITIVES_AT_ALL) { // load false positives from disk into false_positives if (!FOUR_BLOOM_VERSION) false_positives = load_false_positives(); else false_positives = load_false_positives_cascading4(); } else { // titus mode: no FP's false_positives = dummy_false_positives(); } // return 1; assemble(); STOPWALL(0,"Total"); delete Reads; return 0; }