uint64_t extrapolate_distinct_kmers_wrapped(unsigned long nbytes_memory, Bank *Reads) { unsigned long size_linearCounter = nbytes_memory * 8L; // alloc 8 bits * nbytes for counting LinearCounter *linearCounter = new LinearCounter(size_linearCounter); int stops = 100000; // variant of bloom_pass_reads int64_t NbRead = 0; int64_t NbInsertedKmers = 0; Reads->rewind_all(); char * rseq; long i; kmer_type kmer, graine, graine_revcomp; long nb_distinct_kmers = 0; long previous_nb_distinct_kmers = 0; uint64_t estimated_nb_reads = Reads->estimate_nb_reads(); bool stop = false; while (Reads->get_next_seq(&rseq,&readlen)) { if (stop) break; for (i=0; i<readlen-sizeKmer+1; i++) { kmer = extractKmerFromRead(rseq,i,&graine,&graine_revcomp); linearCounter->add(kmer); NbInsertedKmers++; if (NbInsertedKmers % stops == 0 && NbRead != 0) { previous_nb_distinct_kmers = nb_distinct_kmers; nb_distinct_kmers = linearCounter->count()*estimated_nb_reads/NbRead; //printf("estimated now: %ld\n",nb_distinct_kmers); // the following condition will grossly over-estimate the number of distinct kmers // I expect the correct result to be in the same order of magnitude // 5% error if (abs((int)(nb_distinct_kmers-previous_nb_distinct_kmers)) < previous_nb_distinct_kmers/20) stop = true; if (!linearCounter->is_accurate()) stop = true; } } NbRead++; if ((NbRead%10000)==0) fprintf (stderr,(char*)"%cExtrapolating number of distinct kmers %lld",13,NbRead); } if (!linearCounter->is_accurate()) { printf("Inaccurate estimation, restarting with %d MB RAM\n",(2*nbytes_memory)/1024/1024); delete linearCounter; return extrapolate_distinct_kmers_wrapped(2*nbytes_memory, Reads); } nb_distinct_kmers = linearCounter->count()*estimated_nb_reads/NbRead; // this is a very rough estimation printf("Linear estimation: ~%ld M distinct kmers are in the reads\n",nb_distinct_kmers/1000000L); delete linearCounter; return nb_distinct_kmers; }