コード例 #1
0
ファイル: Utils.cpp プロジェクト: goshng/cocoa
uint64_t extrapolate_distinct_kmers_wrapped(unsigned long nbytes_memory, Bank *Reads)
{
    unsigned long size_linearCounter = nbytes_memory * 8L; // alloc 8 bits * nbytes for counting
    LinearCounter *linearCounter = new LinearCounter(size_linearCounter);
    int stops = 100000;

    // variant of bloom_pass_reads

    int64_t NbRead = 0;
    int64_t NbInsertedKmers = 0;
    Reads->rewind_all();
    char * rseq;
    long i;
    kmer_type kmer, graine, graine_revcomp;

    long nb_distinct_kmers = 0; 
    long previous_nb_distinct_kmers = 0; 
    uint64_t estimated_nb_reads = Reads->estimate_nb_reads();
    bool stop = false;

    while (Reads->get_next_seq(&rseq,&readlen))
    {
        if (stop)
            break;

        for (i=0; i<readlen-sizeKmer+1; i++)
        {
            kmer = extractKmerFromRead(rseq,i,&graine,&graine_revcomp);

            linearCounter->add(kmer);
            NbInsertedKmers++;

            if (NbInsertedKmers % stops == 0 && NbRead != 0)
            {
                previous_nb_distinct_kmers = nb_distinct_kmers;
                nb_distinct_kmers = linearCounter->count()*estimated_nb_reads/NbRead;
                //printf("estimated now: %ld\n",nb_distinct_kmers);

                // the following condition will grossly over-estimate the number of distinct kmers
                // I expect the correct result to be in the same order of magnitude
                // 5% error
                if (abs((int)(nb_distinct_kmers-previous_nb_distinct_kmers)) < previous_nb_distinct_kmers/20)
                    stop = true;

                if (!linearCounter->is_accurate())
                    stop = true;
            }
        }
        NbRead++;
        if ((NbRead%10000)==0) fprintf (stderr,(char*)"%cExtrapolating number of distinct kmers %lld",13,NbRead);
    }

    if (!linearCounter->is_accurate())
    {
        printf("Inaccurate estimation, restarting with %d MB RAM\n",(2*nbytes_memory)/1024/1024);
        delete linearCounter;
        return extrapolate_distinct_kmers_wrapped(2*nbytes_memory, Reads);
    }
    nb_distinct_kmers = linearCounter->count()*estimated_nb_reads/NbRead; // this is a very rough estimation

    printf("Linear estimation: ~%ld M distinct kmers are in the reads\n",nb_distinct_kmers/1000000L);
    delete linearCounter;
    return nb_distinct_kmers;
}