Пример #1
0
int debloom(int order, int max_memory)
{
    // read bloo1 from disk dump
    Bloom *bloo1 = bloom_create_bloo1((BloomCpt *)NULL);

    STARTWALL(pos);

    FILE * debloom_file = fopen(return_file_name("debloom"),"wb+");
    FILE * debloom_file_2 = fopen(return_file_name("debloom2"),"wb+");
    FILE * F_tmp;
    
    F_debloom_read = debloom_file;
    F_debloom_write = debloom_file_2;
	
    BinaryBank *SolidKmers = new BinaryBank(return_file_name(solid_kmers_file),sizeof(kmer_type),0);
    
    uint64_t cc=0;
    kmer_type new_graine, kmer;
    int nt;
   
    uint64_t NbSolidKmer =0;
    // write all positive extensions in disk file
    while (SolidKmers->read_element(&kmer))
    {

        //8 right extensions   (4F and 4R); left extensions are redundant by revcomplementation
        for(nt=0; nt<4; nt++) 
        {
            int strand;
            for (strand = 0; strand < 2 ; strand++)
            {
                int current_strand = strand;
                new_graine = next_kmer(kmer,nt, &current_strand);

                if(bloo1->contains(new_graine)){   // extension is positive

                    // maybe do more lax deblooming; if it's a dead-end, it's no big deal, don't pass it to the false positive test
                    // what would have been needed if i decided to enable order>0 (but actually this won't happen): 
                    //  - better estimate of structure size in the presence of order>0 deblooming  
                    if (order == 1)  // this case just detects tips
                    {printf("ORDER==1");
                        bool is_linked = false;
                        for(int tip_nt=0; tip_nt<4; tip_nt++) 
                        {
                            int new_strand = current_strand;
                            kmer_type kmer_after_possible_tip = next_kmer(new_graine,tip_nt, &new_strand);
                            if(bloo1->contains(kmer_after_possible_tip))
                            {
                                is_linked = true;
                                break;
                            }
                        }
                        if (!is_linked)
                            continue; // it's a tip, because it's linked to nothing
                    }
    
                    if (order > 1) // general case. should work for order = 1, but i coded an optimized version above
                    { printf("ORDER>1");
                        Frontline frontline( new_graine, current_strand, bloo1, NULL, NULL, NULL);
                        while (frontline.depth < order)
                        {
                            frontline.go_next_depth();
                            if (frontline.size() == 0)
                                break;
                            // don't allow a breadth too large anywqy
                            if (frontline.size()> 10)
                                break;
                        }
                        if (frontline.size() == 0)
                            continue; // it's a deadend
                    }

                    if (!fwrite(&new_graine, sizeof(new_graine), 1, debloom_file))
                    {
                        printf("error: can't fwrite (disk full?)\n");
                        exit(1);
                    }
                    cc++;
                }

            }
        }
        NbSolidKmer++;
        if ((NbSolidKmer%table_print_frequency)==0) fprintf (stderr,"%c Writing positive Bloom Kmers %lld",13,NbSolidKmer);
    }
    nbkmers_solid =  NbSolidKmer; // GUS: it's global now

    fprintf(stderr,"\n%lli kmers written\n",cc);

    STOPWALL(pos,"Write all positive kmers");

    STARTWALL(deb);

    double bl1tai =  (double)bloo1->tai ;
    delete bloo1;

    // now that bloo1 is deleted, initialize hasht1
    int NBITS_HT = max( (int)ceilf(log2f((0.1*max_memory*1024L*1024L)/sizeof(cell_ptr_t))), 1); // set hasht1 cells to occupy 0.1 * [as much mem as poss]
    hasht1 =new Hash16(NBITS_HT); 
    
    ////////////////////////////////////////////////////////////////   --find false positive, with hash table partitioning
    uint64_t max_kmer_per_part = (uint64_t) (0.8*max_memory*1024LL*1024LL /sizeof(cell<kmer_type>));
    //adapter taille ht en fonction
    

    printf("%d partitions will be needed\n",(int)(nbkmers_solid/max_kmer_per_part));

    NbSolidKmer =0;
    int numpart = 0;
    SolidKmers->rewind_all();

    // deblooming:
    // read the list of (non-redundant) solid kmers and load it, in chunks, into a hash table
    // at each pass, check all the positive extensions and keep those which are not indicated, by the current chunk, as solid kmers
    // at the end, only the positive extensions which are not solid are kept
    while (SolidKmers->read_element(&kmer))
    {
        hasht1->add(kmer);

        NbSolidKmer++;
        if ((NbSolidKmer%table_print_frequency)==0) fprintf (stderr,"%cBuild Hash table %lld",13,NbSolidKmer);

        if(hasht1->nb_elem >max_kmer_per_part) //end partition,  find false positives
        {
            fprintf(stderr,"End of debloom partition  %lli / %lld \n",hasht1->nb_elem,max_kmer_per_part);

            end_debloom_partition(false);

            //swap file pointers
            F_tmp = F_debloom_read;
            F_debloom_read = F_debloom_write;
            F_debloom_write = F_tmp;
            /////////end write files

            //reset hash table
            hasht1->empty_all();

            fprintf(stderr,"\n%lli false positives written , partition %i \n",n_false_positives,numpart);

            numpart++;
        } ///end partition


    }
    fprintf(stderr,"Nb kmers stored in the bloom table %lld\n",nbkmers_solid);


    ///////////////////////// last partition, will write all the FP's to the good file

    end_debloom_partition(true); 

    /////////end write files


    fprintf(stderr,"Total nb false positives stored in the Debloom hashtable %lli \n",n_false_positives);

    delete hasht1;


    STOPWALL(deb,"Debloom");
 
    // GUS: will use to output summary later
    b1_size = (uint64_t) bl1tai;
  
    fclose(debloom_file);
    fclose(debloom_file_2);
    SolidKmers->close();


    return 1;

}
Пример #2
0
Bloom *bloom_create_bloo1(T *bloom_counter)
{
    return bloom_create_bloo1(bloom_counter, false);
}
int main(int argc, char *argv[])
{
    
    if(argc <  6)
    {
        fprintf (stderr,"usage:\n");
        fprintf (stderr," %s input_file kmer_size min_abundance estimated_genome_size prefix\n",argv[0]);
        fprintf (stderr,"hints:\n min_abundance ~ 3\n estimated_genome_size is in bp, does not need to be accurate, only controls memory usage\n prefix is any name you want the results to start with\n");

        return 1;
    }

    bool FOUR_BLOOM_VERSION = true;

     // shortcuts to go directly to assembly using serialized bloom and serialized hash
    int START_FROM_SOLID_KMERS=0; // if = 0, construct the fasta file of solid kmers, if = 1, start directly from that file 
    int LOAD_FALSE_POSITIVE_KMERS=0; // if = 0, construct the fasta file of false positive kmers (debloom), if = 1, load that file into the hashtable
    int NO_FALSE_POSITIVES_AT_ALL=0; // if = 0, normal behavior, if = 1, don't load false positives (will be a probabilistic de bruijn graph)
    int max_disk_space = 0;// let dsk decide
    for (int n_a = 6; n_a < argc ; n_a++)
    {
        if (strcmp(argv[n_a],"--original") == 0)
    	    FOUR_BLOOM_VERSION = false;

        if (strcmp(argv[n_a],"--dont-count")==0)
            START_FROM_SOLID_KMERS = 1;

        if (strcmp(argv[n_a],"--dont-debloom")==0)
            LOAD_FALSE_POSITIVE_KMERS = 1;

        if (strcmp(argv[n_a],"--just-assemble")==0)
        {
            START_FROM_SOLID_KMERS = 1;
            LOAD_FALSE_POSITIVE_KMERS = 1;
        }

        if (strcmp(argv[n_a],"--titus-mode")==0)
            NO_FALSE_POSITIVES_AT_ALL = 1;
        
        
        if (strcmp(argv[n_a],"-d")==0)
            max_disk_space = atoi(argv[n_a+1]);
        
        
        if (strcmp(argv[n_a],"-maxc")==0)
	    max_couv = atoi(argv[n_a+1]);
        
        if (strcmp(argv[n_a],"--le-changement")==0)
            {printf("c'est maintenant!\n");exit(0);}
    }


    // kmer size
    sizeKmer=27; // let's make it even for now, because i havnt thought of how to handle palindromes (dont want to stop on them)
    if(argc >=  3)
    {
        sizeKmer = atoi(argv[2]);
        if (sizeKmer%2==0)
        {
            sizeKmer-=1;
            printf("Need odd kmer size to avoid palindromes. I've set kmer size to %d.\n",sizeKmer);
        }
        if (sizeKmer>((int)sizeof(kmer_type)*4))
        {
            printf("Max kmer size on this compiled version is %lu\n",sizeof(kmer_type)*4);
            exit(1);
        }
    }

    if (sizeKmer == (int)(sizeof(kmer_type)*4))
        kmerMask = -1;
    else
        kmerMask=(((kmer_type)1)<<(sizeKmer*2))-1;

    double lg2 = log(2);
   
    if (sizeKmer > 128)
    {
        FOUR_BLOOM_VERSION = false;
        printf("Reverted to single Bloom filter implementation for k>128\n");
    }

    if (!FOUR_BLOOM_VERSION) 
      NBITS_PER_KMER = log(16*sizeKmer*(lg2*lg2))/(lg2*lg2); // needed to process argv[5]
    else 
      NBITS_PER_KMER = rvalues[sizeKmer][1];

    // solidity 
    nks =NNKS;
    if(argc >=  4)
    {
        nks = atoi(argv[3]);
        if (nks==0) nks=1; // min abundance can't be 0
    }


   if(argc >=  5)
    {
       genome_size  = atoll(argv[4]);
      // int estimated_bloom_size = max( (int)ceilf(log2f(genome_size * NBITS_PER_KMER )), 1);
        uint64_t estimated_bloom_size = (uint64_t) (genome_size * NBITS_PER_KMER);

       uint64_t estimated_nb_FP =  (uint64_t)(genome_size * 4 * powf(0.6,11)); // just indicative
    
       //max_memory = max( (1LL << estimated_bloom_size)/8LL /1024LL/1024LL, 1LL );
        max_memory =  max((int64_t) estimated_bloom_size/8LL /1024LL/1024LL,1LL);

      printf("estimated values: nbits Bloom %lli, nb FP %lld, max memory %i MB\n",estimated_bloom_size,estimated_nb_FP,max_memory);

    }

    // output prefix
    if(argc >=  6)
    {
        strcpy(prefix,argv[5]);
    }

   


    fprintf (stderr,"taille cell %lu \n", sizeof(cell<kmer_type>));

    STARTWALL(0);

    Bank *Reads = new Bank(argv[1]);
    
    // counter kmers, write solid kmers to disk
    if (!START_FROM_SOLID_KMERS)
    {
        int verbose = 0;
        bool write_count = false;
        bool skip_binary_conversion = false;

        sorting_count(Reads,prefix,max_memory,max_disk_space,write_count,verbose, skip_binary_conversion);
    }

    // debloom, write false positives to disk, insert them into false_positives
    if (! LOAD_FALSE_POSITIVE_KMERS)
    {
        debloom(order, max_memory);
    }
    
    bloo1 = bloom_create_bloo1((BloomCpt *)NULL, false);

    if (! NO_FALSE_POSITIVES_AT_ALL)
    {
        // load false positives from disk into false_positives
        if (!FOUR_BLOOM_VERSION) 
            false_positives = load_false_positives();
	else
	    false_positives = load_false_positives_cascading4();
    }
    else
    {
        // titus mode: no FP's
        false_positives = dummy_false_positives();
    }

    //  return 1;
    assemble(); 

    STOPWALL(0,"Total");

    delete Reads;
    return 0;
}