Beispiel #1
0
void GraphOutput::load_nodes_extremities(string linear_seqs_name)
{
  kmer_links.clear(); // PIERRE: reset previous stored kmer links

    Bank *Nodes = new Bank((char *)linear_seqs_name.c_str());
    long nb_nodes = first_id_els.node; //PIERRE;
    char * rseq;
    int readlen;

    sizeKmer--; // nodes extremities overlap by (k-1)-mers, so let's extract (k-1)-mers

    while (Nodes->get_next_seq(&rseq,&readlen))
    {
        kmer_type left_kmer, right_kmer, left_kmer_fw, left_kmer_rc, right_kmer_fw, right_kmer_rc;
        left_kmer = extractKmerFromRead(rseq,0,&left_kmer_fw,&left_kmer_rc, false);
        right_kmer = extractKmerFromRead(rseq,readlen-sizeKmer,&right_kmer_fw,&right_kmer_rc, false);
        Strand left_strand = (left_kmer == left_kmer_fw)?FW:RC;
        Strand right_strand = (right_kmer == right_kmer_fw)?FW:RC;

        kmer_links[left_kmer].insert(node_strand(nb_nodes, left_strand, LEFT));
        kmer_links[right_kmer].insert(node_strand(nb_nodes, right_strand, RIGHT));

        nb_nodes++;
    }
    Nodes->close();
    delete Nodes;

    sizeKmer++; // make sure to restore k
}
Beispiel #2
0
// T can be Bloom, BloomCpt, BloomCpt3 or LinearCounter (just needs to support add(kmer_type) and possibly contains(kmer_type))
// U can be BloomCpt or BloomCpt3
void bloom_pass_reads(Bank *Sequences, T *bloom_to_insert, U *bloom_counter, char *stderr_message)
{
    int64_t NbRead = 0;
    int64_t NbInsertedKmers = 0;
    Sequences->rewind_all();
    char * rseq;
    long i;
    kmer_type kmer, graine, graine_revcomp;


    while (Sequences->get_next_seq(&rseq,&readlen))
    {
      for (i=0; i<readlen-sizeKmer+1; i++)
        {
            kmer = extractKmerFromRead(rseq,i,&graine,&graine_revcomp);

            if (bloom_counter != NULL)
            {
                // discard kmers which are not solid
                if( ! bloom_counter->contains_n_occ(kmer,nks)) continue;
            }

            bloom_to_insert->add(kmer);
            NbInsertedKmers++;

        }
        NbRead++;
        if ((NbRead%10000)==0) fprintf (stderr,stderr_message,13,NbRead);
    }
    fprintf (stderr,"\nInserted %lld %s kmers in the bloom structure.\n",(long long)NbInsertedKmers,"(redundant)");

}
Beispiel #3
0
Set *load_false_positives() 
{
    int64_t NbInsertedKmers = 0;
    char * rseq;
    int readlen;
    kmer_type kmer, graine, graine_revcomp;

    Bank *FalsePositives = new Bank(return_file_name(false_positive_kmers_file));


    // alloc false positives with the just the right estimated size

    uint64_t nbFP = countFP(FalsePositives);

    FPSet *fp = new FPSet(nbFP);
    
    while (FalsePositives->get_next_seq(&rseq,&readlen))
    {
        kmer = extractKmerFromRead(rseq,0,&graine,&graine_revcomp);
                
        fp->insert(kmer);

        NbInsertedKmers++;

        if ((NbInsertedKmers%table_print_frequency)==0) fprintf (stderr,(char*)"%cInsert false positive Kmers in hash table %lld",13,NbInsertedKmers);
    }
    fp->finalize(); // always call this when finishing to create a FPSet

    fprintf (stderr,"\nInserted %lld false positive kmers in the hash structure.\n\n",NbInsertedKmers);

    print_size_summary(fp);

    return fp;
}
Beispiel #4
0
 id_els GraphOutput::construct_graph(string linear_seqs_name) // PIERRE: i need to know the last nb_nodes
{
    Bank *Nodes = new Bank((char *)linear_seqs_name.c_str());
    id_els nb_els = first_id_els; //Alexan: stucture for print id elements in graph output

    char * rseq;
    int readlen;

    Nodes->rewind_all();

    sizeKmer--; // nodes extremities overlap by (k-1)-mers, so let's extract (k-1)-mers

    // for each node, output all the out-edges (in-edges will correspond to out-edges of neighbors)
    while (Nodes->get_next_seq(&rseq,&readlen))
    {
	
        kmer_type left_kmer, right_kmer, left_kmer_fw, left_kmer_rc, right_kmer_fw, right_kmer_rc;
        set<node_strand>::iterator it;

        left_kmer = extractKmerFromRead(rseq,0,&left_kmer_fw,&left_kmer_rc, false);
        right_kmer = extractKmerFromRead(rseq,readlen-sizeKmer,&right_kmer_fw,&right_kmer_rc, false);
        Strand left_strand = (left_kmer == left_kmer_fw)?FW:RC;
        Strand right_strand = (right_kmer == right_kmer_fw)?FW:RC;


        // left edges (are revcomp extensions)
        for (it = kmer_links[left_kmer].begin(); it != kmer_links[left_kmer].end(); it++)
        {
            long cur_node = it->node;
            Strand cur_strand = it->strand;
            LeftOrRight cur_left_or_right = it->left_or_right;

            if (cur_node ==nb_els.node) // prevent self loops on same kmer
                 if (readlen == sizeKmer)
                    continue;
            
            string label = "R";

            if (cur_left_or_right == LEFT)
            {
                if (cur_strand != left_strand)
                    label+=(string)"F";
                else
                    continue;
            }
            else
            {
                if (cur_strand == left_strand)
                    label+=(string)"R";
                else
                    continue;
            }


            print_edge(nb_els.edge, nb_els.node,cur_node,label);
	        nb_els.edge++; 
        }

        // right edges
        for (it = kmer_links[right_kmer].begin(); it != kmer_links[right_kmer].end(); it++)
        {
            long cur_node = it->node;
            Strand cur_strand = it->strand;
            LeftOrRight cur_left_or_right = it->left_or_right;

            if (cur_node == nb_els.node) // prevent self loops on same kmer
                 if (readlen == sizeKmer)
                    continue;
           
            string label = "F";

            if (cur_left_or_right == LEFT)
            {
                if (cur_strand == right_strand)
                    label+=(string)"F";
                else
                    continue;
            }
            else
            {
                if (cur_strand != right_strand)
                    label+=(string)"R";
                else
                    continue;
            }

            print_edge(nb_els.edge, nb_els.node,cur_node,label);
	        nb_els.edge++;
        }

        //nodes
        print_node(nb_els.node, rseq);   

        nb_els.node++;
    }

    sizeKmer++; // make sure to restore k
    Nodes->close();
    delete Nodes;
    return nb_els;
}
Beispiel #5
0
uint64_t extrapolate_distinct_kmers_wrapped(unsigned long nbytes_memory, Bank *Reads)
{
    unsigned long size_linearCounter = nbytes_memory * 8L; // alloc 8 bits * nbytes for counting
    LinearCounter *linearCounter = new LinearCounter(size_linearCounter);
    int stops = 100000;

    // variant of bloom_pass_reads

    int64_t NbRead = 0;
    int64_t NbInsertedKmers = 0;
    Reads->rewind_all();
    char * rseq;
    long i;
    kmer_type kmer, graine, graine_revcomp;

    long nb_distinct_kmers = 0; 
    long previous_nb_distinct_kmers = 0; 
    uint64_t estimated_nb_reads = Reads->estimate_nb_reads();
    bool stop = false;

    while (Reads->get_next_seq(&rseq,&readlen))
    {
        if (stop)
            break;

        for (i=0; i<readlen-sizeKmer+1; i++)
        {
            kmer = extractKmerFromRead(rseq,i,&graine,&graine_revcomp);

            linearCounter->add(kmer);
            NbInsertedKmers++;

            if (NbInsertedKmers % stops == 0 && NbRead != 0)
            {
                previous_nb_distinct_kmers = nb_distinct_kmers;
                nb_distinct_kmers = linearCounter->count()*estimated_nb_reads/NbRead;
                //printf("estimated now: %ld\n",nb_distinct_kmers);

                // the following condition will grossly over-estimate the number of distinct kmers
                // I expect the correct result to be in the same order of magnitude
                // 5% error
                if (abs((int)(nb_distinct_kmers-previous_nb_distinct_kmers)) < previous_nb_distinct_kmers/20)
                    stop = true;

                if (!linearCounter->is_accurate())
                    stop = true;
            }
        }
        NbRead++;
        if ((NbRead%10000)==0) fprintf (stderr,(char*)"%cExtrapolating number of distinct kmers %lld",13,NbRead);
    }

    if (!linearCounter->is_accurate())
    {
        printf("Inaccurate estimation, restarting with %d MB RAM\n",(2*nbytes_memory)/1024/1024);
        delete linearCounter;
        return extrapolate_distinct_kmers_wrapped(2*nbytes_memory, Reads);
    }
    nb_distinct_kmers = linearCounter->count()*estimated_nb_reads/NbRead; // this is a very rough estimation

    printf("Linear estimation: ~%ld M distinct kmers are in the reads\n",nb_distinct_kmers/1000000L);
    delete linearCounter;
    return nb_distinct_kmers;
}
Beispiel #6
0
void exact_kmer_count(Bank *Sequences, T *bloom_counter, unsigned long max_memory)
{
   FILE * count_file = fopen("kmer_count","wb+");
   FILE * count_file_2 = fopen("kmer_count2","wb+");
   FILE * F_tmp;

   F_kmercpt_read  = count_file  ;
   F_kmercpt_write = count_file_2;

   Sequences->rewind_all();
   
   unsigned int max_kmer_per_part = max_memory*1024LL*1024LL /sizeof(cell<kmer_type>);
    int numpart = 0;
    char * rseq;
    long i;
    int64_t NbRead = 0;
    int64_t NbInserted = 0;
    int64_t NbInserted_unique = 0;
    kmer_type kmer, graine, graine_revcomp;

    // that code makes hasht1 occupy full memory. should probably be reduced (but we're deprecating that procedure, right?)
    int NBITS_HT = max( (int)ceilf(log2f((max_memory*1024L*1024L)/sizeof(cell_ptr_t))), 1); 
    Hash16 *hasht1 =new Hash16(NBITS_HT); 

    // partitioned exact kmer counting based on Bloom filter for solidity:
    // the bloom filter enables membership test for a set S of supposedly solid kmers (contains false positives)
    // read the (redundant) kmers from the reads, and load only those in S, in chunks, into a hash table
    // at each pass, update a file containing the true count of non-redundant supposedly solid kmers (S)
    // at the end, analyze the file to keep only those with true count >= solid
   while (Sequences->get_next_seq(&rseq,&readlen))
     {
       for (i=0; i<readlen-sizeKmer+1; i++)
	 {
	   kmer = extractKmerFromRead(rseq,i,&graine,&graine_revcomp);

	   // discard kmers which are not solid
	   if( ! bloom_counter->contains_n_occ(kmer,nks)) continue;
	   
	   //insert into hasht1
	   NbInserted_unique += hasht1->add(kmer);
	   NbInserted++;

	   if(hasht1->nb_elem >max_kmer_per_part) //end partition
	     {
	       
	       fprintf(stderr,"End of Kmer count partition  %lli / %i \n",(long long)(hasht1->nb_elem),max_kmer_per_part);
	       
	       if(numpart==0)
		 hasht1->dump(F_kmercpt_write);
	       else
	        end_kmer_count_partition(false,hasht1);

	       //swap file pointers
	       F_tmp = F_kmercpt_read;
	       F_kmercpt_read = F_kmercpt_write;
	       F_kmercpt_write = F_tmp;
	       /////////end write files
	       
	       //reset hash table
	       hasht1->empty_all();
	       
	       numpart++;
	     } ///end partition
	   
	 } 
             NbRead++;
        if ((NbRead%10000)==0) fprintf (stderr,"%cLoop through reads for exact kmer count %lld",13,(long long)NbRead);
     } 
   fprintf (stderr," \nTotal Inserted in hash (ie output of Bloom)  unique %lli   /  %lli  redundants \n",(long long)NbInserted_unique,(long long)NbInserted);
      ///////////////////////// last partition 
   end_kmer_count_partition(true,hasht1);
   delete hasht1;
 } 
Beispiel #7
0
Set *load_false_positives_cascading4()
{
  int64_t NbInsertedKmers;
  char * rseq;
  int readlen;
  kmer_type kmer, graine, graine_revcomp;

  
  // **** Initialize B2, B3, B4 and T4 ****
  Bank *FalsePositives = new Bank(return_file_name(false_positive_kmers_file));
  uint64_t nbFP = countFP(FalsePositives);
  
  FPSetCascading4 *fp = new FPSetCascading4;
  
  fp->bloom2 = new Bloom((uint64_t)(nbFP * NBITS_PER_KMER));
  fp->bloom2->set_number_of_hash_func((int)floorf(0.7*NBITS_PER_KMER));

  uint64_t estimated_T2_size = max((int)ceilf(nbkmers_solid * (double)powf((double)0.62, (double)NBITS_PER_KMER)), 1);
  uint64_t estimated_T3_size = max((int)ceilf(nbFP          * (double)powf((double)0.62, (double)NBITS_PER_KMER)) ,1);

  fp->bloom3 = new Bloom((uint64_t)(estimated_T2_size * NBITS_PER_KMER));
  fp->bloom3->set_number_of_hash_func((int)floorf(0.7*NBITS_PER_KMER));

  fp->bloom4 = new Bloom((uint64_t)(estimated_T3_size * NBITS_PER_KMER));
  fp->bloom4->set_number_of_hash_func((int)floorf(0.7*NBITS_PER_KMER));


  // **** Insert the false positives in B2 ****
  NbInsertedKmers = 0;
  while (FalsePositives->get_next_seq(&rseq,&readlen))
  {
    kmer = extractKmerFromRead(rseq,0,&graine,&graine_revcomp);
    
    fp->bloom2->add(kmer);
    
    NbInsertedKmers++;
    if ((NbInsertedKmers%table_print_frequency)==0)
      fprintf (stderr,"%cInsert false positive B2 %lld",13,NbInsertedKmers);
  }
  fprintf (stderr,"%cInsert false positive B2 %lld", 13,NbInsertedKmers);
  FalsePositives->close();

  printf("\nInserted %lld (estimated, %lld) kmers in B2.\n", NbInsertedKmers, nbFP);


  //  **** Insert false positives in B3 and write T2 
  int addKmers = 0;
  NbInsertedKmers = 0;
  FILE *T2_file = fopen(return_file_name("t2_kmers"), "w+"); // We will read this file later, when filling T4 
  BinaryBank *SolidKmers = new BinaryBank(return_file_name(solid_kmers_file),sizeof(kmer),0);
  while(SolidKmers->read_element(&kmer))
  {
    if (fp->bloom2->contains(kmer))
    {
      if (!fwrite(&kmer, sizeof(kmer), 1, T2_file))
      {
	printf("error: can't fwrite (disk full?)\n");
	exit(1);
      }

      fp->bloom3->add(kmer);
      addKmers++;
    }

    NbInsertedKmers++;
    if ((NbInsertedKmers% table_print_frequency)==0)
      fprintf (stderr,(char*)"%cInsert false positive B3 %lld",13,NbInsertedKmers);
  }
  fprintf (stderr,(char*)"%cInsert false positive B3 %lld",13,NbInsertedKmers);
  SolidKmers->close();

  printf("\nInserted %lld (estimated, %llu) kmers in B3.\n", addKmers, estimated_T2_size);

  
  // **** Insert false positives in B4 (we could write T3, but it's not necessary)
  FalsePositives = new Bank(return_file_name(false_positive_kmers_file));
  NbInsertedKmers = 0;
  addKmers = 0;
  while (FalsePositives->get_next_seq(&rseq,&readlen))
  {
    kmer = extractKmerFromRead(rseq,0,&graine,&graine_revcomp);
    
    if (fp->bloom3->contains(kmer))
    {
      fp->bloom4->add(kmer);
      addKmers++;
    }

    NbInsertedKmers++;
    if ((NbInsertedKmers%table_print_frequency)==0)
      fprintf (stderr,"%cInsert false positive B4 %lld",13,NbInsertedKmers);
  }
  fprintf (stderr,"%cInsert false positive B4 %lld", 13,NbInsertedKmers);
  FalsePositives->close();

  printf("\nInserted %lld (estimated, %lld) kmers in B4.\n", addKmers, estimated_T3_size);
  

  // **** Count and insert false positives in T4
  rewind(T2_file);
  addKmers = 0;
  while (fread(&kmer, sizeof(kmer), 1, T2_file))
    if (fp->bloom4->contains(kmer))
      addKmers++;

  fp->false_positives = new FPSet(addKmers);
  rewind(T2_file);
  addKmers = 0;
  NbInsertedKmers = 0;
  while (fread(&kmer, sizeof(kmer), 1, T2_file))
  {  
    if (fp->bloom4->contains(kmer))
    {
      fp->false_positives->insert(kmer);
      addKmers++;
    }

    NbInsertedKmers++;
    if ((NbInsertedKmers%table_print_frequency)==0)
      fprintf (stderr,"%cInsert false positive T4 %lld",13,NbInsertedKmers);
  }
  fp->false_positives->finalize();
  fprintf (stderr,"%cInsert false positive T4 %lld", 13,NbInsertedKmers);
  fclose(T2_file);

  printf("\nInserted %lld (estimated, %lld) kmers in T4.\n\n", addKmers, (uint64_t)fp->false_positives->capacity());
  
  print_size_summary(fp);

  return fp;
}
inline void assemble()
{

    //////-------------------------------------------------------------------------------------------
    fprintf (stderr,"______________________________________________________ \n");
    fprintf (stderr,"___________ Assemble from bloom filter _______________ \n");
    fprintf (stderr,"______________________________________________________ \n\n");

    //////-------------------------------------------------------------------------------------------


    long long len_left = 0;
    long long len_right = 0;
    long long contig_len =0;
    long long maxlen=10000000;

    char *left_traversal  = (char *) malloc(maxlen*sizeof(char));
    char *right_traversal = (char *) malloc(maxlen*sizeof(char));
    char *contig          = (char *) malloc(2*(maxlen+sizeKmer)*sizeof(char));
    kmer_type kmer;

    long long nbContig =0;
    long long nbSmallContig =0;
    long long totalnt=0;
    long long max_contig_len=0;
    long long mlenleft=0,mlenright=0;
    int64_t NbBranchingKmer=0;
    char kmer_seq[sizeKmer+1];
    FILE * file_assembly = fopen(return_file_name(assembly_file),"w+");

    BinaryBank *SolidKmers = new BinaryBank(return_file_name(solid_kmers_file),sizeof(kmer_type),0);

    STARTWALL(assembly);

    char *assemble_only_one_region = NULL; // debugging, set to a ASCII kmer to activate, NULL to desactivate
    bool LOAD_BRANCHING_KMERS=false; // debugging
    bool DUMP_BRANCHING_KMERS=false;
   
    BranchingTerminator *terminator;

    if (LOAD_BRANCHING_KMERS)
    {
        BinaryBank *BranchingKmers = new BinaryBank(return_file_name(branching_kmers_file),sizeof(kmer_type),false);
        terminator = new BranchingTerminator(BranchingKmers,SolidKmers, bloo1,false_positives);
        BranchingKmers->close();
    }
    else
        terminator = new BranchingTerminator(SolidKmers,genome_size, bloo1,false_positives);

    if (DUMP_BRANCHING_KMERS)
    {
        BinaryBank *BranchingKmers = new BinaryBank(return_file_name(branching_kmers_file),sizeof(kmer_type),true);
        terminator->dump_branching_kmers(BranchingKmers);
        BranchingKmers->close();
    }

#ifdef UNITIG
    SimplePathsTraversal *traversal = new SimplePathsTraversal(bloo1,false_positives,terminator);
    fprintf (stderr,"_________________Assembling in Unitig mode ..._____________________ \n\n");
#else
    MonumentTraversal *traversal = new MonumentTraversal(bloo1,false_positives,terminator);
#endif
    //RandomBranchingTraversal *traversal = new RandomBranchingTraversal(bloo1,false_positives,terminator);
    traversal->set_maxlen(maxlen);
    traversal->set_max_depth(500);
    traversal->set_max_breadth(20);
    
    while (terminator->next(&kmer))
    {
        // keep looping while a starting kmer is available from this kmer
		// everything will be marked during the traversal()'s
		kmer_type starting_kmer;
#ifdef UNITIG
        while (traversal->get_new_starting_node_improved(kmer,starting_kmer))
#else
        while (traversal->find_starting_kmer(kmer,starting_kmer))
#endif
		{
		    code2seq(starting_kmer,kmer_seq); // convert starting kmer to nucleotide seq
            traversal->revert_stats(); // set stats from the last commit (discard stats from find_starting_kmer / small contigs)

            if (assemble_only_one_region != NULL)
            {
                kmer_type dummy;
                starting_kmer = extractKmerFromRead(assemble_only_one_region,0,&kmer,&dummy,false);
            }

            // right extension
            len_right = traversal->traverse(starting_kmer,right_traversal,0);
            mlenright= max(len_right,mlenright);

            // left extension, is equivalent to right extension of the revcomp
            len_left = traversal->traverse(starting_kmer,left_traversal,1);
            mlenleft= max(len_left,mlenleft);

            // form the contig
            revcomp_sequence(left_traversal,len_left);
            strcpy(contig,left_traversal); // contig = revcomp(left_traversal)
	        strcat(contig,kmer_seq);//               + starting_kmer
            strcat(contig,right_traversal);//           + right_traversal

            contig_len=len_left+len_right+sizeKmer;

            // save the contig
            if(contig_len >= MIN_CONTIG_SIZE)
            {
                max_contig_len = max(max_contig_len,contig_len);
                fprintf(file_assembly,">%lli__len__%lli \n",nbContig,contig_len);
                fprintf(file_assembly,"%s\n",contig);
                nbContig++;
                totalnt+=contig_len;
                traversal->commit_stats();
            }
            else
            {
                traversal->revert_stats();
                nbSmallContig++;
            }
            if (assemble_only_one_region != NULL)
                break;
        }
    
        NbBranchingKmer++;
        if ((NbBranchingKmer%300)==0) fprintf (stderr,"%cLooping through branching kmer n° %lld / %lld  total nt   %lld   ",13,(long long int) NbBranchingKmer,(long long int) terminator->nb_branching_kmers, (long long int)totalnt );

        if (nbContig > 0 && assemble_only_one_region != NULL)
            break;

    }
    fclose(file_assembly);

    fprintf (stderr,"\n Total nt assembled  %lli  nbContig %lli\n",totalnt,nbContig);
    fprintf (stderr," Max contig len  %lli (debug: max len left %lli, max len right %lli)\n",max_contig_len,mlenleft,mlenright);
    fprintf (stderr,"\n Debug traversal stats: %ld ends of contigs (%lld unsaved small contigs), among them:\n",traversal->final_stats.ended_traversals,nbSmallContig);
    fprintf (stderr," %ld couldn't validate consensuses\n",traversal->final_stats.couldnt_validate_consensuses);
    fprintf (stderr," %ld large bubble breadth, %ld large bubble depth, %ld marked kmer, %ld no extension\n",traversal->final_stats.couldnt_traverse_bubble_breadth,traversal->final_stats.couldnt_traverse_bubble_depth,traversal->final_stats.couldnt_because_marked_kmer,traversal->final_stats.couldnt_find_extension);
    fprintf (stderr," %ld in-branchin large depth, %ld in-branching large breadth, %ld in-branching other\n",traversal->final_stats.couldnt_inbranching_depth,traversal->final_stats.couldnt_inbranching_breadth,traversal->final_stats.couldnt_inbranching_other);
    
    STOPWALL(assembly,"Assembly");

    free(left_traversal);
    free(right_traversal);
    free(contig);
    SolidKmers->close();
}