예제 #1
0
void GraphOutput::load_nodes_extremities(string linear_seqs_name)
{
  kmer_links.clear(); // PIERRE: reset previous stored kmer links

    Bank *Nodes = new Bank((char *)linear_seqs_name.c_str());
    long nb_nodes = first_id_els.node; //PIERRE;
    char * rseq;
    int readlen;

    sizeKmer--; // nodes extremities overlap by (k-1)-mers, so let's extract (k-1)-mers

    while (Nodes->get_next_seq(&rseq,&readlen))
    {
        kmer_type left_kmer, right_kmer, left_kmer_fw, left_kmer_rc, right_kmer_fw, right_kmer_rc;
        left_kmer = extractKmerFromRead(rseq,0,&left_kmer_fw,&left_kmer_rc, false);
        right_kmer = extractKmerFromRead(rseq,readlen-sizeKmer,&right_kmer_fw,&right_kmer_rc, false);
        Strand left_strand = (left_kmer == left_kmer_fw)?FW:RC;
        Strand right_strand = (right_kmer == right_kmer_fw)?FW:RC;

        kmer_links[left_kmer].insert(node_strand(nb_nodes, left_strand, LEFT));
        kmer_links[right_kmer].insert(node_strand(nb_nodes, right_strand, RIGHT));

        nb_nodes++;
    }
    Nodes->close();
    delete Nodes;

    sizeKmer++; // make sure to restore k
}
예제 #2
0
Set *load_false_positives() 
{
    int64_t NbInsertedKmers = 0;
    char * rseq;
    int readlen;
    kmer_type kmer, graine, graine_revcomp;

    Bank *FalsePositives = new Bank(return_file_name(false_positive_kmers_file));


    // alloc false positives with the just the right estimated size

    uint64_t nbFP = countFP(FalsePositives);

    FPSet *fp = new FPSet(nbFP);
    
    while (FalsePositives->get_next_seq(&rseq,&readlen))
    {
        kmer = extractKmerFromRead(rseq,0,&graine,&graine_revcomp);
                
        fp->insert(kmer);

        NbInsertedKmers++;

        if ((NbInsertedKmers%table_print_frequency)==0) fprintf (stderr,(char*)"%cInsert false positive Kmers in hash table %lld",13,NbInsertedKmers);
    }
    fp->finalize(); // always call this when finishing to create a FPSet

    fprintf (stderr,"\nInserted %lld false positive kmers in the hash structure.\n\n",NbInsertedKmers);

    print_size_summary(fp);

    return fp;
}
예제 #3
0
 id_els GraphOutput::construct_graph(string linear_seqs_name) // PIERRE: i need to know the last nb_nodes
{
    Bank *Nodes = new Bank((char *)linear_seqs_name.c_str());
    id_els nb_els = first_id_els; //Alexan: stucture for print id elements in graph output

    char * rseq;
    int readlen;

    Nodes->rewind_all();

    sizeKmer--; // nodes extremities overlap by (k-1)-mers, so let's extract (k-1)-mers

    // for each node, output all the out-edges (in-edges will correspond to out-edges of neighbors)
    while (Nodes->get_next_seq(&rseq,&readlen))
    {
	
        kmer_type left_kmer, right_kmer, left_kmer_fw, left_kmer_rc, right_kmer_fw, right_kmer_rc;
        set<node_strand>::iterator it;

        left_kmer = extractKmerFromRead(rseq,0,&left_kmer_fw,&left_kmer_rc, false);
        right_kmer = extractKmerFromRead(rseq,readlen-sizeKmer,&right_kmer_fw,&right_kmer_rc, false);
        Strand left_strand = (left_kmer == left_kmer_fw)?FW:RC;
        Strand right_strand = (right_kmer == right_kmer_fw)?FW:RC;


        // left edges (are revcomp extensions)
        for (it = kmer_links[left_kmer].begin(); it != kmer_links[left_kmer].end(); it++)
        {
            long cur_node = it->node;
            Strand cur_strand = it->strand;
            LeftOrRight cur_left_or_right = it->left_or_right;

            if (cur_node ==nb_els.node) // prevent self loops on same kmer
                 if (readlen == sizeKmer)
                    continue;
            
            string label = "R";

            if (cur_left_or_right == LEFT)
            {
                if (cur_strand != left_strand)
                    label+=(string)"F";
                else
                    continue;
            }
            else
            {
                if (cur_strand == left_strand)
                    label+=(string)"R";
                else
                    continue;
            }


            print_edge(nb_els.edge, nb_els.node,cur_node,label);
	        nb_els.edge++; 
        }

        // right edges
        for (it = kmer_links[right_kmer].begin(); it != kmer_links[right_kmer].end(); it++)
        {
            long cur_node = it->node;
            Strand cur_strand = it->strand;
            LeftOrRight cur_left_or_right = it->left_or_right;

            if (cur_node == nb_els.node) // prevent self loops on same kmer
                 if (readlen == sizeKmer)
                    continue;
           
            string label = "F";

            if (cur_left_or_right == LEFT)
            {
                if (cur_strand == right_strand)
                    label+=(string)"F";
                else
                    continue;
            }
            else
            {
                if (cur_strand != right_strand)
                    label+=(string)"R";
                else
                    continue;
            }

            print_edge(nb_els.edge, nb_els.node,cur_node,label);
	        nb_els.edge++;
        }

        //nodes
        print_node(nb_els.node, rseq);   

        nb_els.node++;
    }

    sizeKmer++; // make sure to restore k
    Nodes->close();
    delete Nodes;
    return nb_els;
}
예제 #4
0
Set *load_false_positives_cascading4()
{
  int64_t NbInsertedKmers;
  char * rseq;
  int readlen;
  kmer_type kmer, graine, graine_revcomp;

  
  // **** Initialize B2, B3, B4 and T4 ****
  Bank *FalsePositives = new Bank(return_file_name(false_positive_kmers_file));
  uint64_t nbFP = countFP(FalsePositives);
  
  FPSetCascading4 *fp = new FPSetCascading4;
  
  fp->bloom2 = new Bloom((uint64_t)(nbFP * NBITS_PER_KMER));
  fp->bloom2->set_number_of_hash_func((int)floorf(0.7*NBITS_PER_KMER));

  uint64_t estimated_T2_size = max((int)ceilf(nbkmers_solid * (double)powf((double)0.62, (double)NBITS_PER_KMER)), 1);
  uint64_t estimated_T3_size = max((int)ceilf(nbFP          * (double)powf((double)0.62, (double)NBITS_PER_KMER)) ,1);

  fp->bloom3 = new Bloom((uint64_t)(estimated_T2_size * NBITS_PER_KMER));
  fp->bloom3->set_number_of_hash_func((int)floorf(0.7*NBITS_PER_KMER));

  fp->bloom4 = new Bloom((uint64_t)(estimated_T3_size * NBITS_PER_KMER));
  fp->bloom4->set_number_of_hash_func((int)floorf(0.7*NBITS_PER_KMER));


  // **** Insert the false positives in B2 ****
  NbInsertedKmers = 0;
  while (FalsePositives->get_next_seq(&rseq,&readlen))
  {
    kmer = extractKmerFromRead(rseq,0,&graine,&graine_revcomp);
    
    fp->bloom2->add(kmer);
    
    NbInsertedKmers++;
    if ((NbInsertedKmers%table_print_frequency)==0)
      fprintf (stderr,"%cInsert false positive B2 %lld",13,NbInsertedKmers);
  }
  fprintf (stderr,"%cInsert false positive B2 %lld", 13,NbInsertedKmers);
  FalsePositives->close();

  printf("\nInserted %lld (estimated, %lld) kmers in B2.\n", NbInsertedKmers, nbFP);


  //  **** Insert false positives in B3 and write T2 
  int addKmers = 0;
  NbInsertedKmers = 0;
  FILE *T2_file = fopen(return_file_name("t2_kmers"), "w+"); // We will read this file later, when filling T4 
  BinaryBank *SolidKmers = new BinaryBank(return_file_name(solid_kmers_file),sizeof(kmer),0);
  while(SolidKmers->read_element(&kmer))
  {
    if (fp->bloom2->contains(kmer))
    {
      if (!fwrite(&kmer, sizeof(kmer), 1, T2_file))
      {
	printf("error: can't fwrite (disk full?)\n");
	exit(1);
      }

      fp->bloom3->add(kmer);
      addKmers++;
    }

    NbInsertedKmers++;
    if ((NbInsertedKmers% table_print_frequency)==0)
      fprintf (stderr,(char*)"%cInsert false positive B3 %lld",13,NbInsertedKmers);
  }
  fprintf (stderr,(char*)"%cInsert false positive B3 %lld",13,NbInsertedKmers);
  SolidKmers->close();

  printf("\nInserted %lld (estimated, %llu) kmers in B3.\n", addKmers, estimated_T2_size);

  
  // **** Insert false positives in B4 (we could write T3, but it's not necessary)
  FalsePositives = new Bank(return_file_name(false_positive_kmers_file));
  NbInsertedKmers = 0;
  addKmers = 0;
  while (FalsePositives->get_next_seq(&rseq,&readlen))
  {
    kmer = extractKmerFromRead(rseq,0,&graine,&graine_revcomp);
    
    if (fp->bloom3->contains(kmer))
    {
      fp->bloom4->add(kmer);
      addKmers++;
    }

    NbInsertedKmers++;
    if ((NbInsertedKmers%table_print_frequency)==0)
      fprintf (stderr,"%cInsert false positive B4 %lld",13,NbInsertedKmers);
  }
  fprintf (stderr,"%cInsert false positive B4 %lld", 13,NbInsertedKmers);
  FalsePositives->close();

  printf("\nInserted %lld (estimated, %lld) kmers in B4.\n", addKmers, estimated_T3_size);
  

  // **** Count and insert false positives in T4
  rewind(T2_file);
  addKmers = 0;
  while (fread(&kmer, sizeof(kmer), 1, T2_file))
    if (fp->bloom4->contains(kmer))
      addKmers++;

  fp->false_positives = new FPSet(addKmers);
  rewind(T2_file);
  addKmers = 0;
  NbInsertedKmers = 0;
  while (fread(&kmer, sizeof(kmer), 1, T2_file))
  {  
    if (fp->bloom4->contains(kmer))
    {
      fp->false_positives->insert(kmer);
      addKmers++;
    }

    NbInsertedKmers++;
    if ((NbInsertedKmers%table_print_frequency)==0)
      fprintf (stderr,"%cInsert false positive T4 %lld",13,NbInsertedKmers);
  }
  fp->false_positives->finalize();
  fprintf (stderr,"%cInsert false positive T4 %lld", 13,NbInsertedKmers);
  fclose(T2_file);

  printf("\nInserted %lld (estimated, %lld) kmers in T4.\n\n", addKmers, (uint64_t)fp->false_positives->capacity());
  
  print_size_summary(fp);

  return fp;
}