void GraphOutput::load_nodes_extremities(string linear_seqs_name) { kmer_links.clear(); // PIERRE: reset previous stored kmer links Bank *Nodes = new Bank((char *)linear_seqs_name.c_str()); long nb_nodes = first_id_els.node; //PIERRE; char * rseq; int readlen; sizeKmer--; // nodes extremities overlap by (k-1)-mers, so let's extract (k-1)-mers while (Nodes->get_next_seq(&rseq,&readlen)) { kmer_type left_kmer, right_kmer, left_kmer_fw, left_kmer_rc, right_kmer_fw, right_kmer_rc; left_kmer = extractKmerFromRead(rseq,0,&left_kmer_fw,&left_kmer_rc, false); right_kmer = extractKmerFromRead(rseq,readlen-sizeKmer,&right_kmer_fw,&right_kmer_rc, false); Strand left_strand = (left_kmer == left_kmer_fw)?FW:RC; Strand right_strand = (right_kmer == right_kmer_fw)?FW:RC; kmer_links[left_kmer].insert(node_strand(nb_nodes, left_strand, LEFT)); kmer_links[right_kmer].insert(node_strand(nb_nodes, right_strand, RIGHT)); nb_nodes++; } Nodes->close(); delete Nodes; sizeKmer++; // make sure to restore k }
Set *load_false_positives() { int64_t NbInsertedKmers = 0; char * rseq; int readlen; kmer_type kmer, graine, graine_revcomp; Bank *FalsePositives = new Bank(return_file_name(false_positive_kmers_file)); // alloc false positives with the just the right estimated size uint64_t nbFP = countFP(FalsePositives); FPSet *fp = new FPSet(nbFP); while (FalsePositives->get_next_seq(&rseq,&readlen)) { kmer = extractKmerFromRead(rseq,0,&graine,&graine_revcomp); fp->insert(kmer); NbInsertedKmers++; if ((NbInsertedKmers%table_print_frequency)==0) fprintf (stderr,(char*)"%cInsert false positive Kmers in hash table %lld",13,NbInsertedKmers); } fp->finalize(); // always call this when finishing to create a FPSet fprintf (stderr,"\nInserted %lld false positive kmers in the hash structure.\n\n",NbInsertedKmers); print_size_summary(fp); return fp; }
id_els GraphOutput::construct_graph(string linear_seqs_name) // PIERRE: i need to know the last nb_nodes { Bank *Nodes = new Bank((char *)linear_seqs_name.c_str()); id_els nb_els = first_id_els; //Alexan: stucture for print id elements in graph output char * rseq; int readlen; Nodes->rewind_all(); sizeKmer--; // nodes extremities overlap by (k-1)-mers, so let's extract (k-1)-mers // for each node, output all the out-edges (in-edges will correspond to out-edges of neighbors) while (Nodes->get_next_seq(&rseq,&readlen)) { kmer_type left_kmer, right_kmer, left_kmer_fw, left_kmer_rc, right_kmer_fw, right_kmer_rc; set<node_strand>::iterator it; left_kmer = extractKmerFromRead(rseq,0,&left_kmer_fw,&left_kmer_rc, false); right_kmer = extractKmerFromRead(rseq,readlen-sizeKmer,&right_kmer_fw,&right_kmer_rc, false); Strand left_strand = (left_kmer == left_kmer_fw)?FW:RC; Strand right_strand = (right_kmer == right_kmer_fw)?FW:RC; // left edges (are revcomp extensions) for (it = kmer_links[left_kmer].begin(); it != kmer_links[left_kmer].end(); it++) { long cur_node = it->node; Strand cur_strand = it->strand; LeftOrRight cur_left_or_right = it->left_or_right; if (cur_node ==nb_els.node) // prevent self loops on same kmer if (readlen == sizeKmer) continue; string label = "R"; if (cur_left_or_right == LEFT) { if (cur_strand != left_strand) label+=(string)"F"; else continue; } else { if (cur_strand == left_strand) label+=(string)"R"; else continue; } print_edge(nb_els.edge, nb_els.node,cur_node,label); nb_els.edge++; } // right edges for (it = kmer_links[right_kmer].begin(); it != kmer_links[right_kmer].end(); it++) { long cur_node = it->node; Strand cur_strand = it->strand; LeftOrRight cur_left_or_right = it->left_or_right; if (cur_node == nb_els.node) // prevent self loops on same kmer if (readlen == sizeKmer) continue; string label = "F"; if (cur_left_or_right == LEFT) { if (cur_strand == right_strand) label+=(string)"F"; else continue; } else { if (cur_strand != right_strand) label+=(string)"R"; else continue; } print_edge(nb_els.edge, nb_els.node,cur_node,label); nb_els.edge++; } //nodes print_node(nb_els.node, rseq); nb_els.node++; } sizeKmer++; // make sure to restore k Nodes->close(); delete Nodes; return nb_els; }
Set *load_false_positives_cascading4() { int64_t NbInsertedKmers; char * rseq; int readlen; kmer_type kmer, graine, graine_revcomp; // **** Initialize B2, B3, B4 and T4 **** Bank *FalsePositives = new Bank(return_file_name(false_positive_kmers_file)); uint64_t nbFP = countFP(FalsePositives); FPSetCascading4 *fp = new FPSetCascading4; fp->bloom2 = new Bloom((uint64_t)(nbFP * NBITS_PER_KMER)); fp->bloom2->set_number_of_hash_func((int)floorf(0.7*NBITS_PER_KMER)); uint64_t estimated_T2_size = max((int)ceilf(nbkmers_solid * (double)powf((double)0.62, (double)NBITS_PER_KMER)), 1); uint64_t estimated_T3_size = max((int)ceilf(nbFP * (double)powf((double)0.62, (double)NBITS_PER_KMER)) ,1); fp->bloom3 = new Bloom((uint64_t)(estimated_T2_size * NBITS_PER_KMER)); fp->bloom3->set_number_of_hash_func((int)floorf(0.7*NBITS_PER_KMER)); fp->bloom4 = new Bloom((uint64_t)(estimated_T3_size * NBITS_PER_KMER)); fp->bloom4->set_number_of_hash_func((int)floorf(0.7*NBITS_PER_KMER)); // **** Insert the false positives in B2 **** NbInsertedKmers = 0; while (FalsePositives->get_next_seq(&rseq,&readlen)) { kmer = extractKmerFromRead(rseq,0,&graine,&graine_revcomp); fp->bloom2->add(kmer); NbInsertedKmers++; if ((NbInsertedKmers%table_print_frequency)==0) fprintf (stderr,"%cInsert false positive B2 %lld",13,NbInsertedKmers); } fprintf (stderr,"%cInsert false positive B2 %lld", 13,NbInsertedKmers); FalsePositives->close(); printf("\nInserted %lld (estimated, %lld) kmers in B2.\n", NbInsertedKmers, nbFP); // **** Insert false positives in B3 and write T2 int addKmers = 0; NbInsertedKmers = 0; FILE *T2_file = fopen(return_file_name("t2_kmers"), "w+"); // We will read this file later, when filling T4 BinaryBank *SolidKmers = new BinaryBank(return_file_name(solid_kmers_file),sizeof(kmer),0); while(SolidKmers->read_element(&kmer)) { if (fp->bloom2->contains(kmer)) { if (!fwrite(&kmer, sizeof(kmer), 1, T2_file)) { printf("error: can't fwrite (disk full?)\n"); exit(1); } fp->bloom3->add(kmer); addKmers++; } NbInsertedKmers++; if ((NbInsertedKmers% table_print_frequency)==0) fprintf (stderr,(char*)"%cInsert false positive B3 %lld",13,NbInsertedKmers); } fprintf (stderr,(char*)"%cInsert false positive B3 %lld",13,NbInsertedKmers); SolidKmers->close(); printf("\nInserted %lld (estimated, %llu) kmers in B3.\n", addKmers, estimated_T2_size); // **** Insert false positives in B4 (we could write T3, but it's not necessary) FalsePositives = new Bank(return_file_name(false_positive_kmers_file)); NbInsertedKmers = 0; addKmers = 0; while (FalsePositives->get_next_seq(&rseq,&readlen)) { kmer = extractKmerFromRead(rseq,0,&graine,&graine_revcomp); if (fp->bloom3->contains(kmer)) { fp->bloom4->add(kmer); addKmers++; } NbInsertedKmers++; if ((NbInsertedKmers%table_print_frequency)==0) fprintf (stderr,"%cInsert false positive B4 %lld",13,NbInsertedKmers); } fprintf (stderr,"%cInsert false positive B4 %lld", 13,NbInsertedKmers); FalsePositives->close(); printf("\nInserted %lld (estimated, %lld) kmers in B4.\n", addKmers, estimated_T3_size); // **** Count and insert false positives in T4 rewind(T2_file); addKmers = 0; while (fread(&kmer, sizeof(kmer), 1, T2_file)) if (fp->bloom4->contains(kmer)) addKmers++; fp->false_positives = new FPSet(addKmers); rewind(T2_file); addKmers = 0; NbInsertedKmers = 0; while (fread(&kmer, sizeof(kmer), 1, T2_file)) { if (fp->bloom4->contains(kmer)) { fp->false_positives->insert(kmer); addKmers++; } NbInsertedKmers++; if ((NbInsertedKmers%table_print_frequency)==0) fprintf (stderr,"%cInsert false positive T4 %lld",13,NbInsertedKmers); } fp->false_positives->finalize(); fprintf (stderr,"%cInsert false positive T4 %lld", 13,NbInsertedKmers); fclose(T2_file); printf("\nInserted %lld (estimated, %lld) kmers in T4.\n\n", addKmers, (uint64_t)fp->false_positives->capacity()); print_size_summary(fp); return fp; }