//in last partition : create solid kmers file, and load solid kmers in bloo1 bloom void end_kmer_count_partition(bool last_partition, Hash16 *hasht1) { int value; int cptk=0; int64_t nso=0; /////////////////////////begin write files rewind (F_kmercpt_read); rewind (F_kmercpt_write); #ifndef MINGW ftruncate(fileno(F_kmercpt_write), 0); //erase previous file #else // tempfix? fileno is not accepted by mingw fclose(F_kmercpt_write); F_kmercpt_write = fopen("kmer_count2","wb+"); #endif BinaryBank * SolidKmers = NULL; kmer_type graine; if (last_partition) SolidKmers = new BinaryBank(return_file_name(solid_kmers_file),sizeof(kmer_type),1); while(fread(&graine, sizeof(graine),1, F_kmercpt_read)){ fread(&cptk, sizeof(cptk), 1, F_kmercpt_read); hasht1->remove(graine,&value); // if graine is present, get value of graine and remove graine, else value=0 cptk += value; fwrite(&graine, sizeof(graine), 1, F_kmercpt_write); fwrite(&cptk, sizeof(cptk), 1, F_kmercpt_write); if (last_partition && cptk >= nks) // if last partition, also need to search for solid kmers in remaining of hasht1, so this is not enough: { SolidKmers->write_element(&graine); nso++; } } hasht1->dump(F_kmercpt_write); // dump remaining of hasht1 if (last_partition) { nso+=hasht1->getsolids(NULL,SolidKmers,nks); // get remaining solids of hasht1 fprintf(stderr,"nsolid kmers = %lli \n",(long long)nso); SolidKmers->close(); #ifndef MINGW ftruncate(fileno(F_kmercpt_read), 0); //erase previous file #else // tempfix? fileno is not accepted by mingw fclose(F_kmercpt_read); F_kmercpt_read = fopen("kmer_count2","wb+"); #endif } }
Bloom *bloom_create_bloo1(T *bloom_counter, bool from_dump) { BinaryBank * SolidKmers ; if(from_dump && nsolids) // from dump and known number of solid kmers { //nsolids is sotred in a config file //number of solid kmers cannot be computed precisely from bloom file, imprecision of 0-7 estimated_BL1 = max( (int)ceilf(log2f(nsolids*NBITS_PER_KMER)), 1); estimated_BL1_freesize = (uint64_t)(nsolids*NBITS_PER_KMER); } else { // get true number of solid kmers, in order to precisely size the bloom filter SolidKmers = new BinaryBank(return_file_name(solid_kmers_file),sizeof(kmer_type),0); estimated_BL1 = max( (int)ceilf(log2f(SolidKmers->nb_elements()*NBITS_PER_KMER)), 1); estimated_BL1_freesize = (uint64_t)(SolidKmers->nb_elements()*NBITS_PER_KMER); printf("nelem %lli nbits %g \n",SolidKmers->nb_elements(),NBITS_PER_KMER); } //printf("Allocating %0.1f MB of memory for the main Bloom structure (%g bits/kmer)\n",(1LL<<estimated_BL1)/1024.0/1024.0/8.0,NBITS_PER_KMER); printf("freesize %lli estimated_BL1_freesize %0.1f MB of memory for the main Bloom structure (%g bits/kmer)\n",(long long)estimated_BL1_freesize,(estimated_BL1_freesize)/1024.0/1024.0/8.0,NBITS_PER_KMER); Bloom *bloo1; #if CUSTOMSIZE bloo1 = new Bloom((uint64_t)estimated_BL1_freesize); #else bloo1 = new Bloom(estimated_BL1); #endif bloo1->set_number_of_hash_func((int)floorf(0.7*NBITS_PER_KMER)); if (from_dump) bloo1->load(return_file_name(bloom_file)); // just load the dump else { bloom_pass_reads_binary(bloo1, bloom_counter, (char*)"%cInsert solid Kmers in Bloom %lld"); // use the method reading SolidKmers binary file, was useful when varying Bloom size (!= dumped size) //bloo1->dump(return_file_name(bloom_file)); // create bloom dump SolidKmers->close(); } return bloo1; }
void bloom_pass_reads_binary(T *bloom_to_insert, BloomCpt *bloom_counter, char *stderr_message) { fprintf(stderr,"binary pass \n"); int64_t NbRead = 0; int64_t NbInsertedKmers = 0; kmer_type kmer; // read solid kmers from disk BinaryBank * SolidKmers = new BinaryBank(return_file_name(solid_kmers_file),sizeof(kmer),0); while(SolidKmers->read_element(&kmer)) { // printf("kmer %lld\n",kmer); bloom_to_insert->add(kmer); NbInsertedKmers++; NbRead++; if ((NbRead%10000)==0) fprintf (stderr,stderr_message,13,(long long)NbRead); } fprintf (stderr,"\nInserted %lld %s kmers in the bloom structure.\n",(long long)NbInsertedKmers,"solid"); SolidKmers->close(); }
int debloom(int order, int max_memory) { // read bloo1 from disk dump Bloom *bloo1 = bloom_create_bloo1((BloomCpt *)NULL); STARTWALL(pos); FILE * debloom_file = fopen(return_file_name("debloom"),"wb+"); FILE * debloom_file_2 = fopen(return_file_name("debloom2"),"wb+"); FILE * F_tmp; F_debloom_read = debloom_file; F_debloom_write = debloom_file_2; BinaryBank *SolidKmers = new BinaryBank(return_file_name(solid_kmers_file),sizeof(kmer_type),0); uint64_t cc=0; kmer_type new_graine, kmer; int nt; uint64_t NbSolidKmer =0; // write all positive extensions in disk file while (SolidKmers->read_element(&kmer)) { //8 right extensions (4F and 4R); left extensions are redundant by revcomplementation for(nt=0; nt<4; nt++) { int strand; for (strand = 0; strand < 2 ; strand++) { int current_strand = strand; new_graine = next_kmer(kmer,nt, ¤t_strand); if(bloo1->contains(new_graine)){ // extension is positive // maybe do more lax deblooming; if it's a dead-end, it's no big deal, don't pass it to the false positive test // what would have been needed if i decided to enable order>0 (but actually this won't happen): // - better estimate of structure size in the presence of order>0 deblooming if (order == 1) // this case just detects tips {printf("ORDER==1"); bool is_linked = false; for(int tip_nt=0; tip_nt<4; tip_nt++) { int new_strand = current_strand; kmer_type kmer_after_possible_tip = next_kmer(new_graine,tip_nt, &new_strand); if(bloo1->contains(kmer_after_possible_tip)) { is_linked = true; break; } } if (!is_linked) continue; // it's a tip, because it's linked to nothing } if (order > 1) // general case. should work for order = 1, but i coded an optimized version above { printf("ORDER>1"); Frontline frontline( new_graine, current_strand, bloo1, NULL, NULL, NULL); while (frontline.depth < order) { frontline.go_next_depth(); if (frontline.size() == 0) break; // don't allow a breadth too large anywqy if (frontline.size()> 10) break; } if (frontline.size() == 0) continue; // it's a deadend } if (!fwrite(&new_graine, sizeof(new_graine), 1, debloom_file)) { printf("error: can't fwrite (disk full?)\n"); exit(1); } cc++; } } } NbSolidKmer++; if ((NbSolidKmer%table_print_frequency)==0) fprintf (stderr,"%c Writing positive Bloom Kmers %lld",13,NbSolidKmer); } nbkmers_solid = NbSolidKmer; // GUS: it's global now fprintf(stderr,"\n%lli kmers written\n",cc); STOPWALL(pos,"Write all positive kmers"); STARTWALL(deb); double bl1tai = (double)bloo1->tai ; delete bloo1; // now that bloo1 is deleted, initialize hasht1 int NBITS_HT = max( (int)ceilf(log2f((0.1*max_memory*1024L*1024L)/sizeof(cell_ptr_t))), 1); // set hasht1 cells to occupy 0.1 * [as much mem as poss] hasht1 =new Hash16(NBITS_HT); //////////////////////////////////////////////////////////////// --find false positive, with hash table partitioning uint64_t max_kmer_per_part = (uint64_t) (0.8*max_memory*1024LL*1024LL /sizeof(cell<kmer_type>)); //adapter taille ht en fonction printf("%d partitions will be needed\n",(int)(nbkmers_solid/max_kmer_per_part)); NbSolidKmer =0; int numpart = 0; SolidKmers->rewind_all(); // deblooming: // read the list of (non-redundant) solid kmers and load it, in chunks, into a hash table // at each pass, check all the positive extensions and keep those which are not indicated, by the current chunk, as solid kmers // at the end, only the positive extensions which are not solid are kept while (SolidKmers->read_element(&kmer)) { hasht1->add(kmer); NbSolidKmer++; if ((NbSolidKmer%table_print_frequency)==0) fprintf (stderr,"%cBuild Hash table %lld",13,NbSolidKmer); if(hasht1->nb_elem >max_kmer_per_part) //end partition, find false positives { fprintf(stderr,"End of debloom partition %lli / %lld \n",hasht1->nb_elem,max_kmer_per_part); end_debloom_partition(false); //swap file pointers F_tmp = F_debloom_read; F_debloom_read = F_debloom_write; F_debloom_write = F_tmp; /////////end write files //reset hash table hasht1->empty_all(); fprintf(stderr,"\n%lli false positives written , partition %i \n",n_false_positives,numpart); numpart++; } ///end partition } fprintf(stderr,"Nb kmers stored in the bloom table %lld\n",nbkmers_solid); ///////////////////////// last partition, will write all the FP's to the good file end_debloom_partition(true); /////////end write files fprintf(stderr,"Total nb false positives stored in the Debloom hashtable %lli \n",n_false_positives); delete hasht1; STOPWALL(deb,"Debloom"); // GUS: will use to output summary later b1_size = (uint64_t) bl1tai; fclose(debloom_file); fclose(debloom_file_2); SolidKmers->close(); return 1; }
Set *load_false_positives_cascading4() { int64_t NbInsertedKmers; char * rseq; int readlen; kmer_type kmer, graine, graine_revcomp; // **** Initialize B2, B3, B4 and T4 **** Bank *FalsePositives = new Bank(return_file_name(false_positive_kmers_file)); uint64_t nbFP = countFP(FalsePositives); FPSetCascading4 *fp = new FPSetCascading4; fp->bloom2 = new Bloom((uint64_t)(nbFP * NBITS_PER_KMER)); fp->bloom2->set_number_of_hash_func((int)floorf(0.7*NBITS_PER_KMER)); uint64_t estimated_T2_size = max((int)ceilf(nbkmers_solid * (double)powf((double)0.62, (double)NBITS_PER_KMER)), 1); uint64_t estimated_T3_size = max((int)ceilf(nbFP * (double)powf((double)0.62, (double)NBITS_PER_KMER)) ,1); fp->bloom3 = new Bloom((uint64_t)(estimated_T2_size * NBITS_PER_KMER)); fp->bloom3->set_number_of_hash_func((int)floorf(0.7*NBITS_PER_KMER)); fp->bloom4 = new Bloom((uint64_t)(estimated_T3_size * NBITS_PER_KMER)); fp->bloom4->set_number_of_hash_func((int)floorf(0.7*NBITS_PER_KMER)); // **** Insert the false positives in B2 **** NbInsertedKmers = 0; while (FalsePositives->get_next_seq(&rseq,&readlen)) { kmer = extractKmerFromRead(rseq,0,&graine,&graine_revcomp); fp->bloom2->add(kmer); NbInsertedKmers++; if ((NbInsertedKmers%table_print_frequency)==0) fprintf (stderr,"%cInsert false positive B2 %lld",13,NbInsertedKmers); } fprintf (stderr,"%cInsert false positive B2 %lld", 13,NbInsertedKmers); FalsePositives->close(); printf("\nInserted %lld (estimated, %lld) kmers in B2.\n", NbInsertedKmers, nbFP); // **** Insert false positives in B3 and write T2 int addKmers = 0; NbInsertedKmers = 0; FILE *T2_file = fopen(return_file_name("t2_kmers"), "w+"); // We will read this file later, when filling T4 BinaryBank *SolidKmers = new BinaryBank(return_file_name(solid_kmers_file),sizeof(kmer),0); while(SolidKmers->read_element(&kmer)) { if (fp->bloom2->contains(kmer)) { if (!fwrite(&kmer, sizeof(kmer), 1, T2_file)) { printf("error: can't fwrite (disk full?)\n"); exit(1); } fp->bloom3->add(kmer); addKmers++; } NbInsertedKmers++; if ((NbInsertedKmers% table_print_frequency)==0) fprintf (stderr,(char*)"%cInsert false positive B3 %lld",13,NbInsertedKmers); } fprintf (stderr,(char*)"%cInsert false positive B3 %lld",13,NbInsertedKmers); SolidKmers->close(); printf("\nInserted %lld (estimated, %llu) kmers in B3.\n", addKmers, estimated_T2_size); // **** Insert false positives in B4 (we could write T3, but it's not necessary) FalsePositives = new Bank(return_file_name(false_positive_kmers_file)); NbInsertedKmers = 0; addKmers = 0; while (FalsePositives->get_next_seq(&rseq,&readlen)) { kmer = extractKmerFromRead(rseq,0,&graine,&graine_revcomp); if (fp->bloom3->contains(kmer)) { fp->bloom4->add(kmer); addKmers++; } NbInsertedKmers++; if ((NbInsertedKmers%table_print_frequency)==0) fprintf (stderr,"%cInsert false positive B4 %lld",13,NbInsertedKmers); } fprintf (stderr,"%cInsert false positive B4 %lld", 13,NbInsertedKmers); FalsePositives->close(); printf("\nInserted %lld (estimated, %lld) kmers in B4.\n", addKmers, estimated_T3_size); // **** Count and insert false positives in T4 rewind(T2_file); addKmers = 0; while (fread(&kmer, sizeof(kmer), 1, T2_file)) if (fp->bloom4->contains(kmer)) addKmers++; fp->false_positives = new FPSet(addKmers); rewind(T2_file); addKmers = 0; NbInsertedKmers = 0; while (fread(&kmer, sizeof(kmer), 1, T2_file)) { if (fp->bloom4->contains(kmer)) { fp->false_positives->insert(kmer); addKmers++; } NbInsertedKmers++; if ((NbInsertedKmers%table_print_frequency)==0) fprintf (stderr,"%cInsert false positive T4 %lld",13,NbInsertedKmers); } fp->false_positives->finalize(); fprintf (stderr,"%cInsert false positive T4 %lld", 13,NbInsertedKmers); fclose(T2_file); printf("\nInserted %lld (estimated, %lld) kmers in T4.\n\n", addKmers, (uint64_t)fp->false_positives->capacity()); print_size_summary(fp); return fp; }
inline void assemble() { //////------------------------------------------------------------------------------------------- fprintf (stderr,"______________________________________________________ \n"); fprintf (stderr,"___________ Assemble from bloom filter _______________ \n"); fprintf (stderr,"______________________________________________________ \n\n"); //////------------------------------------------------------------------------------------------- long long len_left = 0; long long len_right = 0; long long contig_len =0; long long maxlen=10000000; char *left_traversal = (char *) malloc(maxlen*sizeof(char)); char *right_traversal = (char *) malloc(maxlen*sizeof(char)); KmerColour *left_colour_traversal = (KmerColour *) malloc(maxlen*sizeof(KmerColour)); KmerColour *right_colour_traversal = (KmerColour *) malloc(maxlen*sizeof(KmerColour)); char *contig = (char *) malloc(2*(maxlen+sizeKmer)*sizeof(char)); KmerColour *contig_colour = (KmerColour *) malloc(2*(maxlen+sizeKmer)*sizeof(KmerColour)); kmer_type kmer; long long nbContig =0; long long totalnt=0; long long max_contig_len=0; long long mlenleft=0,mlenright=0; int64_t NbBranchingKmer=0; char kmer_seq[sizeKmer+1]; FILE * file_assembly = fopen(return_file_name(assembly_file),"w+"); FILE * file_colour_assembly = fopen(return_file_name(assembly_colour_file),"w+"); BinaryBank *SolidKmers = new BinaryBank(return_file_name(solid_kmers_file),sizeof(kmer_type),0); BinaryBank *solid_kmers_colour = new BinaryBank(return_file_name(solid_kmers_colour_file), kSizeOfKmerType+kSizeOfKmerColour, 0); char colour_seq[1000]; STARTWALL(assembly); char *assemble_only_one_region = NULL; // debugging, set to a ASCII kmer to activate, NULL to desactivate bool LOAD_BRANCHING_KMERS=false; // debugging bool DUMP_BRANCHING_KMERS=false; BranchingTerminator *terminator; if (LOAD_BRANCHING_KMERS) {printf("LOA:%d\n",LOAD_BRANCHING_KMERS); BinaryBank *BranchingKmers = new BinaryBank(return_file_name(branching_kmers_file),sizeof(kmer_type),false); terminator = new BranchingTerminator(BranchingKmers,SolidKmers, bloo1,false_positives); BranchingKmers->close(); } else terminator = new BranchingTerminator(SolidKmers,genome_size, bloo1,false_positives); if (DUMP_BRANCHING_KMERS) {printf("DUMP:%d\n",DUMP_BRANCHING_KMERS); BinaryBank *BranchingKmers = new BinaryBank(return_file_name(branching_kmers_file),sizeof(kmer_type),true); terminator->dump_branching_kmers(BranchingKmers); BranchingKmers->close(); } printf("Check boolean:%i\t%i\n", LOAD_BRANCHING_KMERS, DUMP_BRANCHING_KMERS); #ifdef UNITIG SimplePathsTraversal *traversal = new SimplePathsTraversal(bloo1,false_positives,terminator); fprintf (stderr,"_________________Assembling in Unitig mode ..._____________________ \n\n"); #else MonumentTraversal *traversal = new MonumentTraversal(bloo1,false_positives,terminator); #endif //RandomBranchingTraversal *traversal = new RandomBranchingTraversal(bloo1,false_positives,terminator); traversal->set_maxlen(maxlen); traversal->set_max_depth(500); traversal->set_max_breadth(20); traversal->SetSolidKmersColour(solid_kmers_colour, max_memory); while (terminator->next(&kmer)) { // keep looping while a starting kmer is available from this kmer // everything will be marked during the traversal()'s kmer_type starting_kmer; code2seq(kmer,kmer_seq); // convert // printf("StartWhile, init Kmer:%li\t%s\n",kmer, kmer_seq);// Varified! kmer's matched seq from the original creation while (traversal->find_starting_kmer(kmer,starting_kmer)) // while (traversal->find_starting_kmer_inside_simple_path(kmer,starting_kmer)) { code2seq(starting_kmer,kmer_seq); // convert starting kmer to nucleotide seq KmerColour kmer_colour = traversal->GetColour(starting_kmer); // printf("Starting_kmer:%lu %s",starting_kmer, kmer_seq); if (assemble_only_one_region != NULL) { kmer_type dummy; starting_kmer = extractKmerFromRead(assemble_only_one_region,0,&kmer,&dummy,false); } // right extension // len_right = traversal->traverse(starting_kmer, right_traversal, 0); len_right = traversal->traverse_colour(starting_kmer, right_traversal, right_colour_traversal, 0); mlenright= max(len_right,mlenright); int debug=1; if(debug>1){ printf("RightSeq:%lld\t%s\n", len_right, right_traversal); // printf("RightColour:"); // for (int i = 0; i < len_right; ++i) { // printf("%u ",right_colour_traversal[i]); // } kmer_colour_pattern_string(right_colour_traversal, len_right, colour_seq); printf("RightColour:%s\n",colour_seq); } // left extension, is equivalent to right extension of the revcomp // len_left = traversal->traverse(starting_kmer, left_traversal, 1); len_left = traversal->traverse_colour(starting_kmer, left_traversal, left_colour_traversal, 1); mlenleft= max(len_left,mlenleft); // form the contig // printf("before Rev:%s\n",left_traversal); revcomp_sequence(left_traversal,len_left); KmerColourUtil::rev_colour(left_colour_traversal, len_left); // printf("after Rev:%s\n",left_traversal); strcpy(contig,left_traversal); // contig = revcomp(left_traversal) strcat(contig,kmer_seq);// + starting_kmer strcat(contig,right_traversal);// + right_traversal contig_len=len_left+len_right+sizeKmer; int colour_len = 0; KmerColour sep_colour = kErrorCode+1;// output with %x, so anything greater than 100; colour_len = KmerColourUtil::append_colour(left_colour_traversal, len_left, contig_colour, colour_len); if(debug){ KmerColourUtil::append_colour(&sep_colour, 1, contig_colour, colour_len); } // memset(contig_colour+pt_len, (int) kmer_colour, kSizeOfKmerColour*sizeKmer); // pt_len += sizeKmer; KmerColourUtil::append_colour(&kmer_colour, 1, contig_colour, colour_len); if(debug){ KmerColourUtil::append_colour(&sep_colour, 1, contig_colour, colour_len); } // memcpy(contig_colour+colour_len, right_colour_traversal, len_right); // colour_len += len_right; KmerColourUtil::append_colour(right_colour_traversal, len_right, contig_colour, colour_len); if(debug>1){ printf("LeftSeq:%lld\t%s\n", len_left, left_traversal); // printf("LeftColour:"); // for (int i = 0; i < len_left; ++i) { // printf("%u ",left_colour_traversal[i]); // } // printf("\n"); kmer_colour_pattern_string(left_colour_traversal, len_left, colour_seq); printf("LeftColour:%s\n",colour_seq); printf("Kmer:%s\n",kmer_seq); printf("KmerColour:%u\n",kmer_colour); printf("Contig:%lld\t%s\n",contig_len ,contig); // printf("Colour:"); // for (int i = 0; i < pt_len; ++i) { // printf("%x", contig_colour[i]); // } // printf("Colour:%d\t%s\n\n",pt_len+len_right ,contig_colour); } std::string report("==========Summary==========\n"); // KmerColourUtil::summary(report, contig_colour, colour_len); // KmerColourUtil::colour_table(report, contig_colour, colour_len, max_colour_count); // printf("%s", report.data()); KmerColourSummary kcs(contig_colour, colour_len, max_colour_count); kcs.summary_colour_code(report); kcs.summary_colour_count(report); kcs.summary_stat(report); kcs.colour_table(report); // printf("%s", report.data()); // delete &kcs; // delete &kcs; // printf("================END======================\n\n\n"); // save the contig if(contig_len >= MIN_CONTIG_SIZE)//TODO: add colour info here { max_contig_len = max(max_contig_len,contig_len); fprintf(file_assembly,">%lli__len__%lli \n",nbContig,contig_len); fprintf(file_assembly,"%s\n",contig); fprintf(file_colour_assembly,">%lli__len__%lli \n",nbContig,contig_len); fprintf(file_colour_assembly,"%s\n",contig); //// fprintf(file_colour_assembly,"%s\n",contig_colour); // for (int i = 0; i < colour_len; ++i) { // fprintf(file_colour_assembly, "%d", all_colour[i]); // } fprintf(file_colour_assembly,"%s\n",report.data()); nbContig++; totalnt+=contig_len; } if (assemble_only_one_region != NULL) break; // printf("Memory: %zu %zu\n", getPeakRSS(), getCurrentRSS() ); //exit(-1); } // printf("Done while look is assemble()\n"); //fclose(file_assembly); //fclose(file_colour_assembly); //exit(-2); NbBranchingKmer++; if ((NbBranchingKmer%300)==0) fprintf (stderr,"%cLooping through branching kmer n° %" PRId64 "/ %" PRId64 " total nt %lli" ,13,NbBranchingKmer,terminator->nb_branching_kmers,totalnt ); if (nbContig > 0 && assemble_only_one_region != NULL) break; } fclose(file_assembly); fprintf (stderr,"\n Total nt assembled %lli nbContig %lli\n",totalnt,nbContig); fprintf (stderr,"\n Max contig len %lli (debug: max len left %lli, max len right %lli)\n",max_contig_len,mlenleft,mlenright); STOPWALL(assembly,"Assembly"); free(left_traversal); free(right_traversal); free(contig); SolidKmers->close(); solid_kmers_colour->close(); // delete SolidKmers; // delete solid_kmers_colour; // delete terminator; delete traversal; // printf("Memory: %zu %zu\n", getPeakRSS(), getCurrentRSS() ); printf("===========DONE=========EXIT========\n"); //exit(-9); }
inline void assemble() { //////------------------------------------------------------------------------------------------- fprintf (stderr,"______________________________________________________ \n"); fprintf (stderr,"___________ Assemble from bloom filter _______________ \n"); fprintf (stderr,"______________________________________________________ \n\n"); //////------------------------------------------------------------------------------------------- long long len_left = 0; long long len_right = 0; long long contig_len =0; long long maxlen=10000000; char *left_traversal = (char *) malloc(maxlen*sizeof(char)); char *right_traversal = (char *) malloc(maxlen*sizeof(char)); char *contig = (char *) malloc(2*(maxlen+sizeKmer)*sizeof(char)); kmer_type kmer; long long nbContig =0; long long nbSmallContig =0; long long totalnt=0; long long max_contig_len=0; long long mlenleft=0,mlenright=0; int64_t NbBranchingKmer=0; char kmer_seq[sizeKmer+1]; FILE * file_assembly = fopen(return_file_name(assembly_file),"w+"); BinaryBank *SolidKmers = new BinaryBank(return_file_name(solid_kmers_file),sizeof(kmer_type),0); STARTWALL(assembly); char *assemble_only_one_region = NULL; // debugging, set to a ASCII kmer to activate, NULL to desactivate bool LOAD_BRANCHING_KMERS=false; // debugging bool DUMP_BRANCHING_KMERS=false; BranchingTerminator *terminator; if (LOAD_BRANCHING_KMERS) { BinaryBank *BranchingKmers = new BinaryBank(return_file_name(branching_kmers_file),sizeof(kmer_type),false); terminator = new BranchingTerminator(BranchingKmers,SolidKmers, bloo1,false_positives); BranchingKmers->close(); } else terminator = new BranchingTerminator(SolidKmers,genome_size, bloo1,false_positives); if (DUMP_BRANCHING_KMERS) { BinaryBank *BranchingKmers = new BinaryBank(return_file_name(branching_kmers_file),sizeof(kmer_type),true); terminator->dump_branching_kmers(BranchingKmers); BranchingKmers->close(); } #ifdef UNITIG SimplePathsTraversal *traversal = new SimplePathsTraversal(bloo1,false_positives,terminator); fprintf (stderr,"_________________Assembling in Unitig mode ..._____________________ \n\n"); #else MonumentTraversal *traversal = new MonumentTraversal(bloo1,false_positives,terminator); #endif //RandomBranchingTraversal *traversal = new RandomBranchingTraversal(bloo1,false_positives,terminator); traversal->set_maxlen(maxlen); traversal->set_max_depth(500); traversal->set_max_breadth(20); while (terminator->next(&kmer)) { // keep looping while a starting kmer is available from this kmer // everything will be marked during the traversal()'s kmer_type starting_kmer; #ifdef UNITIG while (traversal->get_new_starting_node_improved(kmer,starting_kmer)) #else while (traversal->find_starting_kmer(kmer,starting_kmer)) #endif { code2seq(starting_kmer,kmer_seq); // convert starting kmer to nucleotide seq traversal->revert_stats(); // set stats from the last commit (discard stats from find_starting_kmer / small contigs) if (assemble_only_one_region != NULL) { kmer_type dummy; starting_kmer = extractKmerFromRead(assemble_only_one_region,0,&kmer,&dummy,false); } // right extension len_right = traversal->traverse(starting_kmer,right_traversal,0); mlenright= max(len_right,mlenright); // left extension, is equivalent to right extension of the revcomp len_left = traversal->traverse(starting_kmer,left_traversal,1); mlenleft= max(len_left,mlenleft); // form the contig revcomp_sequence(left_traversal,len_left); strcpy(contig,left_traversal); // contig = revcomp(left_traversal) strcat(contig,kmer_seq);// + starting_kmer strcat(contig,right_traversal);// + right_traversal contig_len=len_left+len_right+sizeKmer; // save the contig if(contig_len >= MIN_CONTIG_SIZE) { max_contig_len = max(max_contig_len,contig_len); fprintf(file_assembly,">%lli__len__%lli \n",nbContig,contig_len); fprintf(file_assembly,"%s\n",contig); nbContig++; totalnt+=contig_len; traversal->commit_stats(); } else { traversal->revert_stats(); nbSmallContig++; } if (assemble_only_one_region != NULL) break; } NbBranchingKmer++; if ((NbBranchingKmer%300)==0) fprintf (stderr,"%cLooping through branching kmer n° %lld / %lld total nt %lld ",13,(long long int) NbBranchingKmer,(long long int) terminator->nb_branching_kmers, (long long int)totalnt ); if (nbContig > 0 && assemble_only_one_region != NULL) break; } fclose(file_assembly); fprintf (stderr,"\n Total nt assembled %lli nbContig %lli\n",totalnt,nbContig); fprintf (stderr," Max contig len %lli (debug: max len left %lli, max len right %lli)\n",max_contig_len,mlenleft,mlenright); fprintf (stderr,"\n Debug traversal stats: %ld ends of contigs (%lld unsaved small contigs), among them:\n",traversal->final_stats.ended_traversals,nbSmallContig); fprintf (stderr," %ld couldn't validate consensuses\n",traversal->final_stats.couldnt_validate_consensuses); fprintf (stderr," %ld large bubble breadth, %ld large bubble depth, %ld marked kmer, %ld no extension\n",traversal->final_stats.couldnt_traverse_bubble_breadth,traversal->final_stats.couldnt_traverse_bubble_depth,traversal->final_stats.couldnt_because_marked_kmer,traversal->final_stats.couldnt_find_extension); fprintf (stderr," %ld in-branchin large depth, %ld in-branching large breadth, %ld in-branching other\n",traversal->final_stats.couldnt_inbranching_depth,traversal->final_stats.couldnt_inbranching_breadth,traversal->final_stats.couldnt_inbranching_other); STOPWALL(assembly,"Assembly"); free(left_traversal); free(right_traversal); free(contig); SolidKmers->close(); }