inline void assemble() { //////------------------------------------------------------------------------------------------- fprintf (stderr,"______________________________________________________ \n"); fprintf (stderr,"___________ Assemble from bloom filter _______________ \n"); fprintf (stderr,"______________________________________________________ \n\n"); //////------------------------------------------------------------------------------------------- long long len_left = 0; long long len_right = 0; long long contig_len =0; long long maxlen=10000000; char *left_traversal = (char *) malloc(maxlen*sizeof(char)); char *right_traversal = (char *) malloc(maxlen*sizeof(char)); KmerColour *left_colour_traversal = (KmerColour *) malloc(maxlen*sizeof(KmerColour)); KmerColour *right_colour_traversal = (KmerColour *) malloc(maxlen*sizeof(KmerColour)); char *contig = (char *) malloc(2*(maxlen+sizeKmer)*sizeof(char)); KmerColour *contig_colour = (KmerColour *) malloc(2*(maxlen+sizeKmer)*sizeof(KmerColour)); kmer_type kmer; long long nbContig =0; long long totalnt=0; long long max_contig_len=0; long long mlenleft=0,mlenright=0; int64_t NbBranchingKmer=0; char kmer_seq[sizeKmer+1]; FILE * file_assembly = fopen(return_file_name(assembly_file),"w+"); FILE * file_colour_assembly = fopen(return_file_name(assembly_colour_file),"w+"); BinaryBank *SolidKmers = new BinaryBank(return_file_name(solid_kmers_file),sizeof(kmer_type),0); BinaryBank *solid_kmers_colour = new BinaryBank(return_file_name(solid_kmers_colour_file), kSizeOfKmerType+kSizeOfKmerColour, 0); char colour_seq[1000]; STARTWALL(assembly); char *assemble_only_one_region = NULL; // debugging, set to a ASCII kmer to activate, NULL to desactivate bool LOAD_BRANCHING_KMERS=false; // debugging bool DUMP_BRANCHING_KMERS=false; BranchingTerminator *terminator; if (LOAD_BRANCHING_KMERS) {printf("LOA:%d\n",LOAD_BRANCHING_KMERS); BinaryBank *BranchingKmers = new BinaryBank(return_file_name(branching_kmers_file),sizeof(kmer_type),false); terminator = new BranchingTerminator(BranchingKmers,SolidKmers, bloo1,false_positives); BranchingKmers->close(); } else terminator = new BranchingTerminator(SolidKmers,genome_size, bloo1,false_positives); if (DUMP_BRANCHING_KMERS) {printf("DUMP:%d\n",DUMP_BRANCHING_KMERS); BinaryBank *BranchingKmers = new BinaryBank(return_file_name(branching_kmers_file),sizeof(kmer_type),true); terminator->dump_branching_kmers(BranchingKmers); BranchingKmers->close(); } printf("Check boolean:%i\t%i\n", LOAD_BRANCHING_KMERS, DUMP_BRANCHING_KMERS); #ifdef UNITIG SimplePathsTraversal *traversal = new SimplePathsTraversal(bloo1,false_positives,terminator); fprintf (stderr,"_________________Assembling in Unitig mode ..._____________________ \n\n"); #else MonumentTraversal *traversal = new MonumentTraversal(bloo1,false_positives,terminator); #endif //RandomBranchingTraversal *traversal = new RandomBranchingTraversal(bloo1,false_positives,terminator); traversal->set_maxlen(maxlen); traversal->set_max_depth(500); traversal->set_max_breadth(20); traversal->SetSolidKmersColour(solid_kmers_colour, max_memory); while (terminator->next(&kmer)) { // keep looping while a starting kmer is available from this kmer // everything will be marked during the traversal()'s kmer_type starting_kmer; code2seq(kmer,kmer_seq); // convert // printf("StartWhile, init Kmer:%li\t%s\n",kmer, kmer_seq);// Varified! kmer's matched seq from the original creation while (traversal->find_starting_kmer(kmer,starting_kmer)) // while (traversal->find_starting_kmer_inside_simple_path(kmer,starting_kmer)) { code2seq(starting_kmer,kmer_seq); // convert starting kmer to nucleotide seq KmerColour kmer_colour = traversal->GetColour(starting_kmer); // printf("Starting_kmer:%lu %s",starting_kmer, kmer_seq); if (assemble_only_one_region != NULL) { kmer_type dummy; starting_kmer = extractKmerFromRead(assemble_only_one_region,0,&kmer,&dummy,false); } // right extension // len_right = traversal->traverse(starting_kmer, right_traversal, 0); len_right = traversal->traverse_colour(starting_kmer, right_traversal, right_colour_traversal, 0); mlenright= max(len_right,mlenright); int debug=1; if(debug>1){ printf("RightSeq:%lld\t%s\n", len_right, right_traversal); // printf("RightColour:"); // for (int i = 0; i < len_right; ++i) { // printf("%u ",right_colour_traversal[i]); // } kmer_colour_pattern_string(right_colour_traversal, len_right, colour_seq); printf("RightColour:%s\n",colour_seq); } // left extension, is equivalent to right extension of the revcomp // len_left = traversal->traverse(starting_kmer, left_traversal, 1); len_left = traversal->traverse_colour(starting_kmer, left_traversal, left_colour_traversal, 1); mlenleft= max(len_left,mlenleft); // form the contig // printf("before Rev:%s\n",left_traversal); revcomp_sequence(left_traversal,len_left); KmerColourUtil::rev_colour(left_colour_traversal, len_left); // printf("after Rev:%s\n",left_traversal); strcpy(contig,left_traversal); // contig = revcomp(left_traversal) strcat(contig,kmer_seq);// + starting_kmer strcat(contig,right_traversal);// + right_traversal contig_len=len_left+len_right+sizeKmer; int colour_len = 0; KmerColour sep_colour = kErrorCode+1;// output with %x, so anything greater than 100; colour_len = KmerColourUtil::append_colour(left_colour_traversal, len_left, contig_colour, colour_len); if(debug){ KmerColourUtil::append_colour(&sep_colour, 1, contig_colour, colour_len); } // memset(contig_colour+pt_len, (int) kmer_colour, kSizeOfKmerColour*sizeKmer); // pt_len += sizeKmer; KmerColourUtil::append_colour(&kmer_colour, 1, contig_colour, colour_len); if(debug){ KmerColourUtil::append_colour(&sep_colour, 1, contig_colour, colour_len); } // memcpy(contig_colour+colour_len, right_colour_traversal, len_right); // colour_len += len_right; KmerColourUtil::append_colour(right_colour_traversal, len_right, contig_colour, colour_len); if(debug>1){ printf("LeftSeq:%lld\t%s\n", len_left, left_traversal); // printf("LeftColour:"); // for (int i = 0; i < len_left; ++i) { // printf("%u ",left_colour_traversal[i]); // } // printf("\n"); kmer_colour_pattern_string(left_colour_traversal, len_left, colour_seq); printf("LeftColour:%s\n",colour_seq); printf("Kmer:%s\n",kmer_seq); printf("KmerColour:%u\n",kmer_colour); printf("Contig:%lld\t%s\n",contig_len ,contig); // printf("Colour:"); // for (int i = 0; i < pt_len; ++i) { // printf("%x", contig_colour[i]); // } // printf("Colour:%d\t%s\n\n",pt_len+len_right ,contig_colour); } std::string report("==========Summary==========\n"); // KmerColourUtil::summary(report, contig_colour, colour_len); // KmerColourUtil::colour_table(report, contig_colour, colour_len, max_colour_count); // printf("%s", report.data()); KmerColourSummary kcs(contig_colour, colour_len, max_colour_count); kcs.summary_colour_code(report); kcs.summary_colour_count(report); kcs.summary_stat(report); kcs.colour_table(report); // printf("%s", report.data()); // delete &kcs; // delete &kcs; // printf("================END======================\n\n\n"); // save the contig if(contig_len >= MIN_CONTIG_SIZE)//TODO: add colour info here { max_contig_len = max(max_contig_len,contig_len); fprintf(file_assembly,">%lli__len__%lli \n",nbContig,contig_len); fprintf(file_assembly,"%s\n",contig); fprintf(file_colour_assembly,">%lli__len__%lli \n",nbContig,contig_len); fprintf(file_colour_assembly,"%s\n",contig); //// fprintf(file_colour_assembly,"%s\n",contig_colour); // for (int i = 0; i < colour_len; ++i) { // fprintf(file_colour_assembly, "%d", all_colour[i]); // } fprintf(file_colour_assembly,"%s\n",report.data()); nbContig++; totalnt+=contig_len; } if (assemble_only_one_region != NULL) break; // printf("Memory: %zu %zu\n", getPeakRSS(), getCurrentRSS() ); //exit(-1); } // printf("Done while look is assemble()\n"); //fclose(file_assembly); //fclose(file_colour_assembly); //exit(-2); NbBranchingKmer++; if ((NbBranchingKmer%300)==0) fprintf (stderr,"%cLooping through branching kmer n° %" PRId64 "/ %" PRId64 " total nt %lli" ,13,NbBranchingKmer,terminator->nb_branching_kmers,totalnt ); if (nbContig > 0 && assemble_only_one_region != NULL) break; } fclose(file_assembly); fprintf (stderr,"\n Total nt assembled %lli nbContig %lli\n",totalnt,nbContig); fprintf (stderr,"\n Max contig len %lli (debug: max len left %lli, max len right %lli)\n",max_contig_len,mlenleft,mlenright); STOPWALL(assembly,"Assembly"); free(left_traversal); free(right_traversal); free(contig); SolidKmers->close(); solid_kmers_colour->close(); // delete SolidKmers; // delete solid_kmers_colour; // delete terminator; delete traversal; // printf("Memory: %zu %zu\n", getPeakRSS(), getCurrentRSS() ); printf("===========DONE=========EXIT========\n"); //exit(-9); }
inline void assemble() { //////------------------------------------------------------------------------------------------- fprintf (stderr,"______________________________________________________ \n"); fprintf (stderr,"___________ Assemble from bloom filter _______________ \n"); fprintf (stderr,"______________________________________________________ \n\n"); //////------------------------------------------------------------------------------------------- long long len_left = 0; long long len_right = 0; long long contig_len =0; long long maxlen=10000000; char *left_traversal = (char *) malloc(maxlen*sizeof(char)); char *right_traversal = (char *) malloc(maxlen*sizeof(char)); char *contig = (char *) malloc(2*(maxlen+sizeKmer)*sizeof(char)); kmer_type kmer; long long nbContig =0; long long nbSmallContig =0; long long totalnt=0; long long max_contig_len=0; long long mlenleft=0,mlenright=0; int64_t NbBranchingKmer=0; char kmer_seq[sizeKmer+1]; FILE * file_assembly = fopen(return_file_name(assembly_file),"w+"); BinaryBank *SolidKmers = new BinaryBank(return_file_name(solid_kmers_file),sizeof(kmer_type),0); STARTWALL(assembly); char *assemble_only_one_region = NULL; // debugging, set to a ASCII kmer to activate, NULL to desactivate bool LOAD_BRANCHING_KMERS=false; // debugging bool DUMP_BRANCHING_KMERS=false; BranchingTerminator *terminator; if (LOAD_BRANCHING_KMERS) { BinaryBank *BranchingKmers = new BinaryBank(return_file_name(branching_kmers_file),sizeof(kmer_type),false); terminator = new BranchingTerminator(BranchingKmers,SolidKmers, bloo1,false_positives); BranchingKmers->close(); } else terminator = new BranchingTerminator(SolidKmers,genome_size, bloo1,false_positives); if (DUMP_BRANCHING_KMERS) { BinaryBank *BranchingKmers = new BinaryBank(return_file_name(branching_kmers_file),sizeof(kmer_type),true); terminator->dump_branching_kmers(BranchingKmers); BranchingKmers->close(); } #ifdef UNITIG SimplePathsTraversal *traversal = new SimplePathsTraversal(bloo1,false_positives,terminator); fprintf (stderr,"_________________Assembling in Unitig mode ..._____________________ \n\n"); #else MonumentTraversal *traversal = new MonumentTraversal(bloo1,false_positives,terminator); #endif //RandomBranchingTraversal *traversal = new RandomBranchingTraversal(bloo1,false_positives,terminator); traversal->set_maxlen(maxlen); traversal->set_max_depth(500); traversal->set_max_breadth(20); while (terminator->next(&kmer)) { // keep looping while a starting kmer is available from this kmer // everything will be marked during the traversal()'s kmer_type starting_kmer; #ifdef UNITIG while (traversal->get_new_starting_node_improved(kmer,starting_kmer)) #else while (traversal->find_starting_kmer(kmer,starting_kmer)) #endif { code2seq(starting_kmer,kmer_seq); // convert starting kmer to nucleotide seq traversal->revert_stats(); // set stats from the last commit (discard stats from find_starting_kmer / small contigs) if (assemble_only_one_region != NULL) { kmer_type dummy; starting_kmer = extractKmerFromRead(assemble_only_one_region,0,&kmer,&dummy,false); } // right extension len_right = traversal->traverse(starting_kmer,right_traversal,0); mlenright= max(len_right,mlenright); // left extension, is equivalent to right extension of the revcomp len_left = traversal->traverse(starting_kmer,left_traversal,1); mlenleft= max(len_left,mlenleft); // form the contig revcomp_sequence(left_traversal,len_left); strcpy(contig,left_traversal); // contig = revcomp(left_traversal) strcat(contig,kmer_seq);// + starting_kmer strcat(contig,right_traversal);// + right_traversal contig_len=len_left+len_right+sizeKmer; // save the contig if(contig_len >= MIN_CONTIG_SIZE) { max_contig_len = max(max_contig_len,contig_len); fprintf(file_assembly,">%lli__len__%lli \n",nbContig,contig_len); fprintf(file_assembly,"%s\n",contig); nbContig++; totalnt+=contig_len; traversal->commit_stats(); } else { traversal->revert_stats(); nbSmallContig++; } if (assemble_only_one_region != NULL) break; } NbBranchingKmer++; if ((NbBranchingKmer%300)==0) fprintf (stderr,"%cLooping through branching kmer n° %lld / %lld total nt %lld ",13,(long long int) NbBranchingKmer,(long long int) terminator->nb_branching_kmers, (long long int)totalnt ); if (nbContig > 0 && assemble_only_one_region != NULL) break; } fclose(file_assembly); fprintf (stderr,"\n Total nt assembled %lli nbContig %lli\n",totalnt,nbContig); fprintf (stderr," Max contig len %lli (debug: max len left %lli, max len right %lli)\n",max_contig_len,mlenleft,mlenright); fprintf (stderr,"\n Debug traversal stats: %ld ends of contigs (%lld unsaved small contigs), among them:\n",traversal->final_stats.ended_traversals,nbSmallContig); fprintf (stderr," %ld couldn't validate consensuses\n",traversal->final_stats.couldnt_validate_consensuses); fprintf (stderr," %ld large bubble breadth, %ld large bubble depth, %ld marked kmer, %ld no extension\n",traversal->final_stats.couldnt_traverse_bubble_breadth,traversal->final_stats.couldnt_traverse_bubble_depth,traversal->final_stats.couldnt_because_marked_kmer,traversal->final_stats.couldnt_find_extension); fprintf (stderr," %ld in-branchin large depth, %ld in-branching large breadth, %ld in-branching other\n",traversal->final_stats.couldnt_inbranching_depth,traversal->final_stats.couldnt_inbranching_breadth,traversal->final_stats.couldnt_inbranching_other); STOPWALL(assembly,"Assembly"); free(left_traversal); free(right_traversal); free(contig); SolidKmers->close(); }