Exemplo n.º 1
0
inline void assemble()
{

    //////-------------------------------------------------------------------------------------------
    fprintf (stderr,"______________________________________________________ \n");
    fprintf (stderr,"___________ Assemble from bloom filter _______________ \n");
    fprintf (stderr,"______________________________________________________ \n\n");

    //////-------------------------------------------------------------------------------------------


    long long len_left = 0;
    long long len_right = 0;
    long long contig_len =0;
    long long maxlen=10000000;

    char *left_traversal  = (char *) malloc(maxlen*sizeof(char));
    char *right_traversal = (char *) malloc(maxlen*sizeof(char));
    KmerColour *left_colour_traversal  = (KmerColour *) malloc(maxlen*sizeof(KmerColour));
    KmerColour *right_colour_traversal = (KmerColour *) malloc(maxlen*sizeof(KmerColour));

    char *contig          = (char *) malloc(2*(maxlen+sizeKmer)*sizeof(char));
    KmerColour *contig_colour   = (KmerColour *) malloc(2*(maxlen+sizeKmer)*sizeof(KmerColour));

    kmer_type kmer;

    long long nbContig =0;
    long long totalnt=0;
    long long max_contig_len=0;
    long long mlenleft=0,mlenright=0;
    int64_t NbBranchingKmer=0;
    char kmer_seq[sizeKmer+1];
    FILE * file_assembly = fopen(return_file_name(assembly_file),"w+");
    FILE * file_colour_assembly = fopen(return_file_name(assembly_colour_file),"w+");
    BinaryBank *SolidKmers = new BinaryBank(return_file_name(solid_kmers_file),sizeof(kmer_type),0);
    BinaryBank *solid_kmers_colour = new BinaryBank(return_file_name(solid_kmers_colour_file), kSizeOfKmerType+kSizeOfKmerColour, 0);

    char colour_seq[1000];

    STARTWALL(assembly);

    char *assemble_only_one_region = NULL; // debugging, set to a ASCII kmer to activate, NULL to desactivate
    bool LOAD_BRANCHING_KMERS=false; // debugging
    bool DUMP_BRANCHING_KMERS=false;
   
    BranchingTerminator *terminator;

    if (LOAD_BRANCHING_KMERS)
    {printf("LOA:%d\n",LOAD_BRANCHING_KMERS);
        BinaryBank *BranchingKmers = new BinaryBank(return_file_name(branching_kmers_file),sizeof(kmer_type),false);
        terminator = new BranchingTerminator(BranchingKmers,SolidKmers, bloo1,false_positives);
        BranchingKmers->close();
    }
    else
        terminator = new BranchingTerminator(SolidKmers,genome_size, bloo1,false_positives);

    if (DUMP_BRANCHING_KMERS)
    {printf("DUMP:%d\n",DUMP_BRANCHING_KMERS);
        BinaryBank *BranchingKmers = new BinaryBank(return_file_name(branching_kmers_file),sizeof(kmer_type),true);
        terminator->dump_branching_kmers(BranchingKmers);
        BranchingKmers->close();
    }
    printf("Check boolean:%i\t%i\n", LOAD_BRANCHING_KMERS, DUMP_BRANCHING_KMERS);

#ifdef UNITIG
    SimplePathsTraversal *traversal = new SimplePathsTraversal(bloo1,false_positives,terminator);
    fprintf (stderr,"_________________Assembling in Unitig mode ..._____________________ \n\n");
#else
    MonumentTraversal *traversal = new MonumentTraversal(bloo1,false_positives,terminator);
#endif
    //RandomBranchingTraversal *traversal = new RandomBranchingTraversal(bloo1,false_positives,terminator);
    traversal->set_maxlen(maxlen);
    traversal->set_max_depth(500);
    traversal->set_max_breadth(20);
    traversal->SetSolidKmersColour(solid_kmers_colour, max_memory);

    while (terminator->next(&kmer))
    {
        // keep looping while a starting kmer is available from this kmer
		// everything will be marked during the traversal()'s
		kmer_type starting_kmer;
		code2seq(kmer,kmer_seq); // convert
//		printf("StartWhile, init Kmer:%li\t%s\n",kmer, kmer_seq);// Varified! kmer's matched seq from the original creation
		while (traversal->find_starting_kmer(kmer,starting_kmer))
//		while (traversal->find_starting_kmer_inside_simple_path(kmer,starting_kmer))
		{
		    code2seq(starting_kmer,kmer_seq); // convert starting kmer to nucleotide seq
		    KmerColour kmer_colour = traversal->GetColour(starting_kmer);

//		    printf("Starting_kmer:%lu %s",starting_kmer, kmer_seq);
            if (assemble_only_one_region != NULL)
            {
                kmer_type dummy;
                starting_kmer = extractKmerFromRead(assemble_only_one_region,0,&kmer,&dummy,false);
            }

            // right extension
//            len_right = traversal->traverse(starting_kmer, right_traversal, 0);
			len_right = traversal->traverse_colour(starting_kmer, right_traversal, right_colour_traversal, 0);
            mlenright= max(len_right,mlenright);
            int debug=1;
            if(debug>1){
            	printf("RightSeq:%lld\t%s\n", len_right, right_traversal);
//            	printf("RightColour:");
//            	for (int i = 0; i < len_right; ++i) {
//            		printf("%u ",right_colour_traversal[i]);
//				}
            	kmer_colour_pattern_string(right_colour_traversal, len_right, colour_seq);
				printf("RightColour:%s\n",colour_seq);
            }

            // left extension, is equivalent to right extension of the revcomp
//            len_left = traversal->traverse(starting_kmer, left_traversal, 1);
            len_left = traversal->traverse_colour(starting_kmer, left_traversal,
            										left_colour_traversal, 1);
            mlenleft= max(len_left,mlenleft);

            // form the contig

//            printf("before Rev:%s\n",left_traversal);
            revcomp_sequence(left_traversal,len_left);
            KmerColourUtil::rev_colour(left_colour_traversal, len_left);

//            printf("after Rev:%s\n",left_traversal);
            strcpy(contig,left_traversal); // contig = revcomp(left_traversal)
	        strcat(contig,kmer_seq);//               + starting_kmer
            strcat(contig,right_traversal);//           + right_traversal
			contig_len=len_left+len_right+sizeKmer;


            int colour_len = 0;
            KmerColour sep_colour = kErrorCode+1;// output with %x, so anything greater than 100;
			colour_len = KmerColourUtil::append_colour(left_colour_traversal, len_left,
					contig_colour, colour_len);
			if(debug){
				KmerColourUtil::append_colour(&sep_colour, 1, contig_colour,
						colour_len);
			}
//            memset(contig_colour+pt_len, (int) kmer_colour, kSizeOfKmerColour*sizeKmer);
//            pt_len += sizeKmer;

			KmerColourUtil::append_colour(&kmer_colour, 1, contig_colour,
					colour_len);

			if(debug){
				KmerColourUtil::append_colour(&sep_colour, 1, contig_colour,
										colour_len);
			}

//            memcpy(contig_colour+colour_len, right_colour_traversal, len_right);
//			colour_len += len_right;
			KmerColourUtil::append_colour(right_colour_traversal, len_right,
					contig_colour, colour_len);


            if(debug>1){
            	printf("LeftSeq:%lld\t%s\n", len_left, left_traversal);
//            	printf("LeftColour:");
//				for (int i = 0; i < len_left; ++i) {
//					printf("%u ",left_colour_traversal[i]);
//				}
//				printf("\n");
            	kmer_colour_pattern_string(left_colour_traversal, len_left, colour_seq);
				printf("LeftColour:%s\n",colour_seq);
				printf("Kmer:%s\n",kmer_seq);
				printf("KmerColour:%u\n",kmer_colour);
				printf("Contig:%lld\t%s\n",contig_len ,contig);
//				printf("Colour:");
//				for (int i = 0; i < pt_len; ++i) {
//					printf("%x", contig_colour[i]);
//				}

//				printf("Colour:%d\t%s\n\n",pt_len+len_right ,contig_colour);


            }

            std::string report("==========Summary==========\n");
//			KmerColourUtil::summary(report, contig_colour, colour_len);
//			KmerColourUtil::colour_table(report, contig_colour, colour_len, max_colour_count);
//			printf("%s", report.data());

			KmerColourSummary kcs(contig_colour, colour_len, max_colour_count);
			kcs.summary_colour_code(report);
			kcs.summary_colour_count(report);
			kcs.summary_stat(report);
			kcs.colour_table(report);
//			printf("%s", report.data());
//			delete &kcs;
//			delete &kcs;
//			printf("================END======================\n\n\n");
			// save the contig
            if(contig_len >= MIN_CONTIG_SIZE)//TODO: add colour info here
            {
                max_contig_len = max(max_contig_len,contig_len);
                fprintf(file_assembly,">%lli__len__%lli \n",nbContig,contig_len);
                fprintf(file_assembly,"%s\n",contig);

                fprintf(file_colour_assembly,">%lli__len__%lli \n",nbContig,contig_len);
				fprintf(file_colour_assembly,"%s\n",contig);
////				fprintf(file_colour_assembly,"%s\n",contig_colour);
//				for (int i = 0; i < colour_len; ++i) {
//					fprintf(file_colour_assembly, "%d", all_colour[i]);
//				}
				fprintf(file_colour_assembly,"%s\n",report.data());
                nbContig++;
                totalnt+=contig_len;
            }
            if (assemble_only_one_region != NULL)
                break;
//            printf("Memory: %zu %zu\n", getPeakRSS(), getCurrentRSS() );
//exit(-1);
        }
//		printf("Done while look is assemble()\n");
//fclose(file_assembly);
//fclose(file_colour_assembly);
//exit(-2);
        NbBranchingKmer++;
        if ((NbBranchingKmer%300)==0) fprintf (stderr,"%cLooping through branching kmer n° %" PRId64 "/ %" PRId64 " total nt   %lli" ,13,NbBranchingKmer,terminator->nb_branching_kmers,totalnt );

        if (nbContig > 0 && assemble_only_one_region != NULL)
            break;

    }
    fclose(file_assembly);

    fprintf (stderr,"\n Total nt assembled  %lli  nbContig %lli\n",totalnt,nbContig);
    fprintf (stderr,"\n Max contig len  %lli (debug: max len left %lli, max len right %lli)\n",max_contig_len,mlenleft,mlenright);
    
    STOPWALL(assembly,"Assembly");

    free(left_traversal);
    free(right_traversal);
    free(contig);
    SolidKmers->close();
    solid_kmers_colour->close();
//    delete SolidKmers;
//    delete solid_kmers_colour;
//    delete terminator;
    delete traversal;
//	printf("Memory: %zu %zu\n", getPeakRSS(), getCurrentRSS() );
    printf("===========DONE=========EXIT========\n");
//exit(-9);
}
inline void assemble()
{

    //////-------------------------------------------------------------------------------------------
    fprintf (stderr,"______________________________________________________ \n");
    fprintf (stderr,"___________ Assemble from bloom filter _______________ \n");
    fprintf (stderr,"______________________________________________________ \n\n");

    //////-------------------------------------------------------------------------------------------


    long long len_left = 0;
    long long len_right = 0;
    long long contig_len =0;
    long long maxlen=10000000;

    char *left_traversal  = (char *) malloc(maxlen*sizeof(char));
    char *right_traversal = (char *) malloc(maxlen*sizeof(char));
    char *contig          = (char *) malloc(2*(maxlen+sizeKmer)*sizeof(char));
    kmer_type kmer;

    long long nbContig =0;
    long long nbSmallContig =0;
    long long totalnt=0;
    long long max_contig_len=0;
    long long mlenleft=0,mlenright=0;
    int64_t NbBranchingKmer=0;
    char kmer_seq[sizeKmer+1];
    FILE * file_assembly = fopen(return_file_name(assembly_file),"w+");

    BinaryBank *SolidKmers = new BinaryBank(return_file_name(solid_kmers_file),sizeof(kmer_type),0);

    STARTWALL(assembly);

    char *assemble_only_one_region = NULL; // debugging, set to a ASCII kmer to activate, NULL to desactivate
    bool LOAD_BRANCHING_KMERS=false; // debugging
    bool DUMP_BRANCHING_KMERS=false;
   
    BranchingTerminator *terminator;

    if (LOAD_BRANCHING_KMERS)
    {
        BinaryBank *BranchingKmers = new BinaryBank(return_file_name(branching_kmers_file),sizeof(kmer_type),false);
        terminator = new BranchingTerminator(BranchingKmers,SolidKmers, bloo1,false_positives);
        BranchingKmers->close();
    }
    else
        terminator = new BranchingTerminator(SolidKmers,genome_size, bloo1,false_positives);

    if (DUMP_BRANCHING_KMERS)
    {
        BinaryBank *BranchingKmers = new BinaryBank(return_file_name(branching_kmers_file),sizeof(kmer_type),true);
        terminator->dump_branching_kmers(BranchingKmers);
        BranchingKmers->close();
    }

#ifdef UNITIG
    SimplePathsTraversal *traversal = new SimplePathsTraversal(bloo1,false_positives,terminator);
    fprintf (stderr,"_________________Assembling in Unitig mode ..._____________________ \n\n");
#else
    MonumentTraversal *traversal = new MonumentTraversal(bloo1,false_positives,terminator);
#endif
    //RandomBranchingTraversal *traversal = new RandomBranchingTraversal(bloo1,false_positives,terminator);
    traversal->set_maxlen(maxlen);
    traversal->set_max_depth(500);
    traversal->set_max_breadth(20);
    
    while (terminator->next(&kmer))
    {
        // keep looping while a starting kmer is available from this kmer
		// everything will be marked during the traversal()'s
		kmer_type starting_kmer;
#ifdef UNITIG
        while (traversal->get_new_starting_node_improved(kmer,starting_kmer))
#else
        while (traversal->find_starting_kmer(kmer,starting_kmer))
#endif
		{
		    code2seq(starting_kmer,kmer_seq); // convert starting kmer to nucleotide seq
            traversal->revert_stats(); // set stats from the last commit (discard stats from find_starting_kmer / small contigs)

            if (assemble_only_one_region != NULL)
            {
                kmer_type dummy;
                starting_kmer = extractKmerFromRead(assemble_only_one_region,0,&kmer,&dummy,false);
            }

            // right extension
            len_right = traversal->traverse(starting_kmer,right_traversal,0);
            mlenright= max(len_right,mlenright);

            // left extension, is equivalent to right extension of the revcomp
            len_left = traversal->traverse(starting_kmer,left_traversal,1);
            mlenleft= max(len_left,mlenleft);

            // form the contig
            revcomp_sequence(left_traversal,len_left);
            strcpy(contig,left_traversal); // contig = revcomp(left_traversal)
	        strcat(contig,kmer_seq);//               + starting_kmer
            strcat(contig,right_traversal);//           + right_traversal

            contig_len=len_left+len_right+sizeKmer;

            // save the contig
            if(contig_len >= MIN_CONTIG_SIZE)
            {
                max_contig_len = max(max_contig_len,contig_len);
                fprintf(file_assembly,">%lli__len__%lli \n",nbContig,contig_len);
                fprintf(file_assembly,"%s\n",contig);
                nbContig++;
                totalnt+=contig_len;
                traversal->commit_stats();
            }
            else
            {
                traversal->revert_stats();
                nbSmallContig++;
            }
            if (assemble_only_one_region != NULL)
                break;
        }
    
        NbBranchingKmer++;
        if ((NbBranchingKmer%300)==0) fprintf (stderr,"%cLooping through branching kmer n° %lld / %lld  total nt   %lld   ",13,(long long int) NbBranchingKmer,(long long int) terminator->nb_branching_kmers, (long long int)totalnt );

        if (nbContig > 0 && assemble_only_one_region != NULL)
            break;

    }
    fclose(file_assembly);

    fprintf (stderr,"\n Total nt assembled  %lli  nbContig %lli\n",totalnt,nbContig);
    fprintf (stderr," Max contig len  %lli (debug: max len left %lli, max len right %lli)\n",max_contig_len,mlenleft,mlenright);
    fprintf (stderr,"\n Debug traversal stats: %ld ends of contigs (%lld unsaved small contigs), among them:\n",traversal->final_stats.ended_traversals,nbSmallContig);
    fprintf (stderr," %ld couldn't validate consensuses\n",traversal->final_stats.couldnt_validate_consensuses);
    fprintf (stderr," %ld large bubble breadth, %ld large bubble depth, %ld marked kmer, %ld no extension\n",traversal->final_stats.couldnt_traverse_bubble_breadth,traversal->final_stats.couldnt_traverse_bubble_depth,traversal->final_stats.couldnt_because_marked_kmer,traversal->final_stats.couldnt_find_extension);
    fprintf (stderr," %ld in-branchin large depth, %ld in-branching large breadth, %ld in-branching other\n",traversal->final_stats.couldnt_inbranching_depth,traversal->final_stats.couldnt_inbranching_breadth,traversal->final_stats.couldnt_inbranching_other);
    
    STOPWALL(assembly,"Assembly");

    free(left_traversal);
    free(right_traversal);
    free(contig);
    SolidKmers->close();
}