Пример #1
0
static 	void print_supernode(dBNode * node, void * arg) {
    
    perfect_path_print_supernodes_args * args = (perfect_path_print_supernodes_args * ) arg;
    
    args->count_kmers++;
    if (db_node_check_flag_visited(node) == false && db_node_check_flag_not_pruned(node)) {
        
        perfect_path_get_path(node, undefined,
                              &db_node_action_set_flag_visited,
                              args->db_graph, args->path);
        
        if (path_is_singleton(args->singleton_length, args->path)) {
            args->count_sing++;
        }else if(path_is_repetitive(args->graph_cov, args->path)){
            args->count_rep++;
        }else{
            if (args->fout_cov != NULL) {
                path_to_coverage(args->path, args->fout_cov);
            }
            
            path_to_fasta(args->path, args->fout);            
            
            path_counts_add(args->path, &args->counts);
            args->count_nodes++;
            path_increase_id(args->path);
        }
        
        
    }
}
Пример #2
0
void metacortex_find_subgraphs(dBGraph* graph, char* consensus_contigs_filename, int min_subgraph_kmers)
{
    SubGraphInfo* sub_graphs;
    FILE* fp;
    Path *path_fwd = path_new(MAX_EXPLORE_PATH_LENGTH, graph->kmer_size);
    Path *path_rev = path_new(MAX_EXPLORE_PATH_LENGTH, graph->kmer_size);
    Path *final_path = path_new(MAX_EXPLORE_PATH_LENGTH, graph->kmer_size);
    char seq[256];
    char analysis_filename[strlen(consensus_contigs_filename) + 10];
    long int total_nodes = 0;
    int n_seeds = 0;
    int i;
    
    sprintf(analysis_filename, "%s.analysis", consensus_contigs_filename);
    log_and_screen_printf("Running metacortex subgraph analysis...\n");
    log_and_screen_printf("          Contig file: %s\n", consensus_contigs_filename);
    log_and_screen_printf("        Analysis file: %s\n", analysis_filename);
    log_and_screen_printf("Minimum subgraph size: %i\n", min_subgraph_kmers);
    
    /* Initialise temporaray path array buffers */
    path_array_initialise_buffers(graph->kmer_size);
    
    /* Create a list of subgraphs */
    log_and_screen_printf("Allocating %d Mb to store subgraph information (max %d seeds)...\n", ((MAX_SEEDS * sizeof(SubGraphInfo)) / 1024) / 1024, MAX_SEEDS);
    sub_graphs = calloc(MAX_SEEDS, sizeof(SubGraphInfo));
    if (!sub_graphs) {
        log_and_screen_printf("ERROR: Can't get memory for subgraphs\n");
        exit(-1);
    }

    /* Open the analysis file */
    fp = fopen(analysis_filename, "w");
    if (!fp) {
        log_and_screen_printf("ERROR: Can't open analysis file.\n");
        exit(-1);
    }
        
    /* For each node, if it's not pruned or visited, try and grow a graph */
    void explore_node(dBNode * node) {
        if (node == NULL) {
            log_and_screen_printf("Error: NULL node passed to explore_node.\n");
            exit(-1);
        }
        
        if (db_node_check_for_any_flag(node, PRUNED | VISITED) == false) {
            int nodes_in_graph;
            
            /* Grow graph from this node, returning the 'best' (highest coverage) node to store as seed point */
            nodes_in_graph = grow_graph_from_node(node, &(sub_graphs[n_seeds].seed_node), graph);
            total_nodes += nodes_in_graph;
            
            if (sub_graphs[n_seeds].seed_node == NULL) {
                printf("ERROR: Seed node is NULL, nodes in graph is %d\n", nodes_in_graph);
            } else {
                /* Write data to analysis file */
                binary_kmer_to_seq(&(node->kmer), graph->kmer_size, seq);            
                fprintf(fp, "%i\t%i\t%ld\t%s\t", n_seeds, nodes_in_graph, total_nodes, seq);
                binary_kmer_to_seq(&(sub_graphs[n_seeds].seed_node->kmer), graph->kmer_size, seq);
                fprintf(fp, "%s\n", seq);

                /* Store nodes in this subgraph */
                sub_graphs[n_seeds].graph_size = nodes_in_graph;
                n_seeds++;
                
                /* Check we've not run out of seed storage - in future, this should dynamically allocate */
                if (n_seeds == MAX_SEEDS) {
                    log_and_screen_printf("Error: MAX_SEEDS exceeded. Quitting.\n");
                    exit(-1);
                }
            }
        }
    }
    
    /* Traverse each node... */
    log_and_screen_printf("Finding subgraphs...\n");
    hash_table_traverse(&explore_node, graph);
    log_and_screen_printf("Finished. Total: %ld\n", total_nodes);
    fclose(fp);    
    
    /* Open consensus contigs file */
    fp = fopen(consensus_contigs_filename, "w");
    if (!fp) {
        log_and_screen_printf("ERROR: Can't open contig file.\n");
        exit(-1);
    }
    
    /* Now go through all the seed points and generate the consensus contigs by walking forward and backward from the seed */
    db_graph_reset_flags(graph);    
    log_and_screen_printf("Outputting contigs...\n");
	log_progress_bar(0);
	long long one_percent = n_seeds/100;
    int percent;
    
    if (one_percent < 1) {
        one_percent = 1;
    }
    
    for (i=0; i<n_seeds; i++) {
        if (i % one_percent == 0) {
            percent = (100 * i) / n_seeds;
            log_progress_bar(percent);
        } 
        
        //log_printf("Graph %i\n", i);           
        if (sub_graphs[i].graph_size >= min_subgraph_kmers) {            
            binary_kmer_to_seq(&(sub_graphs[i].seed_node->kmer), graph->kmer_size, seq);
            coverage_walk_get_path(sub_graphs[i].seed_node, forward, NULL, graph, path_fwd);
            coverage_walk_get_path(sub_graphs[i].seed_node, reverse, NULL, graph, path_rev);
            path_reverse(path_fwd, final_path);
            path_append(final_path, path_rev);
            final_path->id = i;
            path_to_fasta(final_path, fp);
            //log_printf("  Seed %s\tFwd path length %i\tRev path length %i\tFinal path length %i\n", seq, path_fwd->length, path_rev->length, final_path->length);
            path_reset(path_fwd);
            perfect_path_get_path(sub_graphs[i].seed_node, forward, &db_node_action_do_nothing, graph, path_fwd);
            //log_printf("\t\tPerfect path fwd length %i\n", path_fwd->length);
            path_reset(path_rev);
            path_reset(final_path);
        } else {
            log_printf("  Number of nodes (%i} too small. Not outputting contig.\n", sub_graphs[i].graph_size);
        }
        
    }
	log_progress_bar(100);
	printf("\n");
    log_and_screen_printf("Finished contig output.\n");    
    fclose(fp);
    
    free(sub_graphs);
}