示例#1
0
/*		COMPARE_CHROMA
Perform Dynamic Programming to find optimal alignment
*/
int Scorealign::compare_chroma()
{
    float *path;
    
    /* Allocate the distance matrix */
    path = (float *) calloc(file0_frames * file1_frames, sizeof(float));
    
    /* skip over initial silence in signals */
    if (ignore_silence) {
        first_x = frames_of_init_silence(chrom_energy0, file0_frames);
        last_x = last_non_silent_frame(chrom_energy0, file0_frames);
        first_y = frames_of_init_silence(chrom_energy1, file1_frames);
        last_y = last_non_silent_frame(chrom_energy1, file1_frames);
    } else {
        first_x = 0;
        last_x = file0_frames - 1;
        first_y = 0;
        last_y = file1_frames - 1;
    }

    if (last_x - first_x <= 0 || last_y - first_y <= 0) {
        return SA_TOOSHORT;
    }

    /* Initialize first row and column */
    if (verbose) printf("Performing DP\n"); 
    PATH(first_x, first_y) = gen_dist(first_x, first_y);
    for (int x = first_x + 1; x <= last_x; x++)
        PATH(x, first_y) = gen_dist(x, first_y) + PATH(x - 1, first_y);
    for (int y = 1; y <= last_y; y++)
        PATH(first_x, y) = gen_dist(first_x, y) + PATH(first_x, y - 1);

#if DEBUG_LOG
    fprintf(dbf, "DISTANCE MATRIX ***************************\n");
#endif
    /* Perform DP for the rest of the matrix */
    for (int x = first_x + 1; x <= last_x; x++) {
        for (int y = first_y + 1; y <= last_y; y++) {
            PATH(x, y) = gen_dist(x, y) +
                    float(min3(PATH(x-1, y-1), PATH(x-1, y), PATH(x, y-1))); 
#if DEBUG_LOG
            fprintf(dbf, "(%d %d %g) ", x, y, gen_dist(x, y), PATH(x, y));
#endif
        }
#if DEBUG_LOG
        fprintf(dbf, "\n");
#endif
        // report progress for each file0_frame (column)
        // This is not quite right if we are ignoring silence because
        // then only a sub-matrix is computed.
        if (progress && !progress->set_matrix_progress(file1_frames)) 
            return SA_CANCEL;
    }
#if DEBUG_LOG
    fprintf(dbf, "END OF DISTANCE MATRIX ********************\n");
#endif

    if (verbose) printf("Completed Dynamic Programming.\n");
    
    
    //x and y are the ending points, it can end at either the end of midi, 
    // or end of audio or both
    pathx = ALLOC(short, (file0_frames + file1_frames));
    pathy = ALLOC(short, (file0_frames + file1_frames));
	
    assert(pathx != NULL);
    assert(pathy != NULL);
	 
    // map from file0 time to file1 time
    time_map = ALLOC(float, file0_frames);
    smooth_time_map = ALLOC(float, file0_frames);
	
    int x = last_x;
    int y = last_y;

    if (!force_final_alignment) {
#if DEBUG_LOG
        fprintf(dbf, "\nOptimal Path: ");
#endif
        // find end point, the lowest cost matrix value at one of the
        // sequence endings
        float min_cost = 1.0E10;
        for (int i = first_x; i <= last_x; i++) {
            if (PATH(i, last_y) <= min_cost) {
                min_cost = PATH(i, last_y);
                x = i;
                y = last_y;
            }
        }
        for (int j = first_y; j <= last_y; j++) {
            if (PATH(last_x, j) <= min_cost) {
                min_cost = PATH(last_x, j);
                x = last_x;
                y = j;
            }
        }
#if DEBUG_LOG
        fprintf(dbf, "Min cost at %d %d\n\nPATH:\n", x, y);
#endif
    }

    while ((x != first_x) || (y != first_y)) {
        path_step(x, y);

        /* Check for the optimal path backwards*/
        if (x > first_x && y > first_y && PATH(x-1, y-1) <= PATH(x-1, y) &&
            PATH(x-1, y-1) <= PATH(x, y-1)) {
            x--;
            y--;
        } else if (x > first_x && y > first_y && PATH(x-1, y) <= PATH(x, y-1)) {
            x--;
        } else if (y > first_y) {
            y--;
        } else if (x > first_x) {
            x--;
        }
    }
    path_step(x, y);
    path_reverse();
    free(path);
    return SA_SUCCESS; // success
}
示例#2
0
void metacortex_find_subgraphs(dBGraph* graph, char* consensus_contigs_filename, int min_subgraph_kmers)
{
    SubGraphInfo* sub_graphs;
    FILE* fp;
    Path *path_fwd = path_new(MAX_EXPLORE_PATH_LENGTH, graph->kmer_size);
    Path *path_rev = path_new(MAX_EXPLORE_PATH_LENGTH, graph->kmer_size);
    Path *final_path = path_new(MAX_EXPLORE_PATH_LENGTH, graph->kmer_size);
    char seq[256];
    char analysis_filename[strlen(consensus_contigs_filename) + 10];
    long int total_nodes = 0;
    int n_seeds = 0;
    int i;
    
    sprintf(analysis_filename, "%s.analysis", consensus_contigs_filename);
    log_and_screen_printf("Running metacortex subgraph analysis...\n");
    log_and_screen_printf("          Contig file: %s\n", consensus_contigs_filename);
    log_and_screen_printf("        Analysis file: %s\n", analysis_filename);
    log_and_screen_printf("Minimum subgraph size: %i\n", min_subgraph_kmers);
    
    /* Initialise temporaray path array buffers */
    path_array_initialise_buffers(graph->kmer_size);
    
    /* Create a list of subgraphs */
    log_and_screen_printf("Allocating %d Mb to store subgraph information (max %d seeds)...\n", ((MAX_SEEDS * sizeof(SubGraphInfo)) / 1024) / 1024, MAX_SEEDS);
    sub_graphs = calloc(MAX_SEEDS, sizeof(SubGraphInfo));
    if (!sub_graphs) {
        log_and_screen_printf("ERROR: Can't get memory for subgraphs\n");
        exit(-1);
    }

    /* Open the analysis file */
    fp = fopen(analysis_filename, "w");
    if (!fp) {
        log_and_screen_printf("ERROR: Can't open analysis file.\n");
        exit(-1);
    }
        
    /* For each node, if it's not pruned or visited, try and grow a graph */
    void explore_node(dBNode * node) {
        if (node == NULL) {
            log_and_screen_printf("Error: NULL node passed to explore_node.\n");
            exit(-1);
        }
        
        if (db_node_check_for_any_flag(node, PRUNED | VISITED) == false) {
            int nodes_in_graph;
            
            /* Grow graph from this node, returning the 'best' (highest coverage) node to store as seed point */
            nodes_in_graph = grow_graph_from_node(node, &(sub_graphs[n_seeds].seed_node), graph);
            total_nodes += nodes_in_graph;
            
            if (sub_graphs[n_seeds].seed_node == NULL) {
                printf("ERROR: Seed node is NULL, nodes in graph is %d\n", nodes_in_graph);
            } else {
                /* Write data to analysis file */
                binary_kmer_to_seq(&(node->kmer), graph->kmer_size, seq);            
                fprintf(fp, "%i\t%i\t%ld\t%s\t", n_seeds, nodes_in_graph, total_nodes, seq);
                binary_kmer_to_seq(&(sub_graphs[n_seeds].seed_node->kmer), graph->kmer_size, seq);
                fprintf(fp, "%s\n", seq);

                /* Store nodes in this subgraph */
                sub_graphs[n_seeds].graph_size = nodes_in_graph;
                n_seeds++;
                
                /* Check we've not run out of seed storage - in future, this should dynamically allocate */
                if (n_seeds == MAX_SEEDS) {
                    log_and_screen_printf("Error: MAX_SEEDS exceeded. Quitting.\n");
                    exit(-1);
                }
            }
        }
    }
    
    /* Traverse each node... */
    log_and_screen_printf("Finding subgraphs...\n");
    hash_table_traverse(&explore_node, graph);
    log_and_screen_printf("Finished. Total: %ld\n", total_nodes);
    fclose(fp);    
    
    /* Open consensus contigs file */
    fp = fopen(consensus_contigs_filename, "w");
    if (!fp) {
        log_and_screen_printf("ERROR: Can't open contig file.\n");
        exit(-1);
    }
    
    /* Now go through all the seed points and generate the consensus contigs by walking forward and backward from the seed */
    db_graph_reset_flags(graph);    
    log_and_screen_printf("Outputting contigs...\n");
	log_progress_bar(0);
	long long one_percent = n_seeds/100;
    int percent;
    
    if (one_percent < 1) {
        one_percent = 1;
    }
    
    for (i=0; i<n_seeds; i++) {
        if (i % one_percent == 0) {
            percent = (100 * i) / n_seeds;
            log_progress_bar(percent);
        } 
        
        //log_printf("Graph %i\n", i);           
        if (sub_graphs[i].graph_size >= min_subgraph_kmers) {            
            binary_kmer_to_seq(&(sub_graphs[i].seed_node->kmer), graph->kmer_size, seq);
            coverage_walk_get_path(sub_graphs[i].seed_node, forward, NULL, graph, path_fwd);
            coverage_walk_get_path(sub_graphs[i].seed_node, reverse, NULL, graph, path_rev);
            path_reverse(path_fwd, final_path);
            path_append(final_path, path_rev);
            final_path->id = i;
            path_to_fasta(final_path, fp);
            //log_printf("  Seed %s\tFwd path length %i\tRev path length %i\tFinal path length %i\n", seq, path_fwd->length, path_rev->length, final_path->length);
            path_reset(path_fwd);
            perfect_path_get_path(sub_graphs[i].seed_node, forward, &db_node_action_do_nothing, graph, path_fwd);
            //log_printf("\t\tPerfect path fwd length %i\n", path_fwd->length);
            path_reset(path_rev);
            path_reset(final_path);
        } else {
            log_printf("  Number of nodes (%i} too small. Not outputting contig.\n", sub_graphs[i].graph_size);
        }
        
    }
	log_progress_bar(100);
	printf("\n");
    log_and_screen_printf("Finished contig output.\n");    
    fclose(fp);
    
    free(sub_graphs);
}