Orientation db_node_get_orientation(BinaryKmer * k, Element * e, short kmer_size) { if (binary_kmer_comparison_operator(e->kmer, *k) == true) { return forward; } BinaryKmer tmp_kmer; if (binary_kmer_comparison_operator(e->kmer, *(binary_kmer_reverse_complement (k, kmer_size, &tmp_kmer))) == true) { return reverse; } printf("programming error - you have called db_node_get_orientation with a kmer that is neither equal to the kmer in this node, nor its rev comp\n"); char tmpseq1[kmer_size]; char tmpseq2[kmer_size]; printf("Arg 1 Kmer is %s and Arg 2 node kmer is %s\n", binary_kmer_to_seq(k, kmer_size, tmpseq1), binary_kmer_to_seq(&(e->kmer), kmer_size, tmpseq2)); exit(1); }
dBNode * db_graph_get_next_node(dBNode * current_node, Orientation current_orientation, Orientation * next_orientation, Nucleotide edge, Nucleotide * reverse_edge,dBGraph * db_graph){ BinaryKmer local_copy_of_kmer; binary_kmer_assignment_operator(local_copy_of_kmer, current_node->kmer); BinaryKmer tmp_kmer; dBNode * next_node=NULL; // after the following line tmp_kmer and rev_kmer are pointing to the same B Kmer BinaryKmer* rev_kmer = binary_kmer_reverse_complement(&local_copy_of_kmer,db_graph->kmer_size, &tmp_kmer); if (current_orientation == reverse){ *reverse_edge = binary_kmer_get_last_nucleotide(&local_copy_of_kmer); binary_kmer_assignment_operator(local_copy_of_kmer,*rev_kmer); } else{ *reverse_edge = binary_kmer_get_last_nucleotide(rev_kmer); } binary_kmer_left_shift_one_base_and_insert_new_base_at_right_end(&local_copy_of_kmer, edge, db_graph->kmer_size); //get node from table next_node = hash_table_find(element_get_key(&local_copy_of_kmer,db_graph->kmer_size, &tmp_kmer),db_graph); if (next_node != NULL){ *next_orientation = db_node_get_orientation(&local_copy_of_kmer,next_node,db_graph->kmer_size); } else { // debug char tmpzamseq[db_graph->kmer_size+1]; warn("Cannot find %s so get a NULL node\n", binary_kmer_to_seq(&tmp_kmer, db_graph->kmer_size, tmpzamseq)); } return next_node; }
void metacortex_find_subgraphs(dBGraph* graph, char* consensus_contigs_filename, int min_subgraph_kmers) { SubGraphInfo* sub_graphs; FILE* fp; Path *path_fwd = path_new(MAX_EXPLORE_PATH_LENGTH, graph->kmer_size); Path *path_rev = path_new(MAX_EXPLORE_PATH_LENGTH, graph->kmer_size); Path *final_path = path_new(MAX_EXPLORE_PATH_LENGTH, graph->kmer_size); char seq[256]; char analysis_filename[strlen(consensus_contigs_filename) + 10]; long int total_nodes = 0; int n_seeds = 0; int i; sprintf(analysis_filename, "%s.analysis", consensus_contigs_filename); log_and_screen_printf("Running metacortex subgraph analysis...\n"); log_and_screen_printf(" Contig file: %s\n", consensus_contigs_filename); log_and_screen_printf(" Analysis file: %s\n", analysis_filename); log_and_screen_printf("Minimum subgraph size: %i\n", min_subgraph_kmers); /* Initialise temporaray path array buffers */ path_array_initialise_buffers(graph->kmer_size); /* Create a list of subgraphs */ log_and_screen_printf("Allocating %d Mb to store subgraph information (max %d seeds)...\n", ((MAX_SEEDS * sizeof(SubGraphInfo)) / 1024) / 1024, MAX_SEEDS); sub_graphs = calloc(MAX_SEEDS, sizeof(SubGraphInfo)); if (!sub_graphs) { log_and_screen_printf("ERROR: Can't get memory for subgraphs\n"); exit(-1); } /* Open the analysis file */ fp = fopen(analysis_filename, "w"); if (!fp) { log_and_screen_printf("ERROR: Can't open analysis file.\n"); exit(-1); } /* For each node, if it's not pruned or visited, try and grow a graph */ void explore_node(dBNode * node) { if (node == NULL) { log_and_screen_printf("Error: NULL node passed to explore_node.\n"); exit(-1); } if (db_node_check_for_any_flag(node, PRUNED | VISITED) == false) { int nodes_in_graph; /* Grow graph from this node, returning the 'best' (highest coverage) node to store as seed point */ nodes_in_graph = grow_graph_from_node(node, &(sub_graphs[n_seeds].seed_node), graph); total_nodes += nodes_in_graph; if (sub_graphs[n_seeds].seed_node == NULL) { printf("ERROR: Seed node is NULL, nodes in graph is %d\n", nodes_in_graph); } else { /* Write data to analysis file */ binary_kmer_to_seq(&(node->kmer), graph->kmer_size, seq); fprintf(fp, "%i\t%i\t%ld\t%s\t", n_seeds, nodes_in_graph, total_nodes, seq); binary_kmer_to_seq(&(sub_graphs[n_seeds].seed_node->kmer), graph->kmer_size, seq); fprintf(fp, "%s\n", seq); /* Store nodes in this subgraph */ sub_graphs[n_seeds].graph_size = nodes_in_graph; n_seeds++; /* Check we've not run out of seed storage - in future, this should dynamically allocate */ if (n_seeds == MAX_SEEDS) { log_and_screen_printf("Error: MAX_SEEDS exceeded. Quitting.\n"); exit(-1); } } } } /* Traverse each node... */ log_and_screen_printf("Finding subgraphs...\n"); hash_table_traverse(&explore_node, graph); log_and_screen_printf("Finished. Total: %ld\n", total_nodes); fclose(fp); /* Open consensus contigs file */ fp = fopen(consensus_contigs_filename, "w"); if (!fp) { log_and_screen_printf("ERROR: Can't open contig file.\n"); exit(-1); } /* Now go through all the seed points and generate the consensus contigs by walking forward and backward from the seed */ db_graph_reset_flags(graph); log_and_screen_printf("Outputting contigs...\n"); log_progress_bar(0); long long one_percent = n_seeds/100; int percent; if (one_percent < 1) { one_percent = 1; } for (i=0; i<n_seeds; i++) { if (i % one_percent == 0) { percent = (100 * i) / n_seeds; log_progress_bar(percent); } //log_printf("Graph %i\n", i); if (sub_graphs[i].graph_size >= min_subgraph_kmers) { binary_kmer_to_seq(&(sub_graphs[i].seed_node->kmer), graph->kmer_size, seq); coverage_walk_get_path(sub_graphs[i].seed_node, forward, NULL, graph, path_fwd); coverage_walk_get_path(sub_graphs[i].seed_node, reverse, NULL, graph, path_rev); path_reverse(path_fwd, final_path); path_append(final_path, path_rev); final_path->id = i; path_to_fasta(final_path, fp); //log_printf(" Seed %s\tFwd path length %i\tRev path length %i\tFinal path length %i\n", seq, path_fwd->length, path_rev->length, final_path->length); path_reset(path_fwd); perfect_path_get_path(sub_graphs[i].seed_node, forward, &db_node_action_do_nothing, graph, path_fwd); //log_printf("\t\tPerfect path fwd length %i\n", path_fwd->length); path_reset(path_rev); path_reset(final_path); } else { log_printf(" Number of nodes (%i} too small. Not outputting contig.\n", sub_graphs[i].graph_size); } } log_progress_bar(100); printf("\n"); log_and_screen_printf("Finished contig output.\n"); fclose(fp); free(sub_graphs); }
int main(int argc, char** argv) { char* filepath; if(argc < 2) { print_usage(); } else if(argc > 2) { print_info = 0; print_kmers = 0; parse_kmers = 0; int i; for(i = 1; i < argc-1; i++) { if(strcasecmp(argv[i], "--print_info") == 0) { print_info = 1; } else if(strcasecmp(argv[i], "--print_kmers") == 0) { print_kmers = 1; } else if(strcasecmp(argv[i], "--parse_kmers") == 0) { print_info = 1; parse_kmers = 1; } else print_usage(); } } filepath = argv[argc-1]; if(print_info) printf("Loading file: %s\n", filepath); file_size = get_file_size(filepath); FILE* fh = fopen(filepath, "r"); if(fh == NULL) { report_error("cannot open file '%s'\n", filepath); exit(EXIT_FAILURE); } if(file_size != -1 && print_info) { char str[31]; bytes_to_str(file_size, 0, str); printf("File size: %s\n", str); } buffer = buffer_new(BUFFER_SIZE); /* // Check sizes printf("-- Datatypes --\n"); printf("int: %i\n", (int)sizeof(int)); printf("long: %i\n", (int)sizeof(long)); printf("long long: %i\n", (int)sizeof(long long)); printf("double: %i\n", (int)sizeof(double)); printf("long double: %i\n", (int)sizeof(long double)); */ if(print_info) printf("----\n"); unsigned int i; // Read magic word at the start of header char magic_word[7]; magic_word[6] = '\0'; my_fread(fh, magic_word, strlen("CORTEX"), "Magic word"); if(strcmp(magic_word, "CORTEX") != 0) { fprintf(stderr, "Magic word doesn't match 'CORTEX' (start)\n"); exit(EXIT_FAILURE); } // Read version number my_fread(fh, &version, sizeof(uint32_t), "binary version"); my_fread(fh, &kmer_size, sizeof(uint32_t), "kmer size"); my_fread(fh, &num_of_bitfields, sizeof(uint32_t), "number of bitfields"); my_fread(fh, &num_of_colours, sizeof(uint32_t), "number of colours"); if(print_info) { printf("binary version: %i\n", (int)version); printf("kmer size: %i\n", (int)kmer_size); printf("bitfields: %i\n", (int)num_of_bitfields); printf("colours: %i\n", (int)num_of_colours); } if(version >= 7) { my_fread(fh, &expected_num_of_kmers, sizeof(uint64_t), "number of kmers"); my_fread(fh, &num_of_shades, sizeof(uint32_t), "number of shades"); if(print_info) { char tmp[256]; printf("kmers: %s\n", ulong_to_str(expected_num_of_kmers,tmp)); printf("shades: %i\n", (int)num_of_shades); } } // Checks if(version > 7 || version < 4) report_error("Sorry, we only support binary versions 4, 5, 6 & 7\n"); if(kmer_size % 2 == 0) report_error("kmer size is not an odd number\n"); if(kmer_size < 3) report_error("kmer size is less than three\n"); if(num_of_bitfields * 32 < kmer_size) report_error("Not enough bitfields for kmer size\n"); if((num_of_bitfields-1)*32 >= kmer_size) report_error("using more than the minimum number of bitfields\n"); if(num_of_colours == 0) report_error("number of colours is zero\n"); if(num_of_shades != 0 && (num_of_shades & (num_of_shades-1))) report_error("number of shades is not a power of 2\n"); // // Read array of mean read lengths per colour uint32_t *mean_read_lens_per_colour = malloc(num_of_colours*sizeof(uint32_t)); my_fread(fh, mean_read_lens_per_colour, sizeof(uint32_t) * num_of_colours, "mean read length for each colour"); // Read array of total seq loaded per colour uint64_t *total_seq_loaded_per_colour = malloc(num_of_colours*sizeof(uint64_t)); my_fread(fh, total_seq_loaded_per_colour, sizeof(uint64_t) * num_of_colours, "total sequance loaded for each colour"); for(i = 0; i < num_of_colours; i++) { sum_of_seq_loaded += total_seq_loaded_per_colour[i]; } if(version >= 6) { sample_names = malloc(sizeof(char*) * num_of_colours); for(i = 0; i < num_of_colours; i++) { uint32_t str_length; my_fread(fh, &str_length, sizeof(uint32_t), "sample name length"); if(str_length == 0) { sample_names[i] = NULL; } else { sample_names[i] = (char*)malloc((str_length+1) * sizeof(char)); my_fread(fh, sample_names[i], str_length, "sample name"); sample_names[i][str_length] = '\0'; // Check sample length is as long as we were told size_t sample_name_len = strlen(sample_names[i]); if(sample_name_len != str_length) { // Premature \0 in string report_warning("Sample %i name has length %lu but is only %lu chars " "long (premature '\\0')\n", i, str_length, sample_name_len); } } } seq_error_rates = malloc(sizeof(long double) * num_of_colours); my_fread(fh, seq_error_rates, sizeof(long double) * num_of_colours, "seq error rates"); cleaning_infos = malloc(sizeof(CleaningInfo) * num_of_colours); for(i = 0; i < num_of_colours; i++) { my_fread(fh, &(cleaning_infos[i].tip_cleaning), 1, "tip cleaning"); my_fread(fh, &(cleaning_infos[i].remove_low_covg_supernodes), 1, "remove low covg supernodes"); my_fread(fh, &(cleaning_infos[i].remove_low_covg_kmers), 1, "remove low covg kmers"); my_fread(fh, &(cleaning_infos[i].cleaned_against_graph), 1, "cleaned against graph"); my_fread(fh, &(cleaning_infos[i].remove_low_covg_supernodes_thresh), sizeof(int32_t), "remove low covg supernode threshold"); my_fread(fh, &(cleaning_infos[i].remove_low_covg_kmers_thresh), sizeof(int32_t), "remove low covg kmer threshold"); if(version > 6) { if(cleaning_infos[i].remove_low_covg_supernodes_thresh < 0) { report_warning("Binary header gives sample %i a cleaning threshold of " "%i for supernodes (should be >= 0)\n", i, cleaning_infos[i].remove_low_covg_supernodes_thresh); } if(cleaning_infos[i].remove_low_covg_kmers_thresh < 0) { report_warning("Binary header gives sample %i a cleaning threshold of " "%i for kmers (should be >= 0)\n", i, cleaning_infos[i].remove_low_covg_kmers_thresh); } } if(!cleaning_infos[i].remove_low_covg_supernodes && cleaning_infos[i].remove_low_covg_supernodes_thresh > 0) { report_warning("Binary header gives sample %i a cleaning threshold of " "%i for supernodes when no cleaning was performed\n", i, cleaning_infos[i].remove_low_covg_supernodes_thresh); } if(!cleaning_infos[i].remove_low_covg_kmers && cleaning_infos[i].remove_low_covg_kmers_thresh > 0) { report_warning("Binary header gives sample %i a cleaning threshold of " "%i for kmers when no cleaning was performed\n", i, cleaning_infos[i].remove_low_covg_kmers_thresh); } uint32_t name_length; my_fread(fh, &name_length, sizeof(uint32_t), "graph name length"); if(name_length == 0) { cleaning_infos[i].name_of_graph_clean_against = NULL; } else { cleaning_infos[i].name_of_graph_clean_against = (char*)malloc((name_length + 1) * sizeof(char)); my_fread(fh, cleaning_infos[i].name_of_graph_clean_against, name_length, "graph name length"); cleaning_infos[i].name_of_graph_clean_against[name_length] = '\0'; // Check sample length is as long as we were told size_t cleaned_name_len = strlen(cleaning_infos[i].name_of_graph_clean_against); if(cleaned_name_len != name_length) { // Premature \0 in string report_warning("Sample [%i] cleaned-against-name has length %u but is " "only %u chars long (premature '\\0')\n", i, name_length, cleaned_name_len); } } } } // Print colour info if(print_info) { for(i = 0; i < num_of_colours; i++) { printf("-- Colour %i --\n", i); if(version >= 6) { // Version 6 only output printf(" sample name: '%s'\n", sample_names[i]); } char tmp[32]; printf(" mean read length: %u\n", (unsigned int)mean_read_lens_per_colour[i]); printf(" total sequence loaded: %s\n", ulong_to_str(total_seq_loaded_per_colour[i], tmp)); if(version >= 6) { // Version 6 only output printf(" sequence error rate: %Lf\n", seq_error_rates[i]); printf(" tip clipping: %s\n", (cleaning_infos[i].tip_cleaning == 0 ? "no" : "yes")); printf(" remove low coverage supernodes: %s [threshold: %i]\n", cleaning_infos[i].remove_low_covg_supernodes ? "yes" : "no", cleaning_infos[i].remove_low_covg_supernodes_thresh); printf(" remove low coverage kmers: %s [threshold: %i]\n", cleaning_infos[i].remove_low_covg_kmers ? "yes" : "no", cleaning_infos[i].remove_low_covg_kmers_thresh); printf(" cleaned against graph: %s [against: '%s']\n", cleaning_infos[i].cleaned_against_graph ? "yes" : "no", (cleaning_infos[i].name_of_graph_clean_against == NULL ? "" : cleaning_infos[i].name_of_graph_clean_against)); } } printf("--\n"); } // Read magic word at the end of header my_fread(fh, magic_word, strlen("CORTEX"), "magic word (end)"); if(strcmp(magic_word, "CORTEX") != 0) { report_error("magic word doesn't match 'CORTEX' (end): '%s'\n", magic_word); exit(EXIT_FAILURE); } // Calculate number of kmers if(version < 7 && file_size != -1) { size_t bytes_remaining = file_size - num_bytes_read; size_t num_bytes_per_kmer = sizeof(uint64_t) * num_of_bitfields + sizeof(uint32_t) * num_of_colours + sizeof(uint8_t) * num_of_colours; expected_num_of_kmers = bytes_remaining / num_bytes_per_kmer; size_t excess = bytes_remaining - (expected_num_of_kmers * num_bytes_per_kmer); if(excess > 0) { report_error("Excess bytes. Bytes:\n file size: %lu;\n for kmers: %lu;" "\n num kmers: %lu;\n per kmer: %lu;\n excess: %lu\n", file_size, bytes_remaining, expected_num_of_kmers, num_bytes_per_kmer, excess); } } if(print_info) { char num_str[50]; printf("Expected number of kmers: %s\n", ulong_to_str(expected_num_of_kmers, num_str)); printf("----\n"); } // Finished parsing header if(!parse_kmers && !print_kmers) { print_kmer_stats(); fclose(fh); exit(EXIT_SUCCESS); } shade_bytes = num_of_shades >> 3; size_t shade_array_bytes = shade_bytes * num_of_colours; // Kmer data uint64_t* kmer = malloc(sizeof(uint64_t) * num_of_bitfields); uint32_t* covgs = malloc(sizeof(uint32_t) * num_of_colours); uint8_t* edges = malloc(sizeof(uint8_t) * num_of_colours); uint8_t* shade_data = malloc(shade_array_bytes); uint8_t* shend_data = malloc(shade_array_bytes); if(kmer == NULL || covgs == NULL || edges == NULL || shade_data == NULL || shend_data == NULL) { report_error("Out of memory"); exit(EXIT_SUCCESS); } // Convert values to strings char* seq = malloc(sizeof(char) * kmer_size); char kmer_colour_edge_str[9]; // Check top word of each kmer int bits_in_top_word = 2 * (kmer_size % 32); uint64_t top_word_mask = (~(uint64_t)0) << bits_in_top_word; size_t num_bytes_per_bkmer = sizeof(uint64_t)*num_of_bitfields; // Read kmer in bytes so we can see if there are extra bytes at the end of // the file size_t bytes_read; // while((bytes_read = fread(kmer, 1, num_bytes_per_bkmer, fh)) > 0) while((bytes_read = fread_buf(fh, kmer, num_bytes_per_bkmer, buffer)) > 0) { if(bytes_read != num_bytes_per_bkmer) { report_error("unusual extra bytes [%i] at the end of the file\n", (int)bytes_read); break; } num_bytes_read += bytes_read; my_fread(fh, covgs, sizeof(uint32_t) * num_of_colours, "kmer covg"); my_fread(fh, edges, sizeof(uint8_t) * num_of_colours, "kmer edges"); if(version >= 7) { uint8_t *shades = shade_data, *shends = shend_data; for(i = 0; i < num_of_colours; i++) { my_fread(fh, shades, sizeof(uint8_t) * shade_bytes, "shades"); my_fread(fh, shends, sizeof(uint8_t) * shade_bytes, "shade ends"); shades += shade_bytes; shends += shade_bytes; } } // // Kmer checks // // Check top bits of kmer if(kmer[0] & top_word_mask) { if(num_of_oversized_kmers == 0) { report_error("oversized kmer [index: %lu]\n", num_of_kmers_read); for(i = 0; i < num_of_bitfields; i++) { fprintf(stderr, " word %i: ", i); print_binary(stderr, kmer[i]); fprintf(stderr, "\n"); } } num_of_oversized_kmers++; } // Check for all-zeros (i.e. all As kmer: AAAAAA) uint64_t kmer_words_or = 0; for(i = 0; i < num_of_bitfields; i++) kmer_words_or |= kmer[i]; if(kmer_words_or == 0) { if(num_of_all_zero_kmers == 1) { report_error("more than one all 'A's kmers seen [index: %lu]\n", num_of_kmers_read); } num_of_all_zero_kmers++; } // Check covg is 0 for all colours for(i = 0; i < num_of_colours && covgs[i] == 0; i++); if(i == num_of_colours) { if(num_of_zero_covg_kmers == 0) { report_warning("a kmer has zero coverage in all colours [index: %lu]\n", num_of_kmers_read); } num_of_zero_covg_kmers++; } // Print? if(print_kmers) { binary_kmer_to_seq(kmer, seq, kmer_size, num_of_bitfields); printf("%s", seq); // Print coverages for(i = 0; i < num_of_colours; i++) printf(" %li", (unsigned long)covgs[i]); // Print edges for(i = 0; i < num_of_colours; i++) printf(" %s", get_edges_str(edges[i], kmer_colour_edge_str)); if(version >= 7 && num_of_shades > 0) { for(i = 0; i < num_of_colours; i++) { putc(' ', stdout); print_colour_shades(shade_data + i*shade_bytes, shend_data + i*shade_bytes); } } putc('\n', stdout); } num_of_kmers_read++; for(i = 0; i < num_of_colours; i++) sum_of_covgs_read += covgs[i]; } if(num_of_kmers_read != expected_num_of_kmers) { report_error("Expected %lu kmers, read %lu\n", expected_num_of_kmers, num_of_kmers_read); } if(print_kmers && print_info) printf("----\n"); // check for various reading errors if(errno != 0) { report_error("errno set [%i]\n", (int)errno); } int err; if((err = ferror(fh)) != 0) { report_error("occurred after file reading [%i]\n", err); } // For testing output //num_of_bitfields = 2; //num_of_kmers_read = 3600000000; //num_of_kmers_read = 12345; //num_of_kmers_read = 3581787; //num_of_kmers_read = 0; print_kmer_stats(); fclose(fh); free(kmer); free(covgs); free(edges); free(shade_data); free(shend_data); buffer_free(buffer); if((print_kmers || parse_kmers) && print_info) { printf("----\n"); if(num_warnings > 0 || num_errors > 0) printf("Warnings: %u; Errors: %u\n", num_warnings, num_errors); if(num_errors == 0) printf(num_warnings ? "Binary may be ok\n" : "Binary is valid\n"); } exit(EXIT_SUCCESS); }