static void my_fread(FILE *fh, void *ptr, int size, const char* entry_name) { int read = fread_buf(fh, ptr, size, buffer); // int read = fread(ptr, 1, size, fh); if(read != size) { report_error("Couldn't read '%s': expected %li; recieved: %li; (fatal)\n", entry_name, (long)size, (long)read); if(print_kmers) printf("----\n"); print_kmer_stats(); exit(EXIT_FAILURE); } num_bytes_read += read; }
int main(int argc, char **argv) { setlocale (LC_ALL, ""); log_and_screen_printf("\nkmer_contamination.\n\n"); log_and_screen_printf(SVN_VERSION); log_and_screen_printf(SVN_COMMIT_DATE); log_and_screen_printf("Compiled on %s at %s \n\n", __DATE__, __TIME__); KmerStatsCmdLine cmd_line = parse_cmdline(argc, argv, sizeof(Element)); //log_and_screen_printf("Parsed options\n"); KmerHash * kmer_hash = load_kmer_table(cmd_line); log_and_screen_printf("Kmers readed\n"); load_reads_coverage_table(cmd_line, kmer_hash); print_kmer_stats(&cmd_line, kmer_hash); print_contaminated_kmers_histogram(&cmd_line, kmer_hash); log_and_screen_printf("\nDONE"); return 0; }
int main(int argc, char** argv) { char* filepath; if(argc < 2) { print_usage(); } else if(argc > 2) { print_info = 0; print_kmers = 0; parse_kmers = 0; int i; for(i = 1; i < argc-1; i++) { if(strcasecmp(argv[i], "--print_info") == 0) { print_info = 1; } else if(strcasecmp(argv[i], "--print_kmers") == 0) { print_kmers = 1; } else if(strcasecmp(argv[i], "--parse_kmers") == 0) { print_info = 1; parse_kmers = 1; } else print_usage(); } } filepath = argv[argc-1]; if(print_info) printf("Loading file: %s\n", filepath); file_size = get_file_size(filepath); FILE* fh = fopen(filepath, "r"); if(fh == NULL) { report_error("cannot open file '%s'\n", filepath); exit(EXIT_FAILURE); } if(file_size != -1 && print_info) { char str[31]; bytes_to_str(file_size, 0, str); printf("File size: %s\n", str); } buffer = buffer_new(BUFFER_SIZE); /* // Check sizes printf("-- Datatypes --\n"); printf("int: %i\n", (int)sizeof(int)); printf("long: %i\n", (int)sizeof(long)); printf("long long: %i\n", (int)sizeof(long long)); printf("double: %i\n", (int)sizeof(double)); printf("long double: %i\n", (int)sizeof(long double)); */ if(print_info) printf("----\n"); unsigned int i; // Read magic word at the start of header char magic_word[7]; magic_word[6] = '\0'; my_fread(fh, magic_word, strlen("CORTEX"), "Magic word"); if(strcmp(magic_word, "CORTEX") != 0) { fprintf(stderr, "Magic word doesn't match 'CORTEX' (start)\n"); exit(EXIT_FAILURE); } // Read version number my_fread(fh, &version, sizeof(uint32_t), "binary version"); my_fread(fh, &kmer_size, sizeof(uint32_t), "kmer size"); my_fread(fh, &num_of_bitfields, sizeof(uint32_t), "number of bitfields"); my_fread(fh, &num_of_colours, sizeof(uint32_t), "number of colours"); if(print_info) { printf("binary version: %i\n", (int)version); printf("kmer size: %i\n", (int)kmer_size); printf("bitfields: %i\n", (int)num_of_bitfields); printf("colours: %i\n", (int)num_of_colours); } if(version >= 7) { my_fread(fh, &expected_num_of_kmers, sizeof(uint64_t), "number of kmers"); my_fread(fh, &num_of_shades, sizeof(uint32_t), "number of shades"); if(print_info) { char tmp[256]; printf("kmers: %s\n", ulong_to_str(expected_num_of_kmers,tmp)); printf("shades: %i\n", (int)num_of_shades); } } // Checks if(version > 7 || version < 4) report_error("Sorry, we only support binary versions 4, 5, 6 & 7\n"); if(kmer_size % 2 == 0) report_error("kmer size is not an odd number\n"); if(kmer_size < 3) report_error("kmer size is less than three\n"); if(num_of_bitfields * 32 < kmer_size) report_error("Not enough bitfields for kmer size\n"); if((num_of_bitfields-1)*32 >= kmer_size) report_error("using more than the minimum number of bitfields\n"); if(num_of_colours == 0) report_error("number of colours is zero\n"); if(num_of_shades != 0 && (num_of_shades & (num_of_shades-1))) report_error("number of shades is not a power of 2\n"); // // Read array of mean read lengths per colour uint32_t *mean_read_lens_per_colour = malloc(num_of_colours*sizeof(uint32_t)); my_fread(fh, mean_read_lens_per_colour, sizeof(uint32_t) * num_of_colours, "mean read length for each colour"); // Read array of total seq loaded per colour uint64_t *total_seq_loaded_per_colour = malloc(num_of_colours*sizeof(uint64_t)); my_fread(fh, total_seq_loaded_per_colour, sizeof(uint64_t) * num_of_colours, "total sequance loaded for each colour"); for(i = 0; i < num_of_colours; i++) { sum_of_seq_loaded += total_seq_loaded_per_colour[i]; } if(version >= 6) { sample_names = malloc(sizeof(char*) * num_of_colours); for(i = 0; i < num_of_colours; i++) { uint32_t str_length; my_fread(fh, &str_length, sizeof(uint32_t), "sample name length"); if(str_length == 0) { sample_names[i] = NULL; } else { sample_names[i] = (char*)malloc((str_length+1) * sizeof(char)); my_fread(fh, sample_names[i], str_length, "sample name"); sample_names[i][str_length] = '\0'; // Check sample length is as long as we were told size_t sample_name_len = strlen(sample_names[i]); if(sample_name_len != str_length) { // Premature \0 in string report_warning("Sample %i name has length %lu but is only %lu chars " "long (premature '\\0')\n", i, str_length, sample_name_len); } } } seq_error_rates = malloc(sizeof(long double) * num_of_colours); my_fread(fh, seq_error_rates, sizeof(long double) * num_of_colours, "seq error rates"); cleaning_infos = malloc(sizeof(CleaningInfo) * num_of_colours); for(i = 0; i < num_of_colours; i++) { my_fread(fh, &(cleaning_infos[i].tip_cleaning), 1, "tip cleaning"); my_fread(fh, &(cleaning_infos[i].remove_low_covg_supernodes), 1, "remove low covg supernodes"); my_fread(fh, &(cleaning_infos[i].remove_low_covg_kmers), 1, "remove low covg kmers"); my_fread(fh, &(cleaning_infos[i].cleaned_against_graph), 1, "cleaned against graph"); my_fread(fh, &(cleaning_infos[i].remove_low_covg_supernodes_thresh), sizeof(int32_t), "remove low covg supernode threshold"); my_fread(fh, &(cleaning_infos[i].remove_low_covg_kmers_thresh), sizeof(int32_t), "remove low covg kmer threshold"); if(version > 6) { if(cleaning_infos[i].remove_low_covg_supernodes_thresh < 0) { report_warning("Binary header gives sample %i a cleaning threshold of " "%i for supernodes (should be >= 0)\n", i, cleaning_infos[i].remove_low_covg_supernodes_thresh); } if(cleaning_infos[i].remove_low_covg_kmers_thresh < 0) { report_warning("Binary header gives sample %i a cleaning threshold of " "%i for kmers (should be >= 0)\n", i, cleaning_infos[i].remove_low_covg_kmers_thresh); } } if(!cleaning_infos[i].remove_low_covg_supernodes && cleaning_infos[i].remove_low_covg_supernodes_thresh > 0) { report_warning("Binary header gives sample %i a cleaning threshold of " "%i for supernodes when no cleaning was performed\n", i, cleaning_infos[i].remove_low_covg_supernodes_thresh); } if(!cleaning_infos[i].remove_low_covg_kmers && cleaning_infos[i].remove_low_covg_kmers_thresh > 0) { report_warning("Binary header gives sample %i a cleaning threshold of " "%i for kmers when no cleaning was performed\n", i, cleaning_infos[i].remove_low_covg_kmers_thresh); } uint32_t name_length; my_fread(fh, &name_length, sizeof(uint32_t), "graph name length"); if(name_length == 0) { cleaning_infos[i].name_of_graph_clean_against = NULL; } else { cleaning_infos[i].name_of_graph_clean_against = (char*)malloc((name_length + 1) * sizeof(char)); my_fread(fh, cleaning_infos[i].name_of_graph_clean_against, name_length, "graph name length"); cleaning_infos[i].name_of_graph_clean_against[name_length] = '\0'; // Check sample length is as long as we were told size_t cleaned_name_len = strlen(cleaning_infos[i].name_of_graph_clean_against); if(cleaned_name_len != name_length) { // Premature \0 in string report_warning("Sample [%i] cleaned-against-name has length %u but is " "only %u chars long (premature '\\0')\n", i, name_length, cleaned_name_len); } } } } // Print colour info if(print_info) { for(i = 0; i < num_of_colours; i++) { printf("-- Colour %i --\n", i); if(version >= 6) { // Version 6 only output printf(" sample name: '%s'\n", sample_names[i]); } char tmp[32]; printf(" mean read length: %u\n", (unsigned int)mean_read_lens_per_colour[i]); printf(" total sequence loaded: %s\n", ulong_to_str(total_seq_loaded_per_colour[i], tmp)); if(version >= 6) { // Version 6 only output printf(" sequence error rate: %Lf\n", seq_error_rates[i]); printf(" tip clipping: %s\n", (cleaning_infos[i].tip_cleaning == 0 ? "no" : "yes")); printf(" remove low coverage supernodes: %s [threshold: %i]\n", cleaning_infos[i].remove_low_covg_supernodes ? "yes" : "no", cleaning_infos[i].remove_low_covg_supernodes_thresh); printf(" remove low coverage kmers: %s [threshold: %i]\n", cleaning_infos[i].remove_low_covg_kmers ? "yes" : "no", cleaning_infos[i].remove_low_covg_kmers_thresh); printf(" cleaned against graph: %s [against: '%s']\n", cleaning_infos[i].cleaned_against_graph ? "yes" : "no", (cleaning_infos[i].name_of_graph_clean_against == NULL ? "" : cleaning_infos[i].name_of_graph_clean_against)); } } printf("--\n"); } // Read magic word at the end of header my_fread(fh, magic_word, strlen("CORTEX"), "magic word (end)"); if(strcmp(magic_word, "CORTEX") != 0) { report_error("magic word doesn't match 'CORTEX' (end): '%s'\n", magic_word); exit(EXIT_FAILURE); } // Calculate number of kmers if(version < 7 && file_size != -1) { size_t bytes_remaining = file_size - num_bytes_read; size_t num_bytes_per_kmer = sizeof(uint64_t) * num_of_bitfields + sizeof(uint32_t) * num_of_colours + sizeof(uint8_t) * num_of_colours; expected_num_of_kmers = bytes_remaining / num_bytes_per_kmer; size_t excess = bytes_remaining - (expected_num_of_kmers * num_bytes_per_kmer); if(excess > 0) { report_error("Excess bytes. Bytes:\n file size: %lu;\n for kmers: %lu;" "\n num kmers: %lu;\n per kmer: %lu;\n excess: %lu\n", file_size, bytes_remaining, expected_num_of_kmers, num_bytes_per_kmer, excess); } } if(print_info) { char num_str[50]; printf("Expected number of kmers: %s\n", ulong_to_str(expected_num_of_kmers, num_str)); printf("----\n"); } // Finished parsing header if(!parse_kmers && !print_kmers) { print_kmer_stats(); fclose(fh); exit(EXIT_SUCCESS); } shade_bytes = num_of_shades >> 3; size_t shade_array_bytes = shade_bytes * num_of_colours; // Kmer data uint64_t* kmer = malloc(sizeof(uint64_t) * num_of_bitfields); uint32_t* covgs = malloc(sizeof(uint32_t) * num_of_colours); uint8_t* edges = malloc(sizeof(uint8_t) * num_of_colours); uint8_t* shade_data = malloc(shade_array_bytes); uint8_t* shend_data = malloc(shade_array_bytes); if(kmer == NULL || covgs == NULL || edges == NULL || shade_data == NULL || shend_data == NULL) { report_error("Out of memory"); exit(EXIT_SUCCESS); } // Convert values to strings char* seq = malloc(sizeof(char) * kmer_size); char kmer_colour_edge_str[9]; // Check top word of each kmer int bits_in_top_word = 2 * (kmer_size % 32); uint64_t top_word_mask = (~(uint64_t)0) << bits_in_top_word; size_t num_bytes_per_bkmer = sizeof(uint64_t)*num_of_bitfields; // Read kmer in bytes so we can see if there are extra bytes at the end of // the file size_t bytes_read; // while((bytes_read = fread(kmer, 1, num_bytes_per_bkmer, fh)) > 0) while((bytes_read = fread_buf(fh, kmer, num_bytes_per_bkmer, buffer)) > 0) { if(bytes_read != num_bytes_per_bkmer) { report_error("unusual extra bytes [%i] at the end of the file\n", (int)bytes_read); break; } num_bytes_read += bytes_read; my_fread(fh, covgs, sizeof(uint32_t) * num_of_colours, "kmer covg"); my_fread(fh, edges, sizeof(uint8_t) * num_of_colours, "kmer edges"); if(version >= 7) { uint8_t *shades = shade_data, *shends = shend_data; for(i = 0; i < num_of_colours; i++) { my_fread(fh, shades, sizeof(uint8_t) * shade_bytes, "shades"); my_fread(fh, shends, sizeof(uint8_t) * shade_bytes, "shade ends"); shades += shade_bytes; shends += shade_bytes; } } // // Kmer checks // // Check top bits of kmer if(kmer[0] & top_word_mask) { if(num_of_oversized_kmers == 0) { report_error("oversized kmer [index: %lu]\n", num_of_kmers_read); for(i = 0; i < num_of_bitfields; i++) { fprintf(stderr, " word %i: ", i); print_binary(stderr, kmer[i]); fprintf(stderr, "\n"); } } num_of_oversized_kmers++; } // Check for all-zeros (i.e. all As kmer: AAAAAA) uint64_t kmer_words_or = 0; for(i = 0; i < num_of_bitfields; i++) kmer_words_or |= kmer[i]; if(kmer_words_or == 0) { if(num_of_all_zero_kmers == 1) { report_error("more than one all 'A's kmers seen [index: %lu]\n", num_of_kmers_read); } num_of_all_zero_kmers++; } // Check covg is 0 for all colours for(i = 0; i < num_of_colours && covgs[i] == 0; i++); if(i == num_of_colours) { if(num_of_zero_covg_kmers == 0) { report_warning("a kmer has zero coverage in all colours [index: %lu]\n", num_of_kmers_read); } num_of_zero_covg_kmers++; } // Print? if(print_kmers) { binary_kmer_to_seq(kmer, seq, kmer_size, num_of_bitfields); printf("%s", seq); // Print coverages for(i = 0; i < num_of_colours; i++) printf(" %li", (unsigned long)covgs[i]); // Print edges for(i = 0; i < num_of_colours; i++) printf(" %s", get_edges_str(edges[i], kmer_colour_edge_str)); if(version >= 7 && num_of_shades > 0) { for(i = 0; i < num_of_colours; i++) { putc(' ', stdout); print_colour_shades(shade_data + i*shade_bytes, shend_data + i*shade_bytes); } } putc('\n', stdout); } num_of_kmers_read++; for(i = 0; i < num_of_colours; i++) sum_of_covgs_read += covgs[i]; } if(num_of_kmers_read != expected_num_of_kmers) { report_error("Expected %lu kmers, read %lu\n", expected_num_of_kmers, num_of_kmers_read); } if(print_kmers && print_info) printf("----\n"); // check for various reading errors if(errno != 0) { report_error("errno set [%i]\n", (int)errno); } int err; if((err = ferror(fh)) != 0) { report_error("occurred after file reading [%i]\n", err); } // For testing output //num_of_bitfields = 2; //num_of_kmers_read = 3600000000; //num_of_kmers_read = 12345; //num_of_kmers_read = 3581787; //num_of_kmers_read = 0; print_kmer_stats(); fclose(fh); free(kmer); free(covgs); free(edges); free(shade_data); free(shend_data); buffer_free(buffer); if((print_kmers || parse_kmers) && print_info) { printf("----\n"); if(num_warnings > 0 || num_errors > 0) printf("Warnings: %u; Errors: %u\n", num_warnings, num_errors); if(num_errors == 0) printf(num_warnings ? "Binary may be ok\n" : "Binary is valid\n"); } exit(EXIT_SUCCESS); }