void perfect_path_print_paths(char *filename, int max_length, int singleton_length, boolean with_coverages, dBGraph * db_graph) { FILE * fout = NULL; FILE * fout_cov = NULL; int i; fout = fopen(filename, "w"); if (with_coverages) { char filename_cov[strlen(filename) + 10]; sprintf(filename_cov, "%s_cov", filename); fout_cov = fopen(filename_cov, "w"); } limit = max_length; //Path *path = path_new(max_length, db_graph->kmer_size); //path->id=-1; perfect_path_print_supernodes_args ** args = calloc(db_graph->number_of_threads, sizeof(perfect_path_print_supernodes_args * )); for (i = 0; i < db_graph->number_of_threads; i++) { args[i] = calloc(1, sizeof(perfect_path_print_supernodes_args)) ; args[i]->db_graph = db_graph; args[i]->path = path_new(max_length, db_graph->kmer_size); args[i]->fout = fout; args[i]->fout_cov = fout_cov;//TODO: Make the printing function "thread safe" } //buffers = path_array_new(2); double graph_cov = db_graph_get_average_coverage(db_graph); log_and_screen_printf("Average coverage: %5.2f \n", graph_cov); hash_table_traverse_with_args(&print_supernode, (void ** ) args ,db_graph); log_and_screen_printf("%'d nodes visited [%'qd singletons, %'qd repetitive]\n", args[0]->count_nodes, args[0]->count_sing, args[0]->count_rep);//TODO: At some point we can make this multithreading path_counts_print_and_log(&args[0]->counts); for (i = 0; i < db_graph->number_of_threads; i++) { free(args[i]); path_destroy(args[i]->path); } free(args); fclose(fout); if (with_coverages) { fclose(fout_cov); } }
void append_sequence(char * filename, Sequence * seq, FileFormat format){ FILE * output = NULL; output = fopen(filename, "a"); if(output == NULL){ log_and_screen_printf("Unable to open file: %s\n", filename); exit(-1); } #ifdef LOCKING struct flock fl;// = {F_WRLCK, SEEK_SET, 0, 0, 0 }; fl.l_len = 0; fl.l_start = 0; fl.l_whence = SEEK_SET; fl.l_type = F_WRLCK; fl.l_pid = getpid(); short attemps = 10; while (fcntl(fileno(output), F_SETLKW, &fl) == -1 && attemps-- > 0) { print_error_no(); //sleep(rand()%10);//Sleep from 0 to 10 seconds... } #endif switch (format) { case FASTA: sequence_print_fasta(output, seq); break; case FASTQ: sequence_print_fastq(output, seq); break; default: fprintf(stderr, "Format not implemented for writing\n"); assert(false); exit(-1); break; } #ifdef LOCKING fl.l_type=F_UNLCK; // fl.l_whence = SEEK_END; if (output != NULL && fcntl(fileno(output), F_SETLKW, &fl) == -1 ) { log_and_screen_printf("PID:%d ERROR: Unable to unlock!\n", fl.l_pid); print_error_no(); } #endif if (output != NULL) { fclose(output); } }
/** * Dumps the hash table. It stores the size of the structs so we can * validate when reading that the memory do corresponds to the structure * that we are writing. * */ void hash_table_dump_memory(char * filename, HashTable * hash){ FILE * fp = fopen(filename, "wb"); char * magic = MAGIC_TEXT; int size_ht = sizeof(HashTable); int size_e = sizeof(Element); int magic_size = strlen(magic); short version = HASH_VERSION; long long number_buckets = hash->number_buckets; int bucket_size = hash->bucket_size; long long hash_size=number_buckets * bucket_size; //Header stuff, this is enough information to prepare the hash table. fwrite(magic, sizeof(char), magic_size, fp); fwrite(&version, sizeof(short), 1, fp); fwrite(&size_ht, sizeof(int), 1,fp); fwrite(&size_e, sizeof(int), 1,fp); fwrite(&hash->kmer_size, sizeof(short), 1, fp); fwrite(&number_buckets, sizeof(long long), 1, fp); fwrite(&bucket_size, sizeof(int), 1, fp); fwrite(&hash->max_rehash_tries, sizeof(int), 1, fp); fwrite(&hash->unique_kmers, sizeof(long long), 1, fp); //The actual data of the hash table. We are storing everything fwrite(hash->table, size_e, hash_size, fp); fwrite(hash->next_element, sizeof(int), number_buckets, fp); fwrite(hash->collisions, sizeof(long long), hash->max_rehash_tries, fp); log_and_screen_printf("Hash dumped to : %s \n" , filename); hash_table_print_stats(hash); fclose(fp); }
int main( int argc, char ** argv){ log_and_screen_printf("Demultiplexer\n\n"); log_and_screen_printf(SVN_VERSION); log_and_screen_printf(SVN_COMMIT_DATE); log_and_screen_printf("Compiled on %s at %s \n\n", __DATE__, __TIME__); if (argc < 2) { print_help(); } log_write_timestamp(1); if (argc == 0) { print_help(); } DemultiplexerCmdLine cmd = parse_args(argc, argv); FILE * in = stdin; if (cmd.input_reads != NULL) { in = fopen(cmd.input_reads, "r"); if(in == NULL){ log_and_screen_printf("Unable to open file %s\n", cmd.input_reads); exit(-1); } } Sequence * seq = sequence_new(cmd.max_read_length, cmd.max_name_length, 33); seq->header = new_sequence_header(CASAVA_1_8); header_function * f = (header_function *) seq->header; char * index = f->get_index(seq); size_t prefix_length = strlen(cmd.output_folder); char * output_file = calloc(prefix_length + MAX_FIELD_SIZE + 1, sizeof(char *)); char * index_pointer = output_file + prefix_length; strcpy(output_file, cmd.output_folder); printf("prefix: %s\n", output_file); while (read_sequence_from_fastq(in, seq, cmd.max_read_length)) { strcpy(index_pointer, index); // printf("index: %s\n new output: %s\n", index, output_file); append_sequence(output_file, seq, FASTQ); } if (in != stdin) { fclose(in); } return 0; }
static KmerHash * load_kmer_table(KmerStatsCmdLine cmd_line){ //TODO: Use a special format that contains the memory requirements. log_and_screen_printf("\nHash table from file: %s\n", cmd_line.reference_kmers); KmerHash * kmer_hash = hash_table_new(cmd_line.number_of_buckets_bits, cmd_line.bucket_size, 25, cmd_line.kmer_size); boolean all_entries_are_unique = true; log_and_screen_printf("\nReading kmers file: %s\n", cmd_line.reference_kmers); fflush(stdout); load_binary_from_filename_into_kmers_hash(cmd_line.reference_kmers, kmer_hash, KMER_HASH_REFERENCE_INDEX, all_entries_are_unique); log_and_screen_printf("\nRead of file complete. Total kmers: %'lld\n", hash_table_get_unique_kmers(kmer_hash)); hash_table_print_stats(kmer_hash); return kmer_hash; }
void * new_sequence_header(sequence_header_type header_type){ switch (header_type) { case CASAVA_1_8: return new_casava_sequence_header_1_8(); break; case UNKNOWN_HEADER: case UNKNOWN_HEADER_LAST: default: log_and_screen_printf("Unknown format. \n"); assert(false); break; } return NULL; }
static long long load_reads_coverage_table(KmerStatsCmdLine cmd_line, KmerHash * kmer_hash){ log_and_screen_printf("\nLoading sample from: %s\n", cmd_line.input_filename); long long loaded_kmers = 0; if(cmd_line.format == KMERS){ //boolean all_entries_are_unique = false; //loaded_kmers = load_binary_from_filename_into_kmers_hash(cmd_line.input_filename, kmer_hash, KMER_HASH_SAMPLE_INDEX, all_entries_are_unique); loaded_kmers = load_kmers_binary_from_filename_update_coverage(cmd_line.input_filename, kmer_hash, KMER_HASH_SAMPLE_INDEX); }else{ KmerFileReaderArgs fra; fra.bad_reads = 0; fra.colour = KMER_HASH_SAMPLE_INDEX; fra.fastq_ascii_offset = cmd_line.quality_score_offset; KmerFileReaderInnerArgs fria; fria.format = cmd_line.format; fria.kmer_size = kmer_hash->kmer_size; fria.max_read_length = 1000; fria.new_entry = true; fra.filename = cmd_line.input_filename; fra.quality_cut_off = cmd_line.quality_score_threshold; fra.inner_args = &fria; fra.insert = false; fra.max_read_length = 1000; fra.maximum_ocupancy = 75; fra.KmerHash = kmer_hash; //fp, seq, max_read_length, full_entry, &full_entry loaded_kmers = load_seq_into_kmers_hash(&fra); log_and_screen_printf("Loaded %'lld kmers (bad reads %'lld)", loaded_kmers, fra.bad_reads); hash_table_print_stats(kmer_hash); } return loaded_kmers; }
int main(int argc, char **argv) { setlocale (LC_ALL, ""); log_and_screen_printf("\nkmer_contamination.\n\n"); log_and_screen_printf(SVN_VERSION); log_and_screen_printf(SVN_COMMIT_DATE); log_and_screen_printf("Compiled on %s at %s \n\n", __DATE__, __TIME__); KmerStatsCmdLine cmd_line = parse_cmdline(argc, argv, sizeof(Element)); //log_and_screen_printf("Parsed options\n"); KmerHash * kmer_hash = load_kmer_table(cmd_line); log_and_screen_printf("Kmers readed\n"); load_reads_coverage_table(cmd_line, kmer_hash); print_kmer_stats(&cmd_line, kmer_hash); print_contaminated_kmers_histogram(&cmd_line, kmer_hash); log_and_screen_printf("\nDONE"); return 0; }
/*----------------------------------------------------------------------* * Function: * * Purpose: * * Params: * * Returns: * *----------------------------------------------------------------------*/ int grow_graph_from_node(dBNode* start_node, dBNode** best_node, dBGraph* graph) { Queue* nodes_to_walk; dBNode* node; int orientation; int depth; int current_graph_size = 0; int best_coverage = 0; int best_edges = 0; *best_node = 0; // Nucleotide iterator, used to walk all possible paths from a node void walk_if_exists(Nucleotide n) { //if (debug) printf("Trying nucleotide %i\n", n); // If there is an edge in any colour for this nucleotide... if (db_node_edge_exist_any_colour(node, n, orientation)) { //if (debug) printf(" Edge exists\n"); // Get first node along this edge and check we've not already visited it... Orientation next_orientation; Nucleotide reverse_nucleotide; dBNode * next_node; next_node = db_graph_get_next_node(node, orientation, &next_orientation, n, &reverse_nucleotide, graph); if (!next_node) { log_and_screen_printf("Error: Something went wrong with db_graph_get_next_node\n"); exit(-1); } // If not already visited the first node, walk it... if (!db_node_check_flag_visited(next_node)) { pathStep first_step; Path * new_path; dBNode* end_node; int i = 0; // Get path first_step.node = node; first_step.orientation = orientation; first_step.label = n; new_path = path_new(MAX_EXPLORE_NODES, graph->kmer_size); if (!new_path) { log_and_screen_printf("ERROR: Not enough memory to allocate new path.\n"); exit(-1); } db_graph_get_perfect_path_with_first_edge_all_colours(&first_step, &db_node_action_do_nothing, new_path, graph); // Add end node to list of nodes to visit end_node = new_path->nodes[new_path->length-1]; if (!db_node_check_flag_visited(end_node)) { if (!db_node_is_blunt_end_all_colours(end_node, new_path->orientations[new_path->length-1])) { if (queue_push_node(nodes_to_walk, end_node, depth+1) == NULL) { log_and_screen_printf("Queue too large. Ending.\n"); exit(1); } } } // Now go through all nodes, look for best and mark all as visited for (i=0; i<new_path->length; i++) { if (!db_node_check_flag_visited(new_path->nodes[i])) { int this_coverage = element_get_coverage_all_colours(new_path->nodes[i]); int this_edges = db_node_edges_count_all_colours(new_path->nodes[i], forward) + db_node_edges_count_all_colours(new_path->nodes[i], reverse); if ((best_node == 0) || (this_coverage > best_coverage) || ((this_coverage == best_coverage) && (this_edges < best_edges))) { best_coverage = this_coverage; best_edges = this_edges; *best_node = new_path->nodes[i]; } db_node_action_set_flag_visited(new_path->nodes[i]); current_graph_size++; } } // Clean up path_destroy(new_path); } } } // Start a queue of nodes to walk //log_and_screen_printf("Allocating %d Mb to store queue information (max %d nodes, when full each node could be %d)...\n", ((METACORTEX_QUEUE_SIZE * sizeof(QueueItem*)) / 1024) / 1024, METACORTEX_QUEUE_SIZE, sizeof(QueueItem)); nodes_to_walk = queue_new(METACORTEX_QUEUE_SIZE); if (!nodes_to_walk) { log_and_screen_printf("Couldn't get memory for node queue.\n"); exit(-1); } // Add start node to list of nodes to visit if (queue_push_node(nodes_to_walk, start_node, 0) == NULL) { log_and_screen_printf("Queue too large. Ending.\n"); exit(-1); } if (!db_node_check_flag_visited(start_node)) { db_node_action_set_flag_visited(start_node); current_graph_size++; } // Now keep visiting nodes and walking paths while (nodes_to_walk->number_of_items > 0) { // Take top node from list node = queue_pop_node(nodes_to_walk, &depth); // Look at all paths out from here orientation = forward; nucleotide_iterator(&walk_if_exists); orientation = reverse; nucleotide_iterator(&walk_if_exists); } queue_free(nodes_to_walk); // If we didn't find a start node, presumably this is a singleton? if (*best_node == 0) { printf("Note: didn't find a best node, setting to start node\n"); *best_node = start_node; } return current_graph_size; }
void metacortex_find_subgraphs(dBGraph* graph, char* consensus_contigs_filename, int min_subgraph_kmers) { SubGraphInfo* sub_graphs; FILE* fp; Path *path_fwd = path_new(MAX_EXPLORE_PATH_LENGTH, graph->kmer_size); Path *path_rev = path_new(MAX_EXPLORE_PATH_LENGTH, graph->kmer_size); Path *final_path = path_new(MAX_EXPLORE_PATH_LENGTH, graph->kmer_size); char seq[256]; char analysis_filename[strlen(consensus_contigs_filename) + 10]; long int total_nodes = 0; int n_seeds = 0; int i; sprintf(analysis_filename, "%s.analysis", consensus_contigs_filename); log_and_screen_printf("Running metacortex subgraph analysis...\n"); log_and_screen_printf(" Contig file: %s\n", consensus_contigs_filename); log_and_screen_printf(" Analysis file: %s\n", analysis_filename); log_and_screen_printf("Minimum subgraph size: %i\n", min_subgraph_kmers); /* Initialise temporaray path array buffers */ path_array_initialise_buffers(graph->kmer_size); /* Create a list of subgraphs */ log_and_screen_printf("Allocating %d Mb to store subgraph information (max %d seeds)...\n", ((MAX_SEEDS * sizeof(SubGraphInfo)) / 1024) / 1024, MAX_SEEDS); sub_graphs = calloc(MAX_SEEDS, sizeof(SubGraphInfo)); if (!sub_graphs) { log_and_screen_printf("ERROR: Can't get memory for subgraphs\n"); exit(-1); } /* Open the analysis file */ fp = fopen(analysis_filename, "w"); if (!fp) { log_and_screen_printf("ERROR: Can't open analysis file.\n"); exit(-1); } /* For each node, if it's not pruned or visited, try and grow a graph */ void explore_node(dBNode * node) { if (node == NULL) { log_and_screen_printf("Error: NULL node passed to explore_node.\n"); exit(-1); } if (db_node_check_for_any_flag(node, PRUNED | VISITED) == false) { int nodes_in_graph; /* Grow graph from this node, returning the 'best' (highest coverage) node to store as seed point */ nodes_in_graph = grow_graph_from_node(node, &(sub_graphs[n_seeds].seed_node), graph); total_nodes += nodes_in_graph; if (sub_graphs[n_seeds].seed_node == NULL) { printf("ERROR: Seed node is NULL, nodes in graph is %d\n", nodes_in_graph); } else { /* Write data to analysis file */ binary_kmer_to_seq(&(node->kmer), graph->kmer_size, seq); fprintf(fp, "%i\t%i\t%ld\t%s\t", n_seeds, nodes_in_graph, total_nodes, seq); binary_kmer_to_seq(&(sub_graphs[n_seeds].seed_node->kmer), graph->kmer_size, seq); fprintf(fp, "%s\n", seq); /* Store nodes in this subgraph */ sub_graphs[n_seeds].graph_size = nodes_in_graph; n_seeds++; /* Check we've not run out of seed storage - in future, this should dynamically allocate */ if (n_seeds == MAX_SEEDS) { log_and_screen_printf("Error: MAX_SEEDS exceeded. Quitting.\n"); exit(-1); } } } } /* Traverse each node... */ log_and_screen_printf("Finding subgraphs...\n"); hash_table_traverse(&explore_node, graph); log_and_screen_printf("Finished. Total: %ld\n", total_nodes); fclose(fp); /* Open consensus contigs file */ fp = fopen(consensus_contigs_filename, "w"); if (!fp) { log_and_screen_printf("ERROR: Can't open contig file.\n"); exit(-1); } /* Now go through all the seed points and generate the consensus contigs by walking forward and backward from the seed */ db_graph_reset_flags(graph); log_and_screen_printf("Outputting contigs...\n"); log_progress_bar(0); long long one_percent = n_seeds/100; int percent; if (one_percent < 1) { one_percent = 1; } for (i=0; i<n_seeds; i++) { if (i % one_percent == 0) { percent = (100 * i) / n_seeds; log_progress_bar(percent); } //log_printf("Graph %i\n", i); if (sub_graphs[i].graph_size >= min_subgraph_kmers) { binary_kmer_to_seq(&(sub_graphs[i].seed_node->kmer), graph->kmer_size, seq); coverage_walk_get_path(sub_graphs[i].seed_node, forward, NULL, graph, path_fwd); coverage_walk_get_path(sub_graphs[i].seed_node, reverse, NULL, graph, path_rev); path_reverse(path_fwd, final_path); path_append(final_path, path_rev); final_path->id = i; path_to_fasta(final_path, fp); //log_printf(" Seed %s\tFwd path length %i\tRev path length %i\tFinal path length %i\n", seq, path_fwd->length, path_rev->length, final_path->length); path_reset(path_fwd); perfect_path_get_path(sub_graphs[i].seed_node, forward, &db_node_action_do_nothing, graph, path_fwd); //log_printf("\t\tPerfect path fwd length %i\n", path_fwd->length); path_reset(path_rev); path_reset(final_path); } else { log_printf(" Number of nodes (%i} too small. Not outputting contig.\n", sub_graphs[i].graph_size); } } log_progress_bar(100); printf("\n"); log_and_screen_printf("Finished contig output.\n"); fclose(fp); free(sub_graphs); }
HashTable * hash_table_read_dumped_memory(char * filename ){ HashTable * hash = calloc(1, sizeof(HashTable)); FILE * fp = fopen(filename, "rb"); int magic_size = strlen(MAGIC_TEXT); char * magic = calloc(magic_size, sizeof(char)); int size_ht; int size_e = sizeof(Element); size_t readed; short version; long long number_buckets; int bucket_size; long long hash_size; if(fp == NULL){ exit_while_reading(fp, filename); } //Header stuff, this is enough information to prepare the hash table. readed = fread(magic, sizeof(char), magic_size, fp); validate_read(readed, magic_size, fp, filename); if(strcmp(magic, MAGIC_TEXT) != 0){ log_and_screen_printf( "[hash_table_read_dumped_memory] Invalid magic number!\n"); fclose(fp); exit(-1); } //#printf("%s\n", magic); readed = fread(&version, sizeof(short), 1, fp); validate_read(readed, 1, fp, filename); //#printf("%d\n", version); if(version != HASH_VERSION){ log_and_screen_printf( "[hash_table_read_dumped_memory] Invalid version number!\n"); exit_while_reading(fp, filename); } readed = fread(&size_ht, sizeof(int), 1,fp); validate_read(readed, 1, fp, filename); //#printf("%d\n", size_ht); if(size_ht != sizeof(HashTable)){ log_and_screen_printf( "[hash_table_read_dumped_memory] Invalid size of hash table!\n"); exit_while_reading(fp, filename); } readed = fread(&size_e, sizeof(int), 1,fp); validate_read(readed, 1, fp, filename); //#printf("%d\n", size_e); if(size_e != sizeof(Element)){ log_and_screen_printf( "[hash_table_read_dumped_memory] Invalid size of element!\n"); exit_while_reading(fp, filename); } readed = fread(&hash->kmer_size, sizeof(short), 1, fp); //printf("kmer size %d\n", hash->kmer_size); validate_read(readed, 1, fp, filename); readed = fread(&number_buckets, sizeof(long long), 1, fp); validate_read(readed, 1, fp, filename); //printf("number of buckets %lld\n",number_buckets); readed = fread(&bucket_size, sizeof(int), 1, fp); validate_read(readed, 1, fp, filename); //printf("bucket size%d \n",bucket_size); readed = fread(&hash->max_rehash_tries, sizeof(int), 1, fp); validate_read(readed, 1, fp, filename); //printf("hash->max_rehash_tries %d\n", hash->max_rehash_tries); readed = fread(&hash->unique_kmers, sizeof(long long), 1, fp); validate_read(readed, 1, fp, filename); // printf("hash->unique_kmers %lld\n", hash->unique_kmers); hash->number_buckets = number_buckets ; hash->bucket_size = bucket_size; hash_size=number_buckets * bucket_size; //printf("Hash size %lld \n", hash_size); //Allocating the table according to the description of the file hash->table = calloc(hash_size, sizeof(Element)); hash->next_element = calloc(number_buckets, sizeof(int)); hash->collisions = calloc(number_buckets, sizeof(long long)); if(hash->table == NULL){ log_and_screen_printf( "Unable to create hash table\n "); exit_while_reading(fp, filename); } //Reading the actual data. readed = fread(hash->table, size_e, hash_size, fp); validate_read(readed, hash_size, fp, filename); readed = fread(hash->next_element, sizeof(int), number_buckets, fp); validate_read(readed, number_buckets, fp, filename); readed = fread(hash->collisions, sizeof(long long), hash->max_rehash_tries, fp); validate_read(readed, hash->max_rehash_tries, fp, filename); fclose(fp); log_and_screen_printf("Hash readed from: %s \n" , filename); hash_table_print_stats(hash); return hash; }
static void exit_while_reading(FILE * fp, char * filename){ log_and_screen_printf ("Error while reading file: %s\n", filename); fclose(fp); exit (-1); }