int ctx_correct(int argc, char **argv) { size_t i, j; struct ReadThreadCmdArgs args = READ_THREAD_CMD_ARGS_INIT; read_thread_args_alloc(&args); read_thread_args_parse(&args, argc, argv, longopts, true); GraphFileReader *gfile = &args.gfile; PathFileBuffer *pfiles = &args.pfiles; CorrectAlnInputBuffer *inputs = &args.inputs; size_t ctx_total_cols = gfile->hdr.num_of_cols; size_t ctx_num_kmers = gfile->num_of_kmers; if(args.colour > ctx_total_cols) cmd_print_usage("-c %zu is too big [> %zu]", args.colour, ctx_total_cols); size_t ctp_usedcols = 0; for(i = 0; i < pfiles->len; i++) { if(!file_filter_iscolloaded(&pfiles->data[i].fltr, args.colour)) { cmd_print_usage("Path file doesn't load into colour %zu: %s", args.colour, pfiles->data[i].fltr.orig_path.buff); } ctp_usedcols = MAX2(ctp_usedcols, path_file_usedcols(&pfiles->data[i])); } // // Decide on memory // size_t bits_per_kmer, kmers_in_hash, graph_mem, path_mem, total_mem; // 1 bit needed per kmer if we need to keep track of noreseed bits_per_kmer = sizeof(Edges)*8 + ctx_num_kmers + sizeof(uint64_t)*8; kmers_in_hash = cmd_get_kmers_in_hash2(args.memargs.mem_to_use, args.memargs.mem_to_use_set, args.memargs.num_kmers, args.memargs.num_kmers_set, bits_per_kmer, ctx_num_kmers, ctx_num_kmers, false, &graph_mem); // Paths memory path_mem = path_files_mem_required(pfiles->data, pfiles->len, false, false, ctp_usedcols, 0); cmd_print_mem(path_mem, "paths"); // Total memory total_mem = graph_mem + path_mem; cmd_check_mem_limit(args.memargs.mem_to_use, total_mem); // // Check we can read all output files // // Open output files SeqOutput *outputs = ctx_calloc(inputs->len, sizeof(SeqOutput)); bool output_files_exist = false; for(i = 0; i < inputs->len; i++) { CorrectAlnInput *input = &inputs->data[i]; input->crt_params.ctxcol = input->crt_params.ctpcol = args.colour; SeqOutput *output = &outputs[i]; seq_output_alloc(output); seq_output_set_paths(output, input->out_base, async_task_pe_output(&input->files)); input->output = output; // output check prints warnings and returns true if errors output_files_exist |= seq_output_files_exist_check(output); } // Abandon if some of the output files already exist if(output_files_exist) die("Output files already exist"); // Attempt to open all files for(i = 0; i < inputs->len && seq_output_open(&outputs[i]); i++) {} // Check if something went wrong - if so remove all output files if(i < inputs->len) { for(j = 0; j < i; j++) seq_output_delete(&outputs[i]); die("Couldn't open output files"); } // // Allocate memory // dBGraph db_graph; db_graph_alloc(&db_graph, gfile->hdr.kmer_size, ctx_total_cols, 1, kmers_in_hash); size_t bytes_per_col = roundup_bits2bytes(db_graph.ht.capacity); db_graph.col_edges = ctx_calloc(db_graph.ht.capacity, sizeof(Edges)); db_graph.node_in_cols = ctx_calloc(bytes_per_col * ctx_total_cols, 1); // Paths path_store_alloc(&db_graph.pstore, path_mem, false, db_graph.ht.capacity, ctp_usedcols); // // Load Graph and Path files // LoadingStats gstats = LOAD_STATS_INIT_MACRO; GraphLoadingPrefs gprefs = {.db_graph = &db_graph, .boolean_covgs = false, .must_exist_in_graph = false, .must_exist_in_edges = NULL, .empty_colours = true}; // Load graph, print stats, close file graph_load(gfile, gprefs, &gstats); hash_table_print_stats_brief(&db_graph.ht); graph_file_close(gfile); // Load path files (does nothing if num_fpiles == 0) paths_format_merge(pfiles->data, pfiles->len, false, false, args.num_of_threads, &db_graph); // // Run alignment // correct_reads(args.num_of_threads, MAX_IO_THREADS, inputs->data, inputs->len, &db_graph); // Close and free output files for(i = 0; i < inputs->len; i++) seq_output_dealloc(&outputs[i]); ctx_free(outputs); read_thread_args_dealloc(&args); db_graph_dealloc(&db_graph); return EXIT_SUCCESS; }
int ctx_correct(int argc, char **argv) { size_t i; struct ReadThreadCmdArgs args; read_thread_args_alloc(&args); read_thread_args_parse(&args, argc, argv, longopts, true); GraphFileReader *gfile = &args.gfile; GPathFileBuffer *gpfiles = &args.gpfiles; CorrectAlnInputBuffer *inputs = &args.inputs; // Update colours in graph file - sample in 0, all others in 1 size_t ncols = gpath_load_sample_pop(gfile, 1, gpfiles->b, gpfiles->len, args.colour); // Check for compatibility between graph files and link files graphs_gpaths_compatible(gfile, 1, gpfiles->b, gpfiles->len, 1); int64_t ctx_num_kmers = gfile->num_of_kmers; // // Decide on memory // size_t bits_per_kmer, kmers_in_hash, graph_mem, path_mem, total_mem; // 1 bit needed per kmer if we need to keep track of noreseed bits_per_kmer = sizeof(BinaryKmer)*8 + sizeof(Edges)*8 + (gpfiles->len > 0 ? sizeof(GPath*)*8 : 0) + ncols; // in colour kmers_in_hash = cmd_get_kmers_in_hash(args.memargs.mem_to_use, args.memargs.mem_to_use_set, args.memargs.num_kmers, args.memargs.num_kmers_set, bits_per_kmer, ctx_num_kmers, ctx_num_kmers, false, &graph_mem); // Paths memory size_t rem_mem = args.memargs.mem_to_use - MIN2(args.memargs.mem_to_use, graph_mem); path_mem = gpath_reader_mem_req(gpfiles->b, gpfiles->len, ncols, rem_mem, false, kmers_in_hash, false); cmd_print_mem(path_mem, "paths"); // Shift path store memory from graphs->paths graph_mem -= sizeof(GPath*)*kmers_in_hash; path_mem += sizeof(GPath*)*kmers_in_hash; // Total memory total_mem = graph_mem + path_mem; cmd_check_mem_limit(args.memargs.mem_to_use, total_mem); // // Check we can write all output files // // Open output files SeqOutput *outputs = ctx_calloc(inputs->len, sizeof(SeqOutput)); bool err_occurred = false; for(i = 0; i < inputs->len && !err_occurred; i++) { CorrectAlnInput *input = &inputs->b[i]; // We loaded target colour into colour zero input->crt_params.ctxcol = input->crt_params.ctpcol = 0; bool is_pe = asyncio_task_is_pe(&input->files); err_occurred = !seqout_open(&outputs[i], input->out_base, args.fmt, is_pe); input->output = &outputs[i]; } // Abandon if some of the output files already exist if(err_occurred) { for(i = 0; i < inputs->len; i++) seqout_close(&outputs[i], true); die("Error creating output files"); } // // Allocate memory // dBGraph db_graph; db_graph_alloc(&db_graph, gfile->hdr.kmer_size, ncols, 1, kmers_in_hash, DBG_ALLOC_EDGES | DBG_ALLOC_NODE_IN_COL); // Create a path store that does not tracks path counts gpath_reader_alloc_gpstore(gpfiles->b, gpfiles->len, path_mem, false, &db_graph); // // Load Graph and link files // GraphLoadingPrefs gprefs = graph_loading_prefs(&db_graph); gprefs.empty_colours = true; // Load graph, print stats, close file graph_load(gfile, gprefs, NULL); hash_table_print_stats_brief(&db_graph.ht); graph_file_close(gfile); // Load link files for(i = 0; i < gpfiles->len; i++) { gpath_reader_load(&gpfiles->b[i], GPATH_DIE_MISSING_KMERS, &db_graph); gpath_reader_close(&gpfiles->b[i]); } // // Run alignment // correct_reads(inputs->b, inputs->len, args.dump_seq_sizes, args.dump_frag_sizes, args.fq_zero, args.append_orig_seq, args.nthreads, &db_graph); // Close and free output files for(i = 0; i < inputs->len; i++) seqout_close(&outputs[i], false); ctx_free(outputs); // Closes input files read_thread_args_dealloc(&args); db_graph_dealloc(&db_graph); return EXIT_SUCCESS; }
//////////////////////////////////////////////////////////// // main //////////////////////////////////////////////////////////// int main(int argc, char **argv) { parse_command_line(argc, argv); // prepare AT and GC counts unsigned long long atgc[2] = {0}; // make trusted kmer data structure bithash *trusted = new bithash(k); // get good kmers from Hammer if (hammerf != NULL) { string hammerf_str(hammerf); if (hammerf_str.substr(hammerf_str.size()-3) == ".gz") { igzstream hammerf_in(hammerf); trusted->hammer_file_load(hammerf_in, atgc); } else { ifstream hammerf_in(hammerf); trusted->hammer_file_load(hammerf_in, atgc); } } // get kmer counts if(merf != NULL) { string merf_str(merf); if(ATcutf != NULL) { if(merf_str.substr(merf_str.size()-3) == ".gz") { igzstream mer_in(merf); trusted->tab_file_load(mer_in, load_AT_cutoffs(), atgc); } else { ifstream mer_in(merf); trusted->tab_file_load(mer_in, load_AT_cutoffs(), atgc); } } else { if(merf_str.substr(merf_str.size()-3) == ".gz") { igzstream mer_in(merf); trusted->tab_file_load(mer_in, cutoff, atgc); } else { ifstream mer_in(merf); trusted->tab_file_load(mer_in, cutoff, atgc); } } // saved bithash } else if(bithashf != NULL) { if(strcmp(bithashf,"-") == 0) { cerr << "Saved bithash cannot be piped in. Please specify file." << endl; exit(EXIT_FAILURE); } else trusted->binary_file_input(bithashf, atgc); } cout << trusted->num_kmers() << " trusted kmers" << endl; double prior_prob[4]; prior_prob[0] = (double)atgc[0] / (double)(atgc[0]+atgc[1]) / 2.0; prior_prob[1] = .5 - prior_prob[0]; prior_prob[2] = prior_prob[1]; prior_prob[3] = prior_prob[0]; //cout << "AT: " << atgc[0] << " GC: " << atgc[1] << endl; cout << "AT% = " << (2*prior_prob[0]) << endl; // make list of files vector<string> fastqfs; vector<int> pairedend_codes; parse_fastq(fastqfs, pairedend_codes); // process each file string fqf; bool zip; for(int f = 0; f < fastqfs.size(); f++) { fqf = fastqfs[f]; cout << fqf << endl; // unzip if(fqf.substr(fqf.size()-3) == ".gz") { zip = true; unzip_fastq(fqf); } else zip = false; // determine quality value scale if(Read::quality_scale == -1) guess_quality_scale(fqf); // split file vector<streampos> starts; vector<unsigned long long> counts; chunkify_fastq(fqf, starts, counts); // learn nt->nt transitions double ntnt_prob[Read::max_qual][4][4] = {0}; for(int q = 0; q < Read::max_qual; q++) for(int i = 0; i < 4; i++) for(int j = 0; j < 4; j++) if(i != j) ntnt_prob[q][i][j] = 1.0/3.0; if(!TESTING) learn_errors(fqf, trusted, starts, counts, ntnt_prob, prior_prob); // correct correct_reads(fqf, pairedend_codes[f], trusted, starts, counts, ntnt_prob, prior_prob); // combine if(pairedend_codes[f] == 0) { combine_output(fqf, string("cor"), uncorrected_out); } // combine paired end if(pairedend_codes[f] == 2) { if(!zip) { combine_output_paired(fastqfs[f-1], fqf, string("cor"), uncorrected_out); } else { combine_output_paired(fastqfs[f-1].substr(0,fastqfs[f-1].size()-3), fqf, string("cor"), uncorrected_out); } } if(zip) zip_fastq(fqf); } return 0; }