void graph_crawler_alloc(GraphCrawler *crawler, const dBGraph *db_graph) { ctx_assert(db_graph->node_in_cols != NULL); size_t ncols = db_graph->num_of_cols; int *col_paths = ctx_calloc(ncols, sizeof(int)); GCMultiColPath *multicol_paths = ctx_calloc(ncols, sizeof(GCMultiColPath)); GCUniColPath *unicol_paths = ctx_calloc(ncols, sizeof(GCUniColPath)); uint32_t *col_list = ctx_calloc(ncols, sizeof(uint32_t)); GraphCrawler tmp = {.num_paths = 0, .col_paths = col_paths, .multicol_paths = multicol_paths, .unicol_paths = unicol_paths, .col_list = col_list}; memcpy(crawler, &tmp, sizeof(GraphCrawler)); graph_cache_alloc(&crawler->cache, db_graph); graph_walker_alloc(&crawler->wlk, db_graph); rpt_walker_alloc(&crawler->rptwlk, db_graph->ht.capacity, 22); // 4MB } void graph_crawler_dealloc(GraphCrawler *crawler) { ctx_free(crawler->col_paths); ctx_free(crawler->multicol_paths); ctx_free(crawler->unicol_paths); ctx_free(crawler->col_list); graph_cache_dealloc(&crawler->cache); graph_walker_dealloc(&crawler->wlk); rpt_walker_dealloc(&crawler->rptwlk); memset(crawler, 0, sizeof(GraphCrawler)); // reset }
void gpath_store_dealloc(GPathStore *gpstore) { gpath_set_dealloc(&gpstore->gpset); gpath_store_merge_read_write(gpstore); ctx_free(gpstore->paths_all); if(gpstore->paths_traverse != gpstore->paths_all) ctx_free(gpstore->paths_traverse); memset(gpstore, 0, sizeof(*gpstore)); }
void call_decomp_destroy(CallDecomp *dc) { alignment_free(dc->aln); needleman_wunsch_free(dc->nw_aligner); ctx_free(dc->scoring); bcf_destroy(dc->v); strbuf_dealloc(&dc->sbuf); ctx_free(dc); }
/** * Calculate cleaning threshold for supernodes from a given distribution * of supernode coverages * @param covgs histogram of supernode coverages */ size_t cleaning_pick_supernode_threshold(const uint64_t *covgs, size_t len, double seq_depth, const dBGraph *db_graph) { ctx_assert(len > 5); ctx_assert(db_graph->ht.num_kmers > 0); size_t i, d1len = len-2, d2len = len-3, f1, f2; double *tmp = ctx_malloc((d1len+d2len) * sizeof(double)); double *delta1 = tmp, *delta2 = tmp + d1len; // Get sequencing depth from coverage uint64_t covg_sum = 0, capacity = db_graph->ht.capacity * db_graph->num_of_cols; for(i = 0; i < capacity; i++) covg_sum += db_graph->col_covgs[i]; double seq_depth_est = (double)covg_sum / db_graph->ht.num_kmers; status("[cleaning] Kmer depth before cleaning supernodes: %.2f", seq_depth_est); if(seq_depth <= 0) seq_depth = seq_depth_est; else status("[cleaning] Using sequence depth argument: %f", seq_depth); size_t fallback_thresh = (size_t)MAX2(1, (seq_depth+1)/2); // +1 to ensure covgs is never 0 for(i = 0; i < d1len; i++) delta1[i] = (double)(covgs[i+1]+1) / (covgs[i+2]+1); d1len = i; d2len = d1len - 1; if(d1len <= 2) { status("[cleaning] (using fallback1)\n"); ctx_free(tmp); return fallback_thresh; } // d2len is d1len-1 for(i = 0; i < d2len; i++) delta2[i] = delta1[i] / delta1[i+1]; for(f1 = 0; f1 < d1len && delta1[f1] >= 1; f1++); for(f2 = 0; f2 < d2len && delta2[f2] > 1; f2++); ctx_free(tmp); if(f1 < d1len && f1 < (seq_depth*0.75)) { status("[cleaning] (using f1)"); return f1+1; } else if(f2 < d2len) { status("[cleaning] (using f2)"); return f2+1; } else { status("[cleaning] (using fallback1)"); return fallback_thresh+1; } }
size_t infer_edges(size_t nthreads, bool add_all_edges, const dBGraph *db_graph) { ctx_assert(db_graph->node_in_cols != NULL); ctx_assert(db_graph->col_edges != NULL); size_t i, num_nodes_modified = 0; status("[inferedges] Processing stream"); InferEdgesWorker *wrkrs = ctx_calloc(nthreads, sizeof(InferEdgesWorker)); for(i = 0; i < nthreads; i++) { InferEdgesWorker tmp = {.threadid = i, .nthreads = nthreads, .add_all_edges = add_all_edges, .db_graph = db_graph, .num_nodes_modified = 0}; memcpy(&wrkrs[i], &tmp, sizeof(InferEdgesWorker)); } util_run_threads(wrkrs, nthreads, sizeof(InferEdgesWorker), nthreads, infer_edges_worker); // Sum up nodes modified for(i = 0; i < nthreads; i++) num_nodes_modified += wrkrs[i].num_nodes_modified; ctx_free(wrkrs); return num_nodes_modified; }
/* Execute one instruction from each running context. */ void ke_run(void) { struct ctx_t *ctx, *ctx_trav; int k = 0; /* Run an instruction from every running process */ for (k=0, ctx = ke->suspended_list_head; ctx; ctx = ctx->suspended_next, k++); //printf ("Instruction number: %lld, suspended processes:%d, queue_size: %d\n", instr_num, k, sched_count); for (ctx = ke->running_list_head; ctx; ctx = ctx->running_next) { int i; for ( i = 0 ; i < ctx->instr_slice && ctx_get_status(ctx, ctx_running); ++i) { while (interrupts_exist() && instr_num >= next_interrupt_num()) handle_interrupt (pop_interrupt()); ctx_execute_inst(ctx); instr_num++; } } /* Free finished contexts */ while (ke->finished_list_head) ctx_free(ke->finished_list_head); /* Process list of suspended contexts */ //ke_process_events(); while (!(ke->running_list_head) && interrupts_exist()) { //printf ("Instruction number updated from %lld to %lld\n", instr_num, next_interrupt_num()); instr_num = next_interrupt_num(); handle_interrupt(pop_interrupt()); } }
// Merge temporary files, closes tmp files void futil_merge_tmp_files(FILE **tmp_files, size_t num_files, FILE *fout) { #define TMP_BUF_SIZE (32 * ONE_MEGABYTE) char *data = ctx_malloc(TMP_BUF_SIZE); size_t i, len; FILE *fh; for(i = 0; i < num_files; i++) { fh = tmp_files[i]; if(fseek(fh, 0L, SEEK_SET) != 0) die("fseek error"); while((len = fread(data, 1, TMP_BUF_SIZE, fh)) > 0) if(fwrite(data, 1, len, fout) != len) die("write error [%s]", strerror(errno)); if(ferror(fh)) warn("fread error: %s", strerror(errno)); fclose(fh); } ctx_free(data); #undef TMP_BUF_SIZE }
// Remember to free the result void futil_get_strbuf_of_dir_path(const char *path, StrBuf *dir) { char *tmp = strdup(path); strbuf_set(dir, dirname(tmp)); strbuf_append_char(dir, '/'); ctx_free(tmp); }
/* try to get the module's context, returns a PAM status code */ static int ctx_get(pam_handle_t *pamh,const char *username,struct pld_ctx **pctx) { struct pld_ctx *ctx=NULL; int rc; /* try to get the context from PAM */ rc=pam_get_data(pamh,PLD_CTX,(const void **)&ctx); if ((rc==PAM_SUCCESS)&&(ctx!=NULL)) { /* if the user is different clear the context */ if ((ctx->user!=NULL)&&(strcmp(ctx->user,username)!=0)) ctx_clear(ctx); } else { /* allocate a new context */ ctx=calloc(1,sizeof(struct pld_ctx)); if (ctx==NULL) { pam_syslog(pamh,LOG_CRIT,"calloc(): failed to allocate memory: %s",strerror(errno)); return PAM_BUF_ERR; } ctx_clear(ctx); /* store the new context with the handler to free it */ rc=pam_set_data(pamh,PLD_CTX,ctx,ctx_free); if (rc!=PAM_SUCCESS) { ctx_free(pamh,ctx,0); pam_syslog(pamh,LOG_ERR,"failed to store context: %s",pam_strerror(pamh,rc)); return rc; } } /* return the context */ *pctx=ctx; return PAM_SUCCESS; }
static void run_exp_abc(const dBGraph *db_graph, bool prime_AB, size_t nthreads, size_t num_repeats, size_t max_AB_dist, bool print_failed_contigs) { ExpABCWorker *wrkrs = ctx_calloc(nthreads, sizeof(ExpABCWorker)); size_t i, j; if(max_AB_dist == 0) max_AB_dist = SIZE_MAX; for(i = 0; i < nthreads; i++) { wrkrs[i].colour = 0; wrkrs[i].nthreads = nthreads; wrkrs[i].db_graph = db_graph; wrkrs[i].prime_AB = prime_AB; wrkrs[i].num_limit = num_repeats / nthreads; wrkrs[i].max_AB_dist = max_AB_dist; wrkrs[i].print_failed_contigs = print_failed_contigs; db_node_buf_alloc(&wrkrs[i].nbuf, 1024); graph_walker_alloc(&wrkrs[i].gwlk, db_graph); rpt_walker_alloc(&wrkrs[i].rptwlk, db_graph->ht.capacity, 22); // 4MB } util_run_threads(wrkrs, nthreads, sizeof(ExpABCWorker), nthreads, run_exp_abc_thread); // Merge results size_t num_tests = 0, results[NUM_RESULT_VALUES] = {0}; size_t ab_fail_state[GRPHWLK_NUM_STATES] = {0}; size_t bc_fail_state[GRPHWLK_NUM_STATES] = {0}; for(i = 0; i < nthreads; i++) { num_tests += wrkrs[i].num_tests; for(j = 0; j < NUM_RESULT_VALUES; j++) results[j] += wrkrs[i].results[j]; for(j = 0; j < GRPHWLK_NUM_STATES; j++) ab_fail_state[j] += wrkrs[i].ab_fail_state[j]; for(j = 0; j < GRPHWLK_NUM_STATES; j++) bc_fail_state[j] += wrkrs[i].bc_fail_state[j]; db_node_buf_dealloc(&wrkrs[i].nbuf); graph_walker_dealloc(&wrkrs[i].gwlk); rpt_walker_dealloc(&wrkrs[i].rptwlk); } // Print results char nrunstr[50]; ulong_to_str(num_tests, nrunstr); status("Ran %s tests with %zu threads", nrunstr, nthreads); const char *titles[] = {"RES_ABC_SUCCESS", "RES_AB_WRONG", "RES_AB_FAILED", "RES_BC_WRONG", "RES_BC_FAILED", "RES_BC_OVERSHOT", "RES_LOST_IN_RPT", "RES_NO_TRAVERSAL"}; util_print_nums(titles, results, NUM_RESULT_VALUES, 30); status("AB_FAILED:"); graph_step_print_state_hist(ab_fail_state); status("BC_FAILED:"); graph_step_print_state_hist(bc_fail_state); ctx_free(wrkrs); }
rc_t CC KMain ( int argc, char *argv [] ) { Args * args; rc_t rc = ArgsMakeAndHandle ( &args, argc, argv, 2, MyOptions, sizeof MyOptions / sizeof ( OptDef ), XMLLogger_Args, XMLLogger_ArgsQty ); KLogHandlerSetStdErr(); if ( rc != 0 ) { LOGERR( klogErr, rc, "error creating internal structure" ); } else { ld_context lctx; lctx_init( &lctx ); rc = KDirectoryNativeDir ( &lctx.wd ); if ( rc != 0 ) { LOGERR( klogErr, rc, "error creating internal structure" ); } else { rc = XMLLogger_Make( &lctx.xml_logger, lctx.wd, args ); if ( rc != 0 ) { LOGERR( klogErr, rc, "error creating internal structure" ); } else { context ctx; rc = ctx_init( args, &ctx ); if ( rc == 0 ) { rc = pacbio_check_sourcefile( &ctx, &lctx ); if ( rc == 0 ) { lctx.with_progress = ctx.with_progress; ctx_show( &ctx ); lctx.dst_path = ctx.dst_path; rc = pacbio_load( &ctx, &lctx, false, false ); if ( rc == 0 ) { rc = pacbio_meta_entry( &lctx, argv[ 0 ] ); } } ctx_free( &ctx ); } } } lctx_free( &lctx ); ArgsWhack ( args ); } return rc; }
void hash_table_alloc(HashTable *ht, uint64_t req_capacity) { uint64_t num_of_buckets, capacity; uint8_t bucket_size; capacity = hash_table_cap(req_capacity, &num_of_buckets, &bucket_size); uint_fast32_t hash_mask = (uint_fast32_t)(num_of_buckets - 1); size_t mem = capacity * sizeof(BinaryKmer) + num_of_buckets * sizeof(uint8_t[2]); char num_bkts_str[100], bkt_size_str[100], cap_str[100], mem_str[100]; ulong_to_str(num_of_buckets, num_bkts_str); ulong_to_str(bucket_size, bkt_size_str); ulong_to_str(capacity, cap_str); bytes_to_str(mem, 1, mem_str); status("[hasht] Allocating table with %s entries, using %s", cap_str, mem_str); status("[hasht] number of buckets: %s, bucket size: %s", num_bkts_str, bkt_size_str); // calloc is required for bucket_data to set the first element of each bucket // to the 0th pos BinaryKmer *table = ctx_malloc(capacity * sizeof(BinaryKmer)); uint8_t (*const buckets)[2] = ctx_calloc(num_of_buckets, sizeof(uint8_t[2])); size_t i; for(i = 0; i < capacity; i++) table[i] = unset_bkmer; HashTable data = { .table = table, .num_of_buckets = num_of_buckets, .hash_mask = hash_mask, .bucket_size = bucket_size, .capacity = capacity, .buckets = buckets, .num_kmers = 0, .collisions = {0}, .seed = rand()}; memcpy(ht, &data, sizeof(data)); } void hash_table_dealloc(HashTable *hash_table) { ctx_free(hash_table->table); ctx_free(hash_table->buckets); }
static void supernode_cleaner_alloc(SupernodeCleaner *cl, size_t nthreads, size_t covg_threshold, size_t min_keep_tip, uint8_t *keep_flags, const dBGraph *db_graph) { size_t i; CovgBuffer *cbufs = ctx_calloc(nthreads, sizeof(CovgBuffer)); for(i = 0; i < nthreads; i++) covg_buf_alloc(&cbufs[i], 1024); uint64_t *covg_hist_init, *covg_hist_cleaned; uint64_t *mean_covg_hist_init, *mean_covg_hist_cleaned; uint64_t *len_hist_init, *len_hist_cleaned; covg_hist_init = ctx_calloc(DUMP_COVG_ARRSIZE, sizeof(uint64_t)); covg_hist_cleaned = ctx_calloc(DUMP_COVG_ARRSIZE, sizeof(uint64_t)); mean_covg_hist_init = ctx_calloc(DUMP_MEAN_COVG_ARRSIZE, sizeof(uint64_t)); mean_covg_hist_cleaned = ctx_calloc(DUMP_MEAN_COVG_ARRSIZE, sizeof(uint64_t)); len_hist_init = ctx_calloc(DUMP_LEN_ARRSIZE, sizeof(uint64_t)); len_hist_cleaned = ctx_calloc(DUMP_LEN_ARRSIZE, sizeof(uint64_t)); SupernodeCleaner tmp = {.nthreads = nthreads, .covg_threshold = covg_threshold, .min_keep_tip = min_keep_tip, .cbufs = cbufs, .covg_hist_init = covg_hist_init, .covg_hist_cleaned = covg_hist_cleaned, .covg_arrsize = DUMP_COVG_ARRSIZE, .mean_covg_hist_init = mean_covg_hist_init, .mean_covg_hist_cleaned = mean_covg_hist_cleaned, .mean_covg_arrsize = DUMP_MEAN_COVG_ARRSIZE, .len_hist_init = len_hist_init, .len_hist_cleaned = len_hist_cleaned, .len_arrsize = DUMP_LEN_ARRSIZE, .keep_flags = keep_flags, .num_tips = 0, .num_low_covg_snodes = 0, .num_tip_and_low_snodes = 0, .num_tip_kmers = 0, .num_low_covg_snode_kmers = 0, .num_tip_and_low_snode_kmers = 0, .db_graph = db_graph}; memcpy(cl, &tmp, sizeof(SupernodeCleaner)); } static void supernode_cleaner_dealloc(SupernodeCleaner *cl) { size_t i; for(i = 0; i < cl->nthreads; i++) covg_buf_dealloc(&cl->cbufs[i]); ctx_free(cl->cbufs); ctx_free(cl->covg_hist_init); ctx_free(cl->covg_hist_cleaned); ctx_free(cl->mean_covg_hist_init); ctx_free(cl->mean_covg_hist_cleaned); ctx_free(cl->len_hist_init); ctx_free(cl->len_hist_cleaned); memset(cl, 0, sizeof(SupernodeCleaner)); }
void gpath_store_reset(GPathStore *gpstore) { gpath_set_reset(&gpstore->gpset); gpstore->num_kmers_with_paths = gpstore->num_paths = gpstore->path_bytes = 0; memset(gpstore->paths_all, 0, gpstore->graph_capacity * sizeof(GPath*)); if(gpstore->paths_traverse != gpstore->paths_all) ctx_free(gpstore->paths_traverse); gpstore->paths_traverse = gpstore->paths_all; }
void acall_destroy(AlignedCall *call) { size_t i; for(i = 0; i < call->n_lines; i++) strbuf_dealloc(&call->lines[i]); free(call->lines); free(call->gts); strbuf_dealloc(&call->info); ctx_free(call); }
void gpath_store_merge_read_write(GPathStore *gpstore) { if(gpstore->paths_traverse != gpstore->paths_all) { status("[GPathStore] Merging read/write GraphPath linked lists"); ctx_free(gpstore->paths_traverse); // does nothing if NULL gpstore->paths_traverse = gpstore->paths_all; } }
static void unitig_cleaner_alloc(UnitigCleaner *cl, size_t nthreads, size_t covg_threshold, size_t min_keep_tip, uint8_t *keep_flags, const dBGraph *db_graph) { size_t i; CovgBuffer *cbufs = ctx_calloc(nthreads, sizeof(CovgBuffer)); for(i = 0; i < nthreads; i++) covg_buf_alloc(&cbufs[i], 1024); uint64_t *kmer_covgs_init, *kmer_covgs_clean; uint64_t *unitig_covgs_init, *unitig_covg_clean; uint64_t *len_hist_init, *len_hist_clean; kmer_covgs_init = ctx_calloc(DUMP_COVG_ARRSIZE, sizeof(uint64_t)); kmer_covgs_clean = ctx_calloc(DUMP_COVG_ARRSIZE, sizeof(uint64_t)); unitig_covgs_init = ctx_calloc(DUMP_COVG_ARRSIZE, sizeof(uint64_t)); unitig_covg_clean = ctx_calloc(DUMP_COVG_ARRSIZE, sizeof(uint64_t)); len_hist_init = ctx_calloc(DUMP_LEN_ARRSIZE, sizeof(uint64_t)); len_hist_clean = ctx_calloc(DUMP_LEN_ARRSIZE, sizeof(uint64_t)); UnitigCleaner tmp = {.nthreads = nthreads, .covg_threshold = covg_threshold, .min_keep_tip = min_keep_tip, .cbufs = cbufs, .kmer_covgs_init = kmer_covgs_init, .kmer_covgs_clean = kmer_covgs_clean, .unitig_covgs_init = unitig_covgs_init, .unitig_covg_clean = unitig_covg_clean, .covg_arrsize = DUMP_COVG_ARRSIZE, .len_hist_init = len_hist_init, .len_hist_clean = len_hist_clean, .len_arrsize = DUMP_LEN_ARRSIZE, .keep_flags = keep_flags, .num_tips = 0, .num_low_covg_snodes = 0, .num_tip_and_low_snodes = 0, .num_tip_kmers = 0, .num_low_covg_snode_kmers = 0, .num_tip_and_low_snode_kmers = 0, .db_graph = db_graph}; memcpy(cl, &tmp, sizeof(UnitigCleaner)); } static void unitig_cleaner_dealloc(UnitigCleaner *cl) { size_t i; for(i = 0; i < cl->nthreads; i++) covg_buf_dealloc(&cl->cbufs[i]); ctx_free(cl->cbufs); ctx_free(cl->kmer_covgs_init); ctx_free(cl->kmer_covgs_clean); ctx_free(cl->unitig_covgs_init); ctx_free(cl->unitig_covg_clean); ctx_free(cl->len_hist_init); ctx_free(cl->len_hist_clean); memset(cl, 0, sizeof(UnitigCleaner)); }
/** * Save paths to a file. * @param gzout gzFile to write to * @param path path of output file * @param save_path_seq if true, save seq= and juncpos= for links, requires * exactly one colour in the graph * @param hdrs is array of JSON headers of input files */ void gpath_save(gzFile gzout, const char *path, size_t nthreads, bool save_path_seq, const char *cmdstr, cJSON *cmdhdr, cJSON **hdrs, size_t nhdrs, const ZeroSizeBuffer *contig_hists, size_t ncols, dBGraph *db_graph) { ctx_assert(nthreads > 0); ctx_assert(gpath_set_has_nseen(&db_graph->gpstore.gpset)); ctx_assert(ncols == db_graph->gpstore.gpset.ncols); ctx_assert(!save_path_seq || db_graph->num_of_cols == 1); // save_path => 1 colour char npaths_str[50]; ulong_to_str(db_graph->gpstore.num_paths, npaths_str); status("Saving %s paths to: %s", npaths_str, path); status(" using %zu threads", nthreads); // Write header cJSON *json = gpath_save_mkhdr(path, cmdstr, cmdhdr, hdrs, nhdrs, contig_hists, ncols, db_graph); json_hdr_gzprint(json, gzout); cJSON_Delete(json); // Print comments about the format gzputs(gzout, ctp_explanation_comment); // Multithreaded GPathSaver *wrkrs = ctx_calloc(nthreads, sizeof(GPathSaver)); pthread_mutex_t outlock; size_t i; if(pthread_mutex_init(&outlock, NULL) != 0) die("Mutex init failed"); for(i = 0; i < nthreads; i++) { wrkrs[i] = (GPathSaver){.threadid = i, .nthreads = nthreads, .save_seq = save_path_seq, .gzout = gzout, .outlock = &outlock, .db_graph = db_graph}; } // Iterate over kmers writing paths util_run_threads(wrkrs, nthreads, sizeof(*wrkrs), nthreads, gpath_save_thread); pthread_mutex_destroy(&outlock); ctx_free(wrkrs); status("[GPathSave] Graph paths saved to %s", path); }
void chrom_hash_load(char const*const* paths, size_t num_files, ReadBuffer *chroms, ChromHash *genome) { size_t i; seq_file_t **ref_files = ctx_malloc(num_files * sizeof(seq_file_t*)); for(i = 0; i < num_files; i++) if((ref_files[i] = seq_open(paths[i])) == NULL) die("Cannot read sequence file: %s", paths[i]); chrom_hash_load2(ref_files, num_files, chroms, genome); ctx_free(ref_files); }
static void pull_out_supernodes(const char **seq, const char **ans, size_t n, const dBGraph *graph) { dBNodeBuffer nbuf; db_node_buf_alloc(&nbuf, 1024); // 1. Check pulling out supernodes works for iterating over the graph uint64_t *visited; visited = ctx_calloc(roundup_bits2words64(graph->ht.capacity), 8); HASH_ITERATE(&graph->ht, supernode_from_kmer, &nbuf, visited, graph, ans, n); ctx_free(visited); // 2. Check pulling out supernodes works when we iterate over inputs size_t i, j, len; dBNode node; char tmpstr[SNODEBUF]; for(i = 0; i < n; i++) { len = strlen(seq[i]); for(j = 0; j+graph->kmer_size <= len; j++) { // Find node node = db_graph_find_str(graph, seq[i]+j); TASSERT(node.key != HASH_NOT_FOUND); // Fetch supernode db_node_buf_reset(&nbuf); supernode_find(node.key, &nbuf, graph); supernode_normalise(nbuf.b, nbuf.len, graph); // Compare TASSERT(nbuf.len < SNODEBUF); db_nodes_to_str(nbuf.b, nbuf.len, graph, tmpstr); if(strcmp(tmpstr, ans[i]) != 0) { test_status("Got: %s from ans[i]:%s\n", tmpstr, ans[i]); } TASSERT(strcmp(tmpstr, ans[i]) == 0); } } db_node_buf_dealloc(&nbuf); }
/* Finalization */ void ke_done(void) { struct ctx_t *ctx; /* Finish all contexts */ for (ctx = ke->context_list_head; ctx; ctx = ctx->context_next) if (!ctx_get_status(ctx, ctx_finished)) ctx_finish(ctx, 0); /* Free contexts */ while (ke->context_list_head) ctx_free(ke->context_list_head); /* Finalize GPU kernel */ gk_done(); /* End */ free(ke); isa_done(); syscall_summary(); }
/* Execute one instruction from each running context. */ void ke_run(void) { struct ctx_t *ctx, *ctx_trav; int flag = 0; /* Run an instruction from every running process */ for (ctx = ke->running_list_head; ctx; ctx = ctx->running_next) { int i; //printf ("out - %p\n", ctx); for ( i = 0 ; i < ctx->instr_slice ; ++i) { if((no_instructions == get_least_interrupt_time()) && least_interrupt!=NULL){ struct interrupt_t* curr_interrupt = get_least_interrupt(); curr_interrupt->details->p_ctx->blocked = 0; printf("program with pid %d unblocked at %d\n",curr_interrupt->details->p_ctx->pid,no_instructions); delete_interrupt(curr_interrupt); } if(!ctx->blocked){ ctx_execute_inst(ctx); no_instructions++; printf("%d,%d\n",ctx->pid,no_instructions); } if (ctx!=ke->running_list_head) break; } } /* Free finished contexts */ while (ke->finished_list_head) ctx_free(ke->finished_list_head); /* Process list of suspended contexts */ ke_process_events(); }
// Merge temporary files, closes tmp files void futil_merge_tmp_files(FILE **tmp_files, size_t num_files, FILE *fout) { #define TMP_BUF_SIZE (1<<25) /* 32MB */ char *data = ctx_malloc(TMP_BUF_SIZE); size_t i, len; FILE *tmp_file; for(i = 0; i < num_files; i++) { tmp_file = tmp_files[i]; if(fseek(tmp_file, 0L, SEEK_SET) == -1) die("gzseek error"); while((len = fread(data, 1, TMP_BUF_SIZE, tmp_file)) > 0) if(fwrite(data, 1, len, fout) != len) die("write error [%s]", strerror(errno)); fclose(tmp_file); } ctx_free(data); #undef TMP_BUF_SIZE }
void start(struct config *config) { assert(config != NULL); assert(config->num_threads > 0); assert(config->port > 0); assert(config->nodes != NULL); assert(config->num_nodes > 0); pthread_t threads[config->num_threads]; struct ctx *ctxs[config->num_threads]; int i; for (i = 0; i < config->num_threads; i++) { if ((ctxs[i] = ctx_new(config->nodes, config->num_nodes, config->port, config->flush_interval)) == NULL) exit(1); pthread_create(&threads[i], NULL, &thread_start, ctxs[i]); } for (i = 0; i < config->num_threads; i++) { pthread_join(threads[i], NULL); ctx_free(ctxs[i]); } }
int ctx_clean(int argc, char **argv) { size_t nthreads = 0, use_ncols = 0; struct MemArgs memargs = MEM_ARGS_INIT; const char *out_ctx_path = NULL; bool tip_cleaning = false, supernode_cleaning = false; size_t min_keep_tip = 0; Covg threshold = 0, fallback_thresh = 0; const char *len_before_path = NULL, *len_after_path = NULL; const char *covg_before_path = NULL, *covg_after_path = NULL; // Arg parsing char cmd[100]; char shortopts[300]; cmd_long_opts_to_short(longopts, shortopts, sizeof(shortopts)); int c; // silence error messages from getopt_long // opterr = 0; while((c = getopt_long(argc, argv, shortopts, longopts, NULL)) != -1) { cmd_get_longopt_str(longopts, c, cmd, sizeof(cmd)); switch(c) { case 0: /* flag set */ break; case 'h': cmd_print_usage(NULL); break; case 'f': cmd_check(!futil_get_force(), cmd); futil_set_force(true); break; case 'o': if(out_ctx_path != NULL) cmd_print_usage(NULL); out_ctx_path = optarg; break; case 'm': cmd_mem_args_set_memory(&memargs, optarg); break; case 'n': cmd_mem_args_set_nkmers(&memargs, optarg); break; case 'N': use_ncols = cmd_uint32_nonzero(cmd, optarg); break; case 't': cmd_check(!nthreads, cmd); nthreads = cmd_uint32_nonzero(cmd, optarg); break; case 'T': cmd_check(!tip_cleaning, cmd); min_keep_tip = cmd_uint32_nonzero(cmd, optarg); tip_cleaning = true; break; case 'S': cmd_check(!supernode_cleaning, cmd); if(optarg != NULL) threshold = cmd_uint32_nonzero(cmd, optarg); supernode_cleaning = true; break; case 'B': cmd_check(!fallback_thresh, cmd); fallback_thresh = cmd_uint32_nonzero(cmd, optarg); break; case 'l': cmd_check(!len_before_path, cmd); len_before_path = optarg; break; case 'L': cmd_check(!len_after_path, cmd); len_after_path = optarg; break; case 'c': cmd_check(!covg_before_path, cmd); covg_before_path = optarg; break; case 'C': cmd_check(!covg_after_path, cmd); covg_after_path = optarg; break; case ':': /* BADARG */ case '?': /* BADCH getopt_long has already printed error */ // cmd_print_usage(NULL); die("`"CMD" clean -h` for help. Bad option: %s", argv[optind-1]); default: abort(); } } if(nthreads == 0) nthreads = DEFAULT_NTHREADS; if(optind >= argc) cmd_print_usage("Please give input graph files"); // Default behaviour if(!tip_cleaning && !supernode_cleaning) { if(out_ctx_path != NULL) supernode_cleaning = tip_cleaning = true; // do both else warn("No cleaning being done: you did not specify --out <out.ctx>"); } bool doing_cleaning = (supernode_cleaning || tip_cleaning); if(doing_cleaning && out_ctx_path == NULL) { cmd_print_usage("Please specify --out <out.ctx> for cleaned graph"); } if(!doing_cleaning && (covg_after_path || len_after_path)) { cmd_print_usage("You gave --len-after <out> / --covg-after <out> without " "any cleaning (set -s, --supernodes or -t, --tips)"); } if(doing_cleaning && strcmp(out_ctx_path,"-") != 0 && !futil_get_force() && futil_file_exists(out_ctx_path)) { cmd_print_usage("Output file already exists: %s", out_ctx_path); } if(fallback_thresh && !supernode_cleaning) cmd_print_usage("-B, --fallback <T> without --supernodes"); // Use remaining args as graph files char **gfile_paths = argv + optind; size_t i, j, num_gfiles = (size_t)(argc - optind); // Open graph files GraphFileReader *gfiles = ctx_calloc(num_gfiles, sizeof(GraphFileReader)); size_t ncols, ctx_max_kmers = 0, ctx_sum_kmers = 0; ncols = graph_files_open(gfile_paths, gfiles, num_gfiles, &ctx_max_kmers, &ctx_sum_kmers); size_t kmer_size = gfiles[0].hdr.kmer_size; // default to one colour for now if(use_ncols == 0) use_ncols = 1; // Flatten if we don't have to remember colours / output a graph if(!doing_cleaning) { ncols = use_ncols = 1; for(i = 0; i < num_gfiles; i++) file_filter_flatten(&gfiles[i].fltr, 0); } if(ncols < use_ncols) { warn("I only need %zu colour%s ('--ncols %zu' ignored)", ncols, util_plural_str(ncols), use_ncols); use_ncols = ncols; } char max_kmers_str[100]; ulong_to_str(ctx_max_kmers, max_kmers_str); status("%zu input graph%s, max kmers: %s, using %zu colours", num_gfiles, util_plural_str(num_gfiles), max_kmers_str, use_ncols); // If no arguments given we default to removing tips < 2*kmer_size if(tip_cleaning && min_keep_tip == 0) min_keep_tip = 2 * kmer_size; // Warn if any graph files already cleaned size_t fromcol, intocol; ErrorCleaning *cleaning; for(i = 0; i < num_gfiles; i++) { for(j = 0; j < file_filter_num(&gfiles[i].fltr); j++) { fromcol = file_filter_fromcol(&gfiles[i].fltr, j); cleaning = &gfiles[i].hdr.ginfo[fromcol].cleaning; if(cleaning->cleaned_snodes && supernode_cleaning) { warn("%s:%zu already has supernode cleaning with threshold: <%zu", file_filter_path(&gfiles[i].fltr), fromcol, (size_t)cleaning->clean_snodes_thresh); } if(cleaning->cleaned_tips && tip_cleaning) { warn("%s:%zu already has had tip cleaned", file_filter_path(&gfiles[i].fltr), fromcol); } } } // Print steps size_t step = 0; status("Actions:\n"); if(covg_before_path != NULL) status("%zu. Saving kmer coverage distribution to: %s", step++, covg_before_path); if(len_before_path != NULL) status("%zu. Saving supernode length distribution to: %s", step++, len_before_path); if(tip_cleaning) status("%zu. Cleaning tips shorter than %zu nodes", step++, min_keep_tip); if(supernode_cleaning && threshold > 0) status("%zu. Cleaning supernodes with coverage < %u", step++, threshold); if(supernode_cleaning && threshold <= 0) status("%zu. Cleaning supernodes with auto-detected threshold", step++); if(covg_after_path != NULL) status("%zu. Saving kmer coverage distribution to: %s", step++, covg_after_path); if(len_after_path != NULL) status("%zu. Saving supernode length distribution to: %s", step++, len_after_path); // // Decide memory usage // bool all_colours_loaded = (ncols <= use_ncols); bool use_mem_limit = (memargs.mem_to_use_set && num_gfiles > 1) || !ctx_max_kmers; size_t kmers_in_hash, bits_per_kmer, graph_mem; size_t per_kmer_per_col_bits = (sizeof(BinaryKmer)+sizeof(Covg)+sizeof(Edges)) * 8; size_t pop_edges_per_kmer_bits = (all_colours_loaded ? 0 : sizeof(Edges) * 8); bits_per_kmer = per_kmer_per_col_bits * use_ncols + pop_edges_per_kmer_bits; kmers_in_hash = cmd_get_kmers_in_hash(memargs.mem_to_use, memargs.mem_to_use_set, memargs.num_kmers, memargs.num_kmers_set, bits_per_kmer, ctx_max_kmers, ctx_sum_kmers, use_mem_limit, &graph_mem); // Maximise the number of colours we load to fill the mem size_t max_usencols = (memargs.mem_to_use*8 - pop_edges_per_kmer_bits * kmers_in_hash) / (per_kmer_per_col_bits * kmers_in_hash); use_ncols = MIN2(max_usencols, ncols); cmd_check_mem_limit(memargs.mem_to_use, graph_mem); // // Check output files are writable // futil_create_output(out_ctx_path); // Does nothing if arg is NULL futil_create_output(covg_before_path); futil_create_output(covg_after_path); futil_create_output(len_before_path); futil_create_output(len_after_path); // Create db_graph // Load as many colours as possible // Use an extra set of edge to take intersections dBGraph db_graph; db_graph_alloc(&db_graph, gfiles[0].hdr.kmer_size, use_ncols, use_ncols, kmers_in_hash, DBG_ALLOC_COVGS); // Edges is a special case size_t num_edges = db_graph.ht.capacity * (use_ncols + !all_colours_loaded); db_graph.col_edges = ctx_calloc(num_edges, sizeof(Edges)); // Load graph into a single colour LoadingStats stats = LOAD_STATS_INIT_MACRO; GraphLoadingPrefs gprefs = {.db_graph = &db_graph, .boolean_covgs = false, .must_exist_in_graph = false, .must_exist_in_edges = NULL, .empty_colours = false}; // Construct cleaned graph header GraphFileHeader outhdr; memset(&outhdr, 0, sizeof(GraphFileHeader)); outhdr.version = CTX_GRAPH_FILEFORMAT; outhdr.kmer_size = db_graph.kmer_size; outhdr.num_of_cols = ncols; outhdr.num_of_bitfields = (db_graph.kmer_size*2+63)/64; graph_header_alloc(&outhdr, ncols); // Merge info into header size_t gcol = 0; for(i = 0; i < num_gfiles; i++) { for(j = 0; j < file_filter_num(&gfiles[i].fltr); j++, gcol++) { fromcol = file_filter_fromcol(&gfiles[i].fltr, j); intocol = file_filter_intocol(&gfiles[i].fltr, j); graph_info_merge(&outhdr.ginfo[intocol], &gfiles[i].hdr.ginfo[fromcol]); } } if(ncols > use_ncols) { graph_files_load_flat(gfiles, num_gfiles, gprefs, &stats); } else { for(i = 0; i < num_gfiles; i++) graph_load(&gfiles[i], gprefs, &stats); } char num_kmers_str[100]; ulong_to_str(db_graph.ht.num_kmers, num_kmers_str); status("Total kmers loaded: %s\n", num_kmers_str); size_t initial_nkmers = db_graph.ht.num_kmers; hash_table_print_stats(&db_graph.ht); uint8_t *visited = ctx_calloc(roundup_bits2bytes(db_graph.ht.capacity), 1); uint8_t *keep = ctx_calloc(roundup_bits2bytes(db_graph.ht.capacity), 1); if((supernode_cleaning && threshold <= 0) || covg_before_path || len_before_path) { // Get coverage distribution and estimate cleaning threshold int est_threshold = cleaning_get_threshold(nthreads, covg_before_path, len_before_path, visited, &db_graph); if(est_threshold < 0) status("Cannot find recommended cleaning threshold"); else status("Recommended cleaning threshold is: %i", est_threshold); // Use estimated threshold if threshold not set if(threshold <= 0) { if(fallback_thresh > 0 && est_threshold < (int)fallback_thresh) { status("Using fallback threshold: %i", fallback_thresh); threshold = fallback_thresh; } else if(est_threshold >= 0) threshold = est_threshold; } } // Die if we failed to find suitable cleaning threshold if(supernode_cleaning && threshold <= 0) die("Need cleaning threshold (--supernodes=<D> or --fallback <D>)"); if(doing_cleaning) { // Clean graph of tips (if min_keep_tip > 0) and supernodes (if threshold > 0) clean_graph(nthreads, threshold, min_keep_tip, covg_after_path, len_after_path, visited, keep, &db_graph); } ctx_free(visited); ctx_free(keep); if(doing_cleaning) { // Output graph file Edges *intersect_edges = NULL; bool kmers_loaded = true; size_t col, thresh; // Set output header ginfo cleaned for(col = 0; col < ncols; col++) { cleaning = &outhdr.ginfo[col].cleaning; cleaning->cleaned_snodes |= supernode_cleaning; cleaning->cleaned_tips |= tip_cleaning; // if(tip_cleaning) { // strbuf_append_str(&outhdr.ginfo[col].sample_name, ".tipclean"); // } if(supernode_cleaning) { thresh = cleaning->clean_snodes_thresh; thresh = cleaning->cleaned_snodes ? MAX2(thresh, (uint32_t)threshold) : (uint32_t)threshold; cleaning->clean_snodes_thresh = thresh; // char name_append[200]; // sprintf(name_append, ".supclean%zu", thresh); // strbuf_append_str(&outhdr.ginfo[col].sample_name, name_append); } } if(!all_colours_loaded) { // We haven't loaded all the colours // intersect_edges are edges to mask with // resets graph edges intersect_edges = db_graph.col_edges; db_graph.col_edges += db_graph.ht.capacity; } // Print stats on removed kmers size_t removed_nkmers = initial_nkmers - db_graph.ht.num_kmers; double removed_pct = (100.0 * removed_nkmers) / initial_nkmers; char removed_str[100], init_str[100]; ulong_to_str(removed_nkmers, removed_str); ulong_to_str(initial_nkmers, init_str); status("Removed %s of %s (%.2f%%) kmers", removed_str, init_str, removed_pct); graph_files_merge(out_ctx_path, gfiles, num_gfiles, kmers_loaded, all_colours_loaded, intersect_edges, &outhdr, &db_graph); // Swap back if(!all_colours_loaded) db_graph.col_edges = intersect_edges; } ctx_check(db_graph.ht.num_kmers == hash_table_count_kmers(&db_graph.ht)); graph_header_dealloc(&outhdr); for(i = 0; i < num_gfiles; i++) graph_file_close(&gfiles[i]); ctx_free(gfiles); db_graph_dealloc(&db_graph); return EXIT_SUCCESS; }
int main(int argc, char** argv) { struct timeval start_time, end_time; readstat_error_t error = READSTAT_OK; char *input_filename = NULL; char *catalog_filename = NULL; char *output_filename = NULL; if (argc == 2 && (strcmp(argv[1], "-v") == 0 || strcmp(argv[1], "--version") == 0)) { print_version(); return 0; } else if (argc == 2 && (strcmp(argv[1], "-h") == 0 || strcmp(argv[1], "--help") == 0)) { print_usage(argv[0]); return 0; } if (argc == 3) { if (!can_read(argv[1]) || !can_write(argv[2])) { print_usage(argv[0]); return 1; } input_filename = argv[1]; output_filename = argv[2]; } else if (argc == 4) { if (!can_read(argv[1]) || !is_catalog(argv[2]) || !can_write(argv[3])) { print_usage(argv[0]); return 1; } input_filename = argv[1]; catalog_filename = argv[2]; output_filename = argv[3]; } else { print_usage(argv[0]); return 1; } int input_format = format(input_filename); int output_format = format(output_filename); gettimeofday(&start_time, NULL); int fd = open(output_filename, O_CREAT | O_WRONLY | O_EXCL, 0644); if (fd == -1) { dprintf(STDERR_FILENO, "Error opening %s for writing: %s\n", output_filename, strerror(errno)); return 1; } readstat_parser_t *pass1_parser = readstat_parser_init(); readstat_parser_t *pass2_parser = readstat_parser_init(); readstat_writer_t *writer = readstat_writer_init(); readstat_writer_set_file_label(writer, "Created by ReadStat <https://github.com/WizardMac/ReadStat>"); rs_ctx_t *rs_ctx = ctx_init(); rs_ctx->writer = writer; rs_ctx->out_fd = fd; rs_ctx->out_format = output_format; readstat_set_data_writer(writer, &write_data); // Pass 1 - Collect fweight and value labels readstat_set_error_handler(pass1_parser, &handle_error); readstat_set_info_handler(pass1_parser, &handle_info); readstat_set_value_label_handler(pass1_parser, &handle_value_label); readstat_set_fweight_handler(pass1_parser, &handle_fweight); if (catalog_filename) { error = parse_file(pass1_parser, catalog_filename, RS_FORMAT_SAS_CATALOG, rs_ctx); } else { error = parse_file(pass1_parser, input_filename, input_format, rs_ctx); } if (error != READSTAT_OK) goto cleanup; // Pass 2 - Parse full file readstat_set_error_handler(pass2_parser, &handle_error); readstat_set_info_handler(pass2_parser, &handle_info); readstat_set_variable_handler(pass2_parser, &handle_variable); readstat_set_value_handler(pass2_parser, &handle_value); error = parse_file(pass2_parser, input_filename, input_format, rs_ctx); if (error != READSTAT_OK) goto cleanup; gettimeofday(&end_time, NULL); dprintf(STDERR_FILENO, "Converted %ld variables and %ld rows in %.2lf seconds\n", rs_ctx->var_count, rs_ctx->row_count, (end_time.tv_sec + 1e-6 * end_time.tv_usec) - (start_time.tv_sec + 1e-6 * start_time.tv_usec)); cleanup: readstat_parser_free(pass1_parser); readstat_parser_free(pass2_parser); readstat_writer_free(writer); ctx_free(rs_ctx); close(fd); if (error != READSTAT_OK) { dprintf(STDERR_FILENO, "%s\n", readstat_error_message(error)); unlink(output_filename); return 1; } return 0; }
int ctx_thread(int argc, char **argv) { struct ReadThreadCmdArgs args; read_thread_args_alloc(&args); read_thread_args_parse(&args, argc, argv, longopts, false); GraphFileReader *gfile = &args.gfile; GPathFileBuffer *gpfiles = &args.gpfiles; CorrectAlnInputBuffer *inputs = &args.inputs; size_t i; if(args.zero_link_counts && gpfiles->len == 0) cmd_print_usage("-0,--zero-paths without -p,--paths <in.ctp> has no meaning"); // Check each path file only loads one colour gpaths_only_for_colour(gpfiles->b, gpfiles->len, 0); // // Decide on memory // size_t bits_per_kmer, kmers_in_hash, graph_mem, total_mem; size_t path_hash_mem, path_store_mem, path_mem; bool sep_path_list = (!args.use_new_paths && gpfiles->len > 0); bits_per_kmer = sizeof(BinaryKmer)*8 + sizeof(Edges)*8 + sizeof(GPath*)*8 + 2 * args.nthreads; // Have traversed // false -> don't use mem_to_use to decide how many kmers to store in hash // since we need some of that memory for storing paths kmers_in_hash = cmd_get_kmers_in_hash(args.memargs.mem_to_use, args.memargs.mem_to_use_set, args.memargs.num_kmers, args.memargs.num_kmers_set, bits_per_kmer, gfile->num_of_kmers, gfile->num_of_kmers, false, &graph_mem); // Paths memory size_t min_path_mem = 0; gpath_reader_sum_mem(gpfiles->b, gpfiles->len, 1, true, true, &min_path_mem); if(graph_mem + min_path_mem > args.memargs.mem_to_use) { char buf[50]; die("Require at least %s memory", bytes_to_str(graph_mem+min_path_mem, 1, buf)); } path_mem = args.memargs.mem_to_use - graph_mem; size_t pentry_hash_mem = sizeof(GPEntry)/0.7; size_t pentry_store_mem = sizeof(GPath) + 8 + // struct + sequence 1 + // in colour sizeof(uint8_t) + // counts sizeof(uint32_t); // kmer length size_t max_paths = path_mem / (pentry_store_mem + pentry_hash_mem); path_store_mem = max_paths * pentry_store_mem; path_hash_mem = max_paths * pentry_hash_mem; cmd_print_mem(path_hash_mem, "paths hash"); cmd_print_mem(path_store_mem, "paths store"); total_mem = graph_mem + path_mem; cmd_check_mem_limit(args.memargs.mem_to_use, total_mem); // // Open output file // gzFile gzout = futil_gzopen_create(args.out_ctp_path, "w"); status("Creating paths file: %s", futil_outpath_str(args.out_ctp_path)); // // Allocate memory // dBGraph db_graph; size_t kmer_size = gfile->hdr.kmer_size; db_graph_alloc(&db_graph, kmer_size, 1, 1, kmers_in_hash, DBG_ALLOC_EDGES | DBG_ALLOC_NODE_IN_COL); // Split path memory 2:1 between store and hash // Create a path store that tracks path counts gpath_store_alloc(&db_graph.gpstore, db_graph.num_of_cols, db_graph.ht.capacity, 0, path_store_mem, true, sep_path_list); // Create path hash table for fast lookup gpath_hash_alloc(&db_graph.gphash, &db_graph.gpstore, path_hash_mem); if(args.use_new_paths) { status("Using paths as they are added (risky)"); } else { status("Not using new paths as they are added (safe)"); } // // Start up workers to add paths to the graph // GenPathWorker *workers; workers = gen_paths_workers_alloc(args.nthreads, &db_graph); // Setup for loading graphs graph LoadingStats gstats; loading_stats_init(&gstats); // Path statistics LoadingStats *load_stats = gen_paths_get_stats(workers); CorrectAlnStats *aln_stats = gen_paths_get_aln_stats(workers); // Load contig hist distribution for(i = 0; i < gpfiles->len; i++) { gpath_reader_load_contig_hist(gpfiles->b[i].json, gpfiles->b[i].fltr.path.b, file_filter_fromcol(&gpfiles->b[i].fltr, 0), &aln_stats->contig_histgrm); } GraphLoadingPrefs gprefs = {.db_graph = &db_graph, .boolean_covgs = false, .must_exist_in_graph = false, .must_exist_in_edges = NULL, .empty_colours = false}; // already loaded paths // Load graph, print stats, close file graph_load(gfile, gprefs, &gstats); hash_table_print_stats_brief(&db_graph.ht); graph_file_close(gfile); // Load existing paths for(i = 0; i < gpfiles->len; i++) gpath_reader_load(&gpfiles->b[i], GPATH_DIE_MISSING_KMERS, &db_graph); // zero link counts of already loaded links if(args.zero_link_counts) { status("Zeroing link counts for loaded links"); gpath_set_zero_nseen(&db_graph.gpstore.gpset); } if(!args.use_new_paths) gpath_store_split_read_write(&db_graph.gpstore); // Deal with a set of files at once // Can have different numbers of inputs vs threads size_t start, end; for(start = 0; start < inputs->len; start += MAX_IO_THREADS) { end = MIN2(inputs->len, start+MAX_IO_THREADS); generate_paths(inputs->b+start, end-start, workers, args.nthreads); } // Print memory statistics gpath_hash_print_stats(&db_graph.gphash); gpath_store_print_stats(&db_graph.gpstore); correct_aln_dump_stats(aln_stats, load_stats, args.dump_seq_sizes, args.dump_frag_sizes, db_graph.ht.num_kmers); // Don't need GPathHash anymore gpath_hash_dealloc(&db_graph.gphash); cJSON **hdrs = ctx_malloc(gpfiles->len * sizeof(cJSON*)); for(i = 0; i < gpfiles->len; i++) hdrs[i] = gpfiles->b[i].json; size_t output_threads = MIN2(args.nthreads, MAX_IO_THREADS); // Generate a cJSON header for all inputs cJSON *thread_hdr = cJSON_CreateObject(); cJSON *inputs_hdr = cJSON_CreateArray(); cJSON_AddItemToObject(thread_hdr, "inputs", inputs_hdr); for(i = 0; i < inputs->len; i++) cJSON_AddItemToArray(inputs_hdr, correct_aln_input_json_hdr(&inputs->b[i])); // Write output file gpath_save(gzout, args.out_ctp_path, output_threads, true, "thread", thread_hdr, hdrs, gpfiles->len, &aln_stats->contig_histgrm, 1, &db_graph); gzclose(gzout); ctx_free(hdrs); // Optionally run path checks for debugging // gpath_checks_all_paths(&db_graph, args.nthreads); // ins_gap, err_gap no longer allocated after this line gen_paths_workers_dealloc(workers, args.nthreads); // Close and free input files etc. read_thread_args_dealloc(&args); db_graph_dealloc(&db_graph); return EXIT_SUCCESS; }
int ctx_calls2vcf(int argc, char **argv) { const char *in_path = NULL, *out_path = NULL, *out_type = NULL; // Filtering parameters int32_t min_mapq = -1, max_align_len = -1, max_allele_len = -1; // Alignment parameters int nwmatch = 1, nwmismatch = -2, nwgapopen = -4, nwgapextend = -1; // ref paths char const*const* ref_paths = NULL; size_t nref_paths = 0; // flank file const char *sam_path = NULL; // // Things we figure out by looking at the input // bool isbubble = false; // samples in VCF, (0 for bubble, does not include ref in breakpoint calls) size_t i, kmer_size, num_samples; // // Reference genome // // Hash map of chromosome name -> sequence ChromHash *genome; ReadBuffer chroms; // Arg parsing char cmd[100]; char shortopts[300]; cmd_long_opts_to_short(longopts, shortopts, sizeof(shortopts)); int c; // silence error messages from getopt_long // opterr = 0; while((c = getopt_long_only(argc, argv, shortopts, longopts, NULL)) != -1) { cmd_get_longopt_str(longopts, c, cmd, sizeof(cmd)); switch(c) { case 0: /* flag set */ break; case 'h': cmd_print_usage(NULL); break; case 'o': cmd_check(!out_path, cmd); out_path = optarg; break; case 'O': cmd_check(!out_type, cmd); out_type = optarg; break; case 'f': cmd_check(!futil_get_force(), cmd); futil_set_force(true); break; case 'F': cmd_check(!sam_path,cmd); sam_path = optarg; break; case 'Q': cmd_check(min_mapq < 0,cmd); min_mapq = cmd_uint32(cmd, optarg); break; case 'A': cmd_check(max_align_len < 0,cmd); max_align_len = cmd_uint32(cmd, optarg); break; case 'L': cmd_check(max_allele_len < 0,cmd); max_allele_len = cmd_uint32(cmd, optarg); break; case 'm': nwmatch = cmd_int32(cmd, optarg); break; case 'M': nwmismatch = cmd_int32(cmd, optarg); break; case 'g': nwgapopen = cmd_int32(cmd, optarg); break; case 'G': nwgapextend = cmd_int32(cmd, optarg); break; case ':': /* BADARG */ case '?': /* BADCH getopt_long has already printed error */ die("`"CMD" "SUBCMD" -h` for help. Bad option: %s", argv[optind-1]); default: ctx_assert2(0, "shouldn't reach here: %c", c); } } // Defaults for unset values if(out_path == NULL) out_path = "-"; if(max_align_len < 0) max_align_len = DEFAULT_MAX_ALIGN; if(max_allele_len < 0) max_allele_len = DEFAULT_MAX_ALLELE; if(optind+2 > argc) cmd_print_usage("Require <in.txt.gz> and at least one reference"); in_path = argv[optind++]; ref_paths = (char const*const*)argv + optind; nref_paths = argc - optind; // These functions call die() on error gzFile gzin = futil_gzopen(in_path, "r"); // Read call file header cJSON *json = json_hdr_load(gzin, in_path); // Check we can handle the kmer size kmer_size = json_hdr_get_kmer_size(json, in_path); db_graph_check_kmer_size(kmer_size, in_path); // Get format (bubble or breakpoint file) cJSON *json_fmt = json_hdr_get(json, "file_format", cJSON_String, in_path); if(strcmp(json_fmt->valuestring,"CtxBreakpoints") == 0) isbubble = false; else if(strcmp(json_fmt->valuestring,"CtxBubbles") == 0) isbubble = true; else die("Unknown format: '%s'", json_fmt->valuestring); status("Reading %s in %s format", futil_inpath_str(in_path), isbubble ? "bubble" : "breakpoint"); if(isbubble) { // bubble specific if(sam_path == NULL) cmd_print_usage("Require -F <flanks.sam> with bubble file"); if(min_mapq < 0) min_mapq = DEFAULT_MIN_MAPQ; } else { // breakpoint specific if(min_mapq >= 0) cmd_print_usage("-Q,--min-mapq <Q> only valid with bubble calls"); } // Open flank file if it exists htsFile *samfh = NULL; bam_hdr_t *bam_hdr = NULL; bam1_t *mflank = NULL; if(sam_path) { if((samfh = hts_open(sam_path, "r")) == NULL) die("Cannot open SAM/BAM %s", sam_path); // Load BAM header bam_hdr = sam_hdr_read(samfh); if(bam_hdr == NULL) die("Cannot load BAM header: %s", sam_path); mflank = bam_init1(); } // Output VCF has 0 samples if bubbles file, otherwise has N where N is // number of samples/colours in the breakpoint graph size_t num_graph_samples = json_hdr_get_ncols(json, in_path); size_t num_graph_nonref = json_hdr_get_nonref_ncols(json, in_path); num_samples = 0; if(!isbubble) { // If last colour has "is_ref", drop number of samples by one num_samples = num_graph_nonref < num_graph_samples ? num_graph_samples-1 : num_graph_samples; } // // Open output file // if(!out_path) out_path = "-"; int mode = vcf_misc_get_outtype(out_type, out_path); futil_create_output(out_path); htsFile *vcffh = hts_open(out_path, modes_htslib[mode]); status("[calls2vcf] Reading %s call file with %zu samples", isbubble ? "Bubble" : "Breakpoint", num_graph_samples); status("[calls2vcf] %zu sample output to: %s format: %s", num_samples, futil_outpath_str(out_path), hsmodes_htslib[mode]); if(isbubble) status("[calls2vcf] min. MAPQ: %i", min_mapq); status("[calls2vcf] max alignment length: %i", max_align_len); status("[calls2vcf] max VCF allele length: %i", max_allele_len); status("[calls2vcf] alignment match:%i mismatch:%i gap open:%i extend:%i", nwmatch, nwmismatch, nwgapopen, nwgapextend); // Load reference genome read_buf_alloc(&chroms, 1024); genome = chrom_hash_init(); chrom_hash_load(ref_paths, nref_paths, &chroms, genome); // convert to upper case char *s; for(i = 0; i < chroms.len; i++) for(s = chroms.b[i].seq.b; *s; s++) *s = toupper(*s); if(!isbubble) brkpnt_check_refs_match(json, genome, in_path); bcf_hdr_t *vcfhdr = make_vcf_hdr(json, in_path, !isbubble, kmer_size, ref_paths, nref_paths, chroms.b, chroms.len); if(bcf_hdr_write(vcffh, vcfhdr) != 0) die("Cannot write VCF header"); AlignedCall *call = acall_init(); CallDecomp *aligner = call_decomp_init(vcffh, vcfhdr); scoring_t *scoring = call_decomp_get_scoring(aligner); scoring_init(scoring, nwmatch, nwmismatch, nwgapopen, nwgapextend, false, false, 0, 0, 0, 0); CallFileEntry centry; call_file_entry_alloc(¢ry); char kmer_str[50]; sprintf(kmer_str, ";K%zu", kmer_size); if(isbubble) { // Bubble calls DecompBubble *bubbles = decomp_bubble_init(); // Set scoring for aligning 3' flank scoring = decomp_bubble_get_scoring(bubbles); scoring_init(scoring, nwmatch, nwmismatch, nwgapopen, nwgapextend, true, true, 0, 0, 0, 0); while(call_file_read(gzin, in_path, ¢ry)) { do { if(sam_read1(samfh, bam_hdr, mflank) < 0) die("We've run out of SAM entries!"); } while(mflank->core.flag & (BAM_FSECONDARY | BAM_FSUPPLEMENTARY)); // Align call strbuf_reset(&call->info); decomp_bubble_call(bubbles, genome, kmer_size, min_mapq, ¢ry, mflank, bam_hdr, call); strbuf_append_str(&call->info, kmer_str); acall_decompose(aligner, call, max_align_len, max_allele_len); } // print bubble stats DecompBubbleStats *bub_stats = ctx_calloc(1, sizeof(*bub_stats)); decomp_bubble_cpy_stats(bub_stats, bubbles); print_bubble_stats(bub_stats); ctx_free(bub_stats); decomp_bubble_destroy(bubbles); } else { // Breakpoint calls DecompBreakpoint *breakpoints = decomp_brkpt_init(); while(call_file_read(gzin, in_path, ¢ry)) { strbuf_reset(&call->info); decomp_brkpt_call(breakpoints, genome, num_samples, ¢ry, call); strbuf_append_str(&call->info, kmer_str); acall_decompose(aligner, call, max_align_len, max_allele_len); } // print bubble stats DecompBreakpointStats *brk_stats = ctx_calloc(1, sizeof(*brk_stats)); decomp_brkpt_cpy_stats(brk_stats, breakpoints); print_breakpoint_stats(brk_stats); ctx_free(brk_stats); decomp_brkpt_destroy(breakpoints); } // Print stats DecomposeStats *astats = ctx_calloc(1, sizeof(*astats)); call_decomp_cpy_stats(astats, aligner); print_acall_stats(astats); ctx_free(astats); call_file_entry_dealloc(¢ry); call_decomp_destroy(aligner); acall_destroy(call); // Finished - clean up cJSON_Delete(json); gzclose(gzin); bcf_hdr_destroy(vcfhdr); hts_close(vcffh); for(i = 0; i < chroms.len; i++) seq_read_dealloc(&chroms.b[i]); read_buf_dealloc(&chroms); chrom_hash_destroy(genome); if(sam_path) { hts_close(samfh); bam_hdr_destroy(bam_hdr); bam_destroy1(mflank); } return EXIT_SUCCESS; }
int ctx_rmsubstr(int argc, char **argv) { struct MemArgs memargs = MEM_ARGS_INIT; size_t kmer_size = 0, nthreads = 0; const char *output_file = NULL; seq_format fmt = SEQ_FMT_FASTA; bool invert = false; // Arg parsing char cmd[100], shortopts[100]; cmd_long_opts_to_short(longopts, shortopts, sizeof(shortopts)); int c; while((c = getopt_long_only(argc, argv, shortopts, longopts, NULL)) != -1) { cmd_get_longopt_str(longopts, c, cmd, sizeof(cmd)); switch(c) { case 0: /* flag set */ break; case 'h': cmd_print_usage(NULL); break; case 'f': cmd_check(!futil_get_force(), cmd); futil_set_force(true); break; case 'o': cmd_check(!output_file, cmd); output_file = optarg; break; case 't': cmd_check(!nthreads, cmd); nthreads = cmd_uint32_nonzero(cmd, optarg); break; case 'm': cmd_mem_args_set_memory(&memargs, optarg); break; case 'n': cmd_mem_args_set_nkmers(&memargs, optarg); break; case 'k': cmd_check(!kmer_size,cmd); kmer_size = cmd_uint32(cmd, optarg); break; case 'F': cmd_check(fmt==SEQ_FMT_FASTA, cmd); fmt = cmd_parse_format(cmd, optarg); break; case 'v': cmd_check(!invert,cmd); invert = true; break; case ':': /* BADARG */ case '?': /* BADCH getopt_long has already printed error */ // cmd_print_usage(NULL); cmd_print_usage("`"CMD" rmsubstr -h` for help. Bad option: %s", argv[optind-1]); default: abort(); } } // Defaults if(!nthreads) nthreads = DEFAULT_NTHREADS; if(!kmer_size) kmer_size = DEFAULT_KMER; if(!(kmer_size&1)) cmd_print_usage("Kmer size must be odd"); if(kmer_size < MIN_KMER_SIZE) cmd_print_usage("Kmer size too small (recompile)"); if(kmer_size > MAX_KMER_SIZE) cmd_print_usage("Kmer size too large (recompile?)"); if(optind >= argc) cmd_print_usage("Please specify at least one input sequence file (.fq, .fq etc.)"); size_t i, num_seq_files = argc - optind; char **seq_paths = argv + optind; seq_file_t **seq_files = ctx_calloc(num_seq_files, sizeof(seq_file_t*)); for(i = 0; i < num_seq_files; i++) if((seq_files[i] = seq_open(seq_paths[i])) == NULL) die("Cannot read sequence file %s", seq_paths[i]); // Estimate number of bases // set to -1 if we cannot calc int64_t est_num_bases = seq_est_seq_bases(seq_files, num_seq_files); if(est_num_bases < 0) { warn("Cannot get file sizes, using pipes"); est_num_bases = memargs.num_kmers * IDEAL_OCCUPANCY; } status("[memory] Estimated number of bases: %li", (long)est_num_bases); // Use file sizes to decide on memory // // Decide on memory // size_t bits_per_kmer, kmers_in_hash, graph_mem; bits_per_kmer = sizeof(BinaryKmer)*8 + sizeof(KONodeList) + sizeof(KOccur) + // see kmer_occur.h 8; // 1 byte per kmer for each base to load sequence files kmers_in_hash = cmd_get_kmers_in_hash(memargs.mem_to_use, memargs.mem_to_use_set, memargs.num_kmers, memargs.num_kmers_set, bits_per_kmer, est_num_bases, est_num_bases, false, &graph_mem); cmd_check_mem_limit(memargs.mem_to_use, graph_mem); // // Open output file // if(output_file == NULL) output_file = "-"; FILE *fout = futil_fopen_create(output_file, "w"); // // Set up memory // dBGraph db_graph; db_graph_alloc(&db_graph, kmer_size, 1, 0, kmers_in_hash, DBG_ALLOC_BKTLOCKS); // // Load reference sequence into a read buffer // ReadBuffer rbuf; read_buf_alloc(&rbuf, 1024); seq_load_all_reads(seq_files, num_seq_files, &rbuf); // Check for reads too short for(i = 0; i < rbuf.len && rbuf.b[i].seq.end >= kmer_size; i++) {} if(i < rbuf.len) warn("Reads shorter than kmer size (%zu) will not be filtered", kmer_size); KOGraph kograph = kograph_create(rbuf.b, rbuf.len, true, 0, nthreads, &db_graph); size_t num_reads = rbuf.len, num_reads_printed = 0, num_bad_reads = 0; // Loop over reads printing those that are not substrings int ret; for(i = 0; i < rbuf.len; i++) { ret = _is_substr(&rbuf, i, &kograph, &db_graph); if(ret == -1) num_bad_reads++; else if((ret && invert) || (!ret && !invert)) { seqout_print_read(&rbuf.b[i], fmt, fout); num_reads_printed++; } } char num_reads_str[100], num_reads_printed_str[100], num_bad_reads_str[100]; ulong_to_str(num_reads, num_reads_str); ulong_to_str(num_reads_printed, num_reads_printed_str); ulong_to_str(num_bad_reads, num_bad_reads_str); status("Printed %s / %s (%.1f%%) to %s", num_reads_printed_str, num_reads_str, !num_reads ? 0.0 : (100.0 * num_reads_printed) / num_reads, futil_outpath_str(output_file)); if(num_bad_reads > 0) { status("Bad reads: %s / %s (%.1f%%) - no kmer {ACGT} of length %zu", num_bad_reads_str, num_reads_str, (100.0 * num_bad_reads) / num_reads, kmer_size); } fclose(fout); kograph_dealloc(&kograph); // Free sequence memory for(i = 0; i < rbuf.len; i++) seq_read_dealloc(&rbuf.b[i]); read_buf_dealloc(&rbuf); ctx_free(seq_files); db_graph_dealloc(&db_graph); return EXIT_SUCCESS; }
int ctx_correct(int argc, char **argv) { size_t i; struct ReadThreadCmdArgs args; read_thread_args_alloc(&args); read_thread_args_parse(&args, argc, argv, longopts, true); GraphFileReader *gfile = &args.gfile; GPathFileBuffer *gpfiles = &args.gpfiles; CorrectAlnInputBuffer *inputs = &args.inputs; // Update colours in graph file - sample in 0, all others in 1 size_t ncols = gpath_load_sample_pop(gfile, 1, gpfiles->b, gpfiles->len, args.colour); // Check for compatibility between graph files and link files graphs_gpaths_compatible(gfile, 1, gpfiles->b, gpfiles->len, 1); int64_t ctx_num_kmers = gfile->num_of_kmers; // // Decide on memory // size_t bits_per_kmer, kmers_in_hash, graph_mem, path_mem, total_mem; // 1 bit needed per kmer if we need to keep track of noreseed bits_per_kmer = sizeof(BinaryKmer)*8 + sizeof(Edges)*8 + (gpfiles->len > 0 ? sizeof(GPath*)*8 : 0) + ncols; // in colour kmers_in_hash = cmd_get_kmers_in_hash(args.memargs.mem_to_use, args.memargs.mem_to_use_set, args.memargs.num_kmers, args.memargs.num_kmers_set, bits_per_kmer, ctx_num_kmers, ctx_num_kmers, false, &graph_mem); // Paths memory size_t rem_mem = args.memargs.mem_to_use - MIN2(args.memargs.mem_to_use, graph_mem); path_mem = gpath_reader_mem_req(gpfiles->b, gpfiles->len, ncols, rem_mem, false, kmers_in_hash, false); cmd_print_mem(path_mem, "paths"); // Shift path store memory from graphs->paths graph_mem -= sizeof(GPath*)*kmers_in_hash; path_mem += sizeof(GPath*)*kmers_in_hash; // Total memory total_mem = graph_mem + path_mem; cmd_check_mem_limit(args.memargs.mem_to_use, total_mem); // // Check we can write all output files // // Open output files SeqOutput *outputs = ctx_calloc(inputs->len, sizeof(SeqOutput)); bool err_occurred = false; for(i = 0; i < inputs->len && !err_occurred; i++) { CorrectAlnInput *input = &inputs->b[i]; // We loaded target colour into colour zero input->crt_params.ctxcol = input->crt_params.ctpcol = 0; bool is_pe = asyncio_task_is_pe(&input->files); err_occurred = !seqout_open(&outputs[i], input->out_base, args.fmt, is_pe); input->output = &outputs[i]; } // Abandon if some of the output files already exist if(err_occurred) { for(i = 0; i < inputs->len; i++) seqout_close(&outputs[i], true); die("Error creating output files"); } // // Allocate memory // dBGraph db_graph; db_graph_alloc(&db_graph, gfile->hdr.kmer_size, ncols, 1, kmers_in_hash, DBG_ALLOC_EDGES | DBG_ALLOC_NODE_IN_COL); // Create a path store that does not tracks path counts gpath_reader_alloc_gpstore(gpfiles->b, gpfiles->len, path_mem, false, &db_graph); // // Load Graph and link files // GraphLoadingPrefs gprefs = graph_loading_prefs(&db_graph); gprefs.empty_colours = true; // Load graph, print stats, close file graph_load(gfile, gprefs, NULL); hash_table_print_stats_brief(&db_graph.ht); graph_file_close(gfile); // Load link files for(i = 0; i < gpfiles->len; i++) { gpath_reader_load(&gpfiles->b[i], GPATH_DIE_MISSING_KMERS, &db_graph); gpath_reader_close(&gpfiles->b[i]); } // // Run alignment // correct_reads(inputs->b, inputs->len, args.dump_seq_sizes, args.dump_frag_sizes, args.fq_zero, args.append_orig_seq, args.nthreads, &db_graph); // Close and free output files for(i = 0; i < inputs->len; i++) seqout_close(&outputs[i], false); ctx_free(outputs); // Closes input files read_thread_args_dealloc(&args); db_graph_dealloc(&db_graph); return EXIT_SUCCESS; }