Esempio n. 1
0
void _construct_graph_with_paths(dBGraph *graph,
                                 size_t kmer_size, size_t ncols,
                                 char **seqs, size_t nseqs,
                                 CorrectAlnParam path_params)
{
  size_t i;
  db_graph_alloc(graph, kmer_size, ncols, ncols, 1024);

  // Graph data
  graph->bktlocks = ctx_calloc(roundup_bits2bytes(graph->ht.num_of_buckets), 1);
  graph->col_edges = ctx_calloc(graph->ht.capacity * ncols, sizeof(Edges));
  graph->col_covgs = ctx_calloc(graph->ht.capacity * ncols, sizeof(Covg));
  graph->node_in_cols = ctx_calloc(roundup_bits2bytes(graph->ht.capacity) * ncols, 1);

  // Path data
  path_store_alloc(&graph->pstore, 1024, true, graph->ht.capacity, ncols);
  graph->pstore.kmer_locks = ctx_calloc(roundup_bits2bytes(graph->ht.capacity), 1);

  // Build graph
  for(i = 0; i < nseqs; i++)
    build_graph_from_str_mt(graph, 0, seqs[i], strlen(seqs[i]));

  graph->num_of_cols_used = MAX2(graph->num_of_cols_used, 1);

  GenPathWorker *gen_path_wrkr = gen_paths_workers_alloc(1, graph, NULL);

  for(i = 0; i < nseqs; i++)
    gen_paths_from_str_mt(gen_path_wrkr, seqs[i], path_params);

  gen_paths_workers_dealloc(gen_path_wrkr, 1);
}
Esempio n. 2
0
void all_tests_add_paths_multi(dBGraph *graph, const char **seqs, size_t nseqs,
                               CorrectAlnParam params,
                               int exp_npaths, int exp_nkmers)
{
  size_t npaths = graph->gpstore.num_paths;
  size_t nkmers = graph->gpstore.num_kmers_with_paths;

  size_t i, nworkers = 1;
  GenPathWorker *wrkrs = gen_paths_workers_alloc(nworkers, graph);

  // Set up asyncio input data
  AsyncIOInput io = {.file1 = NULL, .file2 = NULL,
                     .fq_offset = 0, .interleaved = false};

  CorrectAlnInput task = {.files = io, .fq_cutoff = 0, .hp_cutoff = 0,
                          .matedir = READPAIR_FR, .crt_params = params,
                          .out_base = NULL, .output = NULL};

  AsyncIOData iodata;
  asynciodata_alloc(&iodata);
  seq_read_reset(&iodata.r2);
  iodata.fq_offset1 = iodata.fq_offset2 = 0;
  iodata.ptr = NULL;

  // Add paths
  for(i = 0; i < nseqs; i++) {
    seq_read_set(&iodata.r1, seqs[i]);
    gen_paths_worker_seq(wrkrs, &iodata, &task);
  }

  asynciodata_dealloc(&iodata);
  gen_paths_workers_dealloc(wrkrs, nworkers);

  // Check we added the right number of paths
  if(exp_npaths >= 0) {
    TASSERT2(graph->gpstore.num_paths == npaths + (size_t)exp_npaths, "%zu %zu %zu",
             (size_t)graph->gpstore.num_paths, (size_t)npaths, (size_t)exp_npaths);
  }

  if(exp_nkmers >= 0) {
    TASSERT(graph->gpstore.num_kmers_with_paths == nkmers + (size_t)exp_nkmers);
  }
}

void all_tests_add_paths(dBGraph *graph, const char *seq,
                         CorrectAlnParam params,
                         int exp_npaths, int exp_nkmers)
{
  all_tests_add_paths_multi(graph, &seq, 1, params, exp_npaths, exp_nkmers);
}

void all_tests_construct_graph(dBGraph *graph,
                               size_t kmer_size, size_t ncols,
                               const char **seqs, size_t nseqs,
                               CorrectAlnParam path_params)
{
  size_t i;
  db_graph_alloc(graph, kmer_size, ncols, ncols, 1024,
                 DBG_ALLOC_EDGES | DBG_ALLOC_COVGS |
                 DBG_ALLOC_NODE_IN_COL | DBG_ALLOC_BKTLOCKS);

  // Path data
  gpath_store_alloc(&graph->gpstore, ncols, graph->ht.capacity,
                    0, ONE_MEGABYTE, true, false);

  // Don't use links to add new links
  gpath_store_split_read_write(&graph->gpstore);

  // Allocate path hash table just in case
  gpath_hash_alloc(&graph->gphash, &graph->gpstore, ONE_MEGABYTE);

  // Build graph
  for(i = 0; i < nseqs; i++)
    build_graph_from_str_mt(graph, 0, seqs[i], strlen(seqs[i]), false);

  gpath_store_merge_read_write(&graph->gpstore);

  graph->num_of_cols_used = MAX2(graph->num_of_cols_used, 1);

  all_tests_add_paths_multi(graph, seqs, nseqs, path_params, -1, -1);
}
Esempio n. 3
0
int ctx_thread(int argc, char **argv)
{
  struct ReadThreadCmdArgs args;
  read_thread_args_alloc(&args);
  read_thread_args_parse(&args, argc, argv, longopts, false);

  GraphFileReader *gfile = &args.gfile;
  GPathFileBuffer *gpfiles = &args.gpfiles;
  CorrectAlnInputBuffer *inputs = &args.inputs;
  size_t i;

  if(args.zero_link_counts && gpfiles->len == 0)
    cmd_print_usage("-0,--zero-paths without -p,--paths <in.ctp> has no meaning");

  // Check each path file only loads one colour
  gpaths_only_for_colour(gpfiles->b, gpfiles->len, 0);

  //
  // Decide on memory
  //
  size_t bits_per_kmer, kmers_in_hash, graph_mem, total_mem;
  size_t path_hash_mem, path_store_mem, path_mem;
  bool sep_path_list = (!args.use_new_paths && gpfiles->len > 0);

  bits_per_kmer = sizeof(BinaryKmer)*8 + sizeof(Edges)*8 + sizeof(GPath*)*8 +
                  2 * args.nthreads; // Have traversed

  // false -> don't use mem_to_use to decide how many kmers to store in hash
  // since we need some of that memory for storing paths
  kmers_in_hash = cmd_get_kmers_in_hash(args.memargs.mem_to_use,
                                        args.memargs.mem_to_use_set,
                                        args.memargs.num_kmers,
                                        args.memargs.num_kmers_set,
                                        bits_per_kmer,
                                        gfile->num_of_kmers,
                                        gfile->num_of_kmers,
                                        false, &graph_mem);

  // Paths memory
  size_t min_path_mem = 0;
  gpath_reader_sum_mem(gpfiles->b, gpfiles->len, 1, true, true, &min_path_mem);

  if(graph_mem + min_path_mem > args.memargs.mem_to_use) {
    char buf[50];
    die("Require at least %s memory", bytes_to_str(graph_mem+min_path_mem, 1, buf));
  }

  path_mem = args.memargs.mem_to_use - graph_mem;
  size_t pentry_hash_mem = sizeof(GPEntry)/0.7;
  size_t pentry_store_mem = sizeof(GPath) + 8 + // struct + sequence
                            1 + // in colour
                            sizeof(uint8_t) + // counts
                            sizeof(uint32_t); // kmer length

  size_t max_paths = path_mem / (pentry_store_mem + pentry_hash_mem);
  path_store_mem = max_paths * pentry_store_mem;
  path_hash_mem = max_paths * pentry_hash_mem;
  cmd_print_mem(path_hash_mem, "paths hash");
  cmd_print_mem(path_store_mem, "paths store");

  total_mem = graph_mem + path_mem;
  cmd_check_mem_limit(args.memargs.mem_to_use, total_mem);

  //
  // Open output file
  //
  gzFile gzout = futil_gzopen_create(args.out_ctp_path, "w");

  status("Creating paths file: %s", futil_outpath_str(args.out_ctp_path));

  //
  // Allocate memory
  //
  dBGraph db_graph;
  size_t kmer_size = gfile->hdr.kmer_size;
  db_graph_alloc(&db_graph, kmer_size, 1, 1, kmers_in_hash,
                 DBG_ALLOC_EDGES | DBG_ALLOC_NODE_IN_COL);

  // Split path memory 2:1 between store and hash
  // Create a path store that tracks path counts
  gpath_store_alloc(&db_graph.gpstore,
                    db_graph.num_of_cols, db_graph.ht.capacity,
                    0, path_store_mem, true, sep_path_list);

  // Create path hash table for fast lookup
  gpath_hash_alloc(&db_graph.gphash, &db_graph.gpstore, path_hash_mem);

  if(args.use_new_paths) {
    status("Using paths as they are added (risky)");
  } else {
    status("Not using new paths as they are added (safe)");
  }

  //
  // Start up workers to add paths to the graph
  //
  GenPathWorker *workers;
  workers = gen_paths_workers_alloc(args.nthreads, &db_graph);

  // Setup for loading graphs graph
  LoadingStats gstats;
  loading_stats_init(&gstats);

  // Path statistics
  LoadingStats *load_stats = gen_paths_get_stats(workers);
  CorrectAlnStats *aln_stats = gen_paths_get_aln_stats(workers);

  // Load contig hist distribution
  for(i = 0; i < gpfiles->len; i++) {
    gpath_reader_load_contig_hist(gpfiles->b[i].json,
                                  gpfiles->b[i].fltr.path.b,
                                  file_filter_fromcol(&gpfiles->b[i].fltr, 0),
                                  &aln_stats->contig_histgrm);
  }

  GraphLoadingPrefs gprefs = {.db_graph = &db_graph,
                              .boolean_covgs = false,
                              .must_exist_in_graph = false,
                              .must_exist_in_edges = NULL,
                              .empty_colours = false}; // already loaded paths

  // Load graph, print stats, close file
  graph_load(gfile, gprefs, &gstats);
  hash_table_print_stats_brief(&db_graph.ht);
  graph_file_close(gfile);

  // Load existing paths
  for(i = 0; i < gpfiles->len; i++)
    gpath_reader_load(&gpfiles->b[i], GPATH_DIE_MISSING_KMERS, &db_graph);

  // zero link counts of already loaded links
  if(args.zero_link_counts) {
    status("Zeroing link counts for loaded links");
    gpath_set_zero_nseen(&db_graph.gpstore.gpset);
  }

  if(!args.use_new_paths)
    gpath_store_split_read_write(&db_graph.gpstore);

  // Deal with a set of files at once
  // Can have different numbers of inputs vs threads
  size_t start, end;
  for(start = 0; start < inputs->len; start += MAX_IO_THREADS)
  {
    end = MIN2(inputs->len, start+MAX_IO_THREADS);
    generate_paths(inputs->b+start, end-start, workers, args.nthreads);
  }

  // Print memory statistics
  gpath_hash_print_stats(&db_graph.gphash);
  gpath_store_print_stats(&db_graph.gpstore);

  correct_aln_dump_stats(aln_stats, load_stats,
                         args.dump_seq_sizes,
                         args.dump_frag_sizes,
                         db_graph.ht.num_kmers);

  // Don't need GPathHash anymore
  gpath_hash_dealloc(&db_graph.gphash);

  cJSON **hdrs = ctx_malloc(gpfiles->len * sizeof(cJSON*));
  for(i = 0; i < gpfiles->len; i++) hdrs[i] = gpfiles->b[i].json;

  size_t output_threads = MIN2(args.nthreads, MAX_IO_THREADS);

  // Generate a cJSON header for all inputs
  cJSON *thread_hdr = cJSON_CreateObject();
  cJSON *inputs_hdr = cJSON_CreateArray();
  cJSON_AddItemToObject(thread_hdr, "inputs", inputs_hdr);
  for(i = 0; i < inputs->len; i++)
    cJSON_AddItemToArray(inputs_hdr, correct_aln_input_json_hdr(&inputs->b[i]));

  // Write output file
  gpath_save(gzout, args.out_ctp_path, output_threads, true,
             "thread", thread_hdr, hdrs, gpfiles->len,
             &aln_stats->contig_histgrm, 1,
             &db_graph);

  gzclose(gzout);
  ctx_free(hdrs);

  // Optionally run path checks for debugging
  // gpath_checks_all_paths(&db_graph, args.nthreads);

  // ins_gap, err_gap no longer allocated after this line
  gen_paths_workers_dealloc(workers, args.nthreads);

  // Close and free input files etc.
  read_thread_args_dealloc(&args);
  db_graph_dealloc(&db_graph);

  return EXIT_SUCCESS;
}
static void test_repeat_loop()
{
  TASSERT(sizeof(FollowPath) == 20);

  // Construct 1 colour graph with kmer-size=11
  dBGraph graph;
  size_t kmer_size = 11, ncols = 1;

  // Set up alignment correction params
  CorrectAlnParam params = {.ctpcol = 0, .ctxcol = 0,
                            .ins_gap_min = 0, .ins_gap_max = 0,
                            .one_way_gap_traverse = true, .use_end_check = true,
                            .max_context = 10,
                            .gap_variance = 0.1, .gap_wiggle = 5};

  // Sequence with repeat
  char seq[] = "ATTTGGAACTCCGGA"
               "GATAGGGCCAGT"
               "GATAGGGCCAGT"
               "GATAGGGCCAGT"
               "GATAGGGCCAGT"
               "GATAGGGCCAGT"
               "GATAGGGCCAGT"
               "GATAGGGCCAGT"
               "GATAGGGCCAGT"
               "GATAGGGCCAGT"
               "GATAGGGCCAGT"
               "GATAGGGCCAGT"
               "CGTCAGGAGCTAACT";

  char p0[] = "ATTTGGAACTCCGGA""GATAGGGCCAGT""GATAGGGCCAGT";
  char p1[] = "GATAGGGCCAGT""GATAGGGCCAGT""CGTCAGGAGCTAACT";

  // Allocate graph, but don't add any sequence
  _construct_graph_with_paths(&graph, kmer_size, ncols, NULL, 0, params);

  GenPathWorker *gen_path_wrkr = gen_paths_workers_alloc(1, &graph, NULL);

  GraphWalker gwlk;
  RepeatWalker rptwlk;
  graph_walker_alloc(&gwlk);
  rpt_walker_alloc(&rptwlk, graph.ht.capacity, 12);

  dBNodeBuffer nbuf;
  db_node_buf_alloc(&nbuf, 1024);

  // Construct graph but no paths
  build_graph_from_str_mt(&graph, 0, seq, strlen(seq));
  TASSERT2(graph.ht.num_kmers == 15+12+15, "%zu", (size_t)graph.ht.num_kmers);

  // Find first node in sequence
  dBNode node0 = db_graph_find_str(&graph, seq);
  TASSERT(node0.key != HASH_NOT_FOUND);

  // 1) With no paths
  char ans0[] = "ATTTGGAACTCCGGA""GATAGGGCCAGT";
  test_walk(&gwlk, &rptwlk, node0, &nbuf, &graph, 15+2, ans0);

  // 2) Add small paths - produces collapsed down seq with two copy repeat
  gen_paths_from_str_mt(gen_path_wrkr, p0, params);
  gen_paths_from_str_mt(gen_path_wrkr, p1, params);
  char ans1[] = "ATTTGGAACTCCGGA""GATAGGGCCAGT""GATAGGGCCAGT""CGTCAGGAGCTAACT";
  test_walk(&gwlk, &rptwlk, node0, &nbuf, &graph, 15+12+12+5, ans1);

  // 3) Add long paths
  gen_paths_from_str_mt(gen_path_wrkr, seq, params);
  test_walk(&gwlk, &rptwlk, node0, &nbuf, &graph, strlen(seq)+1-kmer_size, seq);

  graph_walker_dealloc(&gwlk);
  rpt_walker_dealloc(&rptwlk);
  db_node_buf_dealloc(&nbuf);
  gen_paths_workers_dealloc(gen_path_wrkr, 1);
  db_graph_dealloc(&graph);
}

void test_repeat_walker()
{
  test_status("Testing repeat_walker.h");
  test_repeat_loop();
}