Beispiel #1
0
static void inputs_attempt_open()
{
  bool err_occurred = false;
  size_t i;

  for(i = 0; i < inputs.len && !err_occurred; i++) {
    AlignReadsData *input = &inputs.b[i];
    err_occurred = !seqout_open(&input->seqout, input->out_base, input->fmt,
                                // input->use_fq ? SEQ_FMT_FASTQ : SEQ_FMT_FASTQ,
                                asyncio_task_is_pe(&files.b[i]));
  }

  if(err_occurred) {
    for(i = 0; i < inputs.len; i++)
      seqout_close(&inputs.b[i].seqout, true);
    die("Error creating output files");
  }
}
Beispiel #2
0
int ctx_correct(int argc, char **argv)
{
  size_t i;
  struct ReadThreadCmdArgs args;
  read_thread_args_alloc(&args);
  read_thread_args_parse(&args, argc, argv, longopts, true);

  GraphFileReader *gfile = &args.gfile;
  GPathFileBuffer *gpfiles = &args.gpfiles;
  CorrectAlnInputBuffer *inputs = &args.inputs;

  // Update colours in graph file - sample in 0, all others in 1
  size_t ncols = gpath_load_sample_pop(gfile, 1, gpfiles->b, gpfiles->len,
                                       args.colour);

  // Check for compatibility between graph files and link files
  graphs_gpaths_compatible(gfile, 1, gpfiles->b, gpfiles->len, 1);

  int64_t ctx_num_kmers = gfile->num_of_kmers;

  //
  // Decide on memory
  //
  size_t bits_per_kmer, kmers_in_hash, graph_mem, path_mem, total_mem;

  // 1 bit needed per kmer if we need to keep track of noreseed
  bits_per_kmer = sizeof(BinaryKmer)*8 + sizeof(Edges)*8 +
                  (gpfiles->len > 0 ? sizeof(GPath*)*8 : 0) +
                  ncols; // in colour

  kmers_in_hash = cmd_get_kmers_in_hash(args.memargs.mem_to_use,
                                        args.memargs.mem_to_use_set,
                                        args.memargs.num_kmers,
                                        args.memargs.num_kmers_set,
                                        bits_per_kmer,
                                        ctx_num_kmers, ctx_num_kmers,
                                        false, &graph_mem);

  // Paths memory
  size_t rem_mem = args.memargs.mem_to_use - MIN2(args.memargs.mem_to_use, graph_mem);
  path_mem = gpath_reader_mem_req(gpfiles->b, gpfiles->len, ncols, rem_mem, false,
                                  kmers_in_hash, false);

  cmd_print_mem(path_mem, "paths");

  // Shift path store memory from graphs->paths
  graph_mem -= sizeof(GPath*)*kmers_in_hash;
  path_mem  += sizeof(GPath*)*kmers_in_hash;

  // Total memory
  total_mem = graph_mem + path_mem;
  cmd_check_mem_limit(args.memargs.mem_to_use, total_mem);

  //
  // Check we can write all output files
  //
  // Open output files
  SeqOutput *outputs = ctx_calloc(inputs->len, sizeof(SeqOutput));
  bool err_occurred = false;

  for(i = 0; i < inputs->len && !err_occurred; i++)
  {
    CorrectAlnInput *input = &inputs->b[i];
    // We loaded target colour into colour zero
    input->crt_params.ctxcol = input->crt_params.ctpcol = 0;
    bool is_pe = asyncio_task_is_pe(&input->files);
    err_occurred = !seqout_open(&outputs[i], input->out_base, args.fmt, is_pe);
    input->output = &outputs[i];
  }

  // Abandon if some of the output files already exist
  if(err_occurred) {
    for(i = 0; i < inputs->len; i++)
      seqout_close(&outputs[i], true);
    die("Error creating output files");
  }

  //
  // Allocate memory
  //

  dBGraph db_graph;
  db_graph_alloc(&db_graph, gfile->hdr.kmer_size, ncols, 1, kmers_in_hash,
                 DBG_ALLOC_EDGES | DBG_ALLOC_NODE_IN_COL);

  // Create a path store that does not tracks path counts
  gpath_reader_alloc_gpstore(gpfiles->b, gpfiles->len, path_mem, false, &db_graph);

  //
  // Load Graph and link files
  //
  GraphLoadingPrefs gprefs = graph_loading_prefs(&db_graph);
  gprefs.empty_colours = true;

  // Load graph, print stats, close file
  graph_load(gfile, gprefs, NULL);
  hash_table_print_stats_brief(&db_graph.ht);
  graph_file_close(gfile);

  // Load link files
  for(i = 0; i < gpfiles->len; i++) {
    gpath_reader_load(&gpfiles->b[i], GPATH_DIE_MISSING_KMERS, &db_graph);
    gpath_reader_close(&gpfiles->b[i]);
  }

  //
  // Run alignment
  //
  correct_reads(inputs->b, inputs->len,
                args.dump_seq_sizes, args.dump_frag_sizes,
                args.fq_zero, args.append_orig_seq,
                args.nthreads, &db_graph);

  // Close and free output files
  for(i = 0; i < inputs->len; i++)
    seqout_close(&outputs[i], false);
  ctx_free(outputs);

  // Closes input files
  read_thread_args_dealloc(&args);
  db_graph_dealloc(&db_graph);

  return EXIT_SUCCESS;
}
Beispiel #3
0
int ctx_reads(int argc, char **argv)
{
  parse_args(argc, argv);

  //
  // Open input graphs
  //
  GraphFileReader *gfiles = ctx_calloc(num_gfiles, sizeof(GraphFileReader));
  size_t i, ctx_max_kmers = 0, ctx_sum_kmers = 0;

  graph_files_open(gfile_paths, gfiles, num_gfiles,
                   &ctx_max_kmers, &ctx_sum_kmers);

  // Will exit and remove output files on error
  inputs_attempt_open();

  //
  // Calculate memory use
  //
  size_t kmers_in_hash, graph_mem, bits_per_kmer = sizeof(BinaryKmer)*8;

  kmers_in_hash = cmd_get_kmers_in_hash(memargs.mem_to_use,
                                        memargs.mem_to_use_set,
                                        memargs.num_kmers,
                                        memargs.num_kmers_set,
                                        bits_per_kmer,
                                        ctx_max_kmers, ctx_sum_kmers,
                                        true, &graph_mem);

  cmd_check_mem_limit(memargs.mem_to_use, graph_mem);

  //
  // Set up graph
  //
  dBGraph db_graph;
  db_graph_alloc(&db_graph, gfiles[0].hdr.kmer_size, 1, 0, kmers_in_hash, 0);

  // Load graphs
  LoadingStats gstats = LOAD_STATS_INIT_MACRO;

  GraphLoadingPrefs gprefs = {.db_graph = &db_graph,
                              .must_exist_in_graph = false,
                              .empty_colours = true,
                              .boolean_covgs = false};

  for(i = 0; i < num_gfiles; i++) {
    file_filter_flatten(&gfiles[i].fltr, 0);
    graph_load(&gfiles[i], gprefs, &gstats);
    graph_file_close(&gfiles[i]);
    gprefs.empty_colours = false;
  }
  ctx_free(gfiles);

  status("Printing reads that do %stouch the graph\n",
         inputs.b[0].invert ? "not " : "");

  //
  // Filter reads using async io
  //
  LoadingStats seq_stats = LOAD_STATS_INIT_MACRO;

  for(i = 0; i < inputs.len; i++) {
    inputs.b[i].stats = &seq_stats;
    inputs.b[i].db_graph = &db_graph;
  }

  // Deal with a set of files at once
  size_t start, end;
  for(start = 0; start < inputs.len; start += MAX_IO_THREADS)
  {
    // Can have different numbers of inputs vs threads
    end = MIN2(inputs.len, start+MAX_IO_THREADS);
    asyncio_run_pool(files.b+start, end-start, filter_reads, NULL, nthreads, 0);
  }

  size_t total_reads_printed = 0;
  size_t total_reads = seq_stats.num_se_reads + seq_stats.num_pe_reads;

  for(i = 0; i < inputs.len; i++)
    total_reads_printed += inputs.b[i].num_of_reads_printed;

  for(i = 0; i < inputs.len; i++) {
    seqout_close(&inputs.b[i].seqout, false);
    asyncio_task_close(&files.b[i]);
  }

  aln_reads_buf_dealloc(&inputs);
  asyncio_buf_dealloc(&files);

  status("Total printed %zu / %zu (%.2f%%) reads\n",
         total_reads_printed, total_reads,
         total_reads ? (100.0 * total_reads_printed) / total_reads : 0.0);

  db_graph_dealloc(&db_graph);

  return EXIT_SUCCESS;
}