int ctx_correct(int argc, char **argv)
{
  size_t i, j;
  struct ReadThreadCmdArgs args = READ_THREAD_CMD_ARGS_INIT;
  read_thread_args_alloc(&args);
  read_thread_args_parse(&args, argc, argv, longopts, true);

  GraphFileReader *gfile = &args.gfile;
  PathFileBuffer *pfiles = &args.pfiles;
  CorrectAlnInputBuffer *inputs = &args.inputs;
  size_t ctx_total_cols = gfile->hdr.num_of_cols;
  size_t ctx_num_kmers = gfile->num_of_kmers;

  if(args.colour > ctx_total_cols)
    cmd_print_usage("-c %zu is too big [> %zu]", args.colour, ctx_total_cols);

  size_t ctp_usedcols = 0;
  for(i = 0; i < pfiles->len; i++) {
    if(!file_filter_iscolloaded(&pfiles->data[i].fltr, args.colour)) {
      cmd_print_usage("Path file doesn't load into colour %zu: %s",
                      args.colour, pfiles->data[i].fltr.orig_path.buff);
    }
    ctp_usedcols = MAX2(ctp_usedcols, path_file_usedcols(&pfiles->data[i]));
  }

  //
  // Decide on memory
  //
  size_t bits_per_kmer, kmers_in_hash, graph_mem, path_mem, total_mem;

  // 1 bit needed per kmer if we need to keep track of noreseed
  bits_per_kmer = sizeof(Edges)*8 + ctx_num_kmers + sizeof(uint64_t)*8;
  kmers_in_hash = cmd_get_kmers_in_hash2(args.memargs.mem_to_use,
                                         args.memargs.mem_to_use_set,
                                         args.memargs.num_kmers,
                                         args.memargs.num_kmers_set,
                                         bits_per_kmer,
                                         ctx_num_kmers, ctx_num_kmers,
                                         false, &graph_mem);

  // Paths memory
  path_mem = path_files_mem_required(pfiles->data, pfiles->len, false, false,
                                     ctp_usedcols, 0);
  cmd_print_mem(path_mem, "paths");

  // Total memory
  total_mem = graph_mem + path_mem;
  cmd_check_mem_limit(args.memargs.mem_to_use, total_mem);

  //
  // Check we can read all output files
  //
  // Open output files
  SeqOutput *outputs = ctx_calloc(inputs->len, sizeof(SeqOutput));
  bool output_files_exist = false;

  for(i = 0; i < inputs->len; i++)
  {
    CorrectAlnInput *input = &inputs->data[i];
    input->crt_params.ctxcol = input->crt_params.ctpcol = args.colour;
    SeqOutput *output = &outputs[i];
    seq_output_alloc(output);
    seq_output_set_paths(output, input->out_base,
                         async_task_pe_output(&input->files));
    input->output = output;
    // output check prints warnings and returns true if errors
    output_files_exist |= seq_output_files_exist_check(output);
  }

  // Abandon if some of the output files already exist
  if(output_files_exist) die("Output files already exist");

  // Attempt to open all files
  for(i = 0; i < inputs->len && seq_output_open(&outputs[i]); i++) {}

  // Check if something went wrong - if so remove all output files
  if(i < inputs->len) {
    for(j = 0; j < i; j++) seq_output_delete(&outputs[i]);
    die("Couldn't open output files");
  }

  //
  // Allocate memory
  //

  dBGraph db_graph;
  db_graph_alloc(&db_graph, gfile->hdr.kmer_size, ctx_total_cols, 1, kmers_in_hash);

  size_t bytes_per_col = roundup_bits2bytes(db_graph.ht.capacity);

  db_graph.col_edges = ctx_calloc(db_graph.ht.capacity, sizeof(Edges));
  db_graph.node_in_cols = ctx_calloc(bytes_per_col * ctx_total_cols, 1);

  // Paths
  path_store_alloc(&db_graph.pstore, path_mem, false,
                   db_graph.ht.capacity, ctp_usedcols);

  //
  // Load Graph and Path files
  //
  LoadingStats gstats = LOAD_STATS_INIT_MACRO;
  GraphLoadingPrefs gprefs = {.db_graph = &db_graph,
                              .boolean_covgs = false,
                              .must_exist_in_graph = false,
                              .must_exist_in_edges = NULL,
                              .empty_colours = true};

  // Load graph, print stats, close file
  graph_load(gfile, gprefs, &gstats);
  hash_table_print_stats_brief(&db_graph.ht);
  graph_file_close(gfile);

  // Load path files (does nothing if num_fpiles == 0)
  paths_format_merge(pfiles->data, pfiles->len, false, false,
                     args.num_of_threads, &db_graph);

  //
  // Run alignment
  //
  correct_reads(args.num_of_threads, MAX_IO_THREADS,
                inputs->data, inputs->len,
                &db_graph);

  // Close and free output files
  for(i = 0; i < inputs->len; i++) seq_output_dealloc(&outputs[i]);
  ctx_free(outputs);

  read_thread_args_dealloc(&args);

  db_graph_dealloc(&db_graph);

  return EXIT_SUCCESS;
}
Exemple #2
0
int ctx_correct(int argc, char **argv)
{
  size_t i;
  struct ReadThreadCmdArgs args;
  read_thread_args_alloc(&args);
  read_thread_args_parse(&args, argc, argv, longopts, true);

  GraphFileReader *gfile = &args.gfile;
  GPathFileBuffer *gpfiles = &args.gpfiles;
  CorrectAlnInputBuffer *inputs = &args.inputs;

  // Update colours in graph file - sample in 0, all others in 1
  size_t ncols = gpath_load_sample_pop(gfile, 1, gpfiles->b, gpfiles->len,
                                       args.colour);

  // Check for compatibility between graph files and link files
  graphs_gpaths_compatible(gfile, 1, gpfiles->b, gpfiles->len, 1);

  int64_t ctx_num_kmers = gfile->num_of_kmers;

  //
  // Decide on memory
  //
  size_t bits_per_kmer, kmers_in_hash, graph_mem, path_mem, total_mem;

  // 1 bit needed per kmer if we need to keep track of noreseed
  bits_per_kmer = sizeof(BinaryKmer)*8 + sizeof(Edges)*8 +
                  (gpfiles->len > 0 ? sizeof(GPath*)*8 : 0) +
                  ncols; // in colour

  kmers_in_hash = cmd_get_kmers_in_hash(args.memargs.mem_to_use,
                                        args.memargs.mem_to_use_set,
                                        args.memargs.num_kmers,
                                        args.memargs.num_kmers_set,
                                        bits_per_kmer,
                                        ctx_num_kmers, ctx_num_kmers,
                                        false, &graph_mem);

  // Paths memory
  size_t rem_mem = args.memargs.mem_to_use - MIN2(args.memargs.mem_to_use, graph_mem);
  path_mem = gpath_reader_mem_req(gpfiles->b, gpfiles->len, ncols, rem_mem, false,
                                  kmers_in_hash, false);

  cmd_print_mem(path_mem, "paths");

  // Shift path store memory from graphs->paths
  graph_mem -= sizeof(GPath*)*kmers_in_hash;
  path_mem  += sizeof(GPath*)*kmers_in_hash;

  // Total memory
  total_mem = graph_mem + path_mem;
  cmd_check_mem_limit(args.memargs.mem_to_use, total_mem);

  //
  // Check we can write all output files
  //
  // Open output files
  SeqOutput *outputs = ctx_calloc(inputs->len, sizeof(SeqOutput));
  bool err_occurred = false;

  for(i = 0; i < inputs->len && !err_occurred; i++)
  {
    CorrectAlnInput *input = &inputs->b[i];
    // We loaded target colour into colour zero
    input->crt_params.ctxcol = input->crt_params.ctpcol = 0;
    bool is_pe = asyncio_task_is_pe(&input->files);
    err_occurred = !seqout_open(&outputs[i], input->out_base, args.fmt, is_pe);
    input->output = &outputs[i];
  }

  // Abandon if some of the output files already exist
  if(err_occurred) {
    for(i = 0; i < inputs->len; i++)
      seqout_close(&outputs[i], true);
    die("Error creating output files");
  }

  //
  // Allocate memory
  //

  dBGraph db_graph;
  db_graph_alloc(&db_graph, gfile->hdr.kmer_size, ncols, 1, kmers_in_hash,
                 DBG_ALLOC_EDGES | DBG_ALLOC_NODE_IN_COL);

  // Create a path store that does not tracks path counts
  gpath_reader_alloc_gpstore(gpfiles->b, gpfiles->len, path_mem, false, &db_graph);

  //
  // Load Graph and link files
  //
  GraphLoadingPrefs gprefs = graph_loading_prefs(&db_graph);
  gprefs.empty_colours = true;

  // Load graph, print stats, close file
  graph_load(gfile, gprefs, NULL);
  hash_table_print_stats_brief(&db_graph.ht);
  graph_file_close(gfile);

  // Load link files
  for(i = 0; i < gpfiles->len; i++) {
    gpath_reader_load(&gpfiles->b[i], GPATH_DIE_MISSING_KMERS, &db_graph);
    gpath_reader_close(&gpfiles->b[i]);
  }

  //
  // Run alignment
  //
  correct_reads(inputs->b, inputs->len,
                args.dump_seq_sizes, args.dump_frag_sizes,
                args.fq_zero, args.append_orig_seq,
                args.nthreads, &db_graph);

  // Close and free output files
  for(i = 0; i < inputs->len; i++)
    seqout_close(&outputs[i], false);
  ctx_free(outputs);

  // Closes input files
  read_thread_args_dealloc(&args);
  db_graph_dealloc(&db_graph);

  return EXIT_SUCCESS;
}
////////////////////////////////////////////////////////////
// main
////////////////////////////////////////////////////////////
int main(int argc, char **argv) {
  parse_command_line(argc, argv);

  // prepare AT and GC counts
  unsigned long long atgc[2] = {0};

  // make trusted kmer data structure
  bithash *trusted = new bithash(k);

  // get good kmers from Hammer
  if (hammerf != NULL) {
    string hammerf_str(hammerf);
    if (hammerf_str.substr(hammerf_str.size()-3) == ".gz") {
	igzstream hammerf_in(hammerf);
	trusted->hammer_file_load(hammerf_in, atgc);
    } else {
	ifstream hammerf_in(hammerf);
	trusted->hammer_file_load(hammerf_in, atgc);
    }   
  }
  
  // get kmer counts
  if(merf != NULL) {
    string merf_str(merf);
    if(ATcutf != NULL) {
      if(merf_str.substr(merf_str.size()-3) == ".gz") {
	igzstream mer_in(merf);
	trusted->tab_file_load(mer_in, load_AT_cutoffs(), atgc);
      } else {
	ifstream mer_in(merf);
	trusted->tab_file_load(mer_in, load_AT_cutoffs(), atgc);
      }
    } else {
      if(merf_str.substr(merf_str.size()-3) == ".gz") {
	igzstream mer_in(merf);
	trusted->tab_file_load(mer_in, cutoff, atgc);
      } else {
	ifstream mer_in(merf);
	trusted->tab_file_load(mer_in, cutoff, atgc);
      }
    }

  // saved bithash
  } else if(bithashf != NULL) {
    if(strcmp(bithashf,"-") == 0) {
      cerr << "Saved bithash cannot be piped in.  Please specify file." << endl;
      exit(EXIT_FAILURE);
    } else
      trusted->binary_file_input(bithashf, atgc);
  }  
  cout << trusted->num_kmers() << " trusted kmers" << endl;

  double prior_prob[4];
  prior_prob[0] = (double)atgc[0] / (double)(atgc[0]+atgc[1]) / 2.0;
  prior_prob[1] = .5 - prior_prob[0];
  prior_prob[2] = prior_prob[1];
  prior_prob[3] = prior_prob[0];
  
  //cout << "AT: " << atgc[0] << " GC: " << atgc[1] << endl;
  cout << "AT% = " << (2*prior_prob[0]) << endl;

  // make list of files
  vector<string> fastqfs;
  vector<int> pairedend_codes;
  parse_fastq(fastqfs, pairedend_codes);

  // process each file
  string fqf;
  bool zip;
  for(int f = 0; f < fastqfs.size(); f++) {
    fqf = fastqfs[f];
    cout << fqf << endl;

    // unzip
    if(fqf.substr(fqf.size()-3) == ".gz") {
      zip = true;
      unzip_fastq(fqf);
    } else
      zip = false;

    // determine quality value scale
    if(Read::quality_scale == -1)
     guess_quality_scale(fqf);

    // split file
    vector<streampos> starts;
    vector<unsigned long long> counts;
    chunkify_fastq(fqf, starts, counts);

    // learn nt->nt transitions
    double ntnt_prob[Read::max_qual][4][4] = {0};
    for(int q = 0; q < Read::max_qual; q++)
      for(int i = 0; i < 4; i++)
	for(int j = 0; j < 4; j++)
	  if(i != j)
	    ntnt_prob[q][i][j] = 1.0/3.0;

    if(!TESTING)
      learn_errors(fqf, trusted, starts, counts, ntnt_prob, prior_prob);

    // correct
    correct_reads(fqf, pairedend_codes[f], trusted, starts, counts, ntnt_prob, prior_prob);
    
    // combine
    if(pairedend_codes[f] == 0) {
      combine_output(fqf, string("cor"), uncorrected_out);
    }

    // combine paired end
    if(pairedend_codes[f] == 2) {
      if(!zip) {
	combine_output_paired(fastqfs[f-1], fqf, string("cor"), uncorrected_out);
      } else {
	combine_output_paired(fastqfs[f-1].substr(0,fastqfs[f-1].size()-3), fqf, string("cor"), uncorrected_out);
      }
    }

    if(zip)
      zip_fastq(fqf);
  }

  return 0;
}