Пример #1
0
// Safe to call on different entries at the same time
// NOT safe to do find() whilst doing delete()
void hash_table_delete(HashTable *const ht, hkey_t pos)
{
  uint64_t bucket = pos / ht->bucket_size, n, m;

  ctx_assert(pos != HASH_NOT_FOUND);
  ctx_assert(HASH_ENTRY_ASSIGNED(ht->table[pos]));

  memset(ht->table+pos, 0, sizeof(BinaryKmer));
  n = __sync_fetch_and_sub((volatile uint64_t *)&ht->num_kmers, 1);
  m = __sync_fetch_and_sub((volatile uint8_t *)&ht->buckets[bucket][HT_BITEMS], 1);

  ctx_assert2(n > 0, "Deleted from empty table");
  ctx_assert2(m > 0, "Deleted from empty bucket");
  ctx_assert(!HASH_ENTRY_ASSIGNED(ht->table[pos]));
}
Пример #2
0
void filter_reads(AsyncIOData *data, void *arg)
{
  (void)arg;
  read_t *r1 = (read_t*)&data->r1, *r2 = data->r2.seq.end ? (read_t*)&data->r2 : NULL;
  AlignReadsData *input = (AlignReadsData*)data->ptr;
  const dBGraph *db_graph = input->db_graph;
  LoadingStats *stats = input->stats;

  ctx_assert2(r2 == NULL || input->seqout.is_pe,
              "Were not expecting r2: %p %i", r2, (int)input->seqout.is_pe);

  bool touches_graph = read_touches_graph(r1, db_graph, stats) ||
                       (r2 != NULL && read_touches_graph(r2, db_graph, stats));

  if(touches_graph != input->invert)
  {
    seqout_print(&input->seqout, r1, r2);
    input->num_of_reads_printed += 1 + (r2 != NULL);
  }

  if(r2 == NULL) __sync_add_and_fetch((volatile size_t*)&stats->num_se_reads, 1);
  else           __sync_add_and_fetch((volatile size_t*)&stats->num_pe_reads, 2);

  size_t n = __sync_add_and_fetch(&read_counter, 1);
  ctx_update("FilterReads", n);
}
Пример #3
0
// Using file so can call fseek and don't need to load whole graph
static size_t inferedges_on_mmap(const dBGraph *db_graph, bool add_all_edges,
                                 GraphFileReader *file)
{
  ctx_assert(db_graph->num_of_cols == file->hdr.num_of_cols);
  ctx_assert(file_filter_is_direct(&file->fltr));
  ctx_assert2(!isatty(fileno(file->fh)), "Use inferedges_on_stream() instead");
  ctx_assert(file->num_of_kmers >= 0);
  ctx_assert(file->file_size >= 0);

  status("[inferedges] Processing mmap file: %s [hdr: %zu bytes file: %zu bytes]",
         file_filter_path(&file->fltr),
         (size_t)file->hdr_size, (size_t)file->file_size);

  if(fseek(file->fh, 0, SEEK_SET) != 0)
    die("fseek failed: %s", strerror(errno));

  // Open memory mapped file
  void *mmap_ptr = mmap(NULL, file->file_size, PROT_WRITE, MAP_SHARED,
                        fileno(file->fh), 0);

  if(mmap_ptr == MAP_FAILED)
    die("Cannot memory map file: %s [%s]", file->fltr.path.b, strerror(errno));

  const size_t ncols = file->hdr.num_of_cols;
  BinaryKmer bkmer;
  Edges edges[ncols];
  Covg covgs[ncols];

  bool updated;
  size_t i, num_kmers = file->num_of_kmers, num_kmers_edited = 0;
  size_t filekmersize = sizeof(BinaryKmer) + (sizeof(Edges)+sizeof(Covg)) * ncols;

  char *ptr = (char*)mmap_ptr + file->hdr_size;

  for(i = 0; i < num_kmers; i++, ptr += filekmersize)
  {
    char *fh_covgs = ptr      + sizeof(BinaryKmer);
    char *fh_edges = fh_covgs + sizeof(Covg)*ncols;

    memcpy(bkmer.b, ptr,      sizeof(BinaryKmer));
    memcpy(covgs,   fh_covgs, ncols * sizeof(Covg));
    memcpy(edges,   fh_edges, ncols * sizeof(Edges));

    updated = (add_all_edges ? infer_all_edges(bkmer, edges, covgs, db_graph)
                             : infer_pop_edges(bkmer, edges, covgs, db_graph));

    if(updated) {
      memcpy(fh_covgs, covgs, ncols * sizeof(Covg));
      memcpy(fh_edges, edges, ncols * sizeof(Edges));
      num_kmers_edited++;
    }
  }

  if(munmap(mmap_ptr, file->file_size) == -1)
    die("Cannot release mmap file: %s [%s]", file->fltr.path.b, strerror(errno));

  return num_kmers_edited;
}
Пример #4
0
void seq_reader_orient_mp_FF_or_RR(read_t *r1, read_t *r2, ReadMateDir matedir)
{
  ctx_assert(r1 != NULL);
  ctx_assert(r2 != NULL);
  switch(matedir) {
    case READPAIR_FF: return;
    case READPAIR_FR: seq_read_reverse_complement(r2); return;
    case READPAIR_RF: seq_read_reverse_complement(r1); return;
    case READPAIR_RR: return;
    default: ctx_assert2(0, "Invalid ReadMateDir value: %i", (int)matedir);
  }
  // ^default should be unreachable
}
Пример #5
0
void acall_decompose(CallDecomp *dc, const AlignedCall *call,
                     size_t max_line_len, size_t max_allele_len)
{
  dc->stats.ncalls++;
  if(call->chrom == NULL) { return; }
  dc->stats.ncalls_mapped++;

  const read_t *chrom = call->chrom;
  const char *ref_allele = chrom->seq.b + call->start;
  size_t i, ref_len = call->end - call->start;
  const StrBuf *alt;

  ctx_assert2(call->start <= call->end, "%u .. %u", call->start, call->end);

  if(ref_len > max_line_len) {
    dc->stats.ncalls_ref_allele_too_long++;
    return; // can't align
  }

  dc->stats.nlines += call->n_lines;

  // printf("chr:%s %u - %u\n", call->chrom->name.b, call->start, call->end);

  for(i = 0; i < call->n_lines; i++)
  {
    alt = &call->lines[i];
    ctx_assert(strlen(alt->b) == alt->end);

    // Quick check if sequence too long or are matching
    if(alt->end > max_line_len) {
      dc->stats.nlines_too_long++;
    } else if(ref_len == alt->end && strncasecmp(ref_allele, alt->b, ref_len) == 0) {
      dc->stats.nlines_match_ref++;
    } else {
      // printf("REF: '%*.s' [%zu]\n", (int)ref_len, ref_allele, ref_len);
      // printf("ALT: '%*.s' [%zu]\n", (int)alt->end, alt->b, alt->end);

      needleman_wunsch_align2(ref_allele, alt->b, ref_len, alt->end,
                              dc->scoring, dc->nw_aligner, dc->aln);

      // printf("ALNA: %s\n", dc->aln->result_a);
      // printf("ALNB: %s\n", dc->aln->result_b);

      align_biallelic(dc->aln->result_a, dc->aln->result_b, chrom,
                      call->gts+i*call->n_samples, call->n_samples,
                      dc, call, max_allele_len);
      dc->stats.nlines_mapped++;
    }
  }
}
Пример #6
0
static void parse_cmdline_args(int argc, char **argv)
{
  // Arg parsing
  char cmd[100];
  char shortopts[300];
  cmd_long_opts_to_short(longopts, shortopts, sizeof(shortopts));
  int c;

  // silence error messages from getopt_long
  // opterr = 0;

  while((c = getopt_long_only(argc, argv, shortopts, longopts, NULL)) != -1) {
    cmd_get_longopt_str(longopts, c, cmd, sizeof(cmd));
    switch(c) {
      case 0: /* flag set */ break;
      case 'h': cmd_print_usage(NULL); break;
      case 'o': cmd_check(!out_path, cmd); out_path = optarg; break;
      case 'f': cmd_check(!futil_get_force(), cmd); futil_set_force(true); break;
      case 'F': cmd_check(!sam_path,cmd); sam_path = optarg; break;
      case 'Q': cmd_check(min_mapq == SIZE_MAX,cmd); min_mapq = cmd_uint32(cmd, optarg); break;
      case 'A': cmd_check(max_align_len  == SIZE_MAX,cmd);  max_align_len  = cmd_uint32(cmd, optarg); break;
      case 'L': cmd_check(max_allele_len == SIZE_MAX,cmd);  max_allele_len = cmd_uint32(cmd, optarg); break;
      case 'D': cmd_check(max_path_diff  == SIZE_MAX, cmd); max_path_diff  = cmd_uint32(cmd, optarg); break;
      case 'm': nwmatch = cmd_int32(cmd, optarg); break;
      case 'M': nwmismatch = cmd_int32(cmd, optarg); break;
      case 'g': nwgapopen = cmd_int32(cmd, optarg); break;
      case 'G': nwgapextend = cmd_int32(cmd, optarg); break;
      case ':': /* BADARG */
      case '?': /* BADCH getopt_long has already printed error */
        die("`"CMD" calls2vcf -h` for help. Bad option: %s", argv[optind-1]);
      default: ctx_assert2(0, "shouldn't reach here: %c", c);
    }
  }

  // Defaults for unset values
  if(out_path == NULL) out_path = default_out_path;
  if(min_mapq == SIZE_MAX) min_mapq = DEFAULT_MIN_MAPQ;
  if(max_align_len  == SIZE_MAX) max_align_len  = DEFAULT_MAX_ALIGN;
  if(max_allele_len == SIZE_MAX) max_allele_len = DEFAULT_MAX_ALLELE;
  if(max_path_diff  == SIZE_MAX) max_path_diff  = DEFAULT_MAX_PDIFF;

  if(optind+2 > argc)
    cmd_print_usage("Require <in.txt.gz> and at least one reference");

  input_path = argv[optind++];
  ref_paths = argv + optind;
  num_ref_paths = argc - optind;
}
Пример #7
0
/**
 * Remove entries from `src` that are in `dst`, copying over sample counts
 */
void gpath_subset_merge(GPathSubset *dst, GPathSubset *src)
{
  ctx_assert2(dst->gpset->ncols == src->gpset->ncols, "%zu vs %zu",
              dst->gpset->ncols, src->gpset->ncols);

  if(!dst->is_sorted) gpath_subset_sort(dst);
  if(!src->is_sorted) gpath_subset_sort(src);

  size_t i = 0, j = 0, ncols = dst->gpset->ncols;
  int cmp;

  GPath **dstlist = dst->list.b;
  GPath **srclist = src->list.b;

  if(dst->list.len == 0 || src->list.len == 0) return;

  while(i < dst->list.len && j < src->list.len)
  {
    cmp = gpath_cmp(dstlist[i], srclist[j]);

    if(cmp < 0) i++;
    else if(cmp > 0) j++;
    else {
      // paths match, steal colours and remove it
      gpath_colset_or_mt(dstlist[i], srclist[j], ncols);
      gpath_set_nseen_sum_mt(dstlist[i], dst->gpset,
                             srclist[j], src->gpset);
      srclist[j] = NULL;
      j++;
    }
  }

  // Remove NULLs from src
  for(i = j = 0; i < src->list.len; i++)
    if(srclist[i] != NULL)
      srclist[j++] = srclist[i];

  src->list.len = j;
}
Пример #8
0
// Using file so can call fseek and don't need to load whole graph
static size_t inferedges_on_file(const dBGraph *db_graph, bool add_all_edges,
                                 GraphFileReader *file, FILE *fout)
{
  ctx_assert(db_graph->num_of_cols == file->hdr.num_of_cols);
  ctx_assert(file_filter_is_direct(&file->fltr));
  ctx_assert2(!isatty(fileno(file->fh)), "Use inferedges_on_stream() instead");
  ctx_assert(fout != NULL);
  ctx_assert(fileno(file->fh) != fileno(fout));

  status("[inferedges] Processing file: %s", file_filter_path(&file->fltr));

  // Print header
  graph_write_header(fout, &file->hdr);

  // Read the input file again
  if(fseek(file->fh, file->hdr_size, SEEK_SET) != 0)
    die("fseek failed: %s", strerror(errno));

  const size_t ncols = file->hdr.num_of_cols;
  BinaryKmer bkmer;
  Edges edges[ncols];
  Covg covgs[ncols];

  size_t num_kmers_edited = 0;
  bool updated;

  while(graph_file_read_reset(file, ncols, &bkmer, covgs, edges))
  {
    updated = (add_all_edges ? infer_all_edges(bkmer, edges, covgs, db_graph)
                             : infer_pop_edges(bkmer, edges, covgs, db_graph));

    graph_write_kmer(fout, file->hdr.num_of_bitfields, file->hdr.num_of_cols,
                     bkmer, covgs, edges);

    num_kmers_edited += updated;
  }

  return num_kmers_edited;
}
Пример #9
0
enum AssemStopCause graphstep2assem(enum GraphStepStatus step, bool hit_cycle,
                                    bool low_step_confid, bool low_cumul_confid)
{
  // There should only be one reason to stop traversal
  ctx_assert2((!grap_step_status_is_good(step) +
               !!hit_cycle + !!low_step_confid + !!low_cumul_confid) == 1,
              "One and only one should be true %i %i %i %i",
              (int)step, (int)hit_cycle,
              (int)low_step_confid, (int)low_cumul_confid);

  if(hit_cycle) return ASSEM_STOP_CYCLE;
  if(low_step_confid) return ASSEM_STOP_LOW_STEP_CONF;
  if(low_cumul_confid) return ASSEM_STOP_LOW_CUMUL_CONF;

  switch(step) {
    case GRPHWLK_NOCOVG: return ASSEM_STOP_NOCOVG;
    case GRPHWLK_NOCOLCOVG: return ASSEM_STOP_NOCOLCOVG;
    case GRPHWLK_NOPATHS: return ASSEM_STOP_NOPATHS;
    case GRPHWLK_SPLIT_PATHS: return ASSEM_STOP_SPLIT_PATHS;
    case GRPHWLK_MISSING_PATHS: return ASSEM_STOP_MISSING_PATHS;
    default: die("Unknown %i", (int)step);
  }
}
Пример #10
0
int ctx_calls2vcf(int argc, char **argv)
{
  const char *in_path = NULL, *out_path = NULL, *out_type = NULL;
  // Filtering parameters
  int32_t min_mapq = -1, max_align_len = -1, max_allele_len = -1;
  // Alignment parameters
  int nwmatch = 1, nwmismatch = -2, nwgapopen = -4, nwgapextend = -1;
  // ref paths
  char const*const* ref_paths = NULL;
  size_t nref_paths = 0;
  // flank file
  const char *sam_path = NULL;

  //
  // Things we figure out by looking at the input
  //
  bool isbubble = false;
  // samples in VCF, (0 for bubble, does not include ref in breakpoint calls)
  size_t i, kmer_size, num_samples;

  //
  // Reference genome
  //
  // Hash map of chromosome name -> sequence
  ChromHash *genome;
  ReadBuffer chroms;

  // Arg parsing
  char cmd[100];
  char shortopts[300];
  cmd_long_opts_to_short(longopts, shortopts, sizeof(shortopts));
  int c;

  // silence error messages from getopt_long
  // opterr = 0;

  while((c = getopt_long_only(argc, argv, shortopts, longopts, NULL)) != -1) {
    cmd_get_longopt_str(longopts, c, cmd, sizeof(cmd));
    switch(c) {
      case 0: /* flag set */ break;
      case 'h': cmd_print_usage(NULL); break;
      case 'o': cmd_check(!out_path, cmd); out_path = optarg; break;
      case 'O': cmd_check(!out_type, cmd); out_type = optarg; break;
      case 'f': cmd_check(!futil_get_force(), cmd); futil_set_force(true); break;
      case 'F': cmd_check(!sam_path,cmd); sam_path = optarg; break;
      case 'Q': cmd_check(min_mapq < 0,cmd); min_mapq = cmd_uint32(cmd, optarg); break;
      case 'A': cmd_check(max_align_len  < 0,cmd); max_align_len  = cmd_uint32(cmd, optarg); break;
      case 'L': cmd_check(max_allele_len < 0,cmd); max_allele_len = cmd_uint32(cmd, optarg); break;
      case 'm': nwmatch = cmd_int32(cmd, optarg); break;
      case 'M': nwmismatch = cmd_int32(cmd, optarg); break;
      case 'g': nwgapopen = cmd_int32(cmd, optarg); break;
      case 'G': nwgapextend = cmd_int32(cmd, optarg); break;
      case ':': /* BADARG */
      case '?': /* BADCH getopt_long has already printed error */
        die("`"CMD" "SUBCMD" -h` for help. Bad option: %s", argv[optind-1]);
      default: ctx_assert2(0, "shouldn't reach here: %c", c);
    }
  }

  // Defaults for unset values
  if(out_path == NULL) out_path = "-";
  if(max_align_len  < 0) max_align_len  = DEFAULT_MAX_ALIGN;
  if(max_allele_len < 0) max_allele_len = DEFAULT_MAX_ALLELE;

  if(optind+2 > argc)
    cmd_print_usage("Require <in.txt.gz> and at least one reference");

  in_path = argv[optind++];
  ref_paths = (char const*const*)argv + optind;
  nref_paths = argc - optind;

  // These functions call die() on error
  gzFile gzin = futil_gzopen(in_path, "r");

  // Read call file header
  cJSON *json = json_hdr_load(gzin, in_path);

  // Check we can handle the kmer size
  kmer_size = json_hdr_get_kmer_size(json, in_path);
  db_graph_check_kmer_size(kmer_size, in_path);

  // Get format (bubble or breakpoint file)
  cJSON *json_fmt = json_hdr_get(json, "file_format", cJSON_String, in_path);
  if(strcmp(json_fmt->valuestring,"CtxBreakpoints") == 0) isbubble = false;
  else if(strcmp(json_fmt->valuestring,"CtxBubbles") == 0) isbubble = true;
  else die("Unknown format: '%s'", json_fmt->valuestring);

  status("Reading %s in %s format", futil_inpath_str(in_path),
         isbubble ? "bubble" : "breakpoint");

  if(isbubble) {
    // bubble specific
    if(sam_path == NULL)
      cmd_print_usage("Require -F <flanks.sam> with bubble file");
    if(min_mapq < 0) min_mapq = DEFAULT_MIN_MAPQ;
  }
  else {
    // breakpoint specific
    if(min_mapq >= 0)
      cmd_print_usage("-Q,--min-mapq <Q> only valid with bubble calls");
  }

  // Open flank file if it exists
  htsFile *samfh = NULL;
  bam_hdr_t *bam_hdr = NULL;
  bam1_t *mflank = NULL;

  if(sam_path)
  {
    if((samfh = hts_open(sam_path, "r")) == NULL)
      die("Cannot open SAM/BAM %s", sam_path);

    // Load BAM header
    bam_hdr = sam_hdr_read(samfh);
    if(bam_hdr == NULL) die("Cannot load BAM header: %s", sam_path);
    mflank = bam_init1();
  }

  // Output VCF has 0 samples if bubbles file, otherwise has N where N is
  // number of samples/colours in the breakpoint graph
  size_t num_graph_samples = json_hdr_get_ncols(json, in_path);
  size_t num_graph_nonref = json_hdr_get_nonref_ncols(json, in_path);

  num_samples = 0;
  if(!isbubble) {
    // If last colour has "is_ref", drop number of samples by one
    num_samples = num_graph_nonref < num_graph_samples ? num_graph_samples-1
                                                       : num_graph_samples;
  }

  //
  // Open output file
  //
  if(!out_path) out_path = "-";
  int mode = vcf_misc_get_outtype(out_type, out_path);
  futil_create_output(out_path);
  htsFile *vcffh = hts_open(out_path, modes_htslib[mode]);

  status("[calls2vcf] Reading %s call file with %zu samples",
         isbubble ? "Bubble" : "Breakpoint", num_graph_samples);
  status("[calls2vcf] %zu sample output to: %s format: %s",
         num_samples, futil_outpath_str(out_path), hsmodes_htslib[mode]);

  if(isbubble) status("[calls2vcf] min. MAPQ: %i", min_mapq);
  status("[calls2vcf] max alignment length: %i", max_align_len);
  status("[calls2vcf] max VCF allele length: %i", max_allele_len);
  status("[calls2vcf] alignment match:%i mismatch:%i gap open:%i extend:%i",
         nwmatch, nwmismatch, nwgapopen, nwgapextend);

  // Load reference genome
  read_buf_alloc(&chroms, 1024);
  genome = chrom_hash_init();
  chrom_hash_load(ref_paths, nref_paths, &chroms, genome);

  // convert to upper case
  char *s;
  for(i = 0; i < chroms.len; i++)
    for(s = chroms.b[i].seq.b; *s; s++) *s = toupper(*s);

  if(!isbubble) brkpnt_check_refs_match(json, genome, in_path);

  bcf_hdr_t *vcfhdr = make_vcf_hdr(json, in_path, !isbubble, kmer_size,
                                   ref_paths, nref_paths,
                                   chroms.b, chroms.len);

  if(bcf_hdr_write(vcffh, vcfhdr) != 0) die("Cannot write VCF header");

  AlignedCall *call = acall_init();
  CallDecomp *aligner = call_decomp_init(vcffh, vcfhdr);

  scoring_t *scoring = call_decomp_get_scoring(aligner);
  scoring_init(scoring, nwmatch, nwmismatch, nwgapopen, nwgapextend,
               false, false, 0, 0, 0, 0);

  CallFileEntry centry;
  call_file_entry_alloc(&centry);

  char kmer_str[50];
  sprintf(kmer_str, ";K%zu", kmer_size);

  if(isbubble)
  {
    // Bubble calls
    DecompBubble *bubbles = decomp_bubble_init();

    // Set scoring for aligning 3' flank
    scoring = decomp_bubble_get_scoring(bubbles);
    scoring_init(scoring, nwmatch, nwmismatch, nwgapopen, nwgapextend,
                 true, true, 0, 0, 0, 0);

    while(call_file_read(gzin, in_path, &centry)) {
      do {
        if(sam_read1(samfh, bam_hdr, mflank) < 0)
          die("We've run out of SAM entries!");
      } while(mflank->core.flag & (BAM_FSECONDARY | BAM_FSUPPLEMENTARY));

      // Align call
      strbuf_reset(&call->info);
      decomp_bubble_call(bubbles, genome, kmer_size, min_mapq,
                         &centry, mflank, bam_hdr, call);
      strbuf_append_str(&call->info, kmer_str);
      acall_decompose(aligner, call, max_align_len, max_allele_len);
    }

    // print bubble stats
    DecompBubbleStats *bub_stats = ctx_calloc(1, sizeof(*bub_stats));
    decomp_bubble_cpy_stats(bub_stats, bubbles);
    print_bubble_stats(bub_stats);
    ctx_free(bub_stats);

    decomp_bubble_destroy(bubbles);
  }
  else
  {
    // Breakpoint calls
    DecompBreakpoint *breakpoints = decomp_brkpt_init();

    while(call_file_read(gzin, in_path, &centry)) {
      strbuf_reset(&call->info);
      decomp_brkpt_call(breakpoints, genome, num_samples, &centry, call);
      strbuf_append_str(&call->info, kmer_str);
      acall_decompose(aligner, call, max_align_len, max_allele_len);
    }

    // print bubble stats
    DecompBreakpointStats *brk_stats = ctx_calloc(1, sizeof(*brk_stats));
    decomp_brkpt_cpy_stats(brk_stats, breakpoints);
    print_breakpoint_stats(brk_stats);
    ctx_free(brk_stats);

    decomp_brkpt_destroy(breakpoints);
  }

  // Print stats
  DecomposeStats *astats = ctx_calloc(1, sizeof(*astats));
  call_decomp_cpy_stats(astats, aligner);
  print_acall_stats(astats);
  ctx_free(astats);

  call_file_entry_dealloc(&centry);
  call_decomp_destroy(aligner);
  acall_destroy(call);

  // Finished - clean up
  cJSON_Delete(json);
  gzclose(gzin);

  bcf_hdr_destroy(vcfhdr);
  hts_close(vcffh);

  for(i = 0; i < chroms.len; i++) seq_read_dealloc(&chroms.b[i]);
  read_buf_dealloc(&chroms);
  chrom_hash_destroy(genome);

  if(sam_path) {
    hts_close(samfh);
    bam_hdr_destroy(bam_hdr);
    bam_destroy1(mflank);
  }

  return EXIT_SUCCESS;
}
Пример #11
0
static void parse_entries(gzFile gzin, FILE *fout)
{
  CallFileEntry centry;
  call_file_entry_alloc(&centry);

  ChromPosBuffer chrposbuf;
  chrompos_buf_alloc(&chrposbuf, 32);

  StrBuf tmpbuf, flank3pbuf;
  strbuf_alloc(&tmpbuf, 1024);
  strbuf_alloc(&flank3pbuf, 1024);

  const char *flank5p, *flank3p;
  size_t flank5p_len, flank3p_len;
  size_t cpy_flnk_5p, cpy_flnk_3p;

  const read_t *chrom = NULL;
  size_t ref_start = 0, ref_end = 0;
  bool mapped = false, fw_strand = false;

  const char **genotypes = NULL;

  if(!input_bubble_format)
    genotypes = ctx_calloc(num_samples, sizeof(char*));

  for(; call_file_read(gzin, input_path, &centry); num_entries_read++)
  {
    size_t nlines = call_file_num_lines(&centry);
    ctx_assert2(!(nlines&1) && nlines >= 6, "Too few lines: %zu", nlines);

    flank5p = call_file_get_line(&centry,1);
    flank5p_len = call_file_line_len(&centry,1);
    cpy_flnk_5p = cpy_flnk_3p = 0;

    // Read a corresponding SAM entry
    if(input_bubble_format)
    {
      // Trim down alleles, add to 3p flank
      bubble_trim_alleles(&centry, &flank3pbuf);
      flank3p = flank3pbuf.b;
      flank3p_len = flank3pbuf.end;

      mapped = sam_fetch_coords(&centry, flank5p, flank5p_len, flank3p, flank3p_len,
                                &cpy_flnk_5p, &cpy_flnk_3p,
                                &chrom, &ref_start, &ref_end, &fw_strand);
    }
    else {
      flank3p = call_file_get_line(&centry, 3);
      flank3p_len = call_file_line_len(&centry, 3);

      mapped = brkpnt_fetch_coords(&centry, &chrposbuf,
                                   &chrom, &ref_start, &ref_end, &fw_strand,
                                   &cpy_flnk_5p, &cpy_flnk_3p);
    }

    if(mapped)
    {
      // Get call id
      const char *hdrline = call_file_get_line(&centry, 0);
      char callid[100];
      int r = get_callid_str(hdrline, input_bubble_format, callid, sizeof(callid));
      if(r == -1) die("Poorly formatted: %s", hdrline);
      if(r == -2) die("Call id string is too long: %s", hdrline);

      align_entry(&centry, callid, flank5p, flank5p_len, flank3p, flank3p_len,
                  cpy_flnk_5p, cpy_flnk_3p,
                  chrom, ref_start, ref_end, fw_strand,
                  &tmpbuf, genotypes,
                  fout);
    }
  }

  ctx_free(genotypes);
  call_file_entry_dealloc(&centry);
  chrompos_buf_dealloc(&chrposbuf);
  strbuf_dealloc(&tmpbuf);
  strbuf_dealloc(&flank3pbuf);
}
Пример #12
0
/**
 * Pick a cleaning threshold from kmer coverage histogram. Assumes low coverage
 * kmers are all due to error. Fits a poisson with a gamma distributed mean.
 * Then chooses a cleaning threshold such than FDR (uncleaned kmers) occur at a
 * rate of < the FDR paramater.
 *
 * Translated from Gil McVean's initial proposed method in R code
 *
 * @param kmer_covg Histogram of kmer counts at coverages 1,2,.. arrlen-1
 * @param arrlen    Length of array kmer_covg
 * @param alpha_est_ptr If not NULL, used to return estimate for alpha
 * @param beta_est_ptr  If not NULL, used to return estimate for beta
 * @return -1 if no cut-off satisfies FDR, otherwise returns coverage cutoff
 */
int cleaning_pick_kmer_threshold(const uint64_t *kmer_covg, size_t arrlen,
                                 double *alpha_est_ptr, double *beta_est_ptr,
                                 double *false_pos_ptr, double *false_neg_ptr)
{
  ctx_assert(arrlen >= 10);
  ctx_assert2(kmer_covg[0] == 0, "Shouldn't see any kmers with coverage zero");

  size_t i, min_a_est_idx = 0;
  double r1, r2, rr, min_a_est = DBL_MAX, tmp;
  double aa, faa, a_est, b_est, c0;

  r1 = (double)kmer_covg[2] / kmer_covg[1];
  r2 = (double)kmer_covg[3] / kmer_covg[2];
  rr = r2 / r1;

  // printf("r1: %.2f r2: %.2f rr: %.2f\n", r1, r2, rr);

  // iterate aa = { 0.01, 0.02, ..., 1.99, 2.00 }
  // find aa value that minimises abs(faa-rr)
  for(i = 1; i <= 200; i++)
  {
    aa = i*0.01;
    faa = tgamma(aa)*tgamma(aa+2) / (2*pow(tgamma(aa+1),2));
    tmp = fabs(faa-rr);
    if(tmp < min_a_est) { min_a_est = tmp; min_a_est_idx = i; }
  }

  // a_est, b_est are estimates for alpha, beta of gamma distribution
  a_est = min_a_est_idx*0.01;
  b_est = tgamma(a_est + 1.0) / (r1 * tgamma(a_est)) - 1.0;
  b_est = MAX2(b_est, 1); // Avoid beta values <1
  c0 = kmer_covg[1] * pow(b_est/(1+b_est),-a_est);

  if(alpha_est_ptr) *alpha_est_ptr = a_est;
  if(beta_est_ptr)  *beta_est_ptr  = b_est;

  // printf("min_a_est_idx: %zu\n", min_a_est_idx);
  // printf("a_est: %f b_est %f c0: %f\n", a_est, b_est, c0);

  // keep coverage estimates on the stack - this should be ok
  double e_covg_tmp, e_covg[arrlen];
  double e_total = 0;
  uint64_t d_total = 0;

  // Calculate some values here for speed
  double log_b_est          = log(b_est);
  double log_one_plus_b_est = log(1 + b_est);
  double lgamma_a_est       = lgamma(a_est);

  // note: lfactorial(x) = lgamma(x+1)

  for(i = 1; i < arrlen; i++)
  {
    e_covg_tmp = a_est * log_b_est - lgamma_a_est - lgamma(i)
                   + lgamma(a_est + i - 1)
                   - (a_est + i - 1) * log_one_plus_b_est;
    e_covg[i] = exp(e_covg_tmp) * c0;
    e_total += e_covg[i];
    d_total += kmer_covg[i];
  }

  // for(i = 1; i < MIN2(arrlen,100); i++)
  //   printf("  %zu: %f %zu\n", i, e_covg[i], (size_t)kmer_covg[i]);

  int cutoff = -1;

  // Find cutoff by finding first coverage level where errors make up less than
  // 0.1% of total coverage
  cutoff = pick_cutoff_with_fdr_thresh(e_covg, kmer_covg, arrlen, 0.001);
  // printf("A cutoff: %i\n", cutoff);

  // Pick highest cutoff that keeps FP < FN
  if(cutoff < 0)
    cutoff = pick_cutoff_FP_lt_FN(e_covg, e_total, kmer_covg, d_total, arrlen);

  if(cutoff < 0)
    cutoff = pick_cutoff_loss_vs_error(e_covg, e_total, kmer_covg, arrlen);

  // printf("B cutoff: %i\n", cutoff);

  if(cutoff < 0) return -1;

  // printf("C cutoff: %i\n", cutoff);

  // Check cutoff keeps at least 20% of coverage
  // (WGS should be much higher, Exome sequencing needs low cutoff)
  if(!is_cutoff_good(kmer_covg, arrlen, cutoff, 0.2)) return -1;

  // printf("D cutoff: %i\n", cutoff);

  // Calculate FP,FN rates
  if(false_pos_ptr || false_neg_ptr) {
    double false_pos = 0, false_neg = 0;
    cutoff_get_FP_FN(e_covg, e_total, kmer_covg, d_total, cutoff,
                     &false_pos, &false_neg);
    // printf("  FP: %f, FN: %f\n", false_pos, false_neg);
    if(false_pos_ptr) *false_pos_ptr = false_pos;
    if(false_neg_ptr) *false_neg_ptr = false_neg;
  }

  // printf(" kmers_above : %zu / (%zu + %zu) = %f\n",
  //        kmers_above, kmers_below, kmers_above,
  //        (double)kmers_above/(kmers_below+kmers_above));

  // printf("cutoff: %i\n", cutoff);

  // printf(" cutoff: %zu fdr: %f fdr_limit: %f good: %i\n",
  //        cutoff, fdr, fdr_limit, (int)good_cutoff);

  return cutoff;
}
Пример #13
0
/**
 * Pick a cleaning threshold from kmer coverage histogram. Assumes low coverage
 * kmers are all due to error, to which it fits a gamma distribution. Then
 * chooses a cleaning threshold such that FDR (uncleaned kmers) occur at a rate
 * of < the FDR paramater.
 *
 * Translated from Gil McVean's proposed method in R code
 *
 * @param kmer_covg Histogram of kmer counts at coverages 1,2,.. arrlen-1
 * @param arrlen    Length of array kmer_covg
 * @param fdr_limit False discovery rate for a single kmer coverage
 *                  (1/1000 i.e. 0.001 is reasonable)
 * @param alpha_est_ptr If not NULL, used to return estimate for alpha
 * @param beta_est_ptr  If not NULL, used to return estimate for beta
 * @return -1 if no cut-off satisfies FDR, otherwise returns coverage cutoff
 */
int cleaning_pick_kmer_threshold(const uint64_t *kmer_covg, size_t arrlen,
                                 double fdr_limit,
                                 double *alpha_est_ptr, double *beta_est_ptr)
{
  ctx_assert(arrlen >= 10);
  ctx_assert2(0 < fdr_limit && fdr_limit < 1, "expected 0 < FDR < 1: %f", fdr_limit);
  ctx_assert2(kmer_covg[0] == 0, "Shouldn't see any kmers with coverage zero");

  size_t i, min_a_est_idx = 0;
  double r1, r2, rr, min_a_est = DBL_MAX, tmp;
  double aa, faa, a_est, b_est, c0;

  r1 = (double)kmer_covg[2] / kmer_covg[1];
  r2 = (double)kmer_covg[3] / kmer_covg[2];
  rr = r2 / r1;

  // printf("r1: %.2f r2: %.2f rr: %.2f\n", r1, r2, rr);

  // iterate aa = { 0.01, 0.02, ..., 1.99, 2.00 }
  // find aa value that minimises abs(faa-rr)
  for(i = 1; i <= 200; i++)
  {
    aa = i*0.01;
    faa = tgamma(aa)*tgamma(aa+2) / (2*pow(tgamma(aa+1),2));
    tmp = fabs(faa-rr);
    if(tmp < min_a_est) { min_a_est = tmp; min_a_est_idx = i; }
  }

  // a_est, b_est are estimates for alpha, beta of gamma distribution
  a_est = min_a_est_idx*0.01;
  b_est = tgamma(a_est + 1.0) / (r1 * tgamma(a_est)) - 1.0;
  b_est = MAX2(b_est, 0.000001); // Avoid negative beta
  c0 = kmer_covg[1] * pow(b_est/(1+b_est),-a_est);

  if(alpha_est_ptr) *alpha_est_ptr = a_est;
  if(beta_est_ptr)  *beta_est_ptr  = b_est;

  // printf("min_a_est_idx: %zu\n", min_a_est_idx);
  // printf("a_est: %f b_est %f c0: %f\n", a_est, b_est, c0);

  // Initialise fdr to be greater than fdr_limit
  double e_cov, e_cov_c0, fdr = 2.0, log_b_est, log_one_plus_b_est, lgamma_a_est;

  // Calculate some values here for speed
  log_b_est          = log(b_est);
  log_one_plus_b_est = log(1 + b_est);
  lgamma_a_est       = lgamma(a_est);

  // note: lfactorial(x) = lgamma(x+1)

  for(i = 0; i < arrlen; i++)
  {
    e_cov = a_est * log_b_est - lgamma_a_est - lgamma(i) + lgamma(a_est + i - 1) -
            (a_est + i - 1) * log_one_plus_b_est;
    e_cov_c0 = exp(e_cov) * c0;
    fdr = 1.0 - (kmer_covg[i] - e_cov_c0) / kmer_covg[i];
    // printf("i: %zu e_cov: %f e_cov_c0: %f fdr: %f limit %f\n",
    //        i, e_cov, e_cov_c0, fdr, fdr_limit);
    if(fdr < fdr_limit) break;
  }
  size_t cutoff = i;

  // Check cutoff is below mean kmer coverage
  uint64_t kmers_below = 0, kmers_above = 0;
  for(i = 0;      i < cutoff; i++) kmers_below += kmer_covg[i]*i;
  for(i = cutoff; i < arrlen; i++) kmers_above += kmer_covg[i]*i;

  // At least 20% of kmers should be kept
  bool good_cutoff = ((double)kmers_above/(kmers_below+kmers_above) >= 0.2);

  // printf(" cutoff: %i fdr: %f fdr_limit: %f meankcovg: %f good: %i\n",
  //        cutoff, fdr, fdr_limit, (double)sum/totalkmers, (int)good_cutoff);

  return fdr < fdr_limit && good_cutoff ? (int)cutoff : -1;
}
Пример #14
0
void assemble_contigs_stats_print(const AssembleContigStats *s)
{
  ctx_assert(s->lengths.len == s->junctns.len);
  ctx_assert(s->lengths.len == s->num_contigs);

  size_t i, ncontigs = s->num_contigs;

  if(ncontigs == 0) {
    status("[asm] No contigs assembled");
    return;
  }

  qsort(s->lengths.b, ncontigs, sizeof(s->lengths.b[0]), cmp_size);
  qsort(s->junctns.b, ncontigs, sizeof(s->junctns.b[0]), cmp_size);

  size_t len_n50, jnc_n50;
  size_t len_median, jnc_median, len_mean, jnc_mean;
  size_t len_min, len_max, jnc_min, jnc_max;

  // Calculate N50s
  len_n50 = calc_N50(s->lengths.b, ncontigs, s->total_len);
  jnc_n50 = calc_N50(s->junctns.b, ncontigs, s->total_junc);

  // Calculate medians, means
  len_median = MEDIAN(s->lengths.b, ncontigs);
  jnc_median = MEDIAN(s->junctns.b, ncontigs);
  len_mean = (double)s->total_len / ncontigs;
  jnc_mean = (double)s->total_junc / ncontigs;

  // Calculate min, max
  len_min = s->lengths.b[0];
  jnc_min = s->junctns.b[0];
  len_max = s->lengths.b[ncontigs-1];
  jnc_max = s->junctns.b[ncontigs-1];

  // Print number of contigs
  char num_contigs_str[50], reseed_str[50], seed_not_fnd_str[50];
  char seed_kmers_str[50], seed_paths_str[50];
  long_to_str(ncontigs, num_contigs_str);
  long_to_str(s->num_reseed_abort, reseed_str);
  long_to_str(s->num_seeds_not_found, seed_not_fnd_str);
  long_to_str(s->num_contigs_from_seed_kmers, seed_kmers_str);
  long_to_str(s->num_contigs_from_seed_paths, seed_paths_str);
  status(PREFIX"pulled out %s contigs, %s from seed kmers, %s from seed paths",
         num_contigs_str, seed_kmers_str, seed_paths_str);
  status(PREFIX"no-reseed aborted %s times", reseed_str);
  status(PREFIX"seed kmer not found %s times", seed_not_fnd_str);

  char len_min_str[50], len_max_str[50], len_total_str[50];
  char len_mean_str[50], len_median_str[50], len_n50_str[50];

  char jnc_min_str[50], jnc_max_str[50], jnc_total_str[50];
  char jnc_mean_str[50], jnc_median_str[50], jnc_n50_str[50];

  // Use ulong_to_str instead of num_to_str to get better accuracy
  // e.g. 966 instead of 1K
  ulong_to_str(len_mean, len_mean_str);
  ulong_to_str(jnc_mean, jnc_mean_str);
  ulong_to_str(len_median, len_median_str);
  ulong_to_str(jnc_median, jnc_median_str);
  ulong_to_str(len_n50, len_n50_str);
  ulong_to_str(jnc_n50, jnc_n50_str);
  ulong_to_str(len_min, len_min_str);
  ulong_to_str(jnc_min, jnc_min_str);
  ulong_to_str(len_max, len_max_str);
  ulong_to_str(jnc_max, jnc_max_str);
  ulong_to_str(s->total_len, len_total_str);
  ulong_to_str(s->total_junc, jnc_total_str);

  status(PREFIX"Lengths: mean: %s  median: %s  N50: %s  min: %s  max: %s  total: %s [kmers]",
         len_mean_str, len_median_str, len_n50_str, len_min_str, len_max_str, len_total_str);
  status(PREFIX"Junctions: mean: %s  median: %s  N50: %s  min: %s  max: %s  total: %s [out >1]",
         jnc_mean_str, jnc_median_str, jnc_n50_str, jnc_min_str, jnc_max_str, jnc_total_str);
  status(PREFIX"Max junction density: %.2f\n", s->max_junc_density);

  timestamp();
  message(PREFIX" Outdegree: ");
  char nout_str[50];

  for(i = 0; i <= 4; i++) {
    message("\t%zu:%s [%zu%%]", i, ulong_to_str(s->contigs_outdegree[i], nout_str),
            (size_t)((100.0*s->contigs_outdegree[i])/(2.0*ncontigs)+0.5));
  }
  message("\n");

  _print_path_dist(s->paths_held, AC_MAX_PATHS, "Paths held",    ncontigs);
  _print_path_dist(s->paths_cntr, AC_MAX_PATHS, "Paths counter", ncontigs);

  const uint64_t *states = s->grphwlk_steps;
  size_t nsteps = s->total_len - s->num_contigs, ncontigends = 2*s->num_contigs;
  status(PREFIX"Traversal succeeded because:");
  _print_grphwlk_state("Pop straight ......... ", states[GRPHWLK_POPFWD],       nsteps);
  _print_grphwlk_state("Col straight ......... ", states[GRPHWLK_COLFWD],       nsteps);
  _print_grphwlk_state("PopFork use colour ... ", states[GRPHWLK_POPFRK_COLFWD],nsteps);
  _print_grphwlk_state("Go paths ............. ", states[GRPHWLK_USEPATH],      nsteps);

  const uint64_t *stops = s->stop_causes;
  status(PREFIX"Traversal halted because:");
  _print_grphwlk_state("No coverage .......... ", stops[ASSEM_STOP_NOCOVG],        ncontigends);
  _print_grphwlk_state("No colour covg ....... ", stops[ASSEM_STOP_NOCOLCOVG],     ncontigends);
  _print_grphwlk_state("No paths ............. ", stops[ASSEM_STOP_NOPATHS],       ncontigends);
  _print_grphwlk_state("Paths split .......... ", stops[ASSEM_STOP_SPLIT_PATHS],   ncontigends);
  _print_grphwlk_state("Missing paths ........ ", stops[ASSEM_STOP_MISSING_PATHS], ncontigends);
  _print_grphwlk_state("Graph cycles ......... ", stops[ASSEM_STOP_CYCLE],         ncontigends);
  _print_grphwlk_state("Low step confidence .. ", stops[ASSEM_STOP_LOW_STEP_CONF], ncontigends);
  _print_grphwlk_state("Low cumul. confidence  ", stops[ASSEM_STOP_LOW_CUMUL_CONF],ncontigends);

  size_t njunc = states[GRPHWLK_USEPATH] +
                 stops[ASSEM_STOP_NOPATHS] +
                 stops[ASSEM_STOP_SPLIT_PATHS] +
                 stops[ASSEM_STOP_MISSING_PATHS];

  ctx_assert2(s->total_junc == states[GRPHWLK_USEPATH], "%zu vs %zu",
              (size_t)s->total_junc, (size_t)states[GRPHWLK_USEPATH]);

  status(PREFIX"Junctions:");
  _print_grphwlk_state("Paths resolved", states[GRPHWLK_USEPATH], njunc);
}
Пример #15
0
int ctx_links(int argc, char **argv)
{
  size_t limit = 0;
  const char *link_out_path = NULL, *csv_out_path = NULL, *plot_out_path = NULL;
  const char *thresh_path = NULL, *hist_path = NULL;

  size_t hist_distsize = 0, hist_covgsize = 0;
  size_t cutoff = 0;
  bool clean = false;

  // Arg parsing
  char cmd[100];
  char shortopts[300];
  cmd_long_opts_to_short(longopts, shortopts, sizeof(shortopts));
  int c;

  while((c = getopt_long_only(argc, argv, shortopts, longopts, NULL)) != -1) {
    cmd_get_longopt_str(longopts, c, cmd, sizeof(cmd));
    switch(c) {
      case 0: /* flag set */ break;
      case 'h': cmd_print_usage(NULL); break;
      case 'o': cmd_check(!link_out_path, cmd); link_out_path = optarg; break;
      case 'f': cmd_check(!futil_get_force(), cmd); futil_set_force(true); break;
      case 'l': cmd_check(!csv_out_path, cmd); csv_out_path = optarg; break;
      case 'c': cmd_check(!cutoff, cmd); cutoff = cmd_size(cmd, optarg); clean = true; break;
      case 'L': cmd_check(!limit, cmd); limit = cmd_size(cmd, optarg); break;
      case 'P': cmd_check(!plot_out_path, cmd); plot_out_path = optarg; break;
      case 'T': cmd_check(!thresh_path, cmd); thresh_path = optarg; break;
      case 'H': cmd_check(!hist_path, cmd); hist_path = optarg; break;
      case 'C': cmd_check(!hist_covgsize, cmd); hist_covgsize = cmd_size(cmd, optarg); break;
      case 'D': cmd_check(!hist_distsize, cmd); hist_distsize = cmd_size(cmd, optarg); break;
      case ':': /* BADARG */
      case '?': /* BADCH getopt_long has already printed error */
        // cmd_print_usage(NULL);
        die("`"CMD" links -h` for help. Bad option: %s", argv[optind-1]);
      default: ctx_assert2(0, "shouldn't reach here: %c", c);
    }
  }

  if(hist_distsize && !hist_path) cmd_print_usage("--max-dist without --covg-hist");
  if(hist_covgsize && !hist_path) cmd_print_usage("--max-covg without --covg-hist");

  // Defaults
  if(!hist_distsize) hist_distsize = DEFAULT_MAX_DIST;
  if(!hist_covgsize) hist_covgsize = DEFAULT_MAX_COVG;

  if(optind + 1 != argc) cmd_print_usage("Wrong number of arguments");
  const char *ctp_path = argv[optind];

  bool list = (csv_out_path != NULL);
  bool plot = (plot_out_path != NULL);
  bool save = (link_out_path != NULL);
  bool hist_covg = (thresh_path != NULL || hist_path != NULL);

  size_t plot_kmer_idx = (limit == 0 ? 0 : limit - 1);

  if(clean && !save)
    cmd_print_usage("Need to give --out <out.ctp.gz> with --clean");

  if(!save && !list && !plot && !hist_covg)
    cmd_print_usage("Please specify one of --plot, --list or --clean");

  if(link_out_path && hist_covg && strcmp(link_out_path,"-") == 0)
    cmd_print_usage("Outputing both cleaning threshold (-T) and links (-o) to STDOUT!");

  // Open input file
  FILE *list_fh = NULL, *plot_fh = NULL, *link_tmp_fh = NULL;
  FILE *thresh_fh = NULL, *hist_fh = NULL;
  gzFile link_gz = NULL;

  // Check file don't exist or that we can overwrite
  // Will ignore if path is null
  bool err = false;
  err |= futil_check_outfile(csv_out_path);
  err |= futil_check_outfile(plot_out_path);
  err |= futil_check_outfile(link_out_path);
  err |= futil_check_outfile(thresh_path);
  err |= futil_check_outfile(hist_path);
  if(err) die("Use -f,--force to overwrite files");

  StrBuf link_tmp_path;
  strbuf_alloc(&link_tmp_path, 1024);

  GPathReader ctpin;
  memset(&ctpin, 0, sizeof(ctpin));
  gpath_reader_open(&ctpin, ctp_path);

  size_t ncols = file_filter_into_ncols(&ctpin.fltr);
  size_t kmer_size = gpath_reader_get_kmer_size(&ctpin);
  cJSON *newhdr = cJSON_Duplicate(ctpin.json, 1);

  if(ncols != 1) die("Can only clean a single colour at a time. Sorry.");

  uint64_t (*hists)[hist_covgsize] = NULL;

  if(hist_covg) {
    hists = ctx_calloc(hist_distsize, sizeof(hists[0]));
  }

  if(hist_path && (hist_fh = futil_fopen_create(hist_path, "w")) == NULL)
      die("Cannot open file: %s", hist_path);

  if(thresh_path && (thresh_fh = futil_fopen_create(thresh_path, "w")) == NULL)
      die("Cannot open file: %s", thresh_path);

  if(limit)
    status("Limiting to the first %zu kmers", limit);

  if(clean)
  {
    timestamp();
    message(" Cleaning coverage below %zu", cutoff);
    message("\n");
  }

  if(save)
  {
    // Check we can find the fields we need
    cJSON *links_json  = json_hdr_get(newhdr, "paths", cJSON_Object, link_out_path);
    cJSON *nkmers_json = json_hdr_get(links_json, "num_kmers_with_paths", cJSON_Number, link_out_path);
    cJSON *nlinks_json = json_hdr_get(links_json, "num_paths",            cJSON_Number, link_out_path);
    cJSON *nbytes_json = json_hdr_get(links_json, "path_bytes",           cJSON_Number, link_out_path);
    if(!nkmers_json || !nlinks_json || !nbytes_json)
      die("Cannot find required header entries");

    // Create a random temporary file
    link_tmp_fh = create_tmp_file(&link_tmp_path, link_out_path);

    status("Saving output to: %s", link_out_path);
    status("Temporary output: %s", link_tmp_path.b);

    // Open output file
    if((link_gz = futil_gzopen_create(link_out_path, "w")) == NULL)
      die("Cannot open output link file: %s", link_out_path);

    // Need to open output file first so we can get absolute path
    // Update the header to include this command
    json_hdr_add_curr_cmd(newhdr, link_out_path);
  }

  if(list)
  {
    status("Listing to %s", csv_out_path);
    if((list_fh = futil_fopen_create(csv_out_path, "w")) == NULL)
      die("Cannot open output CSV file %s", csv_out_path);

    // Print csv header
    fprintf(list_fh, "SeqLen,Covg\n");
  }

  if(plot)
  {
    status("Plotting kmer %zu to %s", plot_kmer_idx, plot_out_path);
    if((plot_fh = futil_fopen_create(plot_out_path, "w")) == NULL)
      die("Cannot open output .dot file %s", plot_out_path);
  }

  SizeBuffer countbuf, jposbuf;
  size_buf_alloc(&countbuf, 16);
  size_buf_alloc(&jposbuf, 1024);

  StrBuf kmerbuf, juncsbuf, seqbuf, outbuf;
  strbuf_alloc(&kmerbuf, 1024);
  strbuf_alloc(&juncsbuf, 1024);
  strbuf_alloc(&seqbuf, 1024);
  strbuf_alloc(&outbuf, 1024);

  bool link_fw;
  size_t njuncs;
  size_t knum, nlinks, num_links_exp = 0;

  LinkTree ltree;
  ltree_alloc(&ltree, kmer_size);

  LinkTreeStats tree_stats;
  memset(&tree_stats, 0, sizeof(tree_stats));
  size_t init_num_links = 0, num_links = 0;

  for(knum = 0; !limit || knum < limit; knum++)
  {
    ltree_reset(&ltree);
    if(!gpath_reader_read_kmer(&ctpin, &kmerbuf, &num_links_exp)) break;
    ctx_assert2(kmerbuf.end == kmer_size, "Kmer incorrect length %zu != %zu",
                kmerbuf.end, kmer_size);
    // status("kmer: %s", kmerbuf.b);

    for(nlinks = 0;
        gpath_reader_read_link(&ctpin, &link_fw, &njuncs,
                               &countbuf, &juncsbuf,
                               &seqbuf, &jposbuf);
        nlinks++)
    {
      ltree_add(&ltree, link_fw, countbuf.b[0], jposbuf.b,
                juncsbuf.b, seqbuf.b);
    }

    if(nlinks != num_links_exp)
      warn("Links count mismatch %zu != %zu", nlinks, num_links_exp);

    if(hist_covg)
    {
      ltree_update_covg_hists(&ltree, (uint64_t*)hists,
                              hist_distsize, hist_covgsize);
    }
    if(clean)
    {
      ltree_clean(&ltree, cutoff);
    }

    // Accumulate statistics
    ltree_get_stats(&ltree, &tree_stats);
    num_links = tree_stats.num_links - init_num_links;
    init_num_links = tree_stats.num_links;

    if(list)
    {
      ltree_write_list(&ltree, &outbuf);
      if(fwrite(outbuf.b, 1, outbuf.end, list_fh) != outbuf.end)
        die("Cannot write CSV file to: %s", csv_out_path);
      strbuf_reset(&outbuf);
    }
    if(save && num_links)
    {
      ltree_write_ctp(&ltree, kmerbuf.b, num_links, &outbuf);
      if(fwrite(outbuf.b, 1, outbuf.end, link_tmp_fh) != outbuf.end)
        die("Cannot write ctp file to: %s", link_tmp_path.b);
      strbuf_reset(&outbuf);
    }
    if(plot && knum == plot_kmer_idx)
    {
      status("Plotting tree...");
      ltree_write_dot(&ltree, &outbuf);
      if(fwrite(outbuf.b, 1, outbuf.end, plot_fh) != outbuf.end)
        die("Cannot write plot DOT file to: %s", plot_out_path);
      strbuf_reset(&outbuf);
    }
  }

  gpath_reader_close(&ctpin);

  cJSON *links_json = json_hdr_get(newhdr, "paths", cJSON_Object, link_out_path);
  cJSON *nkmers_json = json_hdr_get(links_json, "num_kmers_with_paths", cJSON_Number, link_out_path);
  cJSON *nlinks_json = json_hdr_get(links_json, "num_paths",            cJSON_Number, link_out_path);
  cJSON *nbytes_json = json_hdr_get(links_json, "path_bytes",           cJSON_Number, link_out_path);

  status("Number of kmers with links %li -> %zu", nkmers_json->valueint, tree_stats.num_trees_with_links);
  status("Number of links %li -> %zu", nlinks_json->valueint, tree_stats.num_links);
  status("Number of bytes %li -> %zu", nbytes_json->valueint, tree_stats.num_link_bytes);

  if(save)
  {
    // Update JSON
    nkmers_json->valuedouble = nkmers_json->valueint = tree_stats.num_trees_with_links;
    nlinks_json->valuedouble = nlinks_json->valueint = tree_stats.num_links;
    nbytes_json->valuedouble = nbytes_json->valueint = tree_stats.num_link_bytes;

    char *json_str = cJSON_Print(newhdr);
    if(gzputs(link_gz, json_str) != (int)strlen(json_str))
      die("Cannot write ctp file to: %s", link_out_path);
    free(json_str);

    gzputs(link_gz, "\n\n");
    gzputs(link_gz, ctp_explanation_comment);
    gzputs(link_gz, "\n");

    fseek(link_tmp_fh, 0, SEEK_SET);
    char *tmp = ctx_malloc(4*ONE_MEGABYTE);
    size_t s;
    while((s = fread(tmp, 1, 4*ONE_MEGABYTE, link_tmp_fh)) > 0) {
      if(gzwrite(link_gz, tmp, s) != (int)s)
        die("Cannot write to output: %s", link_out_path);
    }
    ctx_free(tmp);

    gzclose(link_gz);
    fclose(link_tmp_fh);
  }

  // Write histogram to file
  if(hist_fh)
  {
    size_t i, j;
    fprintf(hist_fh, "  ");
    for(j = 1; j < hist_covgsize; j++) fprintf(hist_fh, ",covg.%02zu", j);
    fprintf(hist_fh, "\n");
    for(i = 1; i < hist_distsize; i++) {
      fprintf(hist_fh, "dist.%02zu", i);
      for(j = 1; j < hist_covgsize; j++) {
        fprintf(hist_fh, ",%"PRIu64, hists[i][j]);
      }
      fprintf(hist_fh, "\n");
    }
  }

  if(thresh_fh)
  {
    // Use median of first five cutoffs
    print_suggest_cutoff(6, hist_covgsize, hists, thresh_fh);
  }

  if(hist_fh && hist_fh != stdout) fclose(hist_fh);

  if(list)
  {
    fclose(list_fh);
  }

  if(plot)
  {
    fclose(plot_fh);
  }

  ctx_free(hists);
  cJSON_Delete(newhdr);
  strbuf_dealloc(&link_tmp_path);
  ltree_dealloc(&ltree);
  size_buf_dealloc(&countbuf);
  size_buf_dealloc(&jposbuf);
  strbuf_dealloc(&kmerbuf);
  strbuf_dealloc(&juncsbuf);
  strbuf_dealloc(&seqbuf);
  strbuf_dealloc(&outbuf);

  return EXIT_SUCCESS;
}
Пример #16
0
static inline
int test_statement_node(dBNode node, ExpABCWorker *wrkr)
{
  const dBGraph *db_graph = wrkr->db_graph;
  dBNodeBuffer *nbuf = &wrkr->nbuf;
  GraphWalker *wlk = &wrkr->gwlk;
  RepeatWalker *rpt = &wrkr->rptwlk;
  size_t b_idx, col = wrkr->colour;

  // rpt_walker_clear(rpt);

  db_node_buf_reset(nbuf);
  db_node_buf_add(nbuf, node);

  // size_t AB_limit = wrkr->prime_AB ? SIZE_MAX : wrkr->max_AB_dist;
  size_t walk_limit = wrkr->max_AB_dist;
  // status("walk_limit: %zu", walk_limit);

  // Walk from B to find A
  graph_walker_setup(wlk, true, col, col, db_graph);
  graph_walker_start(wlk, nbuf->b[0]);

  while(graph_walker_next(wlk) && nbuf->len < walk_limit) {
    if(!rpt_walker_attempt_traverse(rpt, wlk)) {
      reset(wlk,rpt,nbuf); return RES_LOST_IN_RPT;
    }
    db_node_buf_add(nbuf, wlk->node);
  }

  reset(wlk,rpt,nbuf);

  if(nbuf->len == 1) return RES_NO_TRAVERSAL;

  // Traverse A->B
  db_nodes_reverse_complement(nbuf->b, nbuf->len);
  b_idx = nbuf->len - 1;

  if(wrkr->prime_AB)
  {
    // Prime A->B without attempting to cross
    graph_walker_prime(wlk, nbuf->b, nbuf->len, nbuf->len, true);

    while(graph_walker_next(wlk)) {
      if(!rpt_walker_attempt_traverse(rpt, wlk)) {
        reset(wlk,rpt,nbuf); return RES_LOST_IN_RPT;
      }
      db_node_buf_add(nbuf, wlk->node);
    }
  }
  else
  {
    // Attempt to traverse A->B then extend past B
    int r = confirm_seq(0, true, wlk, rpt, nbuf, col, db_graph);
    switch(r) {
      case CONFIRM_REPEAT: return RES_LOST_IN_RPT;
      case CONFIRM_OVERSHOT: ctx_assert2(0,"Can't 'overshoot' when extending");
      case CONFIRM_WRONG: return RES_AB_WRONG;
      case CONFIRM_SHORT:
        if(wrkr->print_failed_contigs)
          print_failed(node, nbuf, db_graph, true, wrkr->prime_AB);
        wrkr->ab_fail_state[wlk->last_step.status]++;
        return RES_AB_FAILED;
    }
  }

  reset(wlk,rpt,nbuf);

  if(nbuf->len == b_idx+1) return RES_NO_TRAVERSAL; // Couldn't get past B

  // Last node is now C
  // Walk from B... record whether or not we reach C
  ctx_assert(db_nodes_are_equal(nbuf->b[b_idx], db_node_reverse(node)));

  int r = confirm_seq(b_idx, false, wlk, rpt, nbuf, col, db_graph);
  switch(r) {
    case CONFIRM_REPEAT: return RES_LOST_IN_RPT;
    case CONFIRM_OVERSHOT: return RES_BC_OVERSHOT;
    case CONFIRM_WRONG: return RES_BC_WRONG;
    case CONFIRM_SHORT:
      if(wrkr->print_failed_contigs)
        print_failed(node, nbuf, db_graph, false, wrkr->prime_AB);
      wrkr->bc_fail_state[wlk->last_step.status]++;
      return RES_BC_FAILED;
    case CONFIRM_SUCCESS: return RES_ABC_SUCCESS;
  }

  die("Shouldn't reach here: r=%i", r);
  return -1;
}