Esempio n. 1
0
static void gpath_save_thread(void *arg)
{
  GPathSaver *wrkr = (GPathSaver*)arg;
  const dBGraph *db_graph = wrkr->db_graph;

  GPathSubset subset;
  StrBuf sbuf;

  gpath_subset_alloc(&subset);
  gpath_subset_init(&subset, &wrkr->db_graph->gpstore.gpset);
  strbuf_alloc(&sbuf, 2 * DEFAULT_IO_BUFSIZE);

  dBNodeBuffer nbuf;
  SizeBuffer jposbuf;
  db_node_buf_alloc(&nbuf, 1024);
  size_buf_alloc(&jposbuf, 256);

  HASH_ITERATE_PART(&db_graph->ht, wrkr->threadid, wrkr->nthreads,
                    _gpath_gzsave_node,
                    &sbuf, &subset,
                    wrkr->save_seq ? &nbuf : NULL, wrkr->save_seq ? &jposbuf : NULL,
                    wrkr->gzout, wrkr->outlock,
                    db_graph);

  _gpath_save_flush(wrkr->gzout, &sbuf, wrkr->outlock);

  db_node_buf_dealloc(&nbuf);
  size_buf_dealloc(&jposbuf);
  gpath_subset_dealloc(&subset);
  strbuf_dealloc(&sbuf);
}
Esempio n. 2
0
char *
strbuf_double(idnconv_strbuf_t *buf) {
	/*
	 * Double the size of the buffer of BUF.
	 */
	return (strbuf_alloc(buf, buf->size * 2));
}
Esempio n. 3
0
// Returns number of types.
int load_hla_csv(const char *path, char ***bools_ptr, int num_rows)
{
  assert(num_rows > 0);

  StrBuf line;
  strbuf_alloc(&line, 1024);

  FILE *fh = fopen(path, "r");
  if(fh == NULL) die("Cannot open file: %s.", path);

  if(strbuf_readline(&line, fh) == 0) die("Empty CSV file: %s.", path);
  int num_types = count_char(line.b, ',');

  char **bools = my_malloc(sizeof(char*) * num_rows, __FILE__, __LINE__);
  char *data = my_malloc(sizeof(char) * num_rows * (num_types+1), __FILE__, __LINE__);
  printf("Number of rows: %i.\n",num_rows);
  int i;
  for(i = 0; i < num_rows && strbuf_reset_readline(&line, fh); i++)
  {
    strbuf_chomp(&line);
    bools[i] = data + i * (num_types+1);
    load_comma_bool_line(line.b, bools[i], num_types);
    bools[i][num_types] = '\0';
  }

  if(i < num_rows) die("Not enough rows in CSV file: %s.", path);

  fclose(fh);
  strbuf_dealloc(&line);

  *bools_ptr = bools;
  return num_types;
}
Esempio n. 4
0
char *
strbuf_copy(idnconv_strbuf_t *buf, const char *str) {
	/*
	 * Copy STR to BUF.
	 */
	size_t	len = strlen(str);

	if (strbuf_alloc(buf, len + 1) == NULL)
		return (NULL);
	strcpy(buf->str, str);
	return (buf->str);
}
Esempio n. 5
0
CallDecomp* call_decomp_init(htsFile *vcffh, bcf_hdr_t *vcfhdr)
{
  CallDecomp *dc = ctx_calloc(1, sizeof(CallDecomp));
  dc->nw_aligner = needleman_wunsch_new();
  dc->aln = alignment_create(1024);
  dc->scoring = ctx_calloc(1, sizeof(dc->scoring[0]));
  scoring_system_default(dc->scoring);
  dc->vcffh = vcffh;
  dc->vcfhdr = vcfhdr;
  dc->v = bcf_init();
  strbuf_alloc(&dc->sbuf, 256);
  return dc;
}
Esempio n. 6
0
char *
strbuf_append(idnconv_strbuf_t *buf, const char *str) {
	/*
	 * Append STR to the end of BUF.
	 */
	size_t	len1 = strlen(buf->str);
	size_t	len2 = strlen(str);
	char *p;
#define MARGIN	50

	p = strbuf_alloc(buf, len1 + len2 + 1 + MARGIN);
	if (p != NULL)
		strcpy(buf->str + len1, str);
	return (p);
}
Esempio n. 7
0
void vcf_misc_hdr_add_cmd(bcf_hdr_t *hdr, const char *cmdline, const char *cwd)
{
  char keystr[8], timestr[100];
  time_t tnow;
  time(&tnow);
  strftime(timestr, sizeof(timestr), "%Y%m%d-%H:%M:%S", localtime(&tnow));
  StrBuf sbuf;
  strbuf_alloc(&sbuf, 1024);
  strbuf_sprintf(&sbuf, "##mccortex_%s=<prev=\"NULL\",cmd=\"%s\",cwd=\"%s\","
                        "datetime=\"%s\",version="CTX_VERSION">\n",
                 hex_rand_str(keystr, sizeof(keystr)),
                 cmdline, cwd, timestr);
  bcf_hdr_append(hdr, sbuf.b);
  strbuf_dealloc(&sbuf);
}
Esempio n. 8
0
static cJSON* read_input_header(gzFile gzin)
{
  cJSON *json;
  StrBuf hdrstr;
  strbuf_alloc(&hdrstr, 1024);
  json_hdr_read(NULL, gzin, input_path, &hdrstr);
  json = cJSON_Parse(hdrstr.b);
  if(json == NULL) die("Invalid JSON header: %s", input_path);

  // Check we can handle the kmer size
  kmer_size = json_hdr_get_kmer_size(json, input_path);
  db_graph_check_kmer_size(kmer_size, input_path);

  strbuf_dealloc(&hdrstr);

  return json;
}
Esempio n. 9
0
void test_graph_crawler()
{
  test_status("Testing graph crawler...");

  // Construct 1 colour graph with kmer-size=11
  dBGraph graph;
  const size_t kmer_size = 11, ncols = 3;

  db_graph_alloc(&graph, kmer_size, ncols, 1, 2048,
                 DBG_ALLOC_EDGES | DBG_ALLOC_NODE_IN_COL | DBG_ALLOC_BKTLOCKS);

  char graphseq[3][77] =
//           <               X                 X              X...............
{"GTTCCAGAGCGGAGGTCTCCCAACAACATGGTATAAGTTGTCTAGCCCCGGTTCGCGCGGGTACTTCTTACAGCGC",
 "GTTCCAGAGCGGAGGTCTCCCAACAACTTGGTATAAGTTGTCTAGTCCCGGTTCGCGCGGCATTTCAGCATTGTTA",
 "GTTCCAGAGCGCGACAGAGTGCATATCACGCTAAGCACAGCCCTCTTCTATCTGCTTTTAAATGGATCAATAATCG"};

  build_graph_from_str_mt(&graph, 0, graphseq[0], strlen(graphseq[0]));
  build_graph_from_str_mt(&graph, 1, graphseq[1], strlen(graphseq[1]));
  build_graph_from_str_mt(&graph, 2, graphseq[2], strlen(graphseq[2]));

  // Crawl graph
  GraphCrawler crawler;
  graph_crawler_alloc(&crawler, &graph);

  dBNode node = db_graph_find_str(&graph, graphseq[0]);
  dBNode next_node = db_graph_find_str(&graph, graphseq[0]+1);
  TASSERT(node.key != HASH_NOT_FOUND);
  TASSERT(next_node.key != HASH_NOT_FOUND);

  BinaryKmer bkey = db_node_get_bkmer(&graph, node.key);
  Edges edges = db_node_get_edges(&graph, node.key, 0);

  dBNode next_nodes[4];
  Nucleotide next_nucs[4];
  size_t i, p, num_next, next_idx;

  num_next = db_graph_next_nodes(&graph, bkey, node.orient, edges,
                                 next_nodes, next_nucs);

  next_idx = 0;
  while(next_idx < num_next && !db_nodes_are_equal(next_nodes[next_idx],next_node))
    next_idx++;

  TASSERT(next_idx < num_next && db_nodes_are_equal(next_nodes[next_idx],next_node));

  // Crawl in all colours
  graph_crawler_fetch(&crawler, node, next_nodes, next_idx, num_next,
                      NULL, graph.num_of_cols, NULL, NULL, NULL);

  TASSERT2(crawler.num_paths == 2, "crawler.num_paths: %u", crawler.num_paths);

  // Fetch paths
  dBNodeBuffer nbuf;
  db_node_buf_alloc(&nbuf, 16);
  StrBuf sbuf;
  strbuf_alloc(&sbuf, 128);

  for(p = 0; p < crawler.num_paths; p++) {
    db_node_buf_reset(&nbuf);
    graph_crawler_get_path_nodes(&crawler, p, &nbuf);
    strbuf_ensure_capacity(&sbuf, nbuf.len+graph.kmer_size);
    sbuf.end = db_nodes_to_str(nbuf.b, nbuf.len, &graph, sbuf.b);
    for(i = 0; i < 3 && strcmp(graphseq[i]+1,sbuf.b) != 0; i++) {}
    TASSERT2(i < 3, "seq: %s", sbuf.b);
    TASSERT2(sbuf.end == 75, "sbuf.end: %zu", sbuf.end);
    TASSERT2(nbuf.len == 65, "nbuf.len: %zu", nbuf.len);
  }

  strbuf_dealloc(&sbuf);
  db_node_buf_dealloc(&nbuf);

  graph_crawler_dealloc(&crawler);

  db_graph_dealloc(&graph);
}
Esempio n. 10
0
void graph_info_alloc(GraphInfo *ginfo)
{
  strbuf_alloc(&ginfo->sample_name, 256);
  error_cleaning_alloc(&ginfo->cleaning);
  graph_info_init(ginfo);
}
Esempio n. 11
0
static void error_cleaning_alloc(ErrorCleaning *ec)
{
  strbuf_alloc(&ec->intersection_name, 256);
  error_cleaning_init(ec);
}
Esempio n. 12
0
static void parse_entries(gzFile gzin, FILE *fout)
{
  CallFileEntry centry;
  call_file_entry_alloc(&centry);

  ChromPosBuffer chrposbuf;
  chrompos_buf_alloc(&chrposbuf, 32);

  StrBuf tmpbuf, flank3pbuf;
  strbuf_alloc(&tmpbuf, 1024);
  strbuf_alloc(&flank3pbuf, 1024);

  const char *flank5p, *flank3p;
  size_t flank5p_len, flank3p_len;
  size_t cpy_flnk_5p, cpy_flnk_3p;

  const read_t *chrom = NULL;
  size_t ref_start = 0, ref_end = 0;
  bool mapped = false, fw_strand = false;

  const char **genotypes = NULL;

  if(!input_bubble_format)
    genotypes = ctx_calloc(num_samples, sizeof(char*));

  for(; call_file_read(gzin, input_path, &centry); num_entries_read++)
  {
    size_t nlines = call_file_num_lines(&centry);
    ctx_assert2(!(nlines&1) && nlines >= 6, "Too few lines: %zu", nlines);

    flank5p = call_file_get_line(&centry,1);
    flank5p_len = call_file_line_len(&centry,1);
    cpy_flnk_5p = cpy_flnk_3p = 0;

    // Read a corresponding SAM entry
    if(input_bubble_format)
    {
      // Trim down alleles, add to 3p flank
      bubble_trim_alleles(&centry, &flank3pbuf);
      flank3p = flank3pbuf.b;
      flank3p_len = flank3pbuf.end;

      mapped = sam_fetch_coords(&centry, flank5p, flank5p_len, flank3p, flank3p_len,
                                &cpy_flnk_5p, &cpy_flnk_3p,
                                &chrom, &ref_start, &ref_end, &fw_strand);
    }
    else {
      flank3p = call_file_get_line(&centry, 3);
      flank3p_len = call_file_line_len(&centry, 3);

      mapped = brkpnt_fetch_coords(&centry, &chrposbuf,
                                   &chrom, &ref_start, &ref_end, &fw_strand,
                                   &cpy_flnk_5p, &cpy_flnk_3p);
    }

    if(mapped)
    {
      // Get call id
      const char *hdrline = call_file_get_line(&centry, 0);
      char callid[100];
      int r = get_callid_str(hdrline, input_bubble_format, callid, sizeof(callid));
      if(r == -1) die("Poorly formatted: %s", hdrline);
      if(r == -2) die("Call id string is too long: %s", hdrline);

      align_entry(&centry, callid, flank5p, flank5p_len, flank3p, flank3p_len,
                  cpy_flnk_5p, cpy_flnk_3p,
                  chrom, ref_start, ref_end, fw_strand,
                  &tmpbuf, genotypes,
                  fout);
    }
  }

  ctx_free(genotypes);
  call_file_entry_dealloc(&centry);
  chrompos_buf_dealloc(&chrposbuf);
  strbuf_dealloc(&tmpbuf);
  strbuf_dealloc(&flank3pbuf);
}
Esempio n. 13
0
int main(int argc, char **argv)
{
  // compiler complains about unused function without these linese
  (void)kh_clear_ghash;
  (void)kh_del_ghash;

  if(argc < 2) print_usage(usage, NULL);

  char swap_alleles = 0;

  int c;
  while((c = getopt(argc, argv, "s")) >= 0) {
    switch (c) {
      case 's': swap_alleles = 1; break;
      default: die("Unknown option: %c", c);
    }
  }

  if(optind == argc) print_usage(usage, "Not enough arguments");

  char *inputpath = argv[optind];
  char **refpaths = argv + optind + 1;
  size_t num_refs = argc - optind - 1;

  gzFile gzin = gzopen(inputpath, "r");
  if(gzin == NULL) die("Cannot read file: %s", inputpath);

  size_t i, nchroms = 0, capacity = 1024;
  khash_t(ghash) *genome = kh_init(ghash);
  read_t *reads = malloc(capacity * sizeof(read_t)), *r;
  int hret;
  khiter_t k;

  for(i = 0; i < num_refs; i++) {
    fprintf(stderr, "Loading %s\n", refpaths[i]);
    load_reads(refpaths[i], &reads, &capacity, &nchroms);
  }

  if(num_refs == 0) {
    fprintf(stderr, "Loading from stdin\n");
    load_reads("-", &reads, &capacity, &nchroms);
  }

  if(nchroms == 0) die("No chromosomes loaded");

  for(i = 0; i < nchroms; i++) {
    r = reads + i;
    fprintf(stderr, "Loaded: '%s'\n", r->name.b);
    k = kh_put(ghash, genome, r->name.b, &hret);
    if(hret == 0) warn("Duplicate read name (taking first): %s", r->name.b);
    else kh_value(genome, k) = r;
  }

  // Now read VCF
  StrBuf line;
  strbuf_alloc(&line, 1024);
  char *fields[9];
  char *chr;
  int pos, reflen, altlen;

  while(strbuf_reset_gzreadline(&line, gzin) > 0)
  {
    if(line.b[0] == '#') fputs(line.b, stdout);
    else
    {
      strbuf_chomp(&line);
      vcf_columns(line.b, fields);
      fields[1][-1] = fields[2][-1] = '\0';
      chr = line.b;
      pos = atoi(fields[1])-1;
      k = kh_get(ghash, genome, chr);
      r = kh_value(genome, k);
      fields[1][-1] = fields[2][-1] = '\t';
      reflen = fields[4] - fields[3] - 1;
      altlen = fields[5] - fields[4] - 1;
      if(k == kh_end(genome)) warn("Cannot find chrom: %s", chr);
      else if(pos < 0) warn("Bad line: %s\n", line.b);
      else if((reflen == 1 && altlen == 1) || fields[3][0] == fields[4][0])
      {
        if((unsigned)pos + reflen <= r->seq.end &&
           strncasecmp(r->seq.b+pos,fields[3],reflen) == 0)
        {
          fputs(line.b, stdout);
          fputc('\n', stdout);
        }
        else if(swap_alleles && (unsigned)pos + altlen <= r->seq.end &&
                strncasecmp(r->seq.b+pos,fields[4],altlen) == 0)
        {
          // swap alleles
          char tmp[altlen], *ref = fields[3], *alt = fields[4];
          memcpy(tmp, alt, altlen);
          memmove(ref+altlen+1, ref, reflen);
          memcpy(ref, tmp, altlen);
          ref[altlen] = '\t';
          fputs(line.b, stdout);
          fputc('\n', stdout);
        }
        // else printf("FAIL0\n");
      }
      // else printf("FAIL1\n");
    }
  }

  kh_destroy(ghash, genome);
  strbuf_dealloc(&line);
  gzclose(gzin);

  for(i = 0; i < nchroms; i++) seq_read_dealloc(reads+i);
  free(reads);

  fprintf(stderr, " Done.\n");

  return 0;
}
Esempio n. 14
0
BubbleCaller* bubble_callers_new(size_t num_callers,
                                 BubbleCallingPrefs prefs,
                                 gzFile gzout,
                                 const dBGraph *db_graph)
{
  ctx_assert(num_callers > 0);

  // Max usage is 4 * max_allele_len * cols
  size_t i;
  size_t max_path_len = MAX2(prefs.max_flank_len, prefs.max_allele_len);

  BubbleCaller *callers = ctx_malloc(num_callers * sizeof(BubbleCaller));

  pthread_mutex_t *out_lock = ctx_malloc(sizeof(pthread_mutex_t));
  if(pthread_mutex_init(out_lock, NULL) != 0) die("mutex init failed");

  size_t *num_bubbles_ptr = ctx_calloc(1, sizeof(size_t));

  for(i = 0; i < num_callers; i++)
  {
    BubbleCaller tmp = {.threadid = i, .nthreads = num_callers,
                        .haploid_seen = ctx_calloc(1+prefs.num_haploid, sizeof(bool)),
                        .num_bubbles_ptr = num_bubbles_ptr,
                        .prefs = prefs,
                        .db_graph = db_graph, .gzout = gzout,
                        .out_lock = out_lock};

    memcpy(&callers[i], &tmp, sizeof(BubbleCaller));

    // First two buffers don't actually need to grow
    db_node_buf_alloc(&callers[i].flank5p, prefs.max_flank_len);
    db_node_buf_alloc(&callers[i].pathbuf, max_path_len);

    graph_walker_alloc(&callers[i].wlk, db_graph);
    rpt_walker_alloc(&callers[i].rptwlk, db_graph->ht.capacity, 22); // 4MB

    graph_cache_alloc(&callers[i].cache, db_graph);
    cache_stepptr_buf_alloc(&callers[i].spp_forward, 1024);
    cache_stepptr_buf_alloc(&callers[i].spp_reverse, 1024);
    strbuf_alloc(&callers[i].output_buf, 2048);
  }

  return callers;
}

void bubble_callers_destroy(BubbleCaller *callers, size_t num_callers)
{
  ctx_assert(num_callers > 0);

  size_t i;
  for(i = 0; i < num_callers; i++)
  {
    ctx_free(callers[i].haploid_seen);

    db_node_buf_dealloc(&callers[i].flank5p);
    db_node_buf_dealloc(&callers[i].pathbuf);

    rpt_walker_dealloc(&callers[i].rptwlk);
    graph_walker_dealloc(&callers[i].wlk);

    graph_cache_dealloc(&callers[i].cache);
    cache_stepptr_buf_dealloc(&callers[i].spp_forward);
    cache_stepptr_buf_dealloc(&callers[i].spp_reverse);
    strbuf_dealloc(&callers[i].output_buf);
  }
  pthread_mutex_destroy(callers[0].out_lock);
  ctx_free(callers[0].out_lock);
  ctx_free(callers[0].num_bubbles_ptr);
  ctx_free(callers);
}
Esempio n. 15
0
// Load each sequence into a separate colour
static void test_bubbles(dBGraph *graph, const char **seqs, size_t nseqs,
                         const char *flank5p, const char *flank3p,
                         const char **alleles, size_t nalleles)
{
  db_graph_reset(graph);

  TASSERT(graph->num_of_cols >= nseqs);

  size_t i;
  for(i = 0; i < nseqs; i++)
    build_graph_from_str_mt(graph, i, seqs[i], strlen(seqs[i]), false);

  graph->num_of_cols_used = MAX2(graph->num_of_cols_used, 1);

  StrBuf sbuf;
  dBNodeBuffer nbuf;
  strbuf_alloc(&sbuf, 128);
  db_node_buf_alloc(&nbuf, 128);

  BubbleCallingPrefs prefs = {.max_allele_len = 100, .max_flank_len = 100,
                              .haploid_cols = NULL, .nhaploid_cols = 0,
                              .remove_serial_bubbles = true};

  BubbleCaller *caller = bubble_callers_new(1, &prefs, NULL, graph);

  _call_bubble(caller, flank5p, flank3p, alleles, nalleles, &nbuf, &sbuf);

  strbuf_dealloc(&sbuf);
  db_node_buf_dealloc(&nbuf);
  bubble_callers_destroy(caller, 1);
}

void test_bubble_caller()
{
  test_status("Testing bubble calling...");

  // Construct 1 colour graph with kmer-size=11
  dBGraph graph;
  const size_t kmer_size = 11, ncols = 3;

  // Create graph
  db_graph_alloc(&graph, kmer_size, ncols, 1, 2000,
                 DBG_ALLOC_EDGES | DBG_ALLOC_NODE_IN_COL | DBG_ALLOC_BKTLOCKS);

  //   mutations:                      x
  const char *seqs0[] = {"AGGGATAAAACTCTGTACTGGATCTCCCT",
                         "AGGGATAAAACTCTcTACTGGATCTCCCT"};
  const char flank5p0[] = "AGGGATAAAACTCT";
  const char flank3p0[] = "TACTGGATCTCCCT";
  const char *alleles0[] = {"ATAAAACTCTGTACTGGATCT", "ATAAAACTCTcTACTGGATCT"};

  test_bubbles(&graph, seqs0, 2, flank5p0, flank3p0, alleles0, 2);

  //   mutations:                     x                  y
  const char *seqs1[] = {"CCCGTAGGTAAGGGCGTTAGTGCAAGGCCACATTGGGACACGAGTTGATA",
                         "CCCGTAGGTAAGtGCGTTAGTGCAAGGCCACATTGGGACACGAGTTGATA",
                         "CCCGTAGGTAAGGGCGTTAGTGCAAGGCCACtTTGGGACACGAGTTGATA"};

  // forwards
  const char flank5p1a[] = "CCCGTAGGTAAG";
  const char flank3p1a[] = "GCGTTAGTGCAAGGCCAC";
  const char *alleles1a[] = {"CGTAGGTAAGGGCGTTAGTGC", "CGTAGGTAAGtGCGTTAGTGC"};

  const char flank5p1b[] = "GCGTTAGTGCAAGGCCAC";
  const char flank3p1b[] = "TTGGGACACGAGTTGATA";
  const char *alleles1b[] = {"GCAAGGCCACATTGGGACACG", "GCAAGGCCACtTTGGGACACG"};

  test_bubbles(&graph, seqs1, 3, flank5p1a, flank3p1a, alleles1a, 2);
  test_bubbles(&graph, seqs1, 3, flank5p1b, flank3p1b, alleles1b, 2);

  // reverse
  // mutations:        y                  x
  // TATCAACTCGTGTCCCAATGTGGCCTTGCACTAACGCCCTTACCTACGGG
  // TATCAACTCGTGTCCCAATGTGGCCTTGCACTAACGCaCTTACCTACGGG
  // TATCAACTCGTGTCCCAAaGTGGCCTTGCACTAACGCCCTTACCTACGGG
  //
  const char flank5p1c[] = "GTGGCCTTGCACTAACGC";
  const char flank3p1c[] = "CTTACCTACGGG";
  const char *alleles1c[] = {"GCACTAACGCCCTTACCTACG", "GCACTAACGCaCTTACCTACG"};

  const char flank5p1d[] = "TATCAACTCGTGTCCCAA";
  const char flank3p1d[] = "GTGGCCTTGCACTAACGC";
  const char *alleles1d[] = {"CGTGTCCCAATGTGGCCTTGC", "CGTGTCCCAAaGTGGCCTTGC"};

  test_bubbles(&graph, seqs1, 3, flank5p1c, flank3p1c, alleles1c, 2);
  test_bubbles(&graph, seqs1, 3, flank5p1d, flank3p1d, alleles1d, 2);

  db_graph_dealloc(&graph);
}
Esempio n. 16
0
static bcf_hdr_t* make_vcf_hdr(cJSON *json, const char *in_path,
                               bool is_breakpoint, size_t kmer_size,
                               char const*const* ref_paths, size_t nref_paths,
                               read_t *chroms, size_t nchroms)
{
  ctx_assert(json != NULL);

  StrBuf hdrbuf;
  strbuf_alloc(&hdrbuf, 1024);

  char datestr[9];
  time_t date = time(NULL);
  strftime(datestr, 9, "%Y%m%d", localtime(&date));

  strbuf_append_str(&hdrbuf, "##fileformat=VCFv4.2\n##fileDate=");
  strbuf_append_str(&hdrbuf, datestr);
  strbuf_append_str(&hdrbuf, "\n");

  // Print commands used to generate header
  cJSON *commands = json_hdr_get(json, "commands", cJSON_Array, in_path);
  cJSON *command = commands->child;

  // Print this command
  char keystr[8];
  char *prevstr = NULL;
  size_t i;

  if(command) {
    cJSON *key = json_hdr_get(command, "key", cJSON_String, in_path);
    prevstr = key->valuestring;
  }

  // Print command entry for this command
  strbuf_append_str(&hdrbuf, "##mccortex_");
  strbuf_append_str(&hdrbuf, hex_rand_str(keystr, sizeof(keystr)));
  strbuf_append_str(&hdrbuf, "=<prev=\"");
  strbuf_append_str(&hdrbuf, prevstr ? prevstr : "NULL");
  strbuf_append_str(&hdrbuf, "\",cmd=\"");
  strbuf_append_str(&hdrbuf, cmd_get_cmdline());
  strbuf_append_str(&hdrbuf, "\",cwd=\"");
  strbuf_append_str(&hdrbuf, cmd_get_cwd());
  strbuf_append_str(&hdrbuf, "\",version="CTX_VERSION">\n");

  // Print previous commands
  vcf_hdrtxt_append_commands(command, &hdrbuf, in_path);

  // Print field definitions
  if(is_breakpoint)
    strbuf_append_str(&hdrbuf, "##INFO=<ID=BRKPNT,Number=1,Type=String,Description=\"Breakpoint call\">\n");
  else
    strbuf_append_str(&hdrbuf, "##INFO=<ID=BUBBLE,Number=1,Type=String,Description=\"Bubble call\">\n");

  strbuf_sprintf(&hdrbuf, "##INFO=<ID=K%zu,Number=0,Type=Flag,Description=\"Found at k=%zu\">\n", kmer_size, kmer_size);

  strbuf_append_str(&hdrbuf, "##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">\n");
  strbuf_append_str(&hdrbuf, "##FILTER=<ID=PASS,Description=\"All filters passed\">\n");

  // Print reference paths
  strbuf_append_str(&hdrbuf, "##reference=");
  strbuf_append_str(&hdrbuf, ref_paths[0]);
  for(i = 1; i < nref_paths; i++) {
    strbuf_append_char(&hdrbuf, ',');
    strbuf_append_str(&hdrbuf, ref_paths[i]);
  }
  strbuf_append_str(&hdrbuf, "\n");

  // Print contigs lengths
  for(i = 0; i < nchroms; i++) {
    strbuf_sprintf(&hdrbuf, "##contig=<ID=%s,length=%zu>\n",
                   chroms[i].name.b, chroms[i].seq.end);
  }

  // Print VCF column header
  strbuf_append_str(&hdrbuf, "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT");

  if(is_breakpoint)
  {
    // Print a column for each sample
    cJSON *graph_json   = json_hdr_get(json,       "graph",   cJSON_Object, in_path);
    cJSON *colours_json = json_hdr_get(graph_json, "colours", cJSON_Array,  in_path);
    cJSON *colour_json  = colours_json->child;
    if(colour_json == NULL) die("Missing colours");
    for(; colour_json; colour_json = colour_json->next)
    {
      if(!json_hdr_colour_is_ref(colour_json)) {
        cJSON *sample_json = json_hdr_get(colour_json, "sample", cJSON_String, in_path);
        strbuf_append_str(&hdrbuf, "\t");
        strbuf_append_str(&hdrbuf, sample_json->valuestring);
      }
    }
  }

  strbuf_append_char(&hdrbuf, '\n');
  bcf_hdr_t *hdr = bcf_hdr_init("w");
  if(bcf_hdr_parse(hdr, hdrbuf.b) != 0) die("Cannot construct VCF header");

  strbuf_dealloc(&hdrbuf);

  return hdr;
}
Esempio n. 17
0
int ctx_join(int argc, char **argv)
{
  struct MemArgs memargs = MEM_ARGS_INIT;
  const char *out_path = NULL;
  size_t use_ncols = 0;

  GraphFileReader tmp_gfile;
  GraphFileBuffer isec_gfiles_buf;
  gfile_buf_alloc(&isec_gfiles_buf, 8);

  // Arg parsing
  char cmd[100], shortopts[100];
  cmd_long_opts_to_short(longopts, shortopts, sizeof(shortopts));
  int c;

  while((c = getopt_long_only(argc, argv, shortopts, longopts, NULL)) != -1) {
    cmd_get_longopt_str(longopts, c, cmd, sizeof(cmd));
    switch(c) {
      case 0: /* flag set */ break;
      case 'h': cmd_print_usage(NULL); break;
      case 'o': cmd_check(!out_path, cmd); out_path = optarg; break;
      case 'f': cmd_check(!futil_get_force(), cmd); futil_set_force(true); break;
      case 'm': cmd_mem_args_set_memory(&memargs, optarg); break;
      case 'n': cmd_mem_args_set_nkmers(&memargs, optarg); break;
      case 'N': cmd_check(!use_ncols, cmd); use_ncols = cmd_uint32_nonzero(cmd, optarg); break;
      case 'i':
        graph_file_reset(&tmp_gfile);
        graph_file_open(&tmp_gfile, optarg);
        if(file_filter_into_ncols(&tmp_gfile.fltr) > 1)
          warn("Flattening intersection graph into colour 0: %s", optarg);
        file_filter_flatten(&tmp_gfile.fltr, 0);
        gfile_buf_push(&isec_gfiles_buf, &tmp_gfile, 1);
        break;
      case ':': /* BADARG */
      case '?': /* BADCH getopt_long has already printed error */
        // cmd_print_usage(NULL);
        die("`"CMD" join -h` for help. Bad option: %s", argv[optind-1]);
      default: abort();
    }
  }

  GraphFileReader *igfiles = isec_gfiles_buf.b;
  size_t num_igfiles = isec_gfiles_buf.len;

  if(!out_path) cmd_print_usage("--out <out.ctx> required");

  if(optind >= argc)
    cmd_print_usage("Please specify at least one input graph file");

  // optind .. argend-1 are graphs to load
  size_t num_gfiles = (size_t)(argc - optind);
  char **gfile_paths = argv + optind;

  GraphFileReader *gfiles = ctx_calloc(num_gfiles, sizeof(GraphFileReader));

  status("Probing %zu graph files and %zu intersect files", num_gfiles, num_igfiles);

  // Check all binaries are valid binaries with matching kmer size
  size_t i;
  size_t ctx_max_cols = 0;
  uint64_t min_intersect_num_kmers = 0, ctx_max_kmers = 0, ctx_sum_kmers = 0;

  for(i = 0; i < num_gfiles; i++)
  {
    graph_file_open2(&gfiles[i], gfile_paths[i], "r", true, ctx_max_cols);

    if(gfiles[0].hdr.kmer_size != gfiles[i].hdr.kmer_size) {
      cmd_print_usage("Kmer sizes don't match [%u vs %u]",
                      gfiles[0].hdr.kmer_size, gfiles[i].hdr.kmer_size);
    }

    ctx_max_cols = MAX2(ctx_max_cols, file_filter_into_ncols(&gfiles[i].fltr));
    ctx_max_kmers = MAX2(ctx_max_kmers, graph_file_nkmers(&gfiles[i]));
    ctx_sum_kmers += graph_file_nkmers(&gfiles[i]);
  }

  // Probe intersection graph files
  for(i = 0; i < num_igfiles; i++)
  {
    if(gfiles[0].hdr.kmer_size != igfiles[i].hdr.kmer_size) {
      cmd_print_usage("Kmer sizes don't match [%u vs %u]",
                  gfiles[0].hdr.kmer_size, igfiles[i].hdr.kmer_size);
    }

    uint64_t nkmers = graph_file_nkmers(&igfiles[i]);

    if(i == 0) min_intersect_num_kmers = nkmers;
    else if(nkmers < min_intersect_num_kmers)
    {
      // Put smallest intersection binary first
      SWAP(igfiles[i], igfiles[0]);
      min_intersect_num_kmers = nkmers;
    }
  }

  bool take_intersect = (num_igfiles > 0);

  // If we are taking an intersection,
  // all kmers intersection kmers will need to be loaded
  if(take_intersect)
    ctx_max_kmers = ctx_sum_kmers = min_intersect_num_kmers;

  bool use_ncols_set = (use_ncols > 0);
  bool output_to_stdout = (strcmp(out_path,"-") == 0);

  // if(use_ncols == 0) use_ncols = 1;
  if(use_ncols_set) {
    if(use_ncols < ctx_max_cols && output_to_stdout)
      die("I need %zu colours if outputting to STDOUT (--ncols)", ctx_max_cols);
    if(use_ncols > ctx_max_cols) {
      warn("I only need %zu colour%s ('--ncols %zu' ignored)",
           ctx_max_cols, util_plural_str(ctx_max_cols), use_ncols);
      use_ncols = ctx_max_cols;
    }
  }
  else {
    use_ncols = output_to_stdout ? ctx_max_cols : 1;
  }

  // Check out_path is writable
  futil_create_output(out_path);

  status("Output %zu cols; from %zu files; intersecting %zu graphs; ",
         ctx_max_cols, num_gfiles, num_igfiles);

  if(num_gfiles == 1 && num_igfiles == 0)
  {
    // Loading only one file with no intersection files
    // Don't need to store a graph in memory, can filter as stream
    // Don't actually store anything in the de Bruijn graph, but we need to
    // pass it, so mock one up
    dBGraph db_graph;
    db_graph_alloc(&db_graph, gfiles[0].hdr.kmer_size,
                   file_filter_into_ncols(&gfiles[0].fltr), 0, 1024, 0);

    graph_writer_stream_mkhdr(out_path, &gfiles[0], &db_graph, NULL, NULL);
    graph_file_close(&gfiles[0]);
    gfile_buf_dealloc(&isec_gfiles_buf);
    ctx_free(gfiles);

    db_graph_dealloc(&db_graph);

    return EXIT_SUCCESS;
  }

  //
  // Decide on memory
  //
  size_t bits_per_kmer, kmers_in_hash, graph_mem;

  bits_per_kmer = sizeof(BinaryKmer)*8 +
                  (sizeof(Covg) + sizeof(Edges)) * 8 * use_ncols;

  kmers_in_hash = cmd_get_kmers_in_hash(memargs.mem_to_use,
                                        memargs.mem_to_use_set,
                                        memargs.num_kmers,
                                        memargs.num_kmers_set,
                                        bits_per_kmer,
                                        ctx_max_kmers, ctx_sum_kmers,
                                        true, &graph_mem);

  if(!use_ncols_set)
  {
    // Maximise use_ncols
    size_t max_usencols = (memargs.mem_to_use*8) / bits_per_kmer;

    use_ncols = MIN2(max_usencols, ctx_max_cols);
    bits_per_kmer = sizeof(BinaryKmer)*8 +
                    (sizeof(Covg) + sizeof(Edges)) * 8 * use_ncols;

    // Re-check memory used
    kmers_in_hash = cmd_get_kmers_in_hash(memargs.mem_to_use,
                                          memargs.mem_to_use_set,
                                          memargs.num_kmers,
                                          memargs.num_kmers_set,
                                          bits_per_kmer,
                                          ctx_max_kmers, ctx_sum_kmers,
                                          true, &graph_mem);
  }

  status("Using %zu colour%s in memory", use_ncols, util_plural_str(use_ncols));

  cmd_check_mem_limit(memargs.mem_to_use, graph_mem);

  // Create db_graph
  dBGraph db_graph;
  Edges *intersect_edges = NULL;
  size_t edge_cols = (use_ncols + take_intersect);

  db_graph_alloc(&db_graph, gfiles[0].hdr.kmer_size, use_ncols, use_ncols,
                 kmers_in_hash, DBG_ALLOC_COVGS);

  // We allocate edges ourself since it's a special case
  db_graph.col_edges = ctx_calloc(db_graph.ht.capacity*edge_cols, sizeof(Edges));

  // Load intersection binaries
  char *intsct_gname_ptr = NULL;
  StrBuf intersect_gname;
  strbuf_alloc(&intersect_gname, 1024);

  if(take_intersect)
  {
    GraphLoadingPrefs gprefs = graph_loading_prefs(&db_graph);
    gprefs.boolean_covgs = true; // covg++ only

    for(i = 0; i < num_igfiles; i++)
    {
      graph_load(&igfiles[i], gprefs, NULL);

      // Update intersect header
      // note: intersection graphs all load exactly one colour into colour 0
      graph_info_make_intersect(&igfiles[i].hdr.ginfo[0], &intersect_gname);

      gprefs.must_exist_in_graph = true;
      gprefs.must_exist_in_edges = db_graph.col_edges;
    }

    if(num_igfiles > 1)
    {
      // Remove nodes where covg != num_igfiles
      HASH_ITERATE_SAFE(&db_graph.ht, remove_non_intersect_nodes,
                        db_graph.col_covgs, (Covg)num_igfiles, &db_graph.ht);
    }

    status("Loaded intersection set\n");
    intsct_gname_ptr = intersect_gname.b;

    for(i = 0; i < num_igfiles; i++) graph_file_close(&igfiles[i]);

    // Reset graph info
    for(i = 0; i < db_graph.num_of_cols; i++)
      graph_info_init(&db_graph.ginfo[i]);

    // Zero covgs
    memset(db_graph.col_covgs, 0, db_graph.ht.capacity * sizeof(Covg));

    // Use union edges we loaded to intersect new edges
    intersect_edges = db_graph.col_edges;
    db_graph.col_edges += db_graph.ht.capacity;
  }

  bool kmers_loaded = take_intersect, colours_loaded = false;

  graph_writer_merge_mkhdr(out_path, gfiles, num_gfiles,
                          kmers_loaded, colours_loaded, intersect_edges,
                          intsct_gname_ptr, &db_graph);

  if(take_intersect)
    db_graph.col_edges -= db_graph.ht.capacity;

  for(i = 0; i < num_gfiles; i++) graph_file_close(&gfiles[i]);

  strbuf_dealloc(&intersect_gname);
  gfile_buf_dealloc(&isec_gfiles_buf);
  ctx_free(gfiles);

  db_graph_dealloc(&db_graph);

  return EXIT_SUCCESS;
}
Esempio n. 18
0
int ctx_links(int argc, char **argv)
{
  size_t limit = 0;
  const char *link_out_path = NULL, *csv_out_path = NULL, *plot_out_path = NULL;
  const char *thresh_path = NULL, *hist_path = NULL;

  size_t hist_distsize = 0, hist_covgsize = 0;
  size_t cutoff = 0;
  bool clean = false;

  // Arg parsing
  char cmd[100];
  char shortopts[300];
  cmd_long_opts_to_short(longopts, shortopts, sizeof(shortopts));
  int c;

  while((c = getopt_long_only(argc, argv, shortopts, longopts, NULL)) != -1) {
    cmd_get_longopt_str(longopts, c, cmd, sizeof(cmd));
    switch(c) {
      case 0: /* flag set */ break;
      case 'h': cmd_print_usage(NULL); break;
      case 'o': cmd_check(!link_out_path, cmd); link_out_path = optarg; break;
      case 'f': cmd_check(!futil_get_force(), cmd); futil_set_force(true); break;
      case 'l': cmd_check(!csv_out_path, cmd); csv_out_path = optarg; break;
      case 'c': cmd_check(!cutoff, cmd); cutoff = cmd_size(cmd, optarg); clean = true; break;
      case 'L': cmd_check(!limit, cmd); limit = cmd_size(cmd, optarg); break;
      case 'P': cmd_check(!plot_out_path, cmd); plot_out_path = optarg; break;
      case 'T': cmd_check(!thresh_path, cmd); thresh_path = optarg; break;
      case 'H': cmd_check(!hist_path, cmd); hist_path = optarg; break;
      case 'C': cmd_check(!hist_covgsize, cmd); hist_covgsize = cmd_size(cmd, optarg); break;
      case 'D': cmd_check(!hist_distsize, cmd); hist_distsize = cmd_size(cmd, optarg); break;
      case ':': /* BADARG */
      case '?': /* BADCH getopt_long has already printed error */
        // cmd_print_usage(NULL);
        die("`"CMD" links -h` for help. Bad option: %s", argv[optind-1]);
      default: ctx_assert2(0, "shouldn't reach here: %c", c);
    }
  }

  if(hist_distsize && !hist_path) cmd_print_usage("--max-dist without --covg-hist");
  if(hist_covgsize && !hist_path) cmd_print_usage("--max-covg without --covg-hist");

  // Defaults
  if(!hist_distsize) hist_distsize = DEFAULT_MAX_DIST;
  if(!hist_covgsize) hist_covgsize = DEFAULT_MAX_COVG;

  if(optind + 1 != argc) cmd_print_usage("Wrong number of arguments");
  const char *ctp_path = argv[optind];

  bool list = (csv_out_path != NULL);
  bool plot = (plot_out_path != NULL);
  bool save = (link_out_path != NULL);
  bool hist_covg = (thresh_path != NULL || hist_path != NULL);

  size_t plot_kmer_idx = (limit == 0 ? 0 : limit - 1);

  if(clean && !save)
    cmd_print_usage("Need to give --out <out.ctp.gz> with --clean");

  if(!save && !list && !plot && !hist_covg)
    cmd_print_usage("Please specify one of --plot, --list or --clean");

  if(link_out_path && hist_covg && strcmp(link_out_path,"-") == 0)
    cmd_print_usage("Outputing both cleaning threshold (-T) and links (-o) to STDOUT!");

  // Open input file
  FILE *list_fh = NULL, *plot_fh = NULL, *link_tmp_fh = NULL;
  FILE *thresh_fh = NULL, *hist_fh = NULL;
  gzFile link_gz = NULL;

  // Check file don't exist or that we can overwrite
  // Will ignore if path is null
  bool err = false;
  err |= futil_check_outfile(csv_out_path);
  err |= futil_check_outfile(plot_out_path);
  err |= futil_check_outfile(link_out_path);
  err |= futil_check_outfile(thresh_path);
  err |= futil_check_outfile(hist_path);
  if(err) die("Use -f,--force to overwrite files");

  StrBuf link_tmp_path;
  strbuf_alloc(&link_tmp_path, 1024);

  GPathReader ctpin;
  memset(&ctpin, 0, sizeof(ctpin));
  gpath_reader_open(&ctpin, ctp_path);

  size_t ncols = file_filter_into_ncols(&ctpin.fltr);
  size_t kmer_size = gpath_reader_get_kmer_size(&ctpin);
  cJSON *newhdr = cJSON_Duplicate(ctpin.json, 1);

  if(ncols != 1) die("Can only clean a single colour at a time. Sorry.");

  uint64_t (*hists)[hist_covgsize] = NULL;

  if(hist_covg) {
    hists = ctx_calloc(hist_distsize, sizeof(hists[0]));
  }

  if(hist_path && (hist_fh = futil_fopen_create(hist_path, "w")) == NULL)
      die("Cannot open file: %s", hist_path);

  if(thresh_path && (thresh_fh = futil_fopen_create(thresh_path, "w")) == NULL)
      die("Cannot open file: %s", thresh_path);

  if(limit)
    status("Limiting to the first %zu kmers", limit);

  if(clean)
  {
    timestamp();
    message(" Cleaning coverage below %zu", cutoff);
    message("\n");
  }

  if(save)
  {
    // Check we can find the fields we need
    cJSON *links_json  = json_hdr_get(newhdr, "paths", cJSON_Object, link_out_path);
    cJSON *nkmers_json = json_hdr_get(links_json, "num_kmers_with_paths", cJSON_Number, link_out_path);
    cJSON *nlinks_json = json_hdr_get(links_json, "num_paths",            cJSON_Number, link_out_path);
    cJSON *nbytes_json = json_hdr_get(links_json, "path_bytes",           cJSON_Number, link_out_path);
    if(!nkmers_json || !nlinks_json || !nbytes_json)
      die("Cannot find required header entries");

    // Create a random temporary file
    link_tmp_fh = create_tmp_file(&link_tmp_path, link_out_path);

    status("Saving output to: %s", link_out_path);
    status("Temporary output: %s", link_tmp_path.b);

    // Open output file
    if((link_gz = futil_gzopen_create(link_out_path, "w")) == NULL)
      die("Cannot open output link file: %s", link_out_path);

    // Need to open output file first so we can get absolute path
    // Update the header to include this command
    json_hdr_add_curr_cmd(newhdr, link_out_path);
  }

  if(list)
  {
    status("Listing to %s", csv_out_path);
    if((list_fh = futil_fopen_create(csv_out_path, "w")) == NULL)
      die("Cannot open output CSV file %s", csv_out_path);

    // Print csv header
    fprintf(list_fh, "SeqLen,Covg\n");
  }

  if(plot)
  {
    status("Plotting kmer %zu to %s", plot_kmer_idx, plot_out_path);
    if((plot_fh = futil_fopen_create(plot_out_path, "w")) == NULL)
      die("Cannot open output .dot file %s", plot_out_path);
  }

  SizeBuffer countbuf, jposbuf;
  size_buf_alloc(&countbuf, 16);
  size_buf_alloc(&jposbuf, 1024);

  StrBuf kmerbuf, juncsbuf, seqbuf, outbuf;
  strbuf_alloc(&kmerbuf, 1024);
  strbuf_alloc(&juncsbuf, 1024);
  strbuf_alloc(&seqbuf, 1024);
  strbuf_alloc(&outbuf, 1024);

  bool link_fw;
  size_t njuncs;
  size_t knum, nlinks, num_links_exp = 0;

  LinkTree ltree;
  ltree_alloc(&ltree, kmer_size);

  LinkTreeStats tree_stats;
  memset(&tree_stats, 0, sizeof(tree_stats));
  size_t init_num_links = 0, num_links = 0;

  for(knum = 0; !limit || knum < limit; knum++)
  {
    ltree_reset(&ltree);
    if(!gpath_reader_read_kmer(&ctpin, &kmerbuf, &num_links_exp)) break;
    ctx_assert2(kmerbuf.end == kmer_size, "Kmer incorrect length %zu != %zu",
                kmerbuf.end, kmer_size);
    // status("kmer: %s", kmerbuf.b);

    for(nlinks = 0;
        gpath_reader_read_link(&ctpin, &link_fw, &njuncs,
                               &countbuf, &juncsbuf,
                               &seqbuf, &jposbuf);
        nlinks++)
    {
      ltree_add(&ltree, link_fw, countbuf.b[0], jposbuf.b,
                juncsbuf.b, seqbuf.b);
    }

    if(nlinks != num_links_exp)
      warn("Links count mismatch %zu != %zu", nlinks, num_links_exp);

    if(hist_covg)
    {
      ltree_update_covg_hists(&ltree, (uint64_t*)hists,
                              hist_distsize, hist_covgsize);
    }
    if(clean)
    {
      ltree_clean(&ltree, cutoff);
    }

    // Accumulate statistics
    ltree_get_stats(&ltree, &tree_stats);
    num_links = tree_stats.num_links - init_num_links;
    init_num_links = tree_stats.num_links;

    if(list)
    {
      ltree_write_list(&ltree, &outbuf);
      if(fwrite(outbuf.b, 1, outbuf.end, list_fh) != outbuf.end)
        die("Cannot write CSV file to: %s", csv_out_path);
      strbuf_reset(&outbuf);
    }
    if(save && num_links)
    {
      ltree_write_ctp(&ltree, kmerbuf.b, num_links, &outbuf);
      if(fwrite(outbuf.b, 1, outbuf.end, link_tmp_fh) != outbuf.end)
        die("Cannot write ctp file to: %s", link_tmp_path.b);
      strbuf_reset(&outbuf);
    }
    if(plot && knum == plot_kmer_idx)
    {
      status("Plotting tree...");
      ltree_write_dot(&ltree, &outbuf);
      if(fwrite(outbuf.b, 1, outbuf.end, plot_fh) != outbuf.end)
        die("Cannot write plot DOT file to: %s", plot_out_path);
      strbuf_reset(&outbuf);
    }
  }

  gpath_reader_close(&ctpin);

  cJSON *links_json = json_hdr_get(newhdr, "paths", cJSON_Object, link_out_path);
  cJSON *nkmers_json = json_hdr_get(links_json, "num_kmers_with_paths", cJSON_Number, link_out_path);
  cJSON *nlinks_json = json_hdr_get(links_json, "num_paths",            cJSON_Number, link_out_path);
  cJSON *nbytes_json = json_hdr_get(links_json, "path_bytes",           cJSON_Number, link_out_path);

  status("Number of kmers with links %li -> %zu", nkmers_json->valueint, tree_stats.num_trees_with_links);
  status("Number of links %li -> %zu", nlinks_json->valueint, tree_stats.num_links);
  status("Number of bytes %li -> %zu", nbytes_json->valueint, tree_stats.num_link_bytes);

  if(save)
  {
    // Update JSON
    nkmers_json->valuedouble = nkmers_json->valueint = tree_stats.num_trees_with_links;
    nlinks_json->valuedouble = nlinks_json->valueint = tree_stats.num_links;
    nbytes_json->valuedouble = nbytes_json->valueint = tree_stats.num_link_bytes;

    char *json_str = cJSON_Print(newhdr);
    if(gzputs(link_gz, json_str) != (int)strlen(json_str))
      die("Cannot write ctp file to: %s", link_out_path);
    free(json_str);

    gzputs(link_gz, "\n\n");
    gzputs(link_gz, ctp_explanation_comment);
    gzputs(link_gz, "\n");

    fseek(link_tmp_fh, 0, SEEK_SET);
    char *tmp = ctx_malloc(4*ONE_MEGABYTE);
    size_t s;
    while((s = fread(tmp, 1, 4*ONE_MEGABYTE, link_tmp_fh)) > 0) {
      if(gzwrite(link_gz, tmp, s) != (int)s)
        die("Cannot write to output: %s", link_out_path);
    }
    ctx_free(tmp);

    gzclose(link_gz);
    fclose(link_tmp_fh);
  }

  // Write histogram to file
  if(hist_fh)
  {
    size_t i, j;
    fprintf(hist_fh, "  ");
    for(j = 1; j < hist_covgsize; j++) fprintf(hist_fh, ",covg.%02zu", j);
    fprintf(hist_fh, "\n");
    for(i = 1; i < hist_distsize; i++) {
      fprintf(hist_fh, "dist.%02zu", i);
      for(j = 1; j < hist_covgsize; j++) {
        fprintf(hist_fh, ",%"PRIu64, hists[i][j]);
      }
      fprintf(hist_fh, "\n");
    }
  }

  if(thresh_fh)
  {
    // Use median of first five cutoffs
    print_suggest_cutoff(6, hist_covgsize, hists, thresh_fh);
  }

  if(hist_fh && hist_fh != stdout) fclose(hist_fh);

  if(list)
  {
    fclose(list_fh);
  }

  if(plot)
  {
    fclose(plot_fh);
  }

  ctx_free(hists);
  cJSON_Delete(newhdr);
  strbuf_dealloc(&link_tmp_path);
  ltree_dealloc(&ltree);
  size_buf_dealloc(&countbuf);
  size_buf_dealloc(&jposbuf);
  strbuf_dealloc(&kmerbuf);
  strbuf_dealloc(&juncsbuf);
  strbuf_dealloc(&seqbuf);
  strbuf_dealloc(&outbuf);

  return EXIT_SUCCESS;
}