Exemple #1
0
// Check contig entries match reference
// We check that these match the reference just loaded
static void brkpnt_check_refs_match(cJSON *json,
                                    const ChromHash *genome,
                                    const char *path)
{
  cJSON *version = json_hdr_get(json, "format_version", cJSON_Number, path);
  if(version->valueint <= 2) return;

  cJSON *command = json_hdr_get_curr_cmd(json, path);
  cJSON *brkpnts = json_hdr_get(command, "breakpoints", cJSON_Object, path);
  cJSON *contigs = json_hdr_get(brkpnts, "contigs",     cJSON_Array,  path);
  cJSON *contig;
  size_t num_chroms = 0;

  for(contig = contigs->child; contig; contig = contig->next, num_chroms++)
  {
    cJSON *id  = json_hdr_get(contig, "id",     cJSON_String, path);
    cJSON *len = json_hdr_get(contig, "length", cJSON_Number, path);

    // Check chrom is loaded in ref and of expected length
    khiter_t k = kh_get(kChromHash, genome, id->valuestring);
    if(k == kh_end(genome)) warn("Cannot find chrom [%s]", id->valuestring);
    else {
      const read_t *r = kh_value(genome, k);
      if(r->seq.end != (size_t)len->valueint) {
        warn("Chrom lengths do not match %s input:%li ref:%zu",
             id->valuestring, len->valueint, r->seq.end);
      }
    }
  }

  if(num_chroms != kh_size(genome)) {
    warn("Number of chromosomes differ: %zu in header vs %zu in ref",
         num_chroms, (size_t)kh_size(genome));
  }
}
Exemple #2
0
void vcf_hdrtxt_append_commands(cJSON *command, StrBuf *hdr, const char *path)
{
  bool first;
  for(; command != NULL; command = command->next)
  {
    cJSON *key  = json_hdr_get(command, "key",    cJSON_String,  path);
    cJSON *cmd  = json_hdr_get(command, "cmd",    cJSON_Array,   path);
    cJSON *cwd  = json_hdr_get(command, "cwd",    cJSON_String,  path);
    cJSON *prev = json_hdr_get(command, "prev",   cJSON_Array,   path);
    cJSON *ver  = json_hdr_try(command, "mccortex",cJSON_String, path);

    prev = prev->child; // result could be NULL
    if(prev && prev->type != cJSON_String) die("Invalid 'prev' field");
    strbuf_append_str(hdr, "##mccortex_");
    strbuf_append_str(hdr, key->valuestring);
    strbuf_append_str(hdr, "=<prev=\"");
    strbuf_append_str(hdr, prev ? prev->valuestring : "NULL");

    if(prev) {
      while((prev = prev->next) != NULL) {
        strbuf_append_str(hdr, ";");
        strbuf_append_str(hdr, prev->valuestring);
      }
    }
    strbuf_append_str(hdr, "\",cmd=\"");
    for(first = true, cmd = cmd->child; cmd; cmd = cmd->next, first = false) {
      if(!first) strbuf_append_char(hdr, ' ');
      strbuf_append_str(hdr, cmd->valuestring);
    }
    strbuf_append_str(hdr, "\",cwd=\"");
    strbuf_append_str(hdr, cwd->valuestring);
    strbuf_append_str(hdr, "\"");
    if(ver) {
      strbuf_append_str(hdr, ",version=\"");
      strbuf_append_str(hdr, ver->valuestring);
      strbuf_append_str(hdr, "\"");
    }
    strbuf_append_str(hdr, ">\n");
  }
}
Exemple #3
0
// Check contig entries match reference
// We check that these match the reference just loaded
static void brkpnt_check_refs_match(cJSON *json, const char *path)
{
  cJSON *version = json_hdr_get(json, "format_version", cJSON_Number, path);
  if(version->valueint <= 2) return;

  cJSON *command = json_hdr_get_curr_cmd(json, path);
  cJSON *brkpnts = json_hdr_get(command, "breakpoints", cJSON_Object, path);
  cJSON *contigs = json_hdr_get(brkpnts, "contigs",     cJSON_Array,  path);
  cJSON *contig;
  size_t num_chroms = 0;

  for(contig = contigs->child; contig; contig = contig->next, num_chroms++)
  {
    cJSON *id  = json_hdr_get(contig, "id",     cJSON_String, path);
    cJSON *len = json_hdr_get(contig, "length", cJSON_Number, path);

    const char *chrom_name = id->valuestring;
    long chrom_len = len->valueint;
    size_t reflen;

    khiter_t k = kh_get(ChromHash, genome, chrom_name);
    if(k == kh_end(genome))
      die("Cannot find ref chrom: %s", chrom_name);
    else {
      reflen = kh_value(genome, k)->seq.end;
      if(reflen != (size_t)chrom_len) {
        die("Chrom lengths do not match %s input:%li ref:%zu",
            chrom_name, chrom_len, reflen);
      }
    }
  }

  if(num_chroms != chroms.len) {
    die("Number of chromosomes differ: %zu in header vs %zu in ref",
        num_chroms, chroms.len);
  }
}
Exemple #4
0
int ctx_calls2vcf(int argc, char **argv)
{
  const char *in_path = NULL, *out_path = NULL, *out_type = NULL;
  // Filtering parameters
  int32_t min_mapq = -1, max_align_len = -1, max_allele_len = -1;
  // Alignment parameters
  int nwmatch = 1, nwmismatch = -2, nwgapopen = -4, nwgapextend = -1;
  // ref paths
  char const*const* ref_paths = NULL;
  size_t nref_paths = 0;
  // flank file
  const char *sam_path = NULL;

  //
  // Things we figure out by looking at the input
  //
  bool isbubble = false;
  // samples in VCF, (0 for bubble, does not include ref in breakpoint calls)
  size_t i, kmer_size, num_samples;

  //
  // Reference genome
  //
  // Hash map of chromosome name -> sequence
  ChromHash *genome;
  ReadBuffer chroms;

  // Arg parsing
  char cmd[100];
  char shortopts[300];
  cmd_long_opts_to_short(longopts, shortopts, sizeof(shortopts));
  int c;

  // silence error messages from getopt_long
  // opterr = 0;

  while((c = getopt_long_only(argc, argv, shortopts, longopts, NULL)) != -1) {
    cmd_get_longopt_str(longopts, c, cmd, sizeof(cmd));
    switch(c) {
      case 0: /* flag set */ break;
      case 'h': cmd_print_usage(NULL); break;
      case 'o': cmd_check(!out_path, cmd); out_path = optarg; break;
      case 'O': cmd_check(!out_type, cmd); out_type = optarg; break;
      case 'f': cmd_check(!futil_get_force(), cmd); futil_set_force(true); break;
      case 'F': cmd_check(!sam_path,cmd); sam_path = optarg; break;
      case 'Q': cmd_check(min_mapq < 0,cmd); min_mapq = cmd_uint32(cmd, optarg); break;
      case 'A': cmd_check(max_align_len  < 0,cmd); max_align_len  = cmd_uint32(cmd, optarg); break;
      case 'L': cmd_check(max_allele_len < 0,cmd); max_allele_len = cmd_uint32(cmd, optarg); break;
      case 'm': nwmatch = cmd_int32(cmd, optarg); break;
      case 'M': nwmismatch = cmd_int32(cmd, optarg); break;
      case 'g': nwgapopen = cmd_int32(cmd, optarg); break;
      case 'G': nwgapextend = cmd_int32(cmd, optarg); break;
      case ':': /* BADARG */
      case '?': /* BADCH getopt_long has already printed error */
        die("`"CMD" "SUBCMD" -h` for help. Bad option: %s", argv[optind-1]);
      default: ctx_assert2(0, "shouldn't reach here: %c", c);
    }
  }

  // Defaults for unset values
  if(out_path == NULL) out_path = "-";
  if(max_align_len  < 0) max_align_len  = DEFAULT_MAX_ALIGN;
  if(max_allele_len < 0) max_allele_len = DEFAULT_MAX_ALLELE;

  if(optind+2 > argc)
    cmd_print_usage("Require <in.txt.gz> and at least one reference");

  in_path = argv[optind++];
  ref_paths = (char const*const*)argv + optind;
  nref_paths = argc - optind;

  // These functions call die() on error
  gzFile gzin = futil_gzopen(in_path, "r");

  // Read call file header
  cJSON *json = json_hdr_load(gzin, in_path);

  // Check we can handle the kmer size
  kmer_size = json_hdr_get_kmer_size(json, in_path);
  db_graph_check_kmer_size(kmer_size, in_path);

  // Get format (bubble or breakpoint file)
  cJSON *json_fmt = json_hdr_get(json, "file_format", cJSON_String, in_path);
  if(strcmp(json_fmt->valuestring,"CtxBreakpoints") == 0) isbubble = false;
  else if(strcmp(json_fmt->valuestring,"CtxBubbles") == 0) isbubble = true;
  else die("Unknown format: '%s'", json_fmt->valuestring);

  status("Reading %s in %s format", futil_inpath_str(in_path),
         isbubble ? "bubble" : "breakpoint");

  if(isbubble) {
    // bubble specific
    if(sam_path == NULL)
      cmd_print_usage("Require -F <flanks.sam> with bubble file");
    if(min_mapq < 0) min_mapq = DEFAULT_MIN_MAPQ;
  }
  else {
    // breakpoint specific
    if(min_mapq >= 0)
      cmd_print_usage("-Q,--min-mapq <Q> only valid with bubble calls");
  }

  // Open flank file if it exists
  htsFile *samfh = NULL;
  bam_hdr_t *bam_hdr = NULL;
  bam1_t *mflank = NULL;

  if(sam_path)
  {
    if((samfh = hts_open(sam_path, "r")) == NULL)
      die("Cannot open SAM/BAM %s", sam_path);

    // Load BAM header
    bam_hdr = sam_hdr_read(samfh);
    if(bam_hdr == NULL) die("Cannot load BAM header: %s", sam_path);
    mflank = bam_init1();
  }

  // Output VCF has 0 samples if bubbles file, otherwise has N where N is
  // number of samples/colours in the breakpoint graph
  size_t num_graph_samples = json_hdr_get_ncols(json, in_path);
  size_t num_graph_nonref = json_hdr_get_nonref_ncols(json, in_path);

  num_samples = 0;
  if(!isbubble) {
    // If last colour has "is_ref", drop number of samples by one
    num_samples = num_graph_nonref < num_graph_samples ? num_graph_samples-1
                                                       : num_graph_samples;
  }

  //
  // Open output file
  //
  if(!out_path) out_path = "-";
  int mode = vcf_misc_get_outtype(out_type, out_path);
  futil_create_output(out_path);
  htsFile *vcffh = hts_open(out_path, modes_htslib[mode]);

  status("[calls2vcf] Reading %s call file with %zu samples",
         isbubble ? "Bubble" : "Breakpoint", num_graph_samples);
  status("[calls2vcf] %zu sample output to: %s format: %s",
         num_samples, futil_outpath_str(out_path), hsmodes_htslib[mode]);

  if(isbubble) status("[calls2vcf] min. MAPQ: %i", min_mapq);
  status("[calls2vcf] max alignment length: %i", max_align_len);
  status("[calls2vcf] max VCF allele length: %i", max_allele_len);
  status("[calls2vcf] alignment match:%i mismatch:%i gap open:%i extend:%i",
         nwmatch, nwmismatch, nwgapopen, nwgapextend);

  // Load reference genome
  read_buf_alloc(&chroms, 1024);
  genome = chrom_hash_init();
  chrom_hash_load(ref_paths, nref_paths, &chroms, genome);

  // convert to upper case
  char *s;
  for(i = 0; i < chroms.len; i++)
    for(s = chroms.b[i].seq.b; *s; s++) *s = toupper(*s);

  if(!isbubble) brkpnt_check_refs_match(json, genome, in_path);

  bcf_hdr_t *vcfhdr = make_vcf_hdr(json, in_path, !isbubble, kmer_size,
                                   ref_paths, nref_paths,
                                   chroms.b, chroms.len);

  if(bcf_hdr_write(vcffh, vcfhdr) != 0) die("Cannot write VCF header");

  AlignedCall *call = acall_init();
  CallDecomp *aligner = call_decomp_init(vcffh, vcfhdr);

  scoring_t *scoring = call_decomp_get_scoring(aligner);
  scoring_init(scoring, nwmatch, nwmismatch, nwgapopen, nwgapextend,
               false, false, 0, 0, 0, 0);

  CallFileEntry centry;
  call_file_entry_alloc(&centry);

  char kmer_str[50];
  sprintf(kmer_str, ";K%zu", kmer_size);

  if(isbubble)
  {
    // Bubble calls
    DecompBubble *bubbles = decomp_bubble_init();

    // Set scoring for aligning 3' flank
    scoring = decomp_bubble_get_scoring(bubbles);
    scoring_init(scoring, nwmatch, nwmismatch, nwgapopen, nwgapextend,
                 true, true, 0, 0, 0, 0);

    while(call_file_read(gzin, in_path, &centry)) {
      do {
        if(sam_read1(samfh, bam_hdr, mflank) < 0)
          die("We've run out of SAM entries!");
      } while(mflank->core.flag & (BAM_FSECONDARY | BAM_FSUPPLEMENTARY));

      // Align call
      strbuf_reset(&call->info);
      decomp_bubble_call(bubbles, genome, kmer_size, min_mapq,
                         &centry, mflank, bam_hdr, call);
      strbuf_append_str(&call->info, kmer_str);
      acall_decompose(aligner, call, max_align_len, max_allele_len);
    }

    // print bubble stats
    DecompBubbleStats *bub_stats = ctx_calloc(1, sizeof(*bub_stats));
    decomp_bubble_cpy_stats(bub_stats, bubbles);
    print_bubble_stats(bub_stats);
    ctx_free(bub_stats);

    decomp_bubble_destroy(bubbles);
  }
  else
  {
    // Breakpoint calls
    DecompBreakpoint *breakpoints = decomp_brkpt_init();

    while(call_file_read(gzin, in_path, &centry)) {
      strbuf_reset(&call->info);
      decomp_brkpt_call(breakpoints, genome, num_samples, &centry, call);
      strbuf_append_str(&call->info, kmer_str);
      acall_decompose(aligner, call, max_align_len, max_allele_len);
    }

    // print bubble stats
    DecompBreakpointStats *brk_stats = ctx_calloc(1, sizeof(*brk_stats));
    decomp_brkpt_cpy_stats(brk_stats, breakpoints);
    print_breakpoint_stats(brk_stats);
    ctx_free(brk_stats);

    decomp_brkpt_destroy(breakpoints);
  }

  // Print stats
  DecomposeStats *astats = ctx_calloc(1, sizeof(*astats));
  call_decomp_cpy_stats(astats, aligner);
  print_acall_stats(astats);
  ctx_free(astats);

  call_file_entry_dealloc(&centry);
  call_decomp_destroy(aligner);
  acall_destroy(call);

  // Finished - clean up
  cJSON_Delete(json);
  gzclose(gzin);

  bcf_hdr_destroy(vcfhdr);
  hts_close(vcffh);

  for(i = 0; i < chroms.len; i++) seq_read_dealloc(&chroms.b[i]);
  read_buf_dealloc(&chroms);
  chrom_hash_destroy(genome);

  if(sam_path) {
    hts_close(samfh);
    bam_hdr_destroy(bam_hdr);
    bam_destroy1(mflank);
  }

  return EXIT_SUCCESS;
}
Exemple #5
0
static bcf_hdr_t* make_vcf_hdr(cJSON *json, const char *in_path,
                               bool is_breakpoint, size_t kmer_size,
                               char const*const* ref_paths, size_t nref_paths,
                               read_t *chroms, size_t nchroms)
{
  ctx_assert(json != NULL);

  StrBuf hdrbuf;
  strbuf_alloc(&hdrbuf, 1024);

  char datestr[9];
  time_t date = time(NULL);
  strftime(datestr, 9, "%Y%m%d", localtime(&date));

  strbuf_append_str(&hdrbuf, "##fileformat=VCFv4.2\n##fileDate=");
  strbuf_append_str(&hdrbuf, datestr);
  strbuf_append_str(&hdrbuf, "\n");

  // Print commands used to generate header
  cJSON *commands = json_hdr_get(json, "commands", cJSON_Array, in_path);
  cJSON *command = commands->child;

  // Print this command
  char keystr[8];
  char *prevstr = NULL;
  size_t i;

  if(command) {
    cJSON *key = json_hdr_get(command, "key", cJSON_String, in_path);
    prevstr = key->valuestring;
  }

  // Print command entry for this command
  strbuf_append_str(&hdrbuf, "##mccortex_");
  strbuf_append_str(&hdrbuf, hex_rand_str(keystr, sizeof(keystr)));
  strbuf_append_str(&hdrbuf, "=<prev=\"");
  strbuf_append_str(&hdrbuf, prevstr ? prevstr : "NULL");
  strbuf_append_str(&hdrbuf, "\",cmd=\"");
  strbuf_append_str(&hdrbuf, cmd_get_cmdline());
  strbuf_append_str(&hdrbuf, "\",cwd=\"");
  strbuf_append_str(&hdrbuf, cmd_get_cwd());
  strbuf_append_str(&hdrbuf, "\",version="CTX_VERSION">\n");

  // Print previous commands
  vcf_hdrtxt_append_commands(command, &hdrbuf, in_path);

  // Print field definitions
  if(is_breakpoint)
    strbuf_append_str(&hdrbuf, "##INFO=<ID=BRKPNT,Number=1,Type=String,Description=\"Breakpoint call\">\n");
  else
    strbuf_append_str(&hdrbuf, "##INFO=<ID=BUBBLE,Number=1,Type=String,Description=\"Bubble call\">\n");

  strbuf_sprintf(&hdrbuf, "##INFO=<ID=K%zu,Number=0,Type=Flag,Description=\"Found at k=%zu\">\n", kmer_size, kmer_size);

  strbuf_append_str(&hdrbuf, "##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">\n");
  strbuf_append_str(&hdrbuf, "##FILTER=<ID=PASS,Description=\"All filters passed\">\n");

  // Print reference paths
  strbuf_append_str(&hdrbuf, "##reference=");
  strbuf_append_str(&hdrbuf, ref_paths[0]);
  for(i = 1; i < nref_paths; i++) {
    strbuf_append_char(&hdrbuf, ',');
    strbuf_append_str(&hdrbuf, ref_paths[i]);
  }
  strbuf_append_str(&hdrbuf, "\n");

  // Print contigs lengths
  for(i = 0; i < nchroms; i++) {
    strbuf_sprintf(&hdrbuf, "##contig=<ID=%s,length=%zu>\n",
                   chroms[i].name.b, chroms[i].seq.end);
  }

  // Print VCF column header
  strbuf_append_str(&hdrbuf, "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT");

  if(is_breakpoint)
  {
    // Print a column for each sample
    cJSON *graph_json   = json_hdr_get(json,       "graph",   cJSON_Object, in_path);
    cJSON *colours_json = json_hdr_get(graph_json, "colours", cJSON_Array,  in_path);
    cJSON *colour_json  = colours_json->child;
    if(colour_json == NULL) die("Missing colours");
    for(; colour_json; colour_json = colour_json->next)
    {
      if(!json_hdr_colour_is_ref(colour_json)) {
        cJSON *sample_json = json_hdr_get(colour_json, "sample", cJSON_String, in_path);
        strbuf_append_str(&hdrbuf, "\t");
        strbuf_append_str(&hdrbuf, sample_json->valuestring);
      }
    }
  }

  strbuf_append_char(&hdrbuf, '\n');
  bcf_hdr_t *hdr = bcf_hdr_init("w");
  if(bcf_hdr_parse(hdr, hdrbuf.b) != 0) die("Cannot construct VCF header");

  strbuf_dealloc(&hdrbuf);

  return hdr;
}
Exemple #6
0
static void print_vcf_header(cJSON *json, bool is_breakpoint, FILE *fout)
{
  ctx_assert(json != NULL);

  char datestr[9];
  time_t date = time(NULL);
  strftime(datestr, 9, "%Y%m%d", localtime(&date));

  fprintf(fout, "##fileformat=VCFv4.1\n##fileDate=%s\n", datestr);

  // Print commands used to generate header
  cJSON *commands = json_hdr_get(json, "commands", cJSON_Array, input_path);
  cJSON *command = commands->child;

  // Print this command
  char keystr[8];
  char *prevstr = NULL;
  size_t i;

  if(command) {
    cJSON *key = json_hdr_get(command, "key", cJSON_String, input_path);
    prevstr = key->valuestring;
  }

  // Print command entry for this command
  fprintf(fout, "##mccortex_%s=<prev=\"%s\",cmd=\"%s\",cwd=\"%s\",version="CTX_VERSION">\n",
          hex_rand_str(keystr, sizeof(keystr)),
          prevstr ? prevstr : "NULL",
          cmd_get_cmdline(), cmd_get_cwd());

  // Print previous commands
  for(; command != NULL; command = command->next)
  {
    cJSON *key  = json_hdr_get(command, "key",    cJSON_String,  input_path);
    cJSON *cmd  = json_hdr_get(command, "cmd",    cJSON_Array,   input_path);
    cJSON *cwd  = json_hdr_get(command, "cwd",    cJSON_String,  input_path);
    cJSON *prev = json_hdr_get(command, "prev",   cJSON_Array,   input_path);
    cJSON *ver  = json_hdr_try(command, "mccortex",cJSON_String, input_path);

    prev = prev->child; // result could be NULL
    if(prev && prev->type != cJSON_String) die("Invalid 'prev' field");
    fprintf(fout, "##mccortex_%s=<prev=\"%s", key->valuestring,
                  prev ? prev->valuestring : "NULL");
    if(prev) {
      while((prev = prev->next) != NULL) fprintf(fout, ";%s", prev->valuestring);
    }
    fprintf(fout, "\",cmd=\"");
    for(i = 0, cmd = cmd->child; cmd; cmd = cmd->next, i++) {
      if(i > 0) fputc(' ', fout);
      fputs(cmd->valuestring, fout);
    }
    fprintf(fout, "\",cwd=\"%s\"", cwd->valuestring);
    if(ver) { fprintf(fout, ",version=\"%s\"", ver->valuestring); }
    fprintf(fout, ">\n");
  }

  // Print field definitions
  if(is_breakpoint)
    fprintf(fout, "##INFO=<ID=BRKPNT,Number=1,Type=String,Description=\"Breakpoint call\">\n");
  else
    fprintf(fout, "##INFO=<ID=BUBBLE,Number=1,Type=String,Description=\"Bubble call\">\n");

  fprintf(fout, "##INFO=<ID=K%zu,Number=0,Type=Flag,Description=\"Found at k=%zu\">\n", kmer_size, kmer_size);

  fprintf(fout, "##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">\n");
  fprintf(fout, "##FILTER=<ID=PASS,Description=\"All filters passed\">\n");

  // Print reference paths
  fprintf(fout, "##reference=%s", ref_paths[0]);
  for(i = 1; i < num_ref_paths; i++) printf(",%s", ref_paths[i]);
  fprintf(fout, "\n");

  // Print contigs lengths
  for(i = 0; i < chroms.len; i++) {
    fprintf(fout, "##contig=<ID=%s,length=%zu>\n",
            chroms.b[i].name.b, chroms.b[i].seq.end);
  }

  // Print VCF column header
  fputs("#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT", fout);

  if(is_breakpoint)
  {
    // Print a column for each sample
    cJSON *graph_json   = json_hdr_get(json,       "graph",   cJSON_Object, input_path);
    cJSON *colours_json = json_hdr_get(graph_json, "colours", cJSON_Array,  input_path);
    cJSON *colour_json  = colours_json->child;
    if(colour_json == NULL) die("Missing colours");
    for(; colour_json; colour_json = colour_json->next)
    {
      if(!json_hdr_colour_is_ref(colour_json)) {
        cJSON *sample_json = json_hdr_get(colour_json, "sample", cJSON_String, input_path);
        fputc('\t', fout);
        fputs(sample_json->valuestring, fout);
      }
    }
  }

  fputc('\n', fout);
}
Exemple #7
0
int ctx_calls2vcf(int argc, char **argv)
{
  parse_cmdline_args(argc, argv);
  size_t i;

  // These functions call die() on error
  gzFile gzin = futil_gzopen(input_path, "r");

  nw_aligner_setup();

  // Read file header
  cJSON *json = read_input_header(gzin);

  // Get format (bubble or breakpoint file)
  cJSON *json_fmt = json_hdr_get(json, "file_format", cJSON_String, input_path);
  if(strcmp(json_fmt->valuestring,"CtxBreakpoints") == 0) input_bubble_format = false;
  else if(strcmp(json_fmt->valuestring,"CtxBubbles") == 0) input_bubble_format = true;
  else die("Unknown format: '%s'", json_fmt->valuestring);

  status("Reading %s in %s format", futil_inpath_str(input_path),
         input_bubble_format ? "bubble" : "breakpoint");

  if(input_bubble_format && sam_path == NULL)
    cmd_print_usage("Require -F <flanks.sam> with bubble file");

  // Open flank file if it exists
  if(sam_path) flanks_sam_open();

  // Open output file
  FILE *fout = futil_fopen_create(out_path, "w");

  // Load reference genome
  read_buf_alloc(&chroms, 1024);
  genome = kh_init(ChromHash);
  seq_reader_load_ref_genome(ref_paths, num_ref_paths, &chroms, genome);

  // convert to upper case
  char *s;
  for(i = 0; i < chroms.len; i++)
    for(s = chroms.b[i].seq.b; *s; s++) *s = toupper(*s);

  if(!input_bubble_format) brkpnt_check_refs_match(json, input_path);

  // Output VCF has 0 samples if bubbles file, otherwise has N where N is
  // number of samples/colours in the breakpoint graph
  size_t num_graph_samples = json_hdr_get_ncols(json, input_path);
  size_t num_graph_nonref = json_hdr_get_nonref_ncols(json, input_path);

  num_samples = 0;
  if(!input_bubble_format) {
    // If last colour has "is_ref", drop number of samples by one
    num_samples = num_graph_nonref < num_graph_samples ? num_graph_samples-1
                                                       : num_graph_samples;
  }

  print_vcf_header(json, !input_bubble_format, fout);
  status("Reading %s call file with %zu samples",
         input_bubble_format ? "Bubble" : "Breakpoint", num_graph_samples);
  status("Writing a VCF with %zu samples", num_samples);
  parse_entries(gzin, fout);

  // Print stats
  char num_entries_read_str[50];
  char num_vars_printed_str[50];
  ulong_to_str(num_entries_read, num_entries_read_str);
  ulong_to_str(num_vars_printed, num_vars_printed_str);

  status("Read %s entries, printed %s vcf entries to: %s",
         num_entries_read_str, num_vars_printed_str, futil_outpath_str(out_path));

  if(input_bubble_format) {
    char msg[200];
    // Bubble caller specific
    print_stat(num_flank5p_unmapped,    num_entries_read, "flank 5p unmapped");
    sprintf(msg, "flank 5p low mapq (<%zu)", min_mapq);
    print_stat(num_flank5p_lowqual,     num_entries_read, msg);
    print_stat(num_flank3p_not_found,   num_entries_read, "flank 3p not found");
    print_stat(num_flank3p_multihits,   num_entries_read, "flank 3p multiple hits");
    print_stat(num_flank3p_approx_match,num_entries_read, "flank 3p approx match used");
    print_stat(num_flank3p_exact_match, num_entries_read, "flank 3p exact match");
  } else {
    // Breakpoint caller specific
    print_stat(num_flanks_not_uniquely_mapped, num_entries_read, "flank pairs contain one flank not mapped uniquely");
    print_stat(num_flanks_diff_chroms,         num_entries_read, "flank pairs map to diff chroms");
    print_stat(num_flanks_diff_strands,        num_entries_read, "flank pairs map to diff strands");
  }
  print_stat(num_flanks_too_far_apart,       num_entries_read, "flank pairs too far apart");
  print_stat(num_flanks_overlap_too_large,   num_entries_read, "flank pairs overlap too much");
  print_stat(num_entries_well_mapped,        num_entries_read, "flank pairs map well");

  status("Aligned %zu allele pairs and %zu flanks", num_nw_allele, num_nw_flank);

  // Finished - clean up
  cJSON_Delete(json);
  gzclose(gzin);
  fclose(fout);

  for(i = 0; i < chroms.len; i++) seq_read_dealloc(&chroms.b[i]);
  read_buf_dealloc(&chroms);
  kh_destroy_ChromHash(genome);
  nw_aligner_destroy();

  if(sam_path) flanks_sam_close();

  // hide unused method warnings
  (void)kh_del_ChromHash;
  (void)kh_put_ChromHash;
  (void)kh_get_ChromHash;
  (void)kh_clear_ChromHash;
  (void)kh_destroy_ChromHash;
  (void)kh_init_ChromHash;

  return EXIT_SUCCESS;
}
Exemple #8
0
int ctx_links(int argc, char **argv)
{
  size_t limit = 0;
  const char *link_out_path = NULL, *csv_out_path = NULL, *plot_out_path = NULL;
  const char *thresh_path = NULL, *hist_path = NULL;

  size_t hist_distsize = 0, hist_covgsize = 0;
  size_t cutoff = 0;
  bool clean = false;

  // Arg parsing
  char cmd[100];
  char shortopts[300];
  cmd_long_opts_to_short(longopts, shortopts, sizeof(shortopts));
  int c;

  while((c = getopt_long_only(argc, argv, shortopts, longopts, NULL)) != -1) {
    cmd_get_longopt_str(longopts, c, cmd, sizeof(cmd));
    switch(c) {
      case 0: /* flag set */ break;
      case 'h': cmd_print_usage(NULL); break;
      case 'o': cmd_check(!link_out_path, cmd); link_out_path = optarg; break;
      case 'f': cmd_check(!futil_get_force(), cmd); futil_set_force(true); break;
      case 'l': cmd_check(!csv_out_path, cmd); csv_out_path = optarg; break;
      case 'c': cmd_check(!cutoff, cmd); cutoff = cmd_size(cmd, optarg); clean = true; break;
      case 'L': cmd_check(!limit, cmd); limit = cmd_size(cmd, optarg); break;
      case 'P': cmd_check(!plot_out_path, cmd); plot_out_path = optarg; break;
      case 'T': cmd_check(!thresh_path, cmd); thresh_path = optarg; break;
      case 'H': cmd_check(!hist_path, cmd); hist_path = optarg; break;
      case 'C': cmd_check(!hist_covgsize, cmd); hist_covgsize = cmd_size(cmd, optarg); break;
      case 'D': cmd_check(!hist_distsize, cmd); hist_distsize = cmd_size(cmd, optarg); break;
      case ':': /* BADARG */
      case '?': /* BADCH getopt_long has already printed error */
        // cmd_print_usage(NULL);
        die("`"CMD" links -h` for help. Bad option: %s", argv[optind-1]);
      default: ctx_assert2(0, "shouldn't reach here: %c", c);
    }
  }

  if(hist_distsize && !hist_path) cmd_print_usage("--max-dist without --covg-hist");
  if(hist_covgsize && !hist_path) cmd_print_usage("--max-covg without --covg-hist");

  // Defaults
  if(!hist_distsize) hist_distsize = DEFAULT_MAX_DIST;
  if(!hist_covgsize) hist_covgsize = DEFAULT_MAX_COVG;

  if(optind + 1 != argc) cmd_print_usage("Wrong number of arguments");
  const char *ctp_path = argv[optind];

  bool list = (csv_out_path != NULL);
  bool plot = (plot_out_path != NULL);
  bool save = (link_out_path != NULL);
  bool hist_covg = (thresh_path != NULL || hist_path != NULL);

  size_t plot_kmer_idx = (limit == 0 ? 0 : limit - 1);

  if(clean && !save)
    cmd_print_usage("Need to give --out <out.ctp.gz> with --clean");

  if(!save && !list && !plot && !hist_covg)
    cmd_print_usage("Please specify one of --plot, --list or --clean");

  if(link_out_path && hist_covg && strcmp(link_out_path,"-") == 0)
    cmd_print_usage("Outputing both cleaning threshold (-T) and links (-o) to STDOUT!");

  // Open input file
  FILE *list_fh = NULL, *plot_fh = NULL, *link_tmp_fh = NULL;
  FILE *thresh_fh = NULL, *hist_fh = NULL;
  gzFile link_gz = NULL;

  // Check file don't exist or that we can overwrite
  // Will ignore if path is null
  bool err = false;
  err |= futil_check_outfile(csv_out_path);
  err |= futil_check_outfile(plot_out_path);
  err |= futil_check_outfile(link_out_path);
  err |= futil_check_outfile(thresh_path);
  err |= futil_check_outfile(hist_path);
  if(err) die("Use -f,--force to overwrite files");

  StrBuf link_tmp_path;
  strbuf_alloc(&link_tmp_path, 1024);

  GPathReader ctpin;
  memset(&ctpin, 0, sizeof(ctpin));
  gpath_reader_open(&ctpin, ctp_path);

  size_t ncols = file_filter_into_ncols(&ctpin.fltr);
  size_t kmer_size = gpath_reader_get_kmer_size(&ctpin);
  cJSON *newhdr = cJSON_Duplicate(ctpin.json, 1);

  if(ncols != 1) die("Can only clean a single colour at a time. Sorry.");

  uint64_t (*hists)[hist_covgsize] = NULL;

  if(hist_covg) {
    hists = ctx_calloc(hist_distsize, sizeof(hists[0]));
  }

  if(hist_path && (hist_fh = futil_fopen_create(hist_path, "w")) == NULL)
      die("Cannot open file: %s", hist_path);

  if(thresh_path && (thresh_fh = futil_fopen_create(thresh_path, "w")) == NULL)
      die("Cannot open file: %s", thresh_path);

  if(limit)
    status("Limiting to the first %zu kmers", limit);

  if(clean)
  {
    timestamp();
    message(" Cleaning coverage below %zu", cutoff);
    message("\n");
  }

  if(save)
  {
    // Check we can find the fields we need
    cJSON *links_json  = json_hdr_get(newhdr, "paths", cJSON_Object, link_out_path);
    cJSON *nkmers_json = json_hdr_get(links_json, "num_kmers_with_paths", cJSON_Number, link_out_path);
    cJSON *nlinks_json = json_hdr_get(links_json, "num_paths",            cJSON_Number, link_out_path);
    cJSON *nbytes_json = json_hdr_get(links_json, "path_bytes",           cJSON_Number, link_out_path);
    if(!nkmers_json || !nlinks_json || !nbytes_json)
      die("Cannot find required header entries");

    // Create a random temporary file
    link_tmp_fh = create_tmp_file(&link_tmp_path, link_out_path);

    status("Saving output to: %s", link_out_path);
    status("Temporary output: %s", link_tmp_path.b);

    // Open output file
    if((link_gz = futil_gzopen_create(link_out_path, "w")) == NULL)
      die("Cannot open output link file: %s", link_out_path);

    // Need to open output file first so we can get absolute path
    // Update the header to include this command
    json_hdr_add_curr_cmd(newhdr, link_out_path);
  }

  if(list)
  {
    status("Listing to %s", csv_out_path);
    if((list_fh = futil_fopen_create(csv_out_path, "w")) == NULL)
      die("Cannot open output CSV file %s", csv_out_path);

    // Print csv header
    fprintf(list_fh, "SeqLen,Covg\n");
  }

  if(plot)
  {
    status("Plotting kmer %zu to %s", plot_kmer_idx, plot_out_path);
    if((plot_fh = futil_fopen_create(plot_out_path, "w")) == NULL)
      die("Cannot open output .dot file %s", plot_out_path);
  }

  SizeBuffer countbuf, jposbuf;
  size_buf_alloc(&countbuf, 16);
  size_buf_alloc(&jposbuf, 1024);

  StrBuf kmerbuf, juncsbuf, seqbuf, outbuf;
  strbuf_alloc(&kmerbuf, 1024);
  strbuf_alloc(&juncsbuf, 1024);
  strbuf_alloc(&seqbuf, 1024);
  strbuf_alloc(&outbuf, 1024);

  bool link_fw;
  size_t njuncs;
  size_t knum, nlinks, num_links_exp = 0;

  LinkTree ltree;
  ltree_alloc(&ltree, kmer_size);

  LinkTreeStats tree_stats;
  memset(&tree_stats, 0, sizeof(tree_stats));
  size_t init_num_links = 0, num_links = 0;

  for(knum = 0; !limit || knum < limit; knum++)
  {
    ltree_reset(&ltree);
    if(!gpath_reader_read_kmer(&ctpin, &kmerbuf, &num_links_exp)) break;
    ctx_assert2(kmerbuf.end == kmer_size, "Kmer incorrect length %zu != %zu",
                kmerbuf.end, kmer_size);
    // status("kmer: %s", kmerbuf.b);

    for(nlinks = 0;
        gpath_reader_read_link(&ctpin, &link_fw, &njuncs,
                               &countbuf, &juncsbuf,
                               &seqbuf, &jposbuf);
        nlinks++)
    {
      ltree_add(&ltree, link_fw, countbuf.b[0], jposbuf.b,
                juncsbuf.b, seqbuf.b);
    }

    if(nlinks != num_links_exp)
      warn("Links count mismatch %zu != %zu", nlinks, num_links_exp);

    if(hist_covg)
    {
      ltree_update_covg_hists(&ltree, (uint64_t*)hists,
                              hist_distsize, hist_covgsize);
    }
    if(clean)
    {
      ltree_clean(&ltree, cutoff);
    }

    // Accumulate statistics
    ltree_get_stats(&ltree, &tree_stats);
    num_links = tree_stats.num_links - init_num_links;
    init_num_links = tree_stats.num_links;

    if(list)
    {
      ltree_write_list(&ltree, &outbuf);
      if(fwrite(outbuf.b, 1, outbuf.end, list_fh) != outbuf.end)
        die("Cannot write CSV file to: %s", csv_out_path);
      strbuf_reset(&outbuf);
    }
    if(save && num_links)
    {
      ltree_write_ctp(&ltree, kmerbuf.b, num_links, &outbuf);
      if(fwrite(outbuf.b, 1, outbuf.end, link_tmp_fh) != outbuf.end)
        die("Cannot write ctp file to: %s", link_tmp_path.b);
      strbuf_reset(&outbuf);
    }
    if(plot && knum == plot_kmer_idx)
    {
      status("Plotting tree...");
      ltree_write_dot(&ltree, &outbuf);
      if(fwrite(outbuf.b, 1, outbuf.end, plot_fh) != outbuf.end)
        die("Cannot write plot DOT file to: %s", plot_out_path);
      strbuf_reset(&outbuf);
    }
  }

  gpath_reader_close(&ctpin);

  cJSON *links_json = json_hdr_get(newhdr, "paths", cJSON_Object, link_out_path);
  cJSON *nkmers_json = json_hdr_get(links_json, "num_kmers_with_paths", cJSON_Number, link_out_path);
  cJSON *nlinks_json = json_hdr_get(links_json, "num_paths",            cJSON_Number, link_out_path);
  cJSON *nbytes_json = json_hdr_get(links_json, "path_bytes",           cJSON_Number, link_out_path);

  status("Number of kmers with links %li -> %zu", nkmers_json->valueint, tree_stats.num_trees_with_links);
  status("Number of links %li -> %zu", nlinks_json->valueint, tree_stats.num_links);
  status("Number of bytes %li -> %zu", nbytes_json->valueint, tree_stats.num_link_bytes);

  if(save)
  {
    // Update JSON
    nkmers_json->valuedouble = nkmers_json->valueint = tree_stats.num_trees_with_links;
    nlinks_json->valuedouble = nlinks_json->valueint = tree_stats.num_links;
    nbytes_json->valuedouble = nbytes_json->valueint = tree_stats.num_link_bytes;

    char *json_str = cJSON_Print(newhdr);
    if(gzputs(link_gz, json_str) != (int)strlen(json_str))
      die("Cannot write ctp file to: %s", link_out_path);
    free(json_str);

    gzputs(link_gz, "\n\n");
    gzputs(link_gz, ctp_explanation_comment);
    gzputs(link_gz, "\n");

    fseek(link_tmp_fh, 0, SEEK_SET);
    char *tmp = ctx_malloc(4*ONE_MEGABYTE);
    size_t s;
    while((s = fread(tmp, 1, 4*ONE_MEGABYTE, link_tmp_fh)) > 0) {
      if(gzwrite(link_gz, tmp, s) != (int)s)
        die("Cannot write to output: %s", link_out_path);
    }
    ctx_free(tmp);

    gzclose(link_gz);
    fclose(link_tmp_fh);
  }

  // Write histogram to file
  if(hist_fh)
  {
    size_t i, j;
    fprintf(hist_fh, "  ");
    for(j = 1; j < hist_covgsize; j++) fprintf(hist_fh, ",covg.%02zu", j);
    fprintf(hist_fh, "\n");
    for(i = 1; i < hist_distsize; i++) {
      fprintf(hist_fh, "dist.%02zu", i);
      for(j = 1; j < hist_covgsize; j++) {
        fprintf(hist_fh, ",%"PRIu64, hists[i][j]);
      }
      fprintf(hist_fh, "\n");
    }
  }

  if(thresh_fh)
  {
    // Use median of first five cutoffs
    print_suggest_cutoff(6, hist_covgsize, hists, thresh_fh);
  }

  if(hist_fh && hist_fh != stdout) fclose(hist_fh);

  if(list)
  {
    fclose(list_fh);
  }

  if(plot)
  {
    fclose(plot_fh);
  }

  ctx_free(hists);
  cJSON_Delete(newhdr);
  strbuf_dealloc(&link_tmp_path);
  ltree_dealloc(&ltree);
  size_buf_dealloc(&countbuf);
  size_buf_dealloc(&jposbuf);
  strbuf_dealloc(&kmerbuf);
  strbuf_dealloc(&juncsbuf);
  strbuf_dealloc(&seqbuf);
  strbuf_dealloc(&outbuf);

  return EXIT_SUCCESS;
}