예제 #1
0
void seq_parse_se_sf(seq_file_t *sf, uint8_t ascii_fq_offset,
                     read_t *r1,
                     void (*read_func)(read_t *r1, read_t *r2,
                                       uint8_t qoffset1, uint8_t qoffset2,
                                       void *ptr),
                     void *reader_ptr)
{
  status("[seq] Parsing sequence file %s", futil_inpath_str(sf->path));

  // Guess offset if needed
  uint8_t qoffset = ascii_fq_offset;
  uint8_t qmin = ascii_fq_offset, qmax = 126;
  int format;

  if(ascii_fq_offset == 0 && (format = guess_fastq_format(sf)) != -1)
  {
    qmin = (uint8_t)FASTQ_MIN[format];
    qmax = (uint8_t)FASTQ_MAX[format];
    qoffset = (uint8_t)FASTQ_OFFSET[format];
  }

  // warn_flags keeps track of which of the error msgs have been printed
  // (only print each error msg once per file)
  uint8_t warn_flags = 0;
  size_t num_se_reads = 0, num_pe_pairs = 0;
  int s;

  while((s = seq_read_primary(sf, r1)) > 0)
  {
    warn_flags = check_new_read(r1, qmin, qmax, sf->path, warn_flags);
    read_func(r1, NULL, qoffset, 0, reader_ptr);
    num_se_reads++;
  }

  if(s < 0) warn("Input error: %s\n", sf->path);

  char num_se_reads_str[100], num_pe_pairs_str[100];
  ulong_to_str(num_pe_pairs, num_pe_pairs_str);
  ulong_to_str(num_se_reads, num_se_reads_str);
  status("[seq] Loaded %s reads and %s reads pairs (file: %s)",
         num_se_reads_str, num_pe_pairs_str, futil_inpath_str(sf->path));
}
예제 #2
0
int ctx_calls2vcf(int argc, char **argv)
{
  const char *in_path = NULL, *out_path = NULL, *out_type = NULL;
  // Filtering parameters
  int32_t min_mapq = -1, max_align_len = -1, max_allele_len = -1;
  // Alignment parameters
  int nwmatch = 1, nwmismatch = -2, nwgapopen = -4, nwgapextend = -1;
  // ref paths
  char const*const* ref_paths = NULL;
  size_t nref_paths = 0;
  // flank file
  const char *sam_path = NULL;

  //
  // Things we figure out by looking at the input
  //
  bool isbubble = false;
  // samples in VCF, (0 for bubble, does not include ref in breakpoint calls)
  size_t i, kmer_size, num_samples;

  //
  // Reference genome
  //
  // Hash map of chromosome name -> sequence
  ChromHash *genome;
  ReadBuffer chroms;

  // Arg parsing
  char cmd[100];
  char shortopts[300];
  cmd_long_opts_to_short(longopts, shortopts, sizeof(shortopts));
  int c;

  // silence error messages from getopt_long
  // opterr = 0;

  while((c = getopt_long_only(argc, argv, shortopts, longopts, NULL)) != -1) {
    cmd_get_longopt_str(longopts, c, cmd, sizeof(cmd));
    switch(c) {
      case 0: /* flag set */ break;
      case 'h': cmd_print_usage(NULL); break;
      case 'o': cmd_check(!out_path, cmd); out_path = optarg; break;
      case 'O': cmd_check(!out_type, cmd); out_type = optarg; break;
      case 'f': cmd_check(!futil_get_force(), cmd); futil_set_force(true); break;
      case 'F': cmd_check(!sam_path,cmd); sam_path = optarg; break;
      case 'Q': cmd_check(min_mapq < 0,cmd); min_mapq = cmd_uint32(cmd, optarg); break;
      case 'A': cmd_check(max_align_len  < 0,cmd); max_align_len  = cmd_uint32(cmd, optarg); break;
      case 'L': cmd_check(max_allele_len < 0,cmd); max_allele_len = cmd_uint32(cmd, optarg); break;
      case 'm': nwmatch = cmd_int32(cmd, optarg); break;
      case 'M': nwmismatch = cmd_int32(cmd, optarg); break;
      case 'g': nwgapopen = cmd_int32(cmd, optarg); break;
      case 'G': nwgapextend = cmd_int32(cmd, optarg); break;
      case ':': /* BADARG */
      case '?': /* BADCH getopt_long has already printed error */
        die("`"CMD" "SUBCMD" -h` for help. Bad option: %s", argv[optind-1]);
      default: ctx_assert2(0, "shouldn't reach here: %c", c);
    }
  }

  // Defaults for unset values
  if(out_path == NULL) out_path = "-";
  if(max_align_len  < 0) max_align_len  = DEFAULT_MAX_ALIGN;
  if(max_allele_len < 0) max_allele_len = DEFAULT_MAX_ALLELE;

  if(optind+2 > argc)
    cmd_print_usage("Require <in.txt.gz> and at least one reference");

  in_path = argv[optind++];
  ref_paths = (char const*const*)argv + optind;
  nref_paths = argc - optind;

  // These functions call die() on error
  gzFile gzin = futil_gzopen(in_path, "r");

  // Read call file header
  cJSON *json = json_hdr_load(gzin, in_path);

  // Check we can handle the kmer size
  kmer_size = json_hdr_get_kmer_size(json, in_path);
  db_graph_check_kmer_size(kmer_size, in_path);

  // Get format (bubble or breakpoint file)
  cJSON *json_fmt = json_hdr_get(json, "file_format", cJSON_String, in_path);
  if(strcmp(json_fmt->valuestring,"CtxBreakpoints") == 0) isbubble = false;
  else if(strcmp(json_fmt->valuestring,"CtxBubbles") == 0) isbubble = true;
  else die("Unknown format: '%s'", json_fmt->valuestring);

  status("Reading %s in %s format", futil_inpath_str(in_path),
         isbubble ? "bubble" : "breakpoint");

  if(isbubble) {
    // bubble specific
    if(sam_path == NULL)
      cmd_print_usage("Require -F <flanks.sam> with bubble file");
    if(min_mapq < 0) min_mapq = DEFAULT_MIN_MAPQ;
  }
  else {
    // breakpoint specific
    if(min_mapq >= 0)
      cmd_print_usage("-Q,--min-mapq <Q> only valid with bubble calls");
  }

  // Open flank file if it exists
  htsFile *samfh = NULL;
  bam_hdr_t *bam_hdr = NULL;
  bam1_t *mflank = NULL;

  if(sam_path)
  {
    if((samfh = hts_open(sam_path, "r")) == NULL)
      die("Cannot open SAM/BAM %s", sam_path);

    // Load BAM header
    bam_hdr = sam_hdr_read(samfh);
    if(bam_hdr == NULL) die("Cannot load BAM header: %s", sam_path);
    mflank = bam_init1();
  }

  // Output VCF has 0 samples if bubbles file, otherwise has N where N is
  // number of samples/colours in the breakpoint graph
  size_t num_graph_samples = json_hdr_get_ncols(json, in_path);
  size_t num_graph_nonref = json_hdr_get_nonref_ncols(json, in_path);

  num_samples = 0;
  if(!isbubble) {
    // If last colour has "is_ref", drop number of samples by one
    num_samples = num_graph_nonref < num_graph_samples ? num_graph_samples-1
                                                       : num_graph_samples;
  }

  //
  // Open output file
  //
  if(!out_path) out_path = "-";
  int mode = vcf_misc_get_outtype(out_type, out_path);
  futil_create_output(out_path);
  htsFile *vcffh = hts_open(out_path, modes_htslib[mode]);

  status("[calls2vcf] Reading %s call file with %zu samples",
         isbubble ? "Bubble" : "Breakpoint", num_graph_samples);
  status("[calls2vcf] %zu sample output to: %s format: %s",
         num_samples, futil_outpath_str(out_path), hsmodes_htslib[mode]);

  if(isbubble) status("[calls2vcf] min. MAPQ: %i", min_mapq);
  status("[calls2vcf] max alignment length: %i", max_align_len);
  status("[calls2vcf] max VCF allele length: %i", max_allele_len);
  status("[calls2vcf] alignment match:%i mismatch:%i gap open:%i extend:%i",
         nwmatch, nwmismatch, nwgapopen, nwgapextend);

  // Load reference genome
  read_buf_alloc(&chroms, 1024);
  genome = chrom_hash_init();
  chrom_hash_load(ref_paths, nref_paths, &chroms, genome);

  // convert to upper case
  char *s;
  for(i = 0; i < chroms.len; i++)
    for(s = chroms.b[i].seq.b; *s; s++) *s = toupper(*s);

  if(!isbubble) brkpnt_check_refs_match(json, genome, in_path);

  bcf_hdr_t *vcfhdr = make_vcf_hdr(json, in_path, !isbubble, kmer_size,
                                   ref_paths, nref_paths,
                                   chroms.b, chroms.len);

  if(bcf_hdr_write(vcffh, vcfhdr) != 0) die("Cannot write VCF header");

  AlignedCall *call = acall_init();
  CallDecomp *aligner = call_decomp_init(vcffh, vcfhdr);

  scoring_t *scoring = call_decomp_get_scoring(aligner);
  scoring_init(scoring, nwmatch, nwmismatch, nwgapopen, nwgapextend,
               false, false, 0, 0, 0, 0);

  CallFileEntry centry;
  call_file_entry_alloc(&centry);

  char kmer_str[50];
  sprintf(kmer_str, ";K%zu", kmer_size);

  if(isbubble)
  {
    // Bubble calls
    DecompBubble *bubbles = decomp_bubble_init();

    // Set scoring for aligning 3' flank
    scoring = decomp_bubble_get_scoring(bubbles);
    scoring_init(scoring, nwmatch, nwmismatch, nwgapopen, nwgapextend,
                 true, true, 0, 0, 0, 0);

    while(call_file_read(gzin, in_path, &centry)) {
      do {
        if(sam_read1(samfh, bam_hdr, mflank) < 0)
          die("We've run out of SAM entries!");
      } while(mflank->core.flag & (BAM_FSECONDARY | BAM_FSUPPLEMENTARY));

      // Align call
      strbuf_reset(&call->info);
      decomp_bubble_call(bubbles, genome, kmer_size, min_mapq,
                         &centry, mflank, bam_hdr, call);
      strbuf_append_str(&call->info, kmer_str);
      acall_decompose(aligner, call, max_align_len, max_allele_len);
    }

    // print bubble stats
    DecompBubbleStats *bub_stats = ctx_calloc(1, sizeof(*bub_stats));
    decomp_bubble_cpy_stats(bub_stats, bubbles);
    print_bubble_stats(bub_stats);
    ctx_free(bub_stats);

    decomp_bubble_destroy(bubbles);
  }
  else
  {
    // Breakpoint calls
    DecompBreakpoint *breakpoints = decomp_brkpt_init();

    while(call_file_read(gzin, in_path, &centry)) {
      strbuf_reset(&call->info);
      decomp_brkpt_call(breakpoints, genome, num_samples, &centry, call);
      strbuf_append_str(&call->info, kmer_str);
      acall_decompose(aligner, call, max_align_len, max_allele_len);
    }

    // print bubble stats
    DecompBreakpointStats *brk_stats = ctx_calloc(1, sizeof(*brk_stats));
    decomp_brkpt_cpy_stats(brk_stats, breakpoints);
    print_breakpoint_stats(brk_stats);
    ctx_free(brk_stats);

    decomp_brkpt_destroy(breakpoints);
  }

  // Print stats
  DecomposeStats *astats = ctx_calloc(1, sizeof(*astats));
  call_decomp_cpy_stats(astats, aligner);
  print_acall_stats(astats);
  ctx_free(astats);

  call_file_entry_dealloc(&centry);
  call_decomp_destroy(aligner);
  acall_destroy(call);

  // Finished - clean up
  cJSON_Delete(json);
  gzclose(gzin);

  bcf_hdr_destroy(vcfhdr);
  hts_close(vcffh);

  for(i = 0; i < chroms.len; i++) seq_read_dealloc(&chroms.b[i]);
  read_buf_dealloc(&chroms);
  chrom_hash_destroy(genome);

  if(sam_path) {
    hts_close(samfh);
    bam_hdr_destroy(bam_hdr);
    bam_destroy1(mflank);
  }

  return EXIT_SUCCESS;
}
예제 #3
0
int ctx_calls2vcf(int argc, char **argv)
{
  parse_cmdline_args(argc, argv);
  size_t i;

  // These functions call die() on error
  gzFile gzin = futil_gzopen(input_path, "r");

  nw_aligner_setup();

  // Read file header
  cJSON *json = read_input_header(gzin);

  // Get format (bubble or breakpoint file)
  cJSON *json_fmt = json_hdr_get(json, "file_format", cJSON_String, input_path);
  if(strcmp(json_fmt->valuestring,"CtxBreakpoints") == 0) input_bubble_format = false;
  else if(strcmp(json_fmt->valuestring,"CtxBubbles") == 0) input_bubble_format = true;
  else die("Unknown format: '%s'", json_fmt->valuestring);

  status("Reading %s in %s format", futil_inpath_str(input_path),
         input_bubble_format ? "bubble" : "breakpoint");

  if(input_bubble_format && sam_path == NULL)
    cmd_print_usage("Require -F <flanks.sam> with bubble file");

  // Open flank file if it exists
  if(sam_path) flanks_sam_open();

  // Open output file
  FILE *fout = futil_fopen_create(out_path, "w");

  // Load reference genome
  read_buf_alloc(&chroms, 1024);
  genome = kh_init(ChromHash);
  seq_reader_load_ref_genome(ref_paths, num_ref_paths, &chroms, genome);

  // convert to upper case
  char *s;
  for(i = 0; i < chroms.len; i++)
    for(s = chroms.b[i].seq.b; *s; s++) *s = toupper(*s);

  if(!input_bubble_format) brkpnt_check_refs_match(json, input_path);

  // Output VCF has 0 samples if bubbles file, otherwise has N where N is
  // number of samples/colours in the breakpoint graph
  size_t num_graph_samples = json_hdr_get_ncols(json, input_path);
  size_t num_graph_nonref = json_hdr_get_nonref_ncols(json, input_path);

  num_samples = 0;
  if(!input_bubble_format) {
    // If last colour has "is_ref", drop number of samples by one
    num_samples = num_graph_nonref < num_graph_samples ? num_graph_samples-1
                                                       : num_graph_samples;
  }

  print_vcf_header(json, !input_bubble_format, fout);
  status("Reading %s call file with %zu samples",
         input_bubble_format ? "Bubble" : "Breakpoint", num_graph_samples);
  status("Writing a VCF with %zu samples", num_samples);
  parse_entries(gzin, fout);

  // Print stats
  char num_entries_read_str[50];
  char num_vars_printed_str[50];
  ulong_to_str(num_entries_read, num_entries_read_str);
  ulong_to_str(num_vars_printed, num_vars_printed_str);

  status("Read %s entries, printed %s vcf entries to: %s",
         num_entries_read_str, num_vars_printed_str, futil_outpath_str(out_path));

  if(input_bubble_format) {
    char msg[200];
    // Bubble caller specific
    print_stat(num_flank5p_unmapped,    num_entries_read, "flank 5p unmapped");
    sprintf(msg, "flank 5p low mapq (<%zu)", min_mapq);
    print_stat(num_flank5p_lowqual,     num_entries_read, msg);
    print_stat(num_flank3p_not_found,   num_entries_read, "flank 3p not found");
    print_stat(num_flank3p_multihits,   num_entries_read, "flank 3p multiple hits");
    print_stat(num_flank3p_approx_match,num_entries_read, "flank 3p approx match used");
    print_stat(num_flank3p_exact_match, num_entries_read, "flank 3p exact match");
  } else {
    // Breakpoint caller specific
    print_stat(num_flanks_not_uniquely_mapped, num_entries_read, "flank pairs contain one flank not mapped uniquely");
    print_stat(num_flanks_diff_chroms,         num_entries_read, "flank pairs map to diff chroms");
    print_stat(num_flanks_diff_strands,        num_entries_read, "flank pairs map to diff strands");
  }
  print_stat(num_flanks_too_far_apart,       num_entries_read, "flank pairs too far apart");
  print_stat(num_flanks_overlap_too_large,   num_entries_read, "flank pairs overlap too much");
  print_stat(num_entries_well_mapped,        num_entries_read, "flank pairs map well");

  status("Aligned %zu allele pairs and %zu flanks", num_nw_allele, num_nw_flank);

  // Finished - clean up
  cJSON_Delete(json);
  gzclose(gzin);
  fclose(fout);

  for(i = 0; i < chroms.len; i++) seq_read_dealloc(&chroms.b[i]);
  read_buf_dealloc(&chroms);
  kh_destroy_ChromHash(genome);
  nw_aligner_destroy();

  if(sam_path) flanks_sam_close();

  // hide unused method warnings
  (void)kh_del_ChromHash;
  (void)kh_put_ChromHash;
  (void)kh_get_ChromHash;
  (void)kh_clear_ChromHash;
  (void)kh_destroy_ChromHash;
  (void)kh_init_ChromHash;

  return EXIT_SUCCESS;
}
예제 #4
0
void seq_parse_pe_sf(seq_file_t *sf1, seq_file_t *sf2, uint8_t ascii_fq_offset,
                     read_t *r1, read_t *r2,
                     void (*read_func)(read_t *_r1, read_t *_r2,
                                       uint8_t _qoffset1, uint8_t _qoffset2,
                                       void *_ptr),
                     void *reader_ptr)
{
  if(sf2 == NULL) {
    seq_parse_se_sf(sf1, ascii_fq_offset, r1, read_func, reader_ptr);
    return;
  }

  status("[seq] Parsing sequence files %s %s\n",
         futil_inpath_str(sf1->path), futil_inpath_str(sf2->path));
  // Guess offset if needed
  uint8_t qoffset1 = ascii_fq_offset, qoffset2 = ascii_fq_offset;
  uint8_t qmin1 = ascii_fq_offset, qmin2 = ascii_fq_offset;
  uint8_t qmax1 = 126, qmax2 = 126;

  if(ascii_fq_offset == 0)
  {
    int fmt1, fmt2;
    if((fmt1 = guess_fastq_format(sf1)) != -1) {
      qmin1 = (uint8_t)FASTQ_MIN[fmt1];
      qmax1 = (uint8_t)FASTQ_MAX[fmt1];
      qoffset1 = (uint8_t)FASTQ_OFFSET[fmt1];
    }
    if((fmt2 = guess_fastq_format(sf2)) != -1) {
      qmin2 = (uint8_t)FASTQ_MIN[fmt2];
      qmax2 = (uint8_t)FASTQ_MAX[fmt2];
      qoffset2 = (uint8_t)FASTQ_OFFSET[fmt2];
    }
  }

  // warn_flags keeps track of which of the error msgs have been printed
  // (only print each error msg once per file)
  uint8_t warn_flags = 0;
  int success1, success2;
  size_t num_pe_pairs = 0;

  while(1)
  {
    success1 = seq_read_primary(sf1, r1);
    success2 = seq_read_primary(sf2, r2);

    if(success1 < 0) warn("input error: %s", sf1->path);
    if(success2 < 0) warn("input error: %s", sf2->path);
    if(!success1 != !success2) {
      warn("Different number of reads in pe files [%s; %s]\n",
           sf1->path, sf2->path);
    }
    if(success1 <= 0 || success2 <= 0) break;

    // PE
    // We don't care about read orientation at this point
    warn_flags = check_new_read(r1, qmin1, qmax1, sf1->path, warn_flags);
    warn_flags = check_new_read(r2, qmin2, qmax2, sf2->path, warn_flags);
    read_func(r1, r2, qoffset1, qoffset2, reader_ptr);
    num_pe_pairs++;
  }

  char num_pe_pairs_str[100];
  ulong_to_str(num_pe_pairs, num_pe_pairs_str);
  status("[seq] Loaded %s read pairs (files: %s, %s)", num_pe_pairs_str,
         futil_inpath_str(sf1->path), futil_inpath_str(sf2->path));
}
예제 #5
0
void seq_parse_interleaved_sf(seq_file_t *sf, uint8_t ascii_fq_offset,
                              read_t *r1, read_t *r2,
                              void (*read_func)(read_t *_r1, read_t *_r2,
                                                uint8_t _qoffset1,
                                                uint8_t _qoffset2,
                                                void *_ptr),
                              void *reader_ptr)
{
  status("[seq] Reading a (possibly) interleaved file (expect both S.E. & P.E. reads)");

  // Guess offset if needed
  uint8_t qoffset = ascii_fq_offset;
  uint8_t qmin = ascii_fq_offset, qmax = 126;
  int format;

  if(ascii_fq_offset == 0 && (format = guess_fastq_format(sf)) != -1)
  {
    qmin = (uint8_t)FASTQ_MIN[format];
    qmax = (uint8_t)FASTQ_MAX[format];
    qoffset = (uint8_t)FASTQ_OFFSET[format];
  }

  read_t *r[2] = {r1,r2};
  int ridx = 0, s;
  uint8_t warn_flags = 0;
  size_t num_se_reads = 0, num_pe_pairs = 0;

  while((s = seq_read_primary(sf, r[ridx])) > 0)
  {
    warn_flags = check_new_read(r[ridx], qmin, qmax, sf->path, warn_flags);

    if(ridx)
    {
      // ridx == 1
      if(seq_read_names_cmp(r[0]->name.b, r[1]->name.b) == 0) {
        // Either read may be the first in the pair if from SAM/BAM
        int r0 = (r[1]->from_sam && seq_read_bam(r[1])->core.flag & BAM_FREAD1);
        read_func(r[r0], r[!r0], qoffset, qoffset, reader_ptr);
        num_pe_pairs++;
        ridx = 0;
      } else {
        read_func(r[0], NULL, qoffset, 0, reader_ptr);
        num_se_reads++;
        SWAP(r[0], r[1]);
        ridx = 1;
      }
    }
    else ridx = 1;
  }

  // Process last read
  if(ridx == 1) {
    read_func(r[0], NULL, qoffset, 0, reader_ptr);
    num_se_reads++;
  }

  if(s < 0) warn("Input error: %s\n", sf->path);

  char num_se_reads_str[100], num_pe_pairs_str[100];
  ulong_to_str(num_pe_pairs, num_pe_pairs_str);
  ulong_to_str(num_se_reads, num_se_reads_str);
  status("[seq] Loaded %s reads and %s reads pairs (file: %s)",
         num_se_reads_str, num_pe_pairs_str, futil_inpath_str(sf->path));
}