Exemple #1
0
// @param vcf_pos is 0-based
// @param prev_base is -1 if SNP otherwise previous base
// @param next_base is -1 unless indel at position 0
static void print_vcf_entry(size_t vcf_pos, int8_t prev_base, int8_t next_base,
                            const char *ref, const char *alt, size_t len,
                            const uint8_t *gts, size_t nsamples,
                            CallDecomp *dc, const AlignedCall *call,
                            size_t max_allele_len)
{
  dc->stats.nvars++;

  StrBuf *sbuf = &dc->sbuf;
  strbuf_reset(sbuf);

  // Check actual allele length
  size_t i, alt_bases = 0;
  for(i = 0; i < len; i++) alt_bases += (alt[i] != '-');
  if(alt_bases > max_allele_len) { dc->stats.nallele_too_long++; return; }

  // CHROM POS ID REF ALT QUAL FILTER INFO
  strbuf_append_str(sbuf, call->chrom->name.b);
  strbuf_append_char(sbuf, '\t');
  strbuf_append_ulong(sbuf, vcf_pos+1);
  strbuf_append_str(sbuf, "\t.\t");
  print_vcf_allele(ref, len, prev_base, next_base, sbuf);
  strbuf_append_char(sbuf, '\t');
  print_vcf_allele(alt, len, prev_base, next_base, sbuf);
  strbuf_append_str(sbuf, "\t.\tPASS\t");
  strbuf_append_str(sbuf, call->info.b ? call->info.b : ".");
  strbuf_append_str(sbuf, "\tGT");

  // Print genotypes
  for(i = 0; i < nsamples; i++) {
    strbuf_append_char(sbuf, '\t');
    strbuf_append_char(sbuf, gts[i] ? '1' : '.');
  }

  strbuf_append_char(sbuf, '\n');

  // fprintf(stderr, " prev_base:%i next_base:%i info:%s\n", prev_base, next_base, call->info.b);
  // fprintf(stderr, "%s [%zu vs %zu]\n", sbuf->b, sbuf->end, strlen(sbuf->b));

  kstring_t ks = {.l = sbuf->end, .m = sbuf->size, .s = sbuf->b};
  if(vcf_parse(&ks, dc->vcfhdr, dc->v) != 0)
    die("Cannot construct VCF entry: %s", sbuf->b);
  if(bcf_write(dc->vcffh, dc->vcfhdr, dc->v) != 0)
    die("Cannot write VCF entry [nsamples: %zu vs %zu]", nsamples, (size_t)bcf_hdr_nsamples(dc->vcfhdr));
  // Move back into our string buffer
  sbuf->b = ks.s;
  sbuf->size = ks.m;

  dc->stats.nvars_printed++;
}

// `ref` and `alt` are aligned alleles - should both be same length strings
// of 'ACGT-'
// return first mismatch position or -1
static int align_get_start(const char *ref, const char *alt)
{
  const char *start = ref;
  while(*ref) {
    if(*ref != *alt) return (ref - start);
    ref++; alt++;
  }
  return -1;
}

// `ref` and `alt` are aligned alleles - should both be same length strings
// of 'ACGT-'
// return first matching position
static int align_get_end(const char *ref, const char *alt)
{
  int i = 0;
  while(ref[i] && ref[i] != alt[i]) i++;
  return i;
}
Exemple #2
0
/**
 * Print paths to a string buffer. Paths are sorted before being written.
 *
 * @param hkey    All paths associated with hkey are written to the buffer
 * @param sbuf    paths are written this string buffer
 * @param subset  is a temp variable that is reused each time
 * @param nbuf    temporary buffer, if not NULL, used to add seq=... to output
 * @param jposbuf temporary buffer, if not NULL, used to add juncpos=... to output
 */
void gpath_save_sbuf(hkey_t hkey, StrBuf *sbuf, GPathSubset *subset,
                     dBNodeBuffer *nbuf, SizeBuffer *jposbuf,
                     const dBGraph *db_graph)
{
  ctx_assert(db_graph->num_of_cols == 1 || nbuf == NULL);
  ctx_assert(db_graph->num_of_cols == 1 || jposbuf == NULL);

  const GPathStore *gpstore = &db_graph->gpstore;
  const GPathSet *gpset = &gpstore->gpset;
  const size_t ncols = gpstore->gpset.ncols;
  GPath *first_gpath = gpath_store_fetch(gpstore, hkey);
  const GPath *gpath;
  size_t i, j, col;

  // Load and sort paths for given kmer
  gpath_subset_reset(subset);
  gpath_subset_load_llist(subset, first_gpath);
  gpath_subset_sort(subset);

  if(subset->list.len == 0) return;

  // Print "<kmer> <npaths>"
  BinaryKmer bkmer = db_graph->ht.table[hkey];
  char bkstr[MAX_KMER_SIZE+1];
  binary_kmer_to_str(bkmer, db_graph->kmer_size, bkstr);

  // strbuf_sprintf(sbuf, "%s %zu\n", bkstr, subset->list.len);
  strbuf_append_strn(sbuf, bkstr, db_graph->kmer_size);
  strbuf_append_char(sbuf, ' ');
  strbuf_append_ulong(sbuf, subset->list.len);
  strbuf_append_char(sbuf, '\n');

  char orchar[2] = {0};
  orchar[FORWARD] = 'F';
  orchar[REVERSE] = 'R';
  const uint8_t *nseenptr;

  for(i = 0; i < subset->list.len; i++)
  {
    gpath = subset->list.b[i];
    nseenptr = gpath_set_get_nseen(gpset, gpath);

    // strbuf_sprintf(sbuf, "%c %zu %u %u", orchar[gpath->orient], klen,
    //                                      gpath->num_juncs, (uint32_t)nseenptr[0]);

    strbuf_append_char(sbuf, orchar[gpath->orient]);
    strbuf_append_char(sbuf, ' ');
    strbuf_append_ulong(sbuf, gpath->num_juncs);
    strbuf_append_char(sbuf, ' ');
    strbuf_append_ulong(sbuf, nseenptr[0]);

    for(col = 1; col < ncols; col++) {
      // strbuf_sprintf(sbuf, ",%u", (uint32_t)nseenptr[col]);
      strbuf_append_char(sbuf, ',');
      strbuf_append_ulong(sbuf, nseenptr[col]);
    }

    strbuf_append_char(sbuf, ' ');
    strbuf_ensure_capacity(sbuf, sbuf->end + gpath->num_juncs + 2);
    binary_seq_to_str(gpath->seq, gpath->num_juncs, sbuf->b+sbuf->end);
    sbuf->end += gpath->num_juncs;

    if(nbuf)
    {
      // Trace this path through the graph
      // First, find a colour this path is in
      for(col = 0; col < ncols && !gpath_has_colour(gpath, ncols, col); col++) {}
      if(col == ncols) die("path is not in any colours");

      dBNode node = {.key = hkey, .orient = gpath->orient};
      db_node_buf_reset(nbuf);
      if(jposbuf) size_buf_reset(jposbuf); // indices of junctions in nbuf
      gpath_fetch(node, gpath, nbuf, jposbuf, col, db_graph);

      strbuf_append_str(sbuf, " seq=");
      strbuf_ensure_capacity(sbuf, sbuf->end + db_graph->kmer_size + nbuf->len);
      sbuf->end += db_nodes_to_str(nbuf->b, nbuf->len, db_graph,
                                   sbuf->b+sbuf->end);

      if(jposbuf) {
        strbuf_append_str(sbuf, " juncpos=");
        strbuf_append_ulong(sbuf, jposbuf->b[0]);

        for(j = 1; j < jposbuf->len; j++) {
          strbuf_append_char(sbuf, ',');
          strbuf_append_ulong(sbuf, jposbuf->b[j]);
        }
      }
    }

    strbuf_append_char(sbuf, '\n');
  }
}

// @subset is a temp variable that is reused each time
// @sbuf   is a temp variable that is reused each time
static inline int _gpath_gzsave_node(hkey_t hkey,
                                     StrBuf *sbuf, GPathSubset *subset,
                                     dBNodeBuffer *nbuf, SizeBuffer *jposbuf,
                                     gzFile gzout, pthread_mutex_t *outlock,
                                     const dBGraph *db_graph)
{
  gpath_save_sbuf(hkey, sbuf, subset, nbuf, jposbuf, db_graph);

  if(sbuf->end > DEFAULT_IO_BUFSIZE)
    _gpath_save_flush(gzout, sbuf, outlock);

  return 0; // => keep iterating
}
Exemple #3
0
// Potential bubble - filter ref and duplicate alleles
static void print_bubble(BubbleCaller *caller,
                         GCacheStep **steps, size_t num_paths)
{
  const BubbleCallingPrefs prefs = caller->prefs;
  const dBGraph *db_graph = caller->db_graph;
  GCacheSnode *snode;
  size_t i;

  dBNodeBuffer *flank5p = &caller->flank5p;
  if(flank5p->len == 0)
  {
    // Haven't fetched 5p flank yet
    // flank5p[0] already contains the first node
    flank5p->len = 1;
    supernode_extend(flank5p, prefs.max_flank_len, db_graph);
    db_nodes_reverse_complement(flank5p->b, flank5p->len);
  }

  //
  // Print Bubble
  //

  // write to string buffer then flush to gzFile
  StrBuf *sbuf = &caller->output_buf;
  strbuf_reset(sbuf);

  // Temporary node buffer to use
  dBNodeBuffer *pathbuf = &caller->pathbuf;
  db_node_buf_reset(pathbuf);

  // Get bubble number (threadsafe num_bubbles_ptr++)
  size_t id = __sync_fetch_and_add((volatile size_t*)caller->num_bubbles_ptr, 1);

  // This can be set to anything without a '.' in it
  const char prefix[] = "call";

  // 5p flank
  // strbuf_sprintf(sbuf, ">bubble.%s%zu.5pflank kmers=%zu\n", prefix, id, flank5p->len);
  strbuf_append_str(sbuf, ">bubble.");
  strbuf_append_str(sbuf, prefix);
  strbuf_append_ulong(sbuf, id);
  strbuf_append_str(sbuf, ".5pflank kmers=");
  strbuf_append_ulong(sbuf, flank5p->len);
  strbuf_append_char(sbuf, '\n');
  branch_to_str(flank5p->b, flank5p->len, true, sbuf, db_graph);

  // 3p flank
  db_node_buf_reset(pathbuf);
  snode = graph_cache_snode(&caller->cache, steps[0]->supernode);
  graph_cache_snode_fetch_nodes(&caller->cache, snode, steps[0]->orient, pathbuf);

  // strbuf_sprintf(sbuf, ">bubble.%s%zu.3pflank kmers=%zu\n", prefix, id, pathbuf->len);
  strbuf_append_str(sbuf, ">bubble.");
  strbuf_append_str(sbuf, prefix);
  strbuf_append_ulong(sbuf, id);
  strbuf_append_str(sbuf, ".3pflank kmers=");
  strbuf_append_ulong(sbuf, pathbuf->len);
  strbuf_append_char(sbuf, '\n');
  branch_to_str(pathbuf->b, pathbuf->len, false, sbuf, db_graph);

  // Print alleles
  for(i = 0; i < num_paths; i++)
  {
    db_node_buf_reset(pathbuf);
    graph_cache_step_fetch_nodes(&caller->cache, steps[i], pathbuf);

    // strbuf_sprintf(sbuf, ">bubble.%s%zu.branch.%zu kmers=%zu\n",
    //                prefix, id, i, pathbuf->len);
    strbuf_append_str(sbuf, ">bubble.");
    strbuf_append_str(sbuf, prefix);
    strbuf_append_ulong(sbuf, id);
    strbuf_append_str(sbuf, ".branch.");
    strbuf_append_ulong(sbuf, i);
    strbuf_append_str(sbuf, " kmers=");
    strbuf_append_ulong(sbuf, pathbuf->len);
    strbuf_append_char(sbuf, '\n');

    branch_to_str(pathbuf->b, pathbuf->len, false, sbuf, db_graph);
  }

  strbuf_append_char(sbuf, '\n');

  ctx_assert(strlen(sbuf->b) == sbuf->end);

  // lock, print, unlock
  pthread_mutex_lock(caller->out_lock);
  gzwrite(caller->gzout, sbuf->b, sbuf->end);
  pthread_mutex_unlock(caller->out_lock);
}