Ejemplo n.º 1
0
// Get bkey:orient string representation e.g. "AGAGTTTTATC:1".
//   :0 means forward, :1 means reverse
//   `str` must be at least kmer_size+3 chars long
// Returns length in bytes. Null terminates `str`.
size_t db_node_to_str(const dBGraph *db_graph, dBNode node, char *str)
{
  const size_t kmer_size = db_graph->kmer_size;
  BinaryKmer bkmer = db_node_get_bkmer(db_graph, node.key);
  binary_kmer_to_str(bkmer, kmer_size, str);
  str[kmer_size] = ':';
  str[kmer_size+1] = '0' + node.orient;
  str[kmer_size+2] = '\0';
  return kmer_size + 2;
}
Ejemplo n.º 2
0
static void print_failed(dBNode node, const dBNodeBuffer *nbuf,
                         const dBGraph *db_graph, bool is_AB, bool prime_AB)
{
  const size_t kmer_size = db_graph->kmer_size;
  char bkmerstr[MAX_KMER_SIZE+1];
  BinaryKmer bkmer = db_node_get_bkey(db_graph, node.key);
  binary_kmer_to_str(bkmer, kmer_size, bkmerstr);
  printf(">%s:%i %s %s\n", bkmerstr, node.orient,
         is_AB ? "A->B" : "B->C", prime_AB ? "prime_AB" : "walk_AB");
  db_nodes_print(nbuf->b, nbuf->len, db_graph, stdout);
  fputc('\n', stdout);
}
Ejemplo n.º 3
0
// Print:
// 0: AAACCCAAATGCAAACCCAAATGCAAACCCA:1 TGGGTTTGCATTTGGGTTTGCATTTGGGTTT
// 1: CAAACCCAAATGCAAACCCAAATGCAAACCC:1 GGGTTTGCATTTGGGTTTGCATTTGGGTTTG
// ...
void db_nodes_print_verbose(const dBNode *nodes, size_t num,
                            const dBGraph *db_graph, FILE *out)
{
  if(num == 0) return;

  const size_t kmer_size = db_graph->kmer_size;
  size_t i;
  BinaryKmer bkmer, bkey;
  char kmerstr[MAX_KMER_SIZE+1], keystr[MAX_KMER_SIZE+1];

  bkmer = db_node_get_bkmer(db_graph, nodes[0].key);
  bkey = db_node_oriented_bkmer(db_graph, nodes[0]);
  binary_kmer_to_str(bkmer, kmer_size, kmerstr);
  binary_kmer_to_str(bkey, kmer_size, keystr);
  fprintf(out, "%3zu: %s:%i %s\n", (size_t)0, kmerstr, (int)nodes[0].orient, keystr);

  for(i = 1; i < num; i++) {
    bkmer = db_node_get_bkmer(db_graph, nodes[i].key);
    bkey = db_node_oriented_bkmer(db_graph, nodes[i]);
    binary_kmer_to_str(bkmer, kmer_size, kmerstr);
    binary_kmer_to_str(bkey, kmer_size, keystr);
    fprintf(out, "%3zu: %s:%i %s\n", i, kmerstr, (int)nodes[i].orient, keystr);
  }
}
Ejemplo n.º 4
0
void db_nodes_gzprint(const dBNode *nodes, size_t num,
                      const dBGraph *db_graph, gzFile out)
{
  size_t i, kmer_size = db_graph->kmer_size;
  Nucleotide nuc;
  BinaryKmer bkmer;
  char tmp[MAX_KMER_SIZE+1];

  bkmer = db_node_oriented_bkmer(db_graph, nodes[0]);
  binary_kmer_to_str(bkmer, kmer_size, tmp);
  gzputs(out, tmp);

  for(i = 1; i < num; i++) {
    nuc = db_node_get_last_nuc(nodes[i], db_graph);
    gzputc(out, dna_nuc_to_char(nuc));
  }
}
Ejemplo n.º 5
0
void db_nodes_print(const dBNode *nodes, size_t num,
                    const dBGraph *db_graph, FILE *out)
{
  const size_t kmer_size = db_graph->kmer_size;
  size_t i;
  Nucleotide nuc;
  BinaryKmer bkmer;
  char tmp[MAX_KMER_SIZE+1];

  bkmer = db_node_oriented_bkmer(db_graph, nodes[0]);
  binary_kmer_to_str(bkmer, kmer_size, tmp);
  fputs(tmp, out);

  for(i = 1; i < num; i++) {
    nuc = db_node_get_last_nuc(nodes[i], db_graph);
    fputc(dna_nuc_to_char(nuc), out);
  }
}
Ejemplo n.º 6
0
// Returns number of bytes added
size_t db_nodes_to_str(const dBNode *nodes, size_t num,
                       const dBGraph *db_graph, char *str)
{
  if(num == 0) return 0;

  size_t i;
  size_t kmer_size = db_graph->kmer_size;
  BinaryKmer bkmer = db_node_get_bkmer(db_graph, nodes[0].key);
  Nucleotide nuc;

  binary_kmer_to_str(bkmer, kmer_size, str);
  if(nodes[0].orient == REVERSE) dna_reverse_complement_str(str, kmer_size);

  for(i = 1; i < num; i++) {
    nuc = db_node_get_last_nuc(nodes[i], db_graph);
    str[kmer_size+i-1] = dna_nuc_to_char(nuc);
  }

  str[kmer_size+num-1] = '\0';
  return kmer_size+num-1;
}
Ejemplo n.º 7
0
static void branch_to_str(const dBNode *nodes, size_t len, bool print_first_kmer,
                          StrBuf *sbuf, const dBGraph *db_graph)
{
  size_t i = print_first_kmer, kmer_size = db_graph->kmer_size;
  Nucleotide nuc;
  BinaryKmer bkmer;

  if(print_first_kmer) {
    strbuf_ensure_capacity(sbuf, sbuf->end + kmer_size);
    bkmer = db_node_oriented_bkmer(db_graph, nodes[0]);
    binary_kmer_to_str(bkmer, kmer_size, sbuf->b+sbuf->end);
    sbuf->end += kmer_size;
  }

  // i == 1 if print_first_kmer, otherwise 0
  strbuf_ensure_capacity(sbuf, sbuf->end + len + 1); // +1 for '\n'
  for(; i < len; i++) {
    nuc = db_node_get_last_nuc(nodes[i], db_graph);
    sbuf->b[sbuf->end++] = dna_nuc_to_char(nuc);
  }

  sbuf->b[sbuf->end++] = '\n';
  sbuf->b[sbuf->end] = '\0';
}
Ejemplo n.º 8
0
int ctx_index(int argc, char **argv)
{
  const char *out_path = NULL;
  size_t block_size = 0, block_kmers = 0;

  // Arg parsing
  char cmd[100];
  char shortopts[300];
  cmd_long_opts_to_short(longopts, shortopts, sizeof(shortopts));
  int c;

  // silence error messages from getopt_long
  // opterr = 0;

  while((c = getopt_long_only(argc, argv, shortopts, longopts, NULL)) != -1) {
    cmd_get_longopt_str(longopts, c, cmd, sizeof(cmd));
    switch(c) {
      case 0: /* flag set */ break;
      case 'h': cmd_print_usage(NULL); break;
      case 'o': cmd_check(!out_path, cmd); out_path = optarg; break;
      case 'b':
        cmd_check(!block_kmers, cmd);
        block_kmers = cmd_size_nonzero(cmd, optarg);
        break;
      case 's':
        cmd_check(!block_size, cmd);
        block_size = cmd_size_nonzero(cmd, optarg);
        break;
      case ':': /* BADARG */
      case '?': /* BADCH getopt_long has already printed error */
        // cmd_print_usage(NULL);
        die("`"CMD" index -h` for help. Bad option: %s", argv[optind-1]);
      default: abort();
    }
  }

  if(optind+1 != argc)
    cmd_print_usage("Require exactly one input graph file (.ctx)");

  if(block_size && block_kmers)
    cmd_print_usage("Cannot use --block-kmers and --block-size together");

  const char *ctx_path = argv[optind];

  //
  // Open Graph file
  //
  GraphFileReader gfile;
  memset(&gfile, 0, sizeof(GraphFileReader));
  graph_file_open2(&gfile, ctx_path, "r+", true, 0);

  if(!file_filter_is_direct(&gfile.fltr))
    die("Cannot open graph file with a filter ('in.ctx:blah' syntax)");

  // Open output file
  FILE *fout = out_path ? futil_fopen_create(out_path, "w") : stdout;

  // Start
  size_t filencols = gfile.hdr.num_of_cols;
  size_t kmer_size = gfile.hdr.kmer_size;
  const char *path = file_filter_path(&gfile.fltr);

  size_t ncols = file_filter_into_ncols(&gfile.fltr);
  size_t kmer_mem = sizeof(BinaryKmer) + (sizeof(Edges)+sizeof(Covg))*filencols;

  if(block_size) {
    block_kmers = block_size / kmer_mem;
  } else if(!block_size && !block_kmers) {
    block_size = 4 * ONE_MEGABYTE;
    block_kmers = block_size / kmer_mem;
  }

  // Update block-size
  block_size = block_kmers * kmer_mem;

  status("[index] block bytes: %zu kmers: %zu; kmer bytes: %zu, hdr: %zu",
         block_size, block_kmers, kmer_mem, (size_t)gfile.hdr_size);

  if(block_kmers == 0) die("Cannot set block_kmers to zero");

  // Print header
  fputs("#block_start\tnext_block\tfirst_kmer\tkmer_idx\tnext_kmer_idx\n", fout);

  BinaryKmer bkmer = BINARY_KMER_ZERO_MACRO;
  BinaryKmer prev_bkmer = BINARY_KMER_ZERO_MACRO;
  Covg *covgs = ctx_malloc(ncols * sizeof(Covg));
  Edges *edges = ctx_malloc(ncols * sizeof(Edges));
  char bkmerstr[MAX_KMER_SIZE+1];

  size_t rem_block = block_size - kmer_mem; // block after first kmer
  char *tmp_mem = ctx_malloc(rem_block);

  // Read in file, print index
  size_t nblocks = 0;
  size_t bl_bytes = 0, bl_kmers = 0;
  size_t bl_byte_offset = gfile.hdr_size, bl_kmer_offset = 0;

  while(1)
  {
    if(!graph_file_read(&gfile, &bkmer, covgs, edges)) {
      status("Read kmer failed"); break; }
    binary_kmer_to_str(bkmer, kmer_size, bkmerstr);
    if(nblocks > 0 && !binary_kmer_less_than(prev_bkmer,bkmer))
      die("File is not sorted: %s [%s]", bkmerstr, path);
    // We've already read one kmer entry, read rest of block
    bl_bytes = kmer_mem + gfr_fread_bytes(&gfile, tmp_mem, rem_block);
    bl_kmers = 1 + bl_bytes / kmer_mem;
    fprintf(fout, "%zu\t%zu\t%s\t%zu\t%zu\n",
            bl_byte_offset, bl_byte_offset+bl_bytes, bkmerstr,
            bl_kmer_offset, bl_kmer_offset+bl_kmers);
    bl_byte_offset += bl_bytes;
    bl_kmer_offset += bl_kmers;
    nblocks++;
    if(bl_kmers < block_kmers) {
      status("last block %zu < %zu; %zu vs %zu",
             bl_kmers, block_kmers, bl_bytes, block_size);
      break;
    }
    prev_bkmer = bkmer;
  }

  ctx_free(covgs);
  ctx_free(edges);
  ctx_free(tmp_mem);

  // done
  char num_kmers_str[50], num_blocks_str[50];
  char block_mem_str[50], block_kmers_str[50];
  ulong_to_str(bl_kmer_offset, num_kmers_str);
  ulong_to_str(nblocks, num_blocks_str);
  bytes_to_str(block_size, 1, block_mem_str);
  ulong_to_str(block_kmers, block_kmers_str);

  status("Read %s kmers in %s block%s (block size %s / %s kmers)",
         num_kmers_str, num_blocks_str, util_plural_str(nblocks),
         block_mem_str, block_kmers_str);

  if(fout != stdout) status("Saved to %s", out_path);

  graph_file_close(&gfile);
  fclose(fout);

  return EXIT_SUCCESS;
}
Ejemplo n.º 9
0
/**
 * Print paths to a string buffer. Paths are sorted before being written.
 *
 * @param hkey    All paths associated with hkey are written to the buffer
 * @param sbuf    paths are written this string buffer
 * @param subset  is a temp variable that is reused each time
 * @param nbuf    temporary buffer, if not NULL, used to add seq=... to output
 * @param jposbuf temporary buffer, if not NULL, used to add juncpos=... to output
 */
void gpath_save_sbuf(hkey_t hkey, StrBuf *sbuf, GPathSubset *subset,
                     dBNodeBuffer *nbuf, SizeBuffer *jposbuf,
                     const dBGraph *db_graph)
{
  ctx_assert(db_graph->num_of_cols == 1 || nbuf == NULL);
  ctx_assert(db_graph->num_of_cols == 1 || jposbuf == NULL);

  const GPathStore *gpstore = &db_graph->gpstore;
  const GPathSet *gpset = &gpstore->gpset;
  const size_t ncols = gpstore->gpset.ncols;
  GPath *first_gpath = gpath_store_fetch(gpstore, hkey);
  const GPath *gpath;
  size_t i, j, col;

  // Load and sort paths for given kmer
  gpath_subset_reset(subset);
  gpath_subset_load_llist(subset, first_gpath);
  gpath_subset_sort(subset);

  if(subset->list.len == 0) return;

  // Print "<kmer> <npaths>"
  BinaryKmer bkmer = db_graph->ht.table[hkey];
  char bkstr[MAX_KMER_SIZE+1];
  binary_kmer_to_str(bkmer, db_graph->kmer_size, bkstr);

  // strbuf_sprintf(sbuf, "%s %zu\n", bkstr, subset->list.len);
  strbuf_append_strn(sbuf, bkstr, db_graph->kmer_size);
  strbuf_append_char(sbuf, ' ');
  strbuf_append_ulong(sbuf, subset->list.len);
  strbuf_append_char(sbuf, '\n');

  char orchar[2] = {0};
  orchar[FORWARD] = 'F';
  orchar[REVERSE] = 'R';
  const uint8_t *nseenptr;

  for(i = 0; i < subset->list.len; i++)
  {
    gpath = subset->list.b[i];
    nseenptr = gpath_set_get_nseen(gpset, gpath);

    // strbuf_sprintf(sbuf, "%c %zu %u %u", orchar[gpath->orient], klen,
    //                                      gpath->num_juncs, (uint32_t)nseenptr[0]);

    strbuf_append_char(sbuf, orchar[gpath->orient]);
    strbuf_append_char(sbuf, ' ');
    strbuf_append_ulong(sbuf, gpath->num_juncs);
    strbuf_append_char(sbuf, ' ');
    strbuf_append_ulong(sbuf, nseenptr[0]);

    for(col = 1; col < ncols; col++) {
      // strbuf_sprintf(sbuf, ",%u", (uint32_t)nseenptr[col]);
      strbuf_append_char(sbuf, ',');
      strbuf_append_ulong(sbuf, nseenptr[col]);
    }

    strbuf_append_char(sbuf, ' ');
    strbuf_ensure_capacity(sbuf, sbuf->end + gpath->num_juncs + 2);
    binary_seq_to_str(gpath->seq, gpath->num_juncs, sbuf->b+sbuf->end);
    sbuf->end += gpath->num_juncs;

    if(nbuf)
    {
      // Trace this path through the graph
      // First, find a colour this path is in
      for(col = 0; col < ncols && !gpath_has_colour(gpath, ncols, col); col++) {}
      if(col == ncols) die("path is not in any colours");

      dBNode node = {.key = hkey, .orient = gpath->orient};
      db_node_buf_reset(nbuf);
      if(jposbuf) size_buf_reset(jposbuf); // indices of junctions in nbuf
      gpath_fetch(node, gpath, nbuf, jposbuf, col, db_graph);

      strbuf_append_str(sbuf, " seq=");
      strbuf_ensure_capacity(sbuf, sbuf->end + db_graph->kmer_size + nbuf->len);
      sbuf->end += db_nodes_to_str(nbuf->b, nbuf->len, db_graph,
                                   sbuf->b+sbuf->end);

      if(jposbuf) {
        strbuf_append_str(sbuf, " juncpos=");
        strbuf_append_ulong(sbuf, jposbuf->b[0]);

        for(j = 1; j < jposbuf->len; j++) {
          strbuf_append_char(sbuf, ',');
          strbuf_append_ulong(sbuf, jposbuf->b[j]);
        }
      }
    }

    strbuf_append_char(sbuf, '\n');
  }
}

// @subset is a temp variable that is reused each time
// @sbuf   is a temp variable that is reused each time
static inline int _gpath_gzsave_node(hkey_t hkey,
                                     StrBuf *sbuf, GPathSubset *subset,
                                     dBNodeBuffer *nbuf, SizeBuffer *jposbuf,
                                     gzFile gzout, pthread_mutex_t *outlock,
                                     const dBGraph *db_graph)
{
  gpath_save_sbuf(hkey, sbuf, subset, nbuf, jposbuf, db_graph);

  if(sbuf->end > DEFAULT_IO_BUFSIZE)
    _gpath_save_flush(gzout, sbuf, outlock);

  return 0; // => keep iterating
}