Exemple #1
0
char* graph_step_status2str(enum GraphStepStatus status, char *str, size_t len)
{
  ctx_assert(len >= 20); (void)len;
  ctx_assert(status < GRPHWLK_NUM_STATES);
  strcpy(str, graph_step_str[status]);
  return str;
}
size_t infer_edges(size_t nthreads, bool add_all_edges, const dBGraph *db_graph)
{
  ctx_assert(db_graph->node_in_cols != NULL);
  ctx_assert(db_graph->col_edges != NULL);

  size_t i, num_nodes_modified = 0;
  status("[inferedges] Processing stream");

  InferEdgesWorker *wrkrs = ctx_calloc(nthreads, sizeof(InferEdgesWorker));

  for(i = 0; i < nthreads; i++) {
    InferEdgesWorker tmp = {.threadid = i, .nthreads = nthreads,
                            .add_all_edges = add_all_edges,
                            .db_graph = db_graph,
                            .num_nodes_modified = 0};
    memcpy(&wrkrs[i], &tmp, sizeof(InferEdgesWorker));
  }

  util_run_threads(wrkrs, nthreads, sizeof(InferEdgesWorker),
                   nthreads, infer_edges_worker);

  // Sum up nodes modified
  for(i = 0; i < nthreads; i++)
    num_nodes_modified += wrkrs[i].num_nodes_modified;

  ctx_free(wrkrs);

  return num_nodes_modified;
}
Exemple #3
0
char* assem2str(enum AssemStopCause assem, char *str, size_t size)
{
  ctx_assert(assem < ASSEM_NUM_STOPS);
  ctx_assert(strlen(assem_stop_str[assem]) < size);
  strcpy(str, assem_stop_str[assem]);
  return str;
}
Exemple #4
0
// Edges restricted to this colour, only in one direction (node.orient)
Edges db_node_edges_in_col(dBNode node, size_t col, const dBGraph *db_graph)
{
  if(db_graph->num_edge_cols > 1)
  {
    Edges edges = db_node_get_edges(db_graph, node.key, col);
    return edges_mask_orientation(edges, node.orient);
  }

  // Edges are merged into one colour
  ctx_assert(db_graph->num_edge_cols == 1);
  ctx_assert(db_graph->node_in_cols != NULL || db_graph->col_covgs != NULL);

  // Check which next nodes are in the given colour
  dBNode nodes[4];
  Nucleotide nucs[4];
  Edges edges = 0;
  size_t i, n;

  n = db_graph_next_nodes_in_col(db_graph, node, col, nodes, nucs);

  for(i = 0; i < n; i++)
    edges = edges_set_edge(edges, nucs[i], node.orient);

  return edges;
}
Exemple #5
0
void hash_table_empty(HashTable *const ht)
{
  memset(ht->table, 0, ht->capacity * sizeof(BinaryKmer));
  memset(ht->buckets, 0, ht->num_of_buckets * sizeof(uint8_t[2]));

  HashTable data = {
    .table = ht->table,
    .num_of_buckets = ht->num_of_buckets,
    .hash_mask = ht->hash_mask,
    .bucket_size = ht->bucket_size,
    .capacity = ht->capacity,
    .buckets = ht->buckets,
    .num_kmers = 0,
    .collisions = {0}};

  memcpy(ht, &data, sizeof(data));
}

static inline const BinaryKmer* hash_table_find_in_bucket(const HashTable *const ht,
                                                          uint_fast32_t bucket,
                                                          BinaryKmer bkmer)
{
  const BinaryKmer *ptr = ht_bckt_ptr(ht, bucket);
  const BinaryKmer *end = ptr + hash_table_bsize(ht, bucket);
  bkmer.b[0] |= BKMER_SET_FLAG; // mark as assigned in the hash table

  while(ptr < end) {
    if(binary_kmer_eq(bkmer, *ptr)) return ptr;
    ptr++;
  }
  return NULL; // Not found
}

// Remember to increment ht->num_kmers
static inline BinaryKmer* hash_table_insert_in_bucket(HashTable *ht,
                                                      uint_fast32_t bucket,
                                                      BinaryKmer bkmer)
{
  size_t bsize = hash_table_bsize(ht, bucket);
  size_t bitems = hash_table_bitems(ht, bucket);
  ctx_assert(bitems < ht->bucket_size);
  ctx_assert(bitems <= bsize);
  BinaryKmer *ptr = ht_bckt_ptr(ht, bucket);
  bkmer.b[0] |= BKMER_SET_FLAG; // mark as assigned in the hash table

  if(bitems == bsize) {
    ptr += bsize;
    ht->buckets[bucket][HT_BSIZE]++;
  }
  else {
    // Find an entry that has been deleted from this bucket previously
    while(HASH_ENTRY_ASSIGNED(*ptr)) ptr++;
  }

  *ptr = bkmer;
  ht->buckets[bucket][HT_BITEMS]++;
  return ptr;
}
Exemple #6
0
// Returns 1 if a read is a substring of ANY read in the list or a complete
// match with a read before it in the list. Returns <= 0 otherwise.
//  1 => is substr
//  0 => not substr
// -1 => not enough bases of ACGT
static int _is_substr(const ReadBuffer *rbuf, size_t idx,
                      const KOGraph *kograph, const dBGraph *db_graph)
{
  const size_t kmer_size = db_graph->kmer_size;
  const read_t *r = &rbuf->b[idx], *r2;
  size_t contig_start;

  contig_start = seq_contig_start(r, 0, kmer_size, 0, 0);
  if(contig_start >= r->seq.end) return -1; // No kmers in this sequence

  dBNode node = db_graph_find_str(db_graph, r->seq.b+contig_start);
  ctx_assert(node.key != HASH_NOT_FOUND);

  // expect at least one hit (for this read!)
  ctx_assert(kograph_occurs(kograph, node.key));
  KOccur *hit;

  for(hit = kograph_get(kograph, node.key); 1; hit++)
  {
    if(hit->chrom != idx)
    {
      r2 = &rbuf->b[hit->chrom];

      // A read is a duplicate (i.e. return 1) if it is a substring of ANY
      // read in the list or a complete match with a read before it in the list.
      // That is why we have: (hit->chrom < idx || r->seq.end < r2->seq.end)
      // since identical strings have equal length
      if(hit->chrom < idx || r->seq.end < r2->seq.end) {
        if(hit->orient == node.orient) {
          // potential FORWARD match
          if(hit->offset >= contig_start &&
             hit->offset + r->seq.end <= r2->seq.end &&
             strncasecmp(r->seq.b, r2->seq.b+hit->offset-contig_start, r->seq.end) == 0)
          {
            return 1;
          }
        }
        else {
          // potential REVERSE match
          // if read is '<NNNN>[kmer]<rem>' rX_rem is the number of chars after
          // the first valid kmer
          size_t r1_rem =  r->seq.end - (contig_start   + kmer_size);
          size_t r2_rem = r2->seq.end - (hit->offset + kmer_size);

          if(r1_rem <= hit->offset && r2_rem >= contig_start &&
             dna_revncasecmp(r->seq.b, r2->seq.b+hit->offset-r1_rem, r->seq.end) == 0)
          {
            return 1;
          }
        }
      }
    }

    if(!hit->next) break;
  }

  return 0;
}
Exemple #7
0
/**
 * @param right_edge is true iff we this kmer is the last in a unitig
 */
static inline void _print_edge(hkey_t node, bool right_edge,
                               BinaryKmer bkey, Edges edges,
                               UnitigEnd uend0,
                               UnitigPrinter *p)
{
  // DOT: leave from east end if +, west end if -
  //      connect to west end if +, east end if -
  const char dot_exit[2] = "ew", dot_join[2] = "we", gfa_orient[2] = "+-";
  size_t i, n;
  dBNode next_nodes[4];
  Nucleotide next_nucs[4];
  Orientation orient = right_edge ? uend0.rorient : !uend0.lorient;
  // Unitig orientations
  Orientation ut_or0 = right_edge ? FORWARD : REVERSE, ut_or1;

  n = db_graph_next_nodes(p->db_graph, bkey, orient, edges,
                          next_nodes, next_nucs);

  for(i = 0; i < n; i++)
  {
    UnitigEnd uend1 = p->ugraph.unitig_ends[next_nodes[i].key];

    char tmpstr[100];
    db_node_to_str(p->db_graph, next_nodes[i], tmpstr);
    if(!uend1.assigned)
      status(" -> node %zu [%s]", uend1.unitigid, tmpstr);

    ctx_assert(next_nodes[i].key != HASH_NOT_FOUND);
    ctx_assert(uend1.assigned);

    ut_or1 = next_nodes[i].orient == uend1.lorient ? FORWARD : REVERSE;

    // Don't do reverse-to-reverse links when node links to itself,
    // these are duplicates of forward-to-forward
    if(node < next_nodes[i].key ||
       (node == next_nodes[i].key && ut_or0 + ut_or1 < 2))
    {
      pthread_mutex_lock(&p->outlock);

      switch(p->syntax) {
        case PRINT_DOT:
          fprintf(p->fout, "  node%zu:%c -> node%zu:%c\n",
                  (size_t)uend0.unitigid, dot_exit[ut_or0],
                  (size_t)uend1.unitigid, dot_join[ut_or1]);
          break;
        case PRINT_GFA:
          fprintf(p->fout, "L\tnode%zu\t%c\tnode%zu\t%c\t%zuM\n",
                  (size_t)uend0.unitigid, gfa_orient[ut_or0],
                  (size_t)uend1.unitigid, gfa_orient[ut_or1],
                  p->db_graph->kmer_size - 1);
          break;
        default: die("Bad syntax: %i", p->syntax);
      }

      pthread_mutex_unlock(&p->outlock);
    }
  }
}
Exemple #8
0
// Using file so can call fseek and don't need to load whole graph
static size_t inferedges_on_mmap(const dBGraph *db_graph, bool add_all_edges,
                                 GraphFileReader *file)
{
  ctx_assert(db_graph->num_of_cols == file->hdr.num_of_cols);
  ctx_assert(file_filter_is_direct(&file->fltr));
  ctx_assert2(!isatty(fileno(file->fh)), "Use inferedges_on_stream() instead");
  ctx_assert(file->num_of_kmers >= 0);
  ctx_assert(file->file_size >= 0);

  status("[inferedges] Processing mmap file: %s [hdr: %zu bytes file: %zu bytes]",
         file_filter_path(&file->fltr),
         (size_t)file->hdr_size, (size_t)file->file_size);

  if(fseek(file->fh, 0, SEEK_SET) != 0)
    die("fseek failed: %s", strerror(errno));

  // Open memory mapped file
  void *mmap_ptr = mmap(NULL, file->file_size, PROT_WRITE, MAP_SHARED,
                        fileno(file->fh), 0);

  if(mmap_ptr == MAP_FAILED)
    die("Cannot memory map file: %s [%s]", file->fltr.path.b, strerror(errno));

  const size_t ncols = file->hdr.num_of_cols;
  BinaryKmer bkmer;
  Edges edges[ncols];
  Covg covgs[ncols];

  bool updated;
  size_t i, num_kmers = file->num_of_kmers, num_kmers_edited = 0;
  size_t filekmersize = sizeof(BinaryKmer) + (sizeof(Edges)+sizeof(Covg)) * ncols;

  char *ptr = (char*)mmap_ptr + file->hdr_size;

  for(i = 0; i < num_kmers; i++, ptr += filekmersize)
  {
    char *fh_covgs = ptr      + sizeof(BinaryKmer);
    char *fh_edges = fh_covgs + sizeof(Covg)*ncols;

    memcpy(bkmer.b, ptr,      sizeof(BinaryKmer));
    memcpy(covgs,   fh_covgs, ncols * sizeof(Covg));
    memcpy(edges,   fh_edges, ncols * sizeof(Edges));

    updated = (add_all_edges ? infer_all_edges(bkmer, edges, covgs, db_graph)
                             : infer_pop_edges(bkmer, edges, covgs, db_graph));

    if(updated) {
      memcpy(fh_covgs, covgs, ncols * sizeof(Covg));
      memcpy(fh_edges, edges, ncols * sizeof(Edges));
      num_kmers_edited++;
    }
  }

  if(munmap(mmap_ptr, file->file_size) == -1)
    die("Cannot release mmap file: %s [%s]", file->fltr.path.b, strerror(errno));

  return num_kmers_edited;
}
Exemple #9
0
// `node1` should be the first node of a supernode
// `node0` should be the previous node
// `next_base` is the last base of `node1`
// `jmpfunc` is called with each supernode traversed and if it returns true
//           we continue crawling, otherwise we stop
// `endfunc` is a function called at the end of traversal
void graph_crawler_fetch(GraphCrawler *crawler, dBNode node0,
                         dBNode next_nodes[4],
                         size_t take_idx, size_t num_next,
                         uint32_t *cols, size_t ncols,
                         bool (*jmpfunc)(GraphCache *_cache, GCacheStep *_step, void *_arg),
                         void (*endfunc)(GraphCache *_cache, uint32_t _pathid, void *_arg),
                         void *arg)
{
  const dBGraph *db_graph = crawler->cache.db_graph;
  GraphCache *cache = &crawler->cache;
  GraphWalker *wlk = &crawler->wlk;
  RepeatWalker *rptwlk = &crawler->rptwlk;
  GCUniColPath *unipaths = crawler->unicol_paths;

  ctx_assert(take_idx < num_next);
  ctx_assert(!db_nodes_are_equal(node0, next_nodes[take_idx]));

  // Fetch all paths in all colours
  dBNode node1 = next_nodes[take_idx];
  bool is_fork;
  size_t i, c, col, nedges_cols, num_unicol_paths = 0;
  int pathid;

  for(c = 0; c < ncols; c++)
  {
    col = (cols != NULL ? cols[c] : c);

    if(db_node_has_col(db_graph, node0.key, col) &&
       db_node_has_col(db_graph, node1.key, col))
    {
      // Determine if this fork is a fork in the current colour
      for(nedges_cols = 0, i = 0; i < num_next && nedges_cols <= 1; i++)
        nedges_cols += db_node_has_col(db_graph, next_nodes[i].key, col);

      is_fork = (nedges_cols > 1);

      graph_walker_setup(wlk, true, col, col, db_graph);
      graph_walker_start(wlk, node0);
      graph_walker_force(wlk, node1, is_fork);

      pathid = graph_crawler_load_path(cache, node1, wlk, rptwlk, jmpfunc, arg);

      if(endfunc != NULL) endfunc(cache, pathid, arg);

      graph_walker_finish(wlk);
      graph_crawler_reset_rpt_walker(rptwlk, cache, pathid);

      unipaths[num_unicol_paths++] = (GCUniColPath){.colour = col,
                                                    .pathid = pathid};
    }
    else
      pathid = -1;

    crawler->col_paths[col] = pathid;
  }
Exemple #10
0
// Returns sorted array of hkey_t from the hash table
hkey_t* hash_table_sorted(const HashTable *htable)
{
  ctx_assert(sizeof(hkey_t) == sizeof(BinaryKmer*));
  ctx_assert(sizeof(hkey_t) == sizeof(BkmerPtrHkeyUnion));
  BkmerPtrHkeyUnion *kmers, *nxt, *end;
  nxt = kmers = ctx_malloc(sizeof(BkmerPtrHkeyUnion) * htable->num_kmers);
  end = kmers + htable->num_kmers;
  HASH_ITERATE(htable, _fetch_kmer_union, htable, &nxt);
  // Can sort ignoring that the top flag bit is set on all kmers
  qsort(kmers, htable->num_kmers, sizeof(BinaryKmer*), binary_kmers_qcmp_ptrs);
  for(nxt = kmers; nxt < end; nxt++) nxt->h = nxt->bptr - htable->table;
  return (hkey_t*)kmers;
}
Exemple #11
0
void seq_reader_orient_mp_FF_or_RR(read_t *r1, read_t *r2, ReadMateDir matedir)
{
  ctx_assert(r1 != NULL);
  ctx_assert(r2 != NULL);
  switch(matedir) {
    case READPAIR_FF: return;
    case READPAIR_FR: seq_read_reverse_complement(r2); return;
    case READPAIR_RF: seq_read_reverse_complement(r1); return;
    case READPAIR_RR: return;
    default: ctx_assert2(0, "Invalid ReadMateDir value: %i", (int)matedir);
  }
  // ^default should be unreachable
}
Exemple #12
0
/**
 * Save paths to a file.
 * @param gzout         gzFile to write to
 * @param path          path of output file
 * @param save_path_seq if true, save seq= and juncpos= for links, requires
 *                      exactly one colour in the graph
 * @param hdrs is array of JSON headers of input files
 */
void gpath_save(gzFile gzout, const char *path,
                size_t nthreads, bool save_path_seq,
                const char *cmdstr, cJSON *cmdhdr,
                cJSON **hdrs, size_t nhdrs,
                const ZeroSizeBuffer *contig_hists, size_t ncols,
                dBGraph *db_graph)
{
  ctx_assert(nthreads > 0);
  ctx_assert(gpath_set_has_nseen(&db_graph->gpstore.gpset));
  ctx_assert(ncols == db_graph->gpstore.gpset.ncols);
  ctx_assert(!save_path_seq || db_graph->num_of_cols == 1); // save_path => 1 colour

  char npaths_str[50];
  ulong_to_str(db_graph->gpstore.num_paths, npaths_str);

  status("Saving %s paths to: %s", npaths_str, path);
  status("  using %zu threads", nthreads);

  // Write header
  cJSON *json = gpath_save_mkhdr(path, cmdstr, cmdhdr, hdrs, nhdrs,
                                 contig_hists, ncols, db_graph);
  json_hdr_gzprint(json, gzout);
  cJSON_Delete(json);

  // Print comments about the format
  gzputs(gzout, ctp_explanation_comment);

  // Multithreaded
  GPathSaver *wrkrs = ctx_calloc(nthreads, sizeof(GPathSaver));
  pthread_mutex_t outlock;
  size_t i;

  if(pthread_mutex_init(&outlock, NULL) != 0) die("Mutex init failed");

  for(i = 0; i < nthreads; i++) {
    wrkrs[i] = (GPathSaver){.threadid = i,
                            .nthreads = nthreads,
                            .save_seq = save_path_seq,
                            .gzout = gzout,
                            .outlock = &outlock,
                            .db_graph = db_graph};
  }

  // Iterate over kmers writing paths
  util_run_threads(wrkrs, nthreads, sizeof(*wrkrs), nthreads, gpath_save_thread);

  pthread_mutex_destroy(&outlock);
  ctx_free(wrkrs);

  status("[GPathSave] Graph paths saved to %s", path);
}
Exemple #13
0
/**
 * Generate a JSON header object for a .ctp file
 * @param path        path to output file
 * @param cmdstr      name of the command being run, to be used to add @cmdhdr
 * @param cmdhdr      JSON header to add under current command->@cmdstr
 *                    If cmdstr and cmdhdr are both NULL they are ignored
 * @param contig_hist histgram of read contig lengths
 * @param hist_len    length of array contig_hist
 */
cJSON* gpath_save_mkhdr(const char *path,
                        const char *cmdstr, cJSON *cmdhdr,
                        cJSON **hdrs, size_t nhdrs,
                        const ZeroSizeBuffer *contig_hists, size_t ncols,
                        const dBGraph *db_graph)
{
  ctx_assert(!cmdstr == !cmdhdr);

  const GPathStore *gpstore = &db_graph->gpstore;
  const GPathSet *gpset = &gpstore->gpset;

  // using json_hdr_make_std() assumes the following
  ctx_assert(gpset->ncols == db_graph->num_of_cols);

  // Construct cJSON
  cJSON *jsonhdr = cJSON_CreateObject();

  cJSON_AddStringToObject(jsonhdr, "file_format", "ctp");
  cJSON_AddNumberToObject(jsonhdr, "format_version", CTP_FORMAT_VERSION);

  // Add standard cortex header info, including the command being run
  json_hdr_make_std(jsonhdr, path, hdrs, nhdrs, db_graph,
                    hash_table_nkmers(&db_graph->ht));

  // Get first command (this one), and command specific extra info
  if(cmdstr) {
    cJSON *cmd = json_hdr_get_curr_cmd(jsonhdr, path);
    cJSON_AddItemToObject(cmd, cmdstr, cmdhdr);
  }

  // Paths info
  cJSON *paths = cJSON_CreateObject();
  cJSON_AddItemToObject(jsonhdr, "paths", paths);

  // Add command specific header fields
  cJSON_AddNumberToObject(paths, "num_kmers_with_paths", gpstore->num_kmers_with_paths);
  cJSON_AddNumberToObject(paths, "num_paths", gpstore->num_paths);
  cJSON_AddNumberToObject(paths, "path_bytes", gpstore->path_bytes);

  // Add size distribution
  cJSON *json_hists = cJSON_CreateArray();
  cJSON_AddItemToObject(paths, "contig_hists", json_hists);

  size_t i;
  for(i = 0; i < ncols; i++)
    _gpath_save_contig_hist2json(json_hists, contig_hists[i].b, contig_hists[i].len);

  return jsonhdr;
}
Exemple #14
0
/**
 * Calculate cleaning threshold for supernodes from a given distribution
 * of supernode coverages
 * @param covgs histogram of supernode coverages
 */
size_t cleaning_pick_supernode_threshold(const uint64_t *covgs, size_t len,
                                         double seq_depth,
                                         const dBGraph *db_graph)
{
  ctx_assert(len > 5);
  ctx_assert(db_graph->ht.num_kmers > 0);

  size_t i, d1len = len-2, d2len = len-3, f1, f2;
  double *tmp = ctx_malloc((d1len+d2len) * sizeof(double));
  double *delta1 = tmp, *delta2 = tmp + d1len;

  // Get sequencing depth from coverage
  uint64_t covg_sum = 0, capacity = db_graph->ht.capacity * db_graph->num_of_cols;
  for(i = 0; i < capacity; i++) covg_sum += db_graph->col_covgs[i];
  double seq_depth_est = (double)covg_sum / db_graph->ht.num_kmers;

  status("[cleaning] Kmer depth before cleaning supernodes: %.2f", seq_depth_est);
  if(seq_depth <= 0) seq_depth = seq_depth_est;
  else status("[cleaning] Using sequence depth argument: %f", seq_depth);

  size_t fallback_thresh = (size_t)MAX2(1, (seq_depth+1)/2);

  // +1 to ensure covgs is never 0
  for(i = 0; i < d1len; i++) delta1[i] = (double)(covgs[i+1]+1) / (covgs[i+2]+1);

  d1len = i;
  d2len = d1len - 1;

  if(d1len <= 2) {
    status("[cleaning]  (using fallback1)\n");
    ctx_free(tmp);
    return fallback_thresh;
  }

  // d2len is d1len-1
  for(i = 0; i < d2len; i++) delta2[i] = delta1[i] / delta1[i+1];

  for(f1 = 0; f1 < d1len && delta1[f1] >= 1; f1++);
  for(f2 = 0; f2 < d2len && delta2[f2] > 1; f2++);

  ctx_free(tmp);

  if(f1 < d1len && f1 < (seq_depth*0.75))
  { status("[cleaning]   (using f1)"); return f1+1; }
  else if(f2 < d2len)
  { status("[cleaning]   (using f2)"); return f2+1; }
  else
  { status("[cleaning]   (using fallback1)"); return fallback_thresh+1; }
}
Exemple #15
0
// Safe to call on different entries at the same time
// NOT safe to do find() whilst doing delete()
void hash_table_delete(HashTable *const ht, hkey_t pos)
{
  uint64_t bucket = pos / ht->bucket_size;

  ctx_assert(pos != HASH_NOT_FOUND);
  ctx_assert(ht->buckets[bucket][HT_BITEMS] > 0);
  ctx_assert(ht->num_kmers > 0);
  ctx_assert(HASH_ENTRY_ASSIGNED(ht->table[pos]));

  ht->table[pos] = unset_bkmer;
  __sync_fetch_and_sub((volatile uint8_t *)&ht->buckets[bucket][HT_BITEMS], 1);
  __sync_fetch_and_sub((volatile uint64_t *)&ht->num_kmers, 1);

  ctx_assert(!HASH_ENTRY_ASSIGNED(ht->table[pos]));
}
Exemple #16
0
// Safe to call on different entries at the same time
// NOT safe to do find() whilst doing delete()
void hash_table_delete(HashTable *const ht, hkey_t pos)
{
  uint64_t bucket = pos / ht->bucket_size, n, m;

  ctx_assert(pos != HASH_NOT_FOUND);
  ctx_assert(HASH_ENTRY_ASSIGNED(ht->table[pos]));

  memset(ht->table+pos, 0, sizeof(BinaryKmer));
  n = __sync_fetch_and_sub((volatile uint64_t *)&ht->num_kmers, 1);
  m = __sync_fetch_and_sub((volatile uint8_t *)&ht->buckets[bucket][HT_BITEMS], 1);

  ctx_assert2(n > 0, "Deleted from empty table");
  ctx_assert2(m > 0, "Deleted from empty bucket");
  ctx_assert(!HASH_ENTRY_ASSIGNED(ht->table[pos]));
}
Exemple #17
0
void graph_crawler_alloc(GraphCrawler *crawler, const dBGraph *db_graph)
{
  ctx_assert(db_graph->node_in_cols != NULL);

  size_t ncols = db_graph->num_of_cols;

  int *col_paths = ctx_calloc(ncols, sizeof(int));
  GCMultiColPath *multicol_paths = ctx_calloc(ncols, sizeof(GCMultiColPath));
  GCUniColPath *unicol_paths = ctx_calloc(ncols, sizeof(GCUniColPath));
  uint32_t *col_list = ctx_calloc(ncols, sizeof(uint32_t));

  GraphCrawler tmp = {.num_paths = 0,
                      .col_paths = col_paths,
                      .multicol_paths = multicol_paths,
                      .unicol_paths = unicol_paths,
                      .col_list = col_list};

  memcpy(crawler, &tmp, sizeof(GraphCrawler));

  graph_cache_alloc(&crawler->cache, db_graph);
  graph_walker_alloc(&crawler->wlk, db_graph);
  rpt_walker_alloc(&crawler->rptwlk, db_graph->ht.capacity, 22); // 4MB
}

void graph_crawler_dealloc(GraphCrawler *crawler)
{
  ctx_free(crawler->col_paths);
  ctx_free(crawler->multicol_paths);
  ctx_free(crawler->unicol_paths);
  ctx_free(crawler->col_list);
  graph_cache_dealloc(&crawler->cache);
  graph_walker_dealloc(&crawler->wlk);
  rpt_walker_dealloc(&crawler->rptwlk);
  memset(crawler, 0, sizeof(GraphCrawler)); // reset
}
static inline void gcrawler_finish_ref_covg(BreakpointCaller *caller,
                                            uint32_t pathid,
                                            KOccurRunBuffer *koruns,
                                            KOccurRunBuffer *koruns_ended,
                                            KOccurRunBuffer *runs_buf,
                                            PathRefRun *ref_runs)
{
  size_t init_len = runs_buf->len;

  // Copy finished runs into array
  kmer_run_buf_ensure_capacity(runs_buf, runs_buf->len+koruns->len+koruns_ended->len);
  kmer_run_buf_append(runs_buf, koruns_ended->data, koruns_ended->len);

  runs_buf->len += koruns_filter(koruns->data, koruns->len,
                                 runs_buf->data+runs_buf->len,
                                 caller->min_ref_nkmers);

  kmer_run_buf_reset(koruns);
  kmer_run_buf_reset(koruns_ended);

  ctx_assert(pathid < MAX_REFRUNS_PER_ORIENT(caller->db_graph->num_of_cols));

  ref_runs[pathid].first_runid = init_len;
  ref_runs[pathid].num_runs = runs_buf->len - init_len;
}
// Traverse from node0 -> node1
static void traverse_5pflank(BreakpointCaller *caller, GraphCrawler *crawler,
                             dBNode node0, dBNode node1)
{
  const dBGraph *db_graph = crawler->cache.db_graph;
  dBNode next_nodes[4];
  Nucleotide next_nucs[4];
  size_t i, num_next;
  BinaryKmer bkmer0 = db_node_get_bkmer(db_graph, node0.key);

  num_next = db_graph_next_nodes(db_graph, bkmer0, node0.orient,
                                 db_node_edges(db_graph, node0.key, 0),
                                 next_nodes, next_nucs);

  // Find index of previous node
  for(i = 0; i < num_next && !db_nodes_are_equal(next_nodes[i],node1); i++) {}

  ctx_assert(i < num_next && db_nodes_are_equal(next_nodes[i],node1));

  kmer_run_buf_reset(&caller->koruns_5p);
  kmer_run_buf_reset(&caller->koruns_5p_ended);
  kmer_run_buf_reset(&caller->flank5p_run_buf);

  // Go backwards to get 5p flank
  // NULL means loop from 0..(ncols-1)
  graph_crawler_fetch(crawler, node0,
                      next_nodes, next_nucs, i, num_next,
                      NULL, db_graph->num_of_cols,
                      gcrawler_flank5p_stop_at_ref_covg,
                      gcrawler_flank5p_finish_ref_covg,
                      caller);
}
Exemple #20
0
int main(int argc, char **argv)
{
  (void)argc; (void)argv;
  cortex_init();
  cmd_init(argc, argv);

  if(argc != 3) die("usage: ./debug <in.ctp> <in.ctx>");

  const char *out_path = argv[2];

  GPathReader pfile;
  memset(&pfile, 0, sizeof(GPathReader));
  gpath_reader_open(&pfile, argv[1], true);
  status("Got file with %zu colours", pfile.ncolours);

  size_t i, kmer_size = 7, ncols = 3;

  gpath_reader_check(&pfile, kmer_size, ncols);
  gzFile gzout = futil_gzopen_create(out_path, "w");

  dBGraph db_graph;
  db_graph_alloc(&db_graph, kmer_size, ncols, 1, 1024, DBG_ALLOC_EDGES);

  // Create a path store that tracks path counts
  gpath_store_alloc(&db_graph.gpstore,
                    db_graph.num_of_cols, db_graph.ht.capacity,
                    ONE_MEGABYTE, true, false);

  // Create path hash table for fast lookup
  gpath_hash_alloc(&db_graph.gphash, &db_graph.gpstore, ONE_MEGABYTE);

  // Set sample names
  for(i = 0; i < pfile.ncolours; i++) {
    const char *sample_name = gpath_reader_get_sample_name(&pfile, i);
    ctx_assert(sample_name != NULL);
    strbuf_set(&db_graph.ginfo[i].sample_name, sample_name);
  }

  // Load path files, add kmers that are missing
  gpath_reader_load(&pfile, GPATH_ADD_MISSING_KMERS, &db_graph);

  hash_table_print_stats(&db_graph.ht);

  // Write output file
  gpath_save(gzout, out_path, 1, true, NULL, NULL, &pfile.json, 1, &db_graph);
  gzclose(gzout);

  // Checks
  // gpath_checks_all_paths(&db_graph, 2); // use two threads
  gpath_checks_counts(&db_graph);

  // Clean up
  gpath_reader_close(&pfile);
  db_graph_dealloc(&db_graph);
  cortex_destroy();

  return EXIT_SUCCESS;
}
Exemple #21
0
// Always adds new path. If newpath could be a duplicate, use gpathhash
// Note: it is not safe to call _add and _find_add simultaneously, since _add
//       avoids the use of locks.
GPath* gpath_store_add_mt(GPathStore *gpstore, hkey_t hkey, GPathNew newgpath)
{
  ctx_assert(newgpath.seq != NULL);

  GPath *gpath = gpath_set_add_mt(&gpstore->gpset, newgpath);
  _gpstore_add_to_llist_mt(gpstore, hkey, gpath);

  return gpath;
}
Exemple #22
0
// @intocols value to set all intocols to
void file_filter_flatten(FileFilter *fltr, size_t intocol)
{
  size_t i;
  ctx_assert(fltr->filter.b != NULL);
  for(i = 0; i < file_filter_num(fltr); i++)
    file_filter_intocol(fltr,i) = intocol;

  file_filter_update(fltr);
}
Exemple #23
0
void assemble_contigs_stats_merge(AssembleContigStats *dst,
                                  const AssembleContigStats *src)
{
  ctx_assert(dst->lengths.len == dst->junctns.len);
  ctx_assert(dst->lengths.len == dst->num_contigs);
  ctx_assert(src->lengths.len == src->junctns.len);
  ctx_assert(src->lengths.len == src->num_contigs);

  size_t i;

  size_buf_push(&dst->lengths, src->lengths.b, src->lengths.len);
  size_buf_push(&dst->junctns, src->junctns.b, src->junctns.len);

  dst->num_contigs += src->num_contigs;
  dst->total_len   += src->total_len;
  dst->total_junc  += src->total_junc;

  for(i = 0; i < 5; i++)
    dst->contigs_outdegree[i] += src->contigs_outdegree[i];

  for(i = 0; i < AC_MAX_PATHS; i++) {
    dst->paths_held[i] += src->paths_held[i];
    dst->paths_cntr[i] += src->paths_cntr[i];
  }

  dst->paths_held_max = MAX2(dst->paths_held_max, src->paths_held_max);
  dst->paths_cntr_max = MAX2(dst->paths_cntr_max, src->paths_cntr_max);

  for(i = 0; i < GRPHWLK_NUM_STATES; i++)
    dst->grphwlk_steps[i] += src->grphwlk_steps[i];

  for(i = 0; i < ASSEM_NUM_STOPS; i++)
    dst->stop_causes[i] += src->stop_causes[i];

  dst->max_junc_density = MAX2(dst->max_junc_density, src->max_junc_density);

  dst->num_contigs_from_seed_kmers += src->num_contigs_from_seed_kmers;
  dst->num_contigs_from_seed_paths += src->num_contigs_from_seed_paths;

  dst->num_reseed_abort    += src->num_reseed_abort;
  dst->num_seeds_not_found += src->num_seeds_not_found;
}
Exemple #24
0
void cleaning_write_len_histogram(const char *path,
                                  const uint64_t *hist, size_t len,
                                  size_t kmer_size)
{
  ctx_assert(len >= 2);
  ctx_assert(hist[0] == 0);
  size_t i, end;

  FILE *fout = _open_histogram_file(path, "unitig length");
  if(fout == NULL) return;

  fprintf(fout, "UnitigKmerLength,bp,Count\n");
  for(end = len-1; end > 1 && hist[end] == 0; end--) {}
  fprintf(fout, "1,%zu,%"PRIu64"\n", kmer_size, hist[1]);
  for(i = 2; i <= end; i++) {
    if(hist[i] > 0)
      fprintf(fout, "%zu,%zu,%"PRIu64"\n", i, kmer_size+i-1, hist[i]);
  }
  fclose(fout);
}
Exemple #25
0
// Get coverages from nodes in nbuf, store in cbuf
static inline void fetch_coverages(dBNodeBuffer nbuf, CovgBuffer *cbuf,
                                   const dBGraph *db_graph)
{
  ctx_assert(db_graph->num_of_cols == 1);
  size_t i;
  covg_buf_reset(cbuf);
  covg_buf_capacity(cbuf, nbuf.len);
  cbuf->len = nbuf.len;
  for(i = 0; i < nbuf.len; i++)
    cbuf->b[i] = db_graph->col_covgs[nbuf.b[i].key];
}
Exemple #26
0
// Return 1 if changed; 0 otherwise
bool infer_pop_edges(const BinaryKmer node_bkey, Edges *edges,
                     const Covg *covgs, const dBGraph *db_graph)
{
  Edges uedges = 0, iedges = 0xf, add_edges, edge;
  size_t orient, nuc, col, kmer_size = db_graph->kmer_size;
  const size_t ncols = db_graph->num_of_cols;
  BinaryKmer bkey, bkmer;
  hkey_t next;
  Edges newedges[ncols];

  // char tmp[MAX_KMER_SIZE+1];
  // binary_kmer_to_str(node_bkey, db_graph->kmer_size, tmp);
  // status("Inferring %s", tmp);

  for(col = 0; col < ncols; col++) {
    uedges |= edges[col]; // union of edges
    iedges &= edges[col]; // intersection of edges
    newedges[col] = edges[col];
  }

  add_edges = uedges & ~iedges;

  if(!add_edges) return 0;

  for(orient = 0; orient < 2; orient++)
  {
    bkmer = (orient == FORWARD ? binary_kmer_left_shift_one_base(node_bkey, kmer_size)
                               : binary_kmer_right_shift_one_base(node_bkey));

    for(nuc = 0; nuc < 4; nuc++)
    {
      edge = nuc_orient_to_edge(nuc, orient);
      if(add_edges & edge)
      {
        // get next bkmer, look up in graph
        if(orient == FORWARD) binary_kmer_set_last_nuc(&bkmer, nuc);
        else binary_kmer_set_first_nuc(&bkmer, dna_nuc_complement(nuc), kmer_size);

        bkey = bkmer_get_key(bkmer, kmer_size);
        next = hash_table_find(&db_graph->ht, bkey);
        ctx_assert(next != HASH_NOT_FOUND);

        for(col = 0; col < ncols; col++)
          if(covgs[col] > 0 && db_node_has_col(db_graph, next, col))
            newedges[col] |= edge;
      }
    }
  }

  int cmp = memcmp(edges, newedges, sizeof(Edges)*ncols);
  memcpy(edges, newedges, sizeof(Edges)*ncols);
  return (cmp != 0);
}
Exemple #27
0
void cleaning_write_covg_histogram(const char *path,
                                   const uint64_t *covg_hist,
                                   const uint64_t *mean_covg_hist,
                                   size_t len)
{
  ctx_assert(len >= 2);
  ctx_assert(covg_hist[0] == 0);
  ctx_assert(mean_covg_hist[0] == 0);
  size_t i, end;

  FILE *fout = _open_histogram_file(path, "unitig coverage");
  if(fout == NULL) return;

  fprintf(fout, "Covg,NumKmers,NumUnitigs\n");
  for(end = len-1; end > 2 && covg_hist[end] == 0; end--) {}
  for(i = 1; i <= end; i++) {
    if(covg_hist[i] > 0)
      fprintf(fout, "%zu,%"PRIu64",%"PRIu64"\n", i, covg_hist[i], mean_covg_hist[i]);
  }
  fclose(fout);
}
static void process_contig(BreakpointCaller *caller,
                           const uint32_t *cols, size_t ncols,
                           const dBNodeBuffer *flank5p,
                           const dBNodeBuffer *allelebuf,
                           const KOccurRun *flank5p_runs, size_t num_flank5p_runs,
                           const KOccurRun *flank3p_runs, size_t num_flank3p_runs)
{
  gzFile gzout = caller->gzout;
  KOGraph kograph = caller->kograph;
  const size_t kmer_size = caller->db_graph->kmer_size;

  ctx_assert(ncols > 0);

  // we never re-met the ref
  if(num_flank3p_runs == 0) return;

  // Find first place we meet the ref
  size_t callid = __sync_fetch_and_add((volatile size_t*)caller->callid, 1);

  // Swallow up some of the path into the 3p flank
  size_t i, flank3pidx = flank3p_runs[0].qoffset;
  size_t extra3pbases = MIN2(kmer_size-1, flank3pidx);
  size_t num_path_kmers = flank3pidx - extra3pbases;
  size_t kmer3poffset = kmer_size-1-extra3pbases;

  pthread_mutex_lock(caller->out_lock);

  // 5p flank with list of ref intersections
  gzprintf(gzout, ">brkpnt.%zu.5pflank chr=", callid);
  koruns_gzprint(gzout, kmer_size, kograph, flank5p_runs, num_flank5p_runs, 0, 0);
  gzputc(gzout, '\n');
  db_nodes_gzprint(flank5p->data, flank5p->len, caller->db_graph, gzout);
  gzputc(gzout, '\n');

  // 3p flank with list of ref intersections
  gzprintf(gzout, ">brkpnt.%zu.3pflank chr=", callid);
  koruns_gzprint(gzout, kmer_size, kograph, flank3p_runs, num_flank3p_runs,
                 flank3pidx, kmer3poffset);
  gzputc(gzout, '\n');
  db_nodes_gzprint_cont(allelebuf->data+num_path_kmers,
                        allelebuf->len-num_path_kmers,
                        caller->db_graph, gzout);
  gzputc(gzout, '\n');

  // Print path with list of colours
  gzprintf(gzout, ">brkpnt.%zu.path cols=%zu", callid, cols[0]);
  for(i = 1; i < ncols; i++) gzprintf(gzout, ",%zu", cols[i]);
  gzputc(gzout, '\n');
  db_nodes_gzprint_cont(allelebuf->data, num_path_kmers, caller->db_graph, gzout);
  gzprintf(gzout, "\n\n");

  pthread_mutex_unlock(caller->out_lock);
}
Exemple #29
0
/**
 * @param cpy_flnk_5p how many characters to copy from end of 5' flank to start of allele
 * @param cpy_flnk_3p how many characters to copy from end of 3' flank to end of allele
 */
static void align_entry_allele(const char *line, size_t linelen,
                               const char *flank5p, size_t flank5p_len,
                               const char *flank3p, size_t flank3p_len,
                               size_t cpy_flnk_5p, size_t cpy_flnk_3p,
                               const read_t *chr,
                               size_t ref_start, size_t ref_end,
                               bool fw_strand,
                               const char *info, const char **genotypes,
                               StrBuf *tmpbuf, FILE *fout)
{
  (void)flank3p_len;
  ctx_assert(ref_start <= ref_end);

  // Ref allele
  const char *ref_allele = chr->seq.b + ref_start;
  size_t ref_len = ref_end-ref_start;

  // Construct alt allele
  const char *alt_allele;
  size_t alt_len;

  if(cpy_flnk_5p + cpy_flnk_3p == 0 && fw_strand)
  {
    alt_allele = line;
    alt_len = linelen;
  }
  else
  {
    strbuf_reset(tmpbuf);
    strbuf_append_strn(tmpbuf, flank5p+flank5p_len-cpy_flnk_5p, cpy_flnk_5p);
    strbuf_append_strn(tmpbuf, line, linelen);
    strbuf_append_strn(tmpbuf, flank3p, cpy_flnk_3p);

    if(!fw_strand) dna_revcomp_str(tmpbuf->b, tmpbuf->b, tmpbuf->end);

    alt_allele = tmpbuf->b;
    alt_len = tmpbuf->end;
  }

  // printf("%.*s vs %.*s\n", (int)(ref_end-ref_start), chr->seq.b + ref_start,
  //                          (int)alt_len, seq);

  // Align chrom and seq
  needleman_wunsch_align2(ref_allele, alt_allele, ref_len, alt_len,
                          &nw_scoring_allele, nw_aligner, aln);
  num_nw_allele++;

  // Break into variants and print VCF
  align_biallelic(aln->result_a, aln->result_b,
                  chr, ref_start,
                  info, genotypes, fout);
}
Exemple #30
0
// Return sum of bases on right of alignment with:
// * hard masked (H)
// * soft masked (S)
// * inserted bases relative to ref (I)
static inline uint32_t bam_get_end_padding(int n_cigar, const uint32_t *cigar)
{
  ctx_assert(n_cigar > 0);

  uint32_t i, l = 0;
  const uint32_t c = (1<<BAM_CINS)|(1<<BAM_CSOFT_CLIP)|(1<<BAM_CHARD_CLIP);

  for(i = n_cigar-1; i > 0; i--)
    if((c >> bam_cigar_op(cigar[i])) & 1)
      l += bam_cigar_oplen(cigar[i]);

  return l;
}