Esempio n. 1
0
// Returns 1 if a read is a substring of ANY read in the list or a complete
// match with a read before it in the list. Returns <= 0 otherwise.
//  1 => is substr
//  0 => not substr
// -1 => not enough bases of ACGT
static int _is_substr(const ReadBuffer *rbuf, size_t idx,
                      const KOGraph *kograph, const dBGraph *db_graph)
{
  const size_t kmer_size = db_graph->kmer_size;
  const read_t *r = &rbuf->b[idx], *r2;
  size_t contig_start;

  contig_start = seq_contig_start(r, 0, kmer_size, 0, 0);
  if(contig_start >= r->seq.end) return -1; // No kmers in this sequence

  dBNode node = db_graph_find_str(db_graph, r->seq.b+contig_start);
  ctx_assert(node.key != HASH_NOT_FOUND);

  // expect at least one hit (for this read!)
  ctx_assert(kograph_occurs(kograph, node.key));
  KOccur *hit;

  for(hit = kograph_get(kograph, node.key); 1; hit++)
  {
    if(hit->chrom != idx)
    {
      r2 = &rbuf->b[hit->chrom];

      // A read is a duplicate (i.e. return 1) if it is a substring of ANY
      // read in the list or a complete match with a read before it in the list.
      // That is why we have: (hit->chrom < idx || r->seq.end < r2->seq.end)
      // since identical strings have equal length
      if(hit->chrom < idx || r->seq.end < r2->seq.end) {
        if(hit->orient == node.orient) {
          // potential FORWARD match
          if(hit->offset >= contig_start &&
             hit->offset + r->seq.end <= r2->seq.end &&
             strncasecmp(r->seq.b, r2->seq.b+hit->offset-contig_start, r->seq.end) == 0)
          {
            return 1;
          }
        }
        else {
          // potential REVERSE match
          // if read is '<NNNN>[kmer]<rem>' rX_rem is the number of chars after
          // the first valid kmer
          size_t r1_rem =  r->seq.end - (contig_start   + kmer_size);
          size_t r2_rem = r2->seq.end - (hit->offset + kmer_size);

          if(r1_rem <= hit->offset && r2_rem >= contig_start &&
             dna_revncasecmp(r->seq.b, r2->seq.b+hit->offset-r1_rem, r->seq.end) == 0)
          {
            return 1;
          }
        }
      }
    }

    if(!hit->next) break;
  }

  return 0;
}
Esempio n. 2
0
static bool read_touches_graph(const read_t *r, const dBGraph *db_graph,
                               LoadingStats *stats)
{
  bool found = false;
  BinaryKmer bkmer; Nucleotide nuc; dBNode node;
  const size_t kmer_size = db_graph->kmer_size;
  size_t i, num_contigs = 0, num_kmers_loaded = 0;
  size_t search_pos = 0, start, end = 0, contig_len;

  if(r->seq.end >= kmer_size)
  {
    while((start = seq_contig_start(r, search_pos, kmer_size, 0,0)) < r->seq.end &&
          !found)
    {
      end = seq_contig_end(r, start, kmer_size, 0, 0, &search_pos);
      contig_len = end - start;
      __sync_fetch_and_add((volatile size_t*)&stats->total_bases_loaded, contig_len);

      num_contigs++;

      bkmer = binary_kmer_from_str(r->seq.b + start, kmer_size);
      num_kmers_loaded++;
      node = db_graph_find(db_graph, bkmer);
      if(node.key != HASH_NOT_FOUND) { found = true; break; }

      for(i = start+kmer_size; i < end; i++)
      {
        nuc = dna_char_to_nuc(r->seq.b[i]);
        bkmer = binary_kmer_left_shift_add(bkmer, kmer_size, nuc);
        num_kmers_loaded++;
        node = db_graph_find(db_graph, bkmer);
        if(node.key != HASH_NOT_FOUND) { found = true; break; }
      }
    }
  }

  // Update stats
  __sync_fetch_and_add((volatile size_t*)&stats->total_bases_read, r->seq.end);
  __sync_fetch_and_add((volatile size_t*)&stats->num_kmers_loaded, num_kmers_loaded);
  __sync_fetch_and_add((volatile size_t*)&stats->num_kmers_novel, num_kmers_loaded - found);
  __sync_fetch_and_add((volatile size_t*)&stats->num_good_reads, num_contigs > 0);
  __sync_fetch_and_add((volatile size_t*)&stats->num_bad_reads, num_contigs == 0);

  return found;
}
Esempio n. 3
0
// if colour is -1 aligns to all colours, otherwise aligns to given colour only
// Returns number of kmers lost from the end
static size_t db_alignment_from_read(dBAlignment *aln, const read_t *r,
                                     uint8_t qcutoff, uint8_t hp_cutoff,
                                     const dBGraph *db_graph, int colour)
{
  size_t contig_start, contig_end = 0, search_start = 0;
  const size_t kmer_size = db_graph->kmer_size;

  BinaryKmer bkmer, tmp_key;
  Nucleotide nuc;
  hkey_t node;
  size_t i, offset, nxtbse;

  dBNodeBuffer *nodes = &aln->nodes;
  Int32Buffer *rpos = &aln->rpos;

  ctx_assert(nodes->len == rpos->len);
  size_t n = nodes->len, init_len = n;

  db_node_buf_capacity(nodes, n + r->seq.end);
  int32_buf_capacity(rpos, n + r->seq.end);

  while((contig_start = seq_contig_start(r, search_start, kmer_size,
                                         qcutoff, hp_cutoff)) < r->seq.end)
  {
    contig_end = seq_contig_end(r, contig_start, kmer_size,
                                qcutoff, hp_cutoff, &search_start);

    const char *contig = r->seq.b + contig_start;
    size_t contig_len = contig_end - contig_start;

    bkmer = binary_kmer_from_str(contig, kmer_size);
    bkmer = binary_kmer_right_shift_one_base(bkmer);

    for(offset=contig_start, nxtbse=kmer_size-1; nxtbse < contig_len; nxtbse++,offset++)
    {
      nuc = dna_char_to_nuc(contig[nxtbse]);
      bkmer = binary_kmer_left_shift_add(bkmer, kmer_size, nuc);
      tmp_key = binary_kmer_get_key(bkmer, kmer_size);
      node = hash_table_find(&db_graph->ht, tmp_key);

      if(node != HASH_NOT_FOUND &&
         (colour == -1 || db_node_has_col(db_graph, node, colour)))
      {
        nodes->b[n].key = node;
        nodes->b[n].orient = bkmer_get_orientation(bkmer, tmp_key);
        rpos->b[n] = offset;
        n++;
      }
    }
  }

  // Return number of bases from the last kmer found until read end
  size_t ret = (n == init_len ? r->seq.end /* No kmers found */
                              : r->seq.end - (rpos->b[n-1] + kmer_size));

  nodes->len = rpos->len = n;

  // Check for sequence gaps
  for(i = init_len; i+1 < nodes->len; i++) {
    if(rpos->b[i]+1 < rpos->b[i+1]) {
      aln->seq_gaps = true;
      break;
    }
  }

  return ret;
}