Ejemplo n.º 1
0
char* gt_bioseq_get_sequence_range(const GtBioseq *bs, GtUword idx,
                                   GtUword start, GtUword end)
{
  char *out;
  GtUword startpos;
  gt_assert(bs);
  gt_assert(idx < gt_encseq_num_of_sequences(bs->encseq) && end >= start);
  out = gt_malloc((end - start + 1) * sizeof (char));
  startpos = gt_encseq_seqstartpos(bs->encseq, idx);
  gt_encseq_extract_decoded(bs->encseq, out, startpos + start, startpos + end);
  return out;
}
Ejemplo n.º 2
0
char* gt_bioseq_get_sequence(const GtBioseq *bs, GtUword idx)
{
  char *out;
  GtUword startpos;
  gt_assert(bs);
  gt_assert(idx < gt_encseq_num_of_sequences(bs->encseq));
  out = gt_calloc(gt_encseq_seqlength(bs->encseq, idx), sizeof (char));
  startpos = gt_encseq_seqstartpos(bs->encseq, idx);
  gt_encseq_extract_decoded(bs->encseq, out, startpos,
                            startpos
                              + gt_encseq_seqlength(bs->encseq, idx) - 1);
  return out;
}
Ejemplo n.º 3
0
static int encseq_lua_extract_decoded(lua_State *L)
{
  GtEncseq **encseq;
  GtUword from, to;
  char *string;
  encseq = check_encseq(L, 1);
  from = luaL_checknumber(L, 2);
  to = luaL_checknumber(L, 3);
  luaL_argcheck(L, from <= to, 2, "must be <= range endposition");
  luaL_argcheck(L, to < gt_encseq_total_length(*encseq), 3,
                "cannot exceed total length of encoded sequence");
  string = gt_malloc((to - from + 1) * sizeof (char));
  gt_encseq_extract_decoded(*encseq, string, from, to);
  lua_pushlstring(L, string, (to - from + 1));
  gt_free(string);
  return 1;
}
Ejemplo n.º 4
0
static char* gt_encseq_col_get_sequence(const GtSeqCol *sc,
                                        GtUword filenum,
                                        GtUword seqnum,
                                        GtUword start,
                                        GtUword end)
{
  GtEncseqCol *esc;
  char *out;
  GtUword encseq_seqnum, startpos;
  esc = gt_encseq_col_cast(sc);
  gt_assert(esc && filenum < gt_encseq_num_of_files(esc->encseq));
  encseq_seqnum = gt_encseq_filenum_first_seqnum(esc->encseq, filenum) + seqnum;
  gt_assert(encseq_seqnum < gt_encseq_num_of_sequences(esc->encseq));
  gt_assert(start <= end);
  startpos = gt_encseq_seqstartpos(esc->encseq, encseq_seqnum);
  out = gt_calloc(end - start + 1, sizeof (char));
  gt_encseq_extract_decoded(esc->encseq, out, startpos + start, startpos + end);
  return out;
}
Ejemplo n.º 5
0
static int gt_encseq_col_md5_to_seq(GtSeqCol *sc, char **seq,
                                    GtUword start, GtUword end,
                                    GtStr *md5_seqid, GtError *err)
{
  GtUword seqnum = GT_UNDEF_UWORD;
  char seqid[GT_MD5_SEQID_HASH_LEN + 1];
  int had_err = 0;
  GtEncseqCol *esc;
  esc = gt_encseq_col_cast(sc);
  gt_error_check(err);
  gt_assert(esc && seq && start <= end && md5_seqid && err);
  gt_assert(gt_md5_seqid_has_prefix(gt_str_get(md5_seqid)));
  if (gt_str_length(md5_seqid) >= GT_MD5_SEQID_TOTAL_LEN) {
    const char *cstrseqid = gt_str_get(md5_seqid);
    if (cstrseqid[GT_MD5_SEQID_TOTAL_LEN-1] != GT_MD5_SEQID_SEPARATOR) {
      gt_error_set(err, "MD5 sequence id %s not terminated with '%c'",
                   gt_str_get(md5_seqid), GT_MD5_SEQID_SEPARATOR);
      had_err = -1;
    }
    if (!had_err) {
      strncpy(seqid, cstrseqid + GT_MD5_SEQID_PREFIX_LEN,
              GT_MD5_SEQID_HASH_LEN);
      seqid[GT_MD5_SEQID_HASH_LEN] = '\0';
    }
  }
  seqnum = gt_md5_tab_map(esc->md5_tab, seqid);
  if (seqnum != GT_UNDEF_UWORD) {
    GtUword startpos = gt_encseq_seqstartpos(esc->encseq, seqnum),
                  GT_UNUSED seqlength = gt_encseq_seqlength(esc->encseq,
                                                            seqnum);
    *seq = gt_calloc(end - start + 1, sizeof (char));
    gt_encseq_extract_decoded(esc->encseq, (char*) *seq, startpos + start,
                              startpos + end);
  } else {
    gt_error_set(err, "sequence %s not found", gt_str_get(md5_seqid));
    had_err = -1;
  }
  return had_err;
}
Ejemplo n.º 6
0
static inline void rdj_pairwise_generic(bool use_dp, GtOvlfindMode m,
    GtEncseq *encseq, bool revcompl, bool show_progressbar, bool use_kmp,
    double max_error, GtUword min_length, bool find_nonmaximal,
    GtSpmproc proc, GtSpmprocA proc_a, void* procdata, bool cntfilter,
    GtBitsequence *cntreads_in, GtBitsequence **cntreads_out,
    GtUword *nofreads)
{
  GtContfind containment_status;
  GtBitsequence *cntreads = NULL;
  GtUint64 progress = 0;
  GtUword i, j, startpos, v_seqnum, nofsequences, n;
  struct Read u, v;
  struct Data d;
  gt_kmp_t** kmp_values = NULL;

  GT_RDJ_PAIRWISE_INIT_STRUCT_DATA(d, proc, proc_a, procdata, &u, &v, 0);

  gt_assert(encseq != NULL);

  d.mode = m;
  if ((m == GT_OVLFIND_ALL) && cntfilter)
    d.mode = GT_OVLFIND_PROPER_SPM;

  n = gt_encseq_num_of_sequences(encseq);
  if (use_kmp)
    kmp_values = prepare_kmp_values(encseq, n);
  nofsequences = n;
  if (revcompl)
    n = n >> 1;
  if (cntreads_in != NULL)
    cntreads = cntreads_in;
  else if (m != GT_OVLFIND_SPM)
    GT_INITBITTAB(cntreads, n);
  if (show_progressbar) gt_progressbar_start(&progress, (GtUint64)n *
      ((GtUint64)n - 1ULL) / 2ULL);

  for (i = 0; i < n; i++)
  {
    u.seqnum = i;
    u.direct = true;
    u.len = gt_encseq_seqlength(encseq, i);
    u.seq = gt_malloc(sizeof (char) * (u.len + 1));
    startpos = gt_encseq_seqstartpos(encseq, i);
    gt_encseq_extract_decoded(encseq, u.seq, startpos, startpos + u.len - 1);
    u.seq[u.len] = '\0';
    if (use_kmp)
    {
      gt_assert(kmp_values != NULL);
      u.pi = kmp_values[i];
    }

    for (j = i; j < n; j++)
    {
      if (cntfilter)
      {
        gt_assert(cntreads != NULL);
        if ((bool)GT_ISIBITSET(cntreads, i)) break;
        if ((bool)GT_ISIBITSET(cntreads, j)) continue;
      }

      v.seqnum = j;

      /* find overlaps using direct v */
      v.direct = true;
      v.len = gt_encseq_seqlength(encseq, j);
      v.seq = gt_malloc(sizeof (char) * (v.len + 1));
      startpos = gt_encseq_seqstartpos(encseq, j);
      gt_encseq_extract_decoded(encseq, v.seq, startpos,
          startpos + v.len - 1);
      v.seq[v.len] = '\0';
      if (use_kmp)
      {
        gt_assert(kmp_values != NULL);
        v.pi = kmp_values[j];
      }
      containment_status = use_dp
          ? find_approx_overlaps(&d, max_error, min_length, find_nonmaximal)
          : find_exact_overlaps(&d, use_kmp, min_length, find_nonmaximal);
      if (m != GT_OVLFIND_SPM)
        mark_contained(containment_status, u.seqnum, v.seqnum, cntreads);

      /* find overlaps using reverse complement of v */
      if (revcompl)
      {
        v_seqnum =  nofsequences - j - 1;
        v.direct = false;
        gt_assert(gt_encseq_seqlength(encseq, j) ==
            gt_encseq_seqlength(encseq, v_seqnum));
        startpos = gt_encseq_seqstartpos(encseq, v_seqnum);
        gt_encseq_extract_decoded(encseq, v.seq, startpos,
            startpos + v.len - 1);
        if (use_kmp)
        {
          gt_assert(kmp_values != NULL);
          v.pi = kmp_values[v_seqnum];
        }
        containment_status = use_dp
          ? find_approx_overlaps(&d, max_error, min_length, find_nonmaximal)
          : find_exact_overlaps(&d, use_kmp, min_length, find_nonmaximal);
        if (m != GT_OVLFIND_SPM)
          mark_contained(containment_status, u.seqnum, v.seqnum, cntreads);
      }
      gt_free(v.seq);
      progress++;
    }
    gt_free(u.seq);
  }

  if (cntreads_out != NULL)
    *cntreads_out = cntreads;
  else if (cntreads_in == NULL)
    gt_free(cntreads);
  if (nofreads != NULL)
    *nofreads = n;
  if (use_kmp)
    free_kmp_values(kmp_values, revcompl ? n << 1 : n);
  if (show_progressbar)
    gt_progressbar_stop();
}
Ejemplo n.º 7
0
static GtMatchIteratorStatus gt_match_iterator_sw_next(GtMatchIterator *mi,
                                                      GT_UNUSED GtMatch **match,
                                                      GT_UNUSED GtError *err)
{
  GtMatchIteratorSW *mis;
  GtSeq *seq_a, *seq_b;
  char *a, *b;
  const char *adesc, *bdesc;
  GtAlignment *ali = NULL;
  unsigned long seqlen_a, seqlen_b, seqpos;
  GtRange arng, brng;
  gt_assert(mi && match);

  mis = gt_match_iterator_sw_cast(mi);
  while (true) {
    if (!mis->pvt->firstali)
      mis->pvt->seqno_es2++;
    if (mis->pvt->seqno_es2 == gt_encseq_num_of_sequences(mis->pvt->es2)) {
      mis->pvt->seqno_es1++;
      if (mis->pvt->seqno_es1 == gt_encseq_num_of_sequences(mis->pvt->es1))
        return GT_MATCHER_STATUS_END;
      mis->pvt->seqno_es2 = 0;
    }
    seqlen_a = gt_encseq_seqlength(mis->pvt->es1, mis->pvt->seqno_es1);
    seqlen_b = gt_encseq_seqlength(mis->pvt->es2, mis->pvt->seqno_es2);
    /* XXX: reuse buffers for performance improvement */
    a = gt_malloc(seqlen_a * sizeof (char));
    seqpos = gt_encseq_seqstartpos(mis->pvt->es1, mis->pvt->seqno_es1);
    gt_encseq_extract_decoded(mis->pvt->es1, a, seqpos, seqpos + seqlen_a - 1);
    b = gt_malloc(seqlen_b * sizeof (char));
    seqpos = gt_encseq_seqstartpos(mis->pvt->es2, mis->pvt->seqno_es2);
    gt_encseq_extract_decoded(mis->pvt->es1, b, seqpos, seqpos + seqlen_b - 1);
    seq_a = gt_seq_new(a, seqlen_a, gt_encseq_alphabet(mis->pvt->es1));
    seq_b = gt_seq_new(b, seqlen_b, gt_encseq_alphabet(mis->pvt->es2));
    ali = gt_swalign(seq_a, seq_b, mis->pvt->sf);
    mis->pvt->firstali = false;
    if (ali && gt_alignment_get_length(ali) >= mis->pvt->min_len
          && gt_alignment_eval(ali) <= mis->pvt->max_edist) {
      break;
    }
    gt_alignment_delete(ali);
    gt_seq_delete(seq_a);
    gt_seq_delete(seq_b);
    gt_free(a);
    gt_free(b);
  }
  arng = gt_alignment_get_urange(ali);
  brng = gt_alignment_get_vrange(ali);
  adesc = gt_encseq_description(mis->pvt->es1, &seqlen_a, mis->pvt->seqno_es1);
  bdesc = gt_encseq_description(mis->pvt->es2, &seqlen_b, mis->pvt->seqno_es2);
  *match = gt_match_sw_new("", "",
                           mis->pvt->seqno_es1,
                           mis->pvt->seqno_es2,
                           gt_alignment_get_length(ali),
                           gt_alignment_eval(ali),
                           arng.start, brng.start,
                           arng.end, brng.end,
                           GT_MATCH_DIRECT);
  gt_match_set_seqid1_nt(*match, adesc, seqlen_a);
  gt_match_set_seqid2_nt(*match, bdesc, seqlen_b);
  gt_alignment_delete(ali);
  gt_seq_delete(seq_a);
  gt_seq_delete(seq_b);
  gt_free(a);
  gt_free(b);
  return GT_MATCHER_STATUS_OK;
}
Ejemplo n.º 8
0
int gt_region_mapping_get_sequence(GtRegionMapping *rm, char **seq,
                                   GtStr *seqid, unsigned long start,
                                   unsigned long end, GtError *err)
{
  int had_err = 0;
  unsigned long offset = 1;
  GtRange range = {GT_UNDEF_ULONG, GT_UNDEF_ULONG};
  gt_error_check(err);
  gt_assert(rm && seq && seqid && gt_str_length(seqid) > 0);

  /* handle rawseq access first  */
  if (rm->userawseq) {
    gt_assert(!rm->seqid2seqnum_mapping);
    *seq = gt_calloc(end - start + 1, sizeof (char));
    strncpy(*seq, rm->rawseq + start - 1, (end - start + 1) * sizeof (char));
    return 0;
  }

  /* make sure that correct sequence is loaded */
  had_err = update_seq_col_if_necessary(rm, seqid, err);

  /* MD5 sequence id */
  if (!had_err) {
    if (gt_md5_seqid_has_prefix(gt_str_get(seqid))) {
      had_err = gt_seq_col_md5_to_seq(rm->seq_col, seq, start - offset,
                                      end - offset, seqid, err);
      return had_err;
    }
  }

  /* ``regular'' sequence ID */
  if (!had_err) {
    gt_assert(!rm->usedesc || rm->seqid2seqnum_mapping);
    gt_assert(rm->mapping || rm->seq_col);
    if (rm->usedesc) {
      unsigned long seqnum, filenum;
      gt_assert(rm->seqid2seqnum_mapping);
      range.start = start;
      range.end = end;
      had_err = gt_seqid2seqnum_mapping_map(rm->seqid2seqnum_mapping,
                                            gt_str_get(seqid), &range, &seqnum,
                                            &filenum, &offset, err);

      if (!had_err) {
        if (range.end != GT_UNDEF_ULONG && range.start != GT_UNDEF_ULONG &&
              range.end >= gt_seq_col_get_sequence_length(rm->seq_col, filenum,
                                                          seqnum)
              + offset) {
          gt_error_set(err, "trying to extract range %lu-%lu on sequence "
                       "``%s'' which is not covered by that sequence (with "
                       "boundaries %lu-%lu). Has the sequence-region "
                       "to sequence mapping been defined correctly?",
                       start, end, gt_str_get(seqid),
                       range.start, range.end);
          had_err = -1;
        }
      }
      if (!had_err) {
        *seq = gt_seq_col_get_sequence(rm->seq_col, filenum, seqnum,
                                       start - offset, end - offset);
      }
    } else if (rm->matchdesc) {
      gt_assert(!rm->seqid2seqnum_mapping);
      gt_assert(rm->seq_col);
      if (!had_err) {
        had_err = gt_seq_col_grep_desc(rm->seq_col, seq, start - 1, end - 1,
                                       seqid, err);
      }
    } else if (rm->useseqno) {
      unsigned long seqno = GT_UNDEF_ULONG;
      gt_assert(rm->encseq);
      if (1 != sscanf(gt_str_get(seqid), "seq%lu", &seqno)) {
        gt_error_set(err, "seqid '%s' does not have the form 'seqX' "
                          "where X is a sequence number in the encoded "
                          "sequence", gt_str_get(seqid));
        had_err = -1;
      }
      gt_assert(had_err || seqno != GT_UNDEF_ULONG);
      if (!had_err && seqno >= gt_encseq_num_of_sequences(rm->encseq)) {
          gt_error_set(err, "trying to access sequence %lu, but encoded "
                            "sequence contains only %lu sequences",
                            seqno, gt_encseq_num_of_sequences(rm->encseq));
          had_err = -1;
      }
      if (!had_err) {
        unsigned long seqlength = gt_encseq_seqlength(rm->encseq, seqno);
        if (start > seqlength || end > seqlength) {
          gt_error_set(err, "trying to extract range %lu-%lu on sequence "
                       "``%s'' which is not covered by that sequence (only "
                       "%lu characters in size). Has the sequence-region "
                       "to sequence mapping been defined correctly?",
                       start, end, gt_str_get(seqid), seqlength);
          had_err = -1;
        }
      }
      if (!had_err) {
        unsigned long seqstartpos;
        *seq = gt_calloc(end - start + 1, sizeof (char));
        seqstartpos = gt_encseq_seqstartpos(rm->encseq, seqno);
        gt_encseq_extract_decoded(rm->encseq, *seq, seqstartpos + start - 1,
                                  seqstartpos + end - 1);
      }
    } else if (rm->userawseq) {
      gt_assert(!rm->seqid2seqnum_mapping);
      *seq = gt_calloc(end - start + 1, sizeof (char));
      strncpy(*seq, rm->rawseq + start - 1, (end - start + 1) * sizeof (char));
    } else {
      gt_assert(rm->seq_col);
      if (!had_err) {
        unsigned long seqlength = gt_seq_col_get_sequence_length(rm->seq_col,
                                                                 0, 0);
        if (start > seqlength || end > seqlength) {
          had_err = -1;
          gt_error_set(err, "trying to extract range %lu-%lu on sequence "
                       "``%s'' which is not covered by that sequence (only "
                       "%lu characters in size). Has the sequence-region "
                       "to sequence mapping been defined correctly?",
                       start, end, gt_str_get(seqid), seqlength);
        }
        if (!had_err) {
          *seq = gt_seq_col_get_sequence(rm->seq_col, 0, 0, start - offset,
                                         end - offset);
        }
      }
    }
  }
  return had_err;
}