예제 #1
0
static int gt_cntlist_parse_ascii(FILE *infp, bool alloc_cntlist,
    GtBitsequence **cntlist, GtUword *nofreads, GtError *err)
{
  int n;
  GtUword seqnum;

  gt_assert(infp != NULL && nofreads != NULL && cntlist != NULL);
  /*@i1@*/ gt_error_check(err);
  n = fscanf(infp, "[n: "GT_WU"]\n", nofreads);
  if (n!=1 || *nofreads == 0)
  {
    gt_error_set(err, "contained reads file: unrecognized format");
    return -1;
  }
  if (alloc_cntlist)
    GT_INITBITTAB(*cntlist, *nofreads);
  while (true)
  {
    n = fscanf(infp, ""GT_WU"\n", &seqnum);
    if (n == EOF)
      break;
    else if (n != 1)
    {
      gt_error_set(err, "contained reads file: unrecognized format");
      return -1;
    }
    GT_SETIBIT(*cntlist, seqnum);
  }
  return 0;
}
예제 #2
0
static int gt_cntlist_parse_bin(FILE *infp, bool alloc_cntlist,
    GtBitsequence **cntlist, GtUword *nofreads, GtError *err)
{
  int had_err = gt_cntlist_parse_bin_or_bit_header(infp, nofreads, err);
  if (had_err == 0)
  {
    size_t n;
    GtUword seqnum;
    gt_assert(cntlist != NULL);
    if (alloc_cntlist)
      GT_INITBITTAB(*cntlist, *nofreads);
    while (true)
    {
      n = fread(&seqnum, sizeof (GtUword), (size_t)1, infp);
      if (n != (size_t)1)
      {
        if (!feof(infp))
        {
          gt_error_set(err, "contained reads file: unrecognized format");
          had_err = -1;
        }
        break;
      }
      GT_SETIBIT(*cntlist, seqnum);
    }
  }
  return had_err;
}
예제 #3
0
void gt_initstorematch(Storematchinfo *storematch,
                    const GtEncseq *encseq)
{
  unsigned long numofdbsequences = gt_encseq_num_of_sequences(encseq);

  storematch->encseq = encseq;
  GT_INITBITTAB(storematch->hasmatch,numofdbsequences);
}
예제 #4
0
/* prepare sspbittab and determine length of shortest sequence */
static void prepare_sspbittab_and_shortest(unsigned long totallength,
    ContfindBUstate *state)
{
  unsigned long length, lastseqstart, i, ssp;

  GT_INITBITTAB(state->sspbittab, totallength + 1);
  lastseqstart = 0;
  state->shortest = totallength;
  for (i = 1UL; i <= state->nofsequences - 1; i++)
  {
    ssp = gt_encseq_seqstartpos(state->encseq, i) - 1;
    GT_SETIBIT(state->sspbittab, ssp);
    length = ssp - lastseqstart;
    lastseqstart = ssp + 1;
    if (length < state->shortest)
      state->shortest = length;
  }
  GT_SETIBIT(state->sspbittab, totallength);
  length = totallength - lastseqstart;
  if (length < state->shortest)
    state->shortest = length;
}
예제 #5
0
static int gt_cntlist_parse_bit(FILE *infp, bool alloc_cntlist,
    GtBitsequence **cntlist, GtUword *nofreads, GtError *err)
{
  int had_err = gt_cntlist_parse_bin_or_bit_header(infp, nofreads, err);
  if (had_err == 0)
  {
    size_t n;
    gt_assert(cntlist != NULL);
    if (alloc_cntlist)
    {
      GT_INITBITTAB(*cntlist, *nofreads);
      n = fread(*cntlist, sizeof (GtBitsequence),
          GT_NUMOFINTSFORBITS(*nofreads), infp);
      if (n != GT_NUMOFINTSFORBITS(*nofreads))
      {
        gt_error_set(err, "contained reads file: unrecognized format");
        had_err = -1;
      }
    }
    else
    {
      /* combine using OR with existing data */
      size_t i;
      for (i = 0; i < GT_NUMOFINTSFORBITS(*nofreads); i++)
      {
        GtBitsequence value;
        n = fread(&value, sizeof (GtBitsequence), (size_t)1, infp);
        if (n != (size_t)1)
        {
          gt_error_set(err, "contained reads file: unrecognized format");
          had_err = -1;
          break;
        }
        *cntlist[i] |= value;
      }
    }
  }
  return had_err;
}
예제 #6
0
static inline void rdj_pairwise_generic(bool use_dp, GtOvlfindMode m,
    GtEncseq *encseq, bool revcompl, bool show_progressbar, bool use_kmp,
    double max_error, GtUword min_length, bool find_nonmaximal,
    GtSpmproc proc, GtSpmprocA proc_a, void* procdata, bool cntfilter,
    GtBitsequence *cntreads_in, GtBitsequence **cntreads_out,
    GtUword *nofreads)
{
  GtContfind containment_status;
  GtBitsequence *cntreads = NULL;
  GtUint64 progress = 0;
  GtUword i, j, startpos, v_seqnum, nofsequences, n;
  struct Read u, v;
  struct Data d;
  gt_kmp_t** kmp_values = NULL;

  GT_RDJ_PAIRWISE_INIT_STRUCT_DATA(d, proc, proc_a, procdata, &u, &v, 0);

  gt_assert(encseq != NULL);

  d.mode = m;
  if ((m == GT_OVLFIND_ALL) && cntfilter)
    d.mode = GT_OVLFIND_PROPER_SPM;

  n = gt_encseq_num_of_sequences(encseq);
  if (use_kmp)
    kmp_values = prepare_kmp_values(encseq, n);
  nofsequences = n;
  if (revcompl)
    n = n >> 1;
  if (cntreads_in != NULL)
    cntreads = cntreads_in;
  else if (m != GT_OVLFIND_SPM)
    GT_INITBITTAB(cntreads, n);
  if (show_progressbar) gt_progressbar_start(&progress, (GtUint64)n *
      ((GtUint64)n - 1ULL) / 2ULL);

  for (i = 0; i < n; i++)
  {
    u.seqnum = i;
    u.direct = true;
    u.len = gt_encseq_seqlength(encseq, i);
    u.seq = gt_malloc(sizeof (char) * (u.len + 1));
    startpos = gt_encseq_seqstartpos(encseq, i);
    gt_encseq_extract_decoded(encseq, u.seq, startpos, startpos + u.len - 1);
    u.seq[u.len] = '\0';
    if (use_kmp)
    {
      gt_assert(kmp_values != NULL);
      u.pi = kmp_values[i];
    }

    for (j = i; j < n; j++)
    {
      if (cntfilter)
      {
        gt_assert(cntreads != NULL);
        if ((bool)GT_ISIBITSET(cntreads, i)) break;
        if ((bool)GT_ISIBITSET(cntreads, j)) continue;
      }

      v.seqnum = j;

      /* find overlaps using direct v */
      v.direct = true;
      v.len = gt_encseq_seqlength(encseq, j);
      v.seq = gt_malloc(sizeof (char) * (v.len + 1));
      startpos = gt_encseq_seqstartpos(encseq, j);
      gt_encseq_extract_decoded(encseq, v.seq, startpos,
          startpos + v.len - 1);
      v.seq[v.len] = '\0';
      if (use_kmp)
      {
        gt_assert(kmp_values != NULL);
        v.pi = kmp_values[j];
      }
      containment_status = use_dp
          ? find_approx_overlaps(&d, max_error, min_length, find_nonmaximal)
          : find_exact_overlaps(&d, use_kmp, min_length, find_nonmaximal);
      if (m != GT_OVLFIND_SPM)
        mark_contained(containment_status, u.seqnum, v.seqnum, cntreads);

      /* find overlaps using reverse complement of v */
      if (revcompl)
      {
        v_seqnum =  nofsequences - j - 1;
        v.direct = false;
        gt_assert(gt_encseq_seqlength(encseq, j) ==
            gt_encseq_seqlength(encseq, v_seqnum));
        startpos = gt_encseq_seqstartpos(encseq, v_seqnum);
        gt_encseq_extract_decoded(encseq, v.seq, startpos,
            startpos + v.len - 1);
        if (use_kmp)
        {
          gt_assert(kmp_values != NULL);
          v.pi = kmp_values[v_seqnum];
        }
        containment_status = use_dp
          ? find_approx_overlaps(&d, max_error, min_length, find_nonmaximal)
          : find_exact_overlaps(&d, use_kmp, min_length, find_nonmaximal);
        if (m != GT_OVLFIND_SPM)
          mark_contained(containment_status, u.seqnum, v.seqnum, cntreads);
      }
      gt_free(v.seq);
      progress++;
    }
    gt_free(u.seq);
  }

  if (cntreads_out != NULL)
    *cntreads_out = cntreads;
  else if (cntreads_in == NULL)
    gt_free(cntreads);
  if (nofreads != NULL)
    *nofreads = n;
  if (use_kmp)
    free_kmp_values(kmp_values, revcompl ? n << 1 : n);
  if (show_progressbar)
    gt_progressbar_stop();
}
static int gt_readjoiner_cnttest_runner(GT_UNUSED int argc,
    GT_UNUSED const char **argv, GT_UNUSED int parsed_args,
    void *tool_arguments, GT_UNUSED GtError *err)
{
  GtReadjoinerCnttestArguments *arguments = tool_arguments;
  GtEncseqLoader *el = NULL;
  GtEncseq *reads = NULL;
  GtBitsequence *bits = NULL;
  GtUword nofreads;
  int had_err = 0;

  gt_error_check(err);
  gt_assert(arguments);

  if (arguments->test == GT_READJOINER_CNTTEST_SHOWLIST)
  {
    GtStr *fn = NULL;
    fn = gt_str_clone(arguments->readset);
    gt_str_append_cstr(fn, GT_READJOINER_SUFFIX_CNTLIST);
    had_err = gt_cntlist_parse(gt_str_get(fn), true, &bits, &nofreads, err);
    gt_str_delete(fn);
  }
  else if (arguments->test == GT_READJOINER_CNTTEST_BRUTEFORCE ||
      arguments->test == GT_READJOINER_CNTTEST_KMP)
  {
    el = gt_encseq_loader_new();
    gt_encseq_loader_drop_description_support(el);
    gt_encseq_loader_disable_autosupport(el);
    if (!arguments->singlestrand)
      gt_encseq_loader_mirror(el);
    reads = gt_encseq_loader_load(el, gt_str_get(arguments->readset), err);
    if (reads == NULL)
      had_err = -1;
    else
    {
      gt_rdj_pairwise_exact(GT_OVLFIND_CNT, reads, !arguments->singlestrand,
          false, arguments->test == GT_READJOINER_CNTTEST_KMP, 1UL, true,
          NULL, NULL, false, NULL, &bits, &nofreads);
    }
    gt_encseq_delete(reads);
    gt_encseq_loader_delete(el);
  }
  else if (arguments->test == GT_READJOINER_CNTTEST_ESA)
  {
    Sequentialsuffixarrayreader *ssar = NULL;
    GtUword readlength = 0, firstrevcompl = 0;
    GtLogger *verbose_logger = gt_logger_new(arguments->verbose,
        GT_LOGGER_DEFLT_PREFIX, stderr);
    ssar = gt_newSequentialsuffixarrayreaderfromfile(gt_str_get(
          arguments->readset), SARR_LCPTAB | SARR_SUFTAB | SARR_SSPTAB,
        true, verbose_logger, err);
    if (gt_error_is_set(err))
      had_err = -1;
    else
    {
      nofreads = gt_encseq_num_of_sequences(ssar->encseq);
      if (!arguments->singlestrand)
      {
        nofreads = GT_DIV2(nofreads);
        firstrevcompl = nofreads;
      }
      GT_INITBITTAB(bits, nofreads);
      if (!arguments->singlestrand)
      if (gt_encseq_accesstype_get(ssar->encseq) == GT_ACCESS_TYPE_EQUALLENGTH)
        readlength = gt_encseq_seqlength(ssar->encseq, 0);
      (void)gt_contfind_bottomup(ssar, false, bits, arguments->singlestrand ? 0
          : firstrevcompl, readlength);
    }
    if (ssar != NULL)
      gt_freeSequentialsuffixarrayreader(&ssar);
    gt_logger_delete(verbose_logger);
  }
  else
  {
    gt_assert(false);
  }
  if (!had_err)
    had_err = gt_cntlist_show(bits, nofreads, NULL, false, err);
  gt_free(bits);
  return had_err;
}