Пример #1
0
static int processleafedge_spmsk(bool firstedge,
                                 GtUword fd,
                                 GtBUinfo_spmsk *finfo,
                                 GtUword seqnum,
                                 GtUword relpos,
                                 GtBUstate_spmsk *state,
                                 GT_UNUSED GtError *err)

{
  if (fd >= state->minmatchlength)
  {
    if (firstedge)
    {
      gt_assert(finfo != NULL);
      ((GtBUinfo_spmsk *) finfo)->firstinW = state->Wset.nextfreeGtUlong;
    }
    if (relpos == 0)
    {
      GT_STOREINARRAY(&state->Wset,GtUlong,128,seqnum);
    }
    if (relpos + fd == gt_encseq_seqlength(state->encseq,seqnum))
    {
      GT_STOREINARRAY(&state->Lset,GtUlong,128,seqnum);
    }
  }
  return 0;
}
Пример #2
0
Pckbuckettable *gt_pckbuckettable_new(const FMindex *fmindex,
                                      unsigned int numofchars,
                                      unsigned long totallength,
                                      unsigned int maxdepth)
{
  GtArrayPckbck_Boundsatdepth stack;
  Pckbck_Boundsatdepth parent, child;
  unsigned long rangesize, idx, *rangeOccs;
  Pckbuckettable *pckbt;
  Mbtab *tmpmbtab;

  GT_INITARRAY(&stack,Pckbck_Boundsatdepth);
  child.lowerbound = 0;
  child.upperbound = totallength+1;
  child.depth = 0;
  child.code = (GtCodetype) 0;
  GT_STOREINARRAY(&stack,Pckbck_Boundsatdepth,128,child);
  rangeOccs = gt_malloc(sizeof (*rangeOccs) * GT_MULT2(numofchars));
  tmpmbtab = gt_malloc(sizeof (*tmpmbtab) * numofchars);
  pckbt = pckbuckettable_allocandinittable(numofchars,maxdepth,true);
  while (stack.nextfreePckbck_Boundsatdepth > 0)
  {
    parent
      = stack.spacePckbck_Boundsatdepth[--stack.nextfreePckbck_Boundsatdepth];
    gt_assert(parent.lowerbound < parent.upperbound);
    rangesize = gt_bwtrangesplitallwithoutspecial(tmpmbtab,
                                                  rangeOccs,
                                                  fmindex,
                                                  parent.lowerbound,
                                                  parent.upperbound);
    gt_assert(rangesize <= (unsigned long) numofchars);
    for (idx = 0; idx < rangesize; idx++)
    {
      child.lowerbound = tmpmbtab[idx].lowerbound;
      child.upperbound = tmpmbtab[idx].upperbound;
      child.depth = parent.depth + 1;
      gt_assert(child.depth <= maxdepth);
      child.code = parent.code * numofchars + idx;
      pckbuckettable_storeBoundsatdepth(pckbt,&child);
      if (child.depth < maxdepth)
      {
        if (child.lowerbound + 1 < child.upperbound)
        {
          GT_STOREINARRAY(&stack,Pckbck_Boundsatdepth,128,child);
        } else
        {
          pckbuckettable_followleafedge(pckbt,fmindex,&child);
        }
      }
    }
  }
  GT_FREEARRAY(&stack,Pckbck_Boundsatdepth);
  gt_free(rangeOccs);
  gt_free(tmpmbtab);
  printf("filled: %lu (%.2f)\n",pckbt->numofvalues,
                        (double) pckbt->numofvalues/pckbt->maxnumofvalues);
  return pckbt;
}
Пример #3
0
static void outkmeroccurrence(void *processinfo,
                              const GtKmercode *kmercode)
{
  GtArrayGtCodetype *codelist = (GtArrayGtCodetype *) processinfo;

  GT_STOREINARRAY(codelist,GtCodetype,1024,kmercode->code);
}
static void gt_kmer_database_add_to_hash(GtHashmap *hash, GtCodetype kmercode,
                                         GtUword position)
{
  GtArrayGtUword *arr =
    (GtArrayGtUword *) gt_hashmap_get(hash, (void *) kmercode);

  if (arr == NULL) {
    arr = gt_malloc(sizeof (*arr));
    GT_INITARRAY(arr, GtUword);
    gt_hashmap_add(hash, (void *) kmercode, (void *) arr);
  }
  if (arr->allocatedGtUword == 0)
    GT_STOREINARRAY(arr, GtUword,
                    (GtUword) 20,
                    position);
  else
    GT_STOREINARRAY(arr, GtUword,
                    arr->allocatedGtUword * 0.1,
                    position);
}
Пример #5
0
static void collectkmercode(GtArrayGtCodetype *codelist,
                            const GtEncseq *encseq,
                            unsigned int kmersize,
                            unsigned int numofchars,
                            GtUword stringtotallength)
{
  GtUword offset;
  GtCodetype code;

  for (offset=0; offset<=stringtotallength; offset++)
  {
    code = qgram2codefillspecial(numofchars,
                                 kmersize,
                                 encseq,
                                 GT_READMODE_FORWARD,
                                 offset,
                                 stringtotallength);
    GT_STOREINARRAY(codelist,GtCodetype,1024,code);
  }
}
Пример #6
0
/*read condenseq data structure from file*/
GtCondenseq *gt_condenseq_new_from_file(const char *indexname,
                                        GtLogger *logger, GtError *err)
{
  int had_err = 0;
  FILE* fp;
  GtEncseqLoader *esl;
  GtEncseq *unique_es;
  GtCondenseq *condenseq = NULL;
  /*load unique_es*/
  esl = gt_encseq_loader_new();
  unique_es = gt_encseq_loader_load(esl, indexname, err);
  if (!unique_es)
    had_err = -1;
  if (!had_err) {
    gt_encseq_loader_delete(esl);
    condenseq = condenseq_new_empty(gt_encseq_alphabet(unique_es));
    condenseq->filename = gt_cstr_dup(indexname);
    condenseq->unique_es = unique_es;
    fp = gt_fa_fopen_with_suffix(indexname, GT_CONDENSEQ_FILE_SUFFIX,
                                 "rb", err);
    if (fp == NULL) {
      had_err = -1;
    }
    else {
      had_err = condenseq_io(condenseq, fp, gt_io_error_fread, err);
      if (!had_err) {
        GtUword i;
        gt_assert(condenseq->uniques);
        gt_assert(condenseq->links);
        gt_fa_fclose(fp);
        /*create link array for each unique entry*/
        for (i = 0; i < condenseq->udb_nelems; i++) {
          GT_INITARRAY(&(condenseq->uniques[i].links),uint32_t);
        }
        /* check for overflows */
        if (condenseq->ldb_nelems > (GtUword) ((uint32_t) 0 - (uint32_t) 1)) {
          gt_error_set(err, "Overflow, to many link-elements. Can't be stored");
          had_err = -1;
        }
        /* iterate through link entrys and store ids in corresponding unique
          entry array */
        for (i = 0; !had_err && (GtUword) i < condenseq->ldb_nelems; i++) {
          GtUword uid = condenseq->links[i].unique_id;
          gt_assert(uid < condenseq->udb_nelems);
          GT_STOREINARRAY(&(condenseq->uniques[uid].links),
                          uint32_t,
                          10,
                          (uint32_t) i);
        }
      }
    }
  }
  if (!had_err) {
    gt_assert(condenseq != NULL);
    if (condenseq->id_len != GT_UNDEF_UWORD)
      gt_logger_log(logger, "IDs const len: " GT_WU, condenseq->id_len);
    else
      gt_logger_log(logger, "using sdstab to access IDs");
  }
  if (had_err) {
    gt_condenseq_delete(condenseq);
    condenseq = NULL;
  }
  return (condenseq);
}
Пример #7
0
void gt_evalxdroparbitscoresextend(bool forward,
                                   GtXdropbest *xdropbest,
                                   GtXdropresources *res,
                                   const GtSeqabstract *useq,
                                   const GtSeqabstract *vseq,
                                   GtXdropscore xdropbelowscore)
{
  const GtWord ulen = (GtWord) gt_seqabstract_length(useq),
               vlen = (GtWord) gt_seqabstract_length(vseq),
               end_k =
                 (GtWord) ulen - vlen, /* diagonal of endpoint (ulen, vlen) */
               integermax = (GtWord) MAX(ulen, vlen),
               integermin = -integermax,
               dback = GT_XDROP_SETDBACK(xdropbelowscore);
  GtWord idx,
         lbound,    /* diagonal lower bound */
         ubound,    /* diagonal upper bound */
         currd = 0, /* distance */
         k;         /* lbound - 1 <= k <= ubound + 1*/
  /*The following function calculates the maximal allowed number of
    generations with all front values equal minus infinity.*/
  const int allowedMININFINITYINTgenerations = MAX(MAX(res->arbitdistances.mis,
                                                       res->arbitdistances.ins),
                                                   res->arbitdistances.del) - 1;
  int currentMININFINITYINTgeneration = 0;
  GtXdropfrontvalue tmpfront;
  GtXdropscore bigt_tmp;        /* best score T' seen already */
  bool alwaysMININFINITYINT = true;

  gt_assert(ulen != 0 && vlen != 0);
  res->big_t.nextfreeGtXdropscore = 0;
  res->fronts.nextfreeGtXdropfrontvalue = 0;
  /* phase 0 */
  idx =  (GtWord) gt_seqabstract_lcp(forward, useq, vseq,0,0);
  /* alignment already finished */
  if (idx >= ulen || idx >= vlen) {
    lbound =  1L;
    ubound = -1L;
  }
  else {
    lbound = 0;
    ubound = 0;
  }
  tmpfront.row = (GtWord) idx;
  tmpfront.direction = (GtUchar) 0;   /* no predecessor */
  gt_xdrop_frontvalue_set(res,0,0,tmpfront);
  xdropbest->score = bigt_tmp = GT_XDROP_EVAL(idx + idx, 0);
  gt_assert(idx >= 0);
  xdropbest->ivalue = xdropbest->jvalue = (GtUword) idx;
  xdropbest->best_d = currd;
  xdropbest->best_k = 0;
  GT_STOREINARRAY (&res->big_t, GtXdropscore, 10, bigt_tmp);

  /* phase d > 0 */
  while (lbound <= ubound) {
    currd++;
    /* calculate fronts */
    for (k = lbound - 1; k <= ubound + 1; k++) {
      GtWord i = integermin, row;
      /* case 1 : DELETION-EDGE  */
      if (lbound < k &&
          currd - res->arbitdistances.del >= 0 &&
          -(currd - res->arbitdistances.del) <= k - 1 &&
          k - 1 <= currd - res->arbitdistances.del) {
        i = gt_xdrop_frontvalue_get(res, currd - res->arbitdistances.del, k-1)
            + 1;
        tmpfront.direction = GT_XDROP_DELETIONBIT;
      }
      /* case 2: REPLACEMENT-EDGE */
      if (lbound <= k &&
          k <= ubound &&
          currd - res->arbitdistances.mis >= 0 &&
          -(currd - res->arbitdistances.mis) <= k &&
          k <= currd - res->arbitdistances.mis) {
        row = gt_xdrop_frontvalue_get(res, currd - res->arbitdistances.mis, k)
              + 1;
        /* test, if case 1 has happened. */
        if (!(tmpfront.direction & GT_XDROP_DELETIONBIT) || row > i) {
          i = row;
          tmpfront.direction = GT_XDROP_REPLACEMENTBIT;
        }
      }
      /* case 3: INSERTION-EDGE */
      if (k < ubound &&
          currd - res->arbitdistances.ins >= 0 &&
          -(currd - res->arbitdistances.ins) <= k + 1 &&
           k + 1 <= currd - res->arbitdistances.ins) {
        row =
          gt_xdrop_frontvalue_get(res, currd - res->arbitdistances.ins, k+1);
        if (!(tmpfront.direction & (GT_XDROP_DELETIONBIT |
                                    GT_XDROP_REPLACEMENTBIT)) || row > i) {
          i = row;
          tmpfront.direction = GT_XDROP_INSERTIONBIT;
        }
      }
      /* if i = MINUSINFINITYINY or MINUSINFINITYINY + 1 */
      if (i < 0) {
        if (tmpfront.direction == (GtUchar) 0)
          alwaysMININFINITYINT = false;

        tmpfront.row = integermin;
      }
      else {
        GtWord j = i - k;
        const GtWord previousd = currd - dback;

        /* alignment score smaller than T - X */
        if (previousd > 0 &&
            res->big_t.spaceGtXdropscore != NULL &&
            GT_XDROP_EVAL (i + j, currd) <
            res->big_t.spaceGtXdropscore[previousd] - xdropbelowscore) {
          tmpfront.row = integermin;
        }
        else {
          if (k <= -currd || k >= currd ||
              (gt_xdrop_frontvalue_get(res, currd-1, k) < i &&
               i <= MIN(ulen, vlen + k))) {
            if (ulen > i && vlen > j) {
              GtUword lcp;
              gt_assert(forward || (ulen - 1 >= (GtWord) i &&
                                    vlen - 1 >= (GtWord) j));
              lcp = gt_seqabstract_lcp(forward, useq, vseq,i,j);
              i += lcp;
              j += lcp;
            }
            alwaysMININFINITYINT = false;
            tmpfront.row = i;
            if (GT_XDROP_EVAL(i + j, currd) > bigt_tmp) {
              xdropbest->score = bigt_tmp = GT_XDROP_EVAL(i + j, currd);
              gt_assert(i >= 0 && j >= 0);
              xdropbest->ivalue = (GtUword) i;
              xdropbest->jvalue = (GtUword) j;
              xdropbest->best_d = currd;
              xdropbest->best_k = k;
            }
          }
          else {
            alwaysMININFINITYINT = false;
            tmpfront.row = gt_xdrop_frontvalue_get(res,currd-1,k);
          }
        }
      }
      gt_xdrop_frontvalue_set(res, currd, k, tmpfront);
    }
    /* if all front values are integermin, alignment prematurely finished if
       allowedMININFINITYINTgenerations exceeded (full front has already ended
       at currd - currentMININFINITYINTgeneration). */
    if (alwaysMININFINITYINT) {
      currentMININFINITYINTgeneration++;
      if (currentMININFINITYINTgeneration > allowedMININFINITYINTgenerations)
        break;

    }
    else {
      currentMININFINITYINTgeneration = 0;
      alwaysMININFINITYINT = true;
    }
    GT_STOREINARRAY (&res->big_t, GtXdropscore, 10, bigt_tmp);
    /* fill out of bounds values of integermin
       needed for gt_showfrontvalues function */
    for (k = -currd; k < lbound - 1; k++) {
      tmpfront.row = integermin;
      gt_xdrop_frontvalue_set(res,currd,k,tmpfront);
    }
    for (k = ubound + 2; k <= currd; k++) {
      tmpfront.row = integermin;
      gt_xdrop_frontvalue_set(res,currd,k,tmpfront);
    }
    /* alignment finished */
    if (-currd <= end_k && end_k <= currd &&
        gt_xdrop_frontvalue_get(res,currd,end_k) == ulen)
      break;

    /* pruning lower bound
       lbound may decrease by one or increase/stays the same
       l <- min{k:R(d,k) > -inf} */
    for (k = lbound - 1; k <= ubound + 1; k++) {
      if (gt_xdrop_frontvalue_get(res,currd,k) > integermin) {
        lbound = k;
        break;
      }
    }
    /* pruning upper bound
       ubound may increase by one or decrease/stays the same
       u <- max{k:R(d,k) > -inf} */
    for (k = ubound + 1; k >= lbound - 1; k--) {
      if (gt_xdrop_frontvalue_get(res,currd,k) > integermin) {
        ubound = k;
        break;
      }
    }
    /* handling boundaries lower bound */
    for (k = 0; k >= lbound; k--) {
      if (gt_xdrop_frontvalue_get(res,currd,k) == vlen + k) {
        lbound = k;
        break;
      }
    }
    /* handling boundaries upper bound */
    for (k = 0; k <= ubound; k++) {
      if (gt_xdrop_frontvalue_get(res,currd,k) == ulen) {
        ubound = k;
        break;
      }
    }
  }
}
static int gt_seq_iterator_sequence_buffer_next(GtSeqIterator *si,
                                               const GtUchar **sequence,
                                               unsigned long *len,
                                               char **desc,
                                               GtError *err)
{
  GtSeqIteratorSequenceBuffer *seqit;
  GtUchar charcode;
  int retval;
  bool haserr = false, foundseq = false;
  gt_assert(si);
  gt_assert(len && desc);

  seqit = gt_seq_iterator_sequence_buffer_cast(si);
  gt_assert((sequence && seqit->withsequence) || !seqit->withsequence);

  if (seqit->exhausted)
  {
    return 0;
  }
  while (true)
  {
    retval = gt_sequence_buffer_next(seqit->fb,&charcode,err);
    if (retval < 0)
    {
      haserr = true;
      break;
    }
    if (retval == 0)
    {
      seqit->exhausted = true;
      break;
    }
    if (seqit->currentread < seqit->maxread)
    {
      seqit->currentread++;
    }
    if (charcode == (GtUchar) SEPARATOR)
    {
      if (seqit->sequencebuffer.nextfreeGtUchar == 0 && seqit->withsequence)
      {
        gt_error_set(err,"sequence %llu is empty", seqit->unitnum);
        haserr = true;
        break;
      }
      *desc = (char*) gt_desc_buffer_get_next(seqit->descptr);
      *len = seqit->sequencebuffer.nextfreeGtUchar;
      if (seqit->withsequence)
      {
        /* make sure the outgoing sequence is '\0' terminated */
        seqit->sequencebuffer.spaceGtUchar
          [seqit->sequencebuffer.nextfreeGtUchar] = (GtUchar) '\0';
        *sequence = seqit->sequencebuffer.spaceGtUchar;
      }
      seqit->sequencebuffer.nextfreeGtUchar = 0;
      foundseq = true;
      seqit->unitnum++;
      break;
    }
    if (seqit->withsequence)
    {
      GT_STOREINARRAY(&seqit->sequencebuffer, GtUchar,
                   MAX(1024UL, seqit->sequencebuffer.nextfreeGtUchar * 0.5),
                   charcode);
    } else
    {
      seqit->sequencebuffer.nextfreeGtUchar++;
    }
  }
  if (!haserr && seqit->sequencebuffer.nextfreeGtUchar > 0)
  {
    *desc = (char*) gt_desc_buffer_get_next(seqit->descptr);
    if (seqit->withsequence)
    {
      /* make sure the outgoing sequence is '\0' terminated */
      seqit->sequencebuffer.spaceGtUchar
        [seqit->sequencebuffer.nextfreeGtUchar] = (GtUchar) '\0';
      *sequence = seqit->sequencebuffer.spaceGtUchar;
    }
    *len = seqit->sequencebuffer.nextfreeGtUchar;
    foundseq = true;
    seqit->sequencebuffer.nextfreeGtUchar = 0;
  }
  if (haserr)
  {
    return -1;
  }
  if (foundseq)
  {
    return 1;
  }
  return 0;
}
Пример #9
0
void gt_wlis_filter_evaluate(GtArrayGtUword *chain,
                             GtUword *sum_distance_chain,
                             GtUword *sum_aligned_len_chain,
                             GtUword *chain_weighted_score,
                             GtWLisFilterMatches *wlismatches)
{
  GtUword bestchain_idx, *fwd, *bck;

  if (wlismatches->items.nextfreeGtWlisItem == 0)
  {
    return;
  }
  gt_assert((chain == NULL && sum_distance_chain != NULL
                           && sum_aligned_len_chain != NULL
                           && chain_weighted_score != NULL) ||
            (chain != NULL && sum_distance_chain == NULL
                           && sum_aligned_len_chain == NULL
                           && chain_weighted_score == NULL));

  /* sort by query seuqence */
  qsort(wlismatches->items.spaceGtWlisItem,
        (size_t) wlismatches->items.nextfreeGtWlisItem,
        sizeof *wlismatches->items.spaceGtWlisItem,
        gt_alignment_link_compare);

  /* call filter algorithm */
  bestchain_idx = gt_filter_apply(wlismatches);

  /* get the chain by backtracing */
  if (chain == NULL)
  {
    *chain_weighted_score = GT_WLIS_ACC(bestchain_idx).score;
  }
  do {
    if (chain != NULL)
    {
      GT_STOREINARRAY(chain,GtUword,chain->allocatedGtUword * 0.2 + 256,
                      GT_WLIS_ACC(bestchain_idx).oi_di.original_index);
    } else
    {
      gt_assert(sum_distance_chain != NULL && sum_aligned_len_chain != NULL);
      *sum_distance_chain += GT_WLIS_ACC(bestchain_idx).oi_di.distance;
      *sum_aligned_len_chain +=
         gt_wlis_filter_aligned_len(wlismatches->items.spaceGtWlisItem +
                                    bestchain_idx);
    }
    bestchain_idx = GT_WLIS_ACC(bestchain_idx).prev;
  } while (bestchain_idx != GT_WLIS_FILTER_UNDEF(wlismatches));

  if (chain != NULL)
  {
  /* invert the order */
    for (fwd = chain->spaceGtUword, bck = fwd + chain->nextfreeGtUword - 1;
         fwd < bck; fwd++, bck--)
    {
      GtUword tmp = *fwd;
      *fwd = *bck;
      *bck = tmp;
    }
  }
}