Ejemplo n.º 1
0
int gt_sarrquerysubstringmatch(const GtUchar *dbseq,
                               GtUword dblen,
                               const GtUchar *query,
                               GtUword querylen,
                               unsigned int minlength,
                               GtAlphabet *alpha,
                               GtProcessquerymatch processquerymatch,
                               void *processquerymatchinfo,
                               GtLogger *logger,
                               GtError *err)
{
  unsigned int numofchars, recommendedprefixlength;
  bool haserr = false;
  GtEncseq *dbencseq;
  GtEncseqBuilder *eb;

  gt_assert(querylen >= (GtUword) minlength && dblen >= (GtUword) minlength);
  eb = gt_encseq_builder_new(alpha);
  gt_encseq_builder_disable_multiseq_support(eb);
  gt_encseq_builder_disable_description_support(eb);
  gt_encseq_builder_set_logger(eb, logger);
  gt_encseq_builder_add_multiple_encoded(eb,dbseq,dblen);
  dbencseq = gt_encseq_builder_build(eb, err);
  gt_encseq_builder_delete(eb);
  numofchars = gt_alphabet_num_of_chars(alpha);
  recommendedprefixlength
    = gt_recommendedprefixlength(numofchars,dblen,
                                 GT_RECOMMENDED_MULTIPLIER_DEFAULT,
                                 true);
  if (gt_constructsarrandrunmmsearch(dbencseq,
                                     GT_READMODE_FORWARD,
                                     recommendedprefixlength,
                                     1U, /* parts */
                                     0, /* maximumspace */
                                     query,
                                     querylen,
                                     minlength,
                                     processquerymatch,
                                     processquerymatchinfo,
                                     NULL,
                                     false,
                                     logger,
                                     err) != 0)
  {
    haserr = true;
  }
  gt_encseq_delete(dbencseq);
  dbencseq = NULL;
  return haserr ? -1 : 0;
}
Ejemplo n.º 2
0
static int sarrselfsubstringmatch(const GtUchar *dbseq,
                                  GtUword dblen,
                                  const GtUchar *query,
                                  GtUword querylen,
                                  unsigned int minlength,
                                  GtAlphabet *alpha,
                                  Processmaxpairs processmaxpairs,
                                  void *processmaxpairsinfo,
                                  GtLogger *logger,
                                  GtError *err)
{
  Substringmatchinfo ssi;
  unsigned int numofchars, recommendedprefixlength;
  GtEncseqBuilder *eb;
  bool haserr = false;

  eb = gt_encseq_builder_new(alpha);
  gt_encseq_builder_disable_multiseq_support(eb);
  gt_encseq_builder_disable_description_support(eb);
  gt_encseq_builder_set_logger(eb, logger);
  gt_encseq_builder_add_encoded(eb, dbseq, dblen, NULL);
  gt_encseq_builder_add_encoded(eb, query, querylen, NULL);
  ssi.encseq = gt_encseq_builder_build(eb, err);
  gt_encseq_builder_delete(eb);

  ssi.minlength = minlength;
  ssi.processmaxpairs = processmaxpairs;
  ssi.processmaxpairsinfo = processmaxpairsinfo;
  numofchars = gt_alphabet_num_of_chars(alpha);
  recommendedprefixlength
    = gt_recommendedprefixlength(numofchars,
                                 dblen+querylen+1,
                                 GT_RECOMMENDED_MULTIPLIER_DEFAULT,
                                 true);
  if (constructsarrandrunmaxpairs(&ssi,
                                  GT_READMODE_FORWARD,
                                  recommendedprefixlength,
                                  1U, /* parts */
                                  0, /* maximumspace */
                                  NULL,
                                  false,
                                  logger,
                                  err) != 0)
  {
    haserr = true;
  }
  gt_encseq_delete(ssi.encseq);
  ssi.encseq = NULL;
  return haserr ? -1 : 0;
}
Ejemplo n.º 3
0
int gt_determinetyrbckpfxlen(unsigned int *prefixlength,
                          const Tyrindex *tyrindex,
                          const Definedunsignedint *callprefixlength,
                          GtError *err)
{
  gt_error_check(err);
  if (callprefixlength->defined)
  {
    unsigned int maxprefixlen
      = gt_whatisthemaximalprefixlength(tyrindex->alphasize,
                                     (GtUword) tyrindex->numofmers,
                                     0,true);
    if (maxprefixlen > (unsigned int) tyrindex->mersize)
    {
      maxprefixlen = (unsigned int) tyrindex->mersize;
    }
    if (gt_checkprefixlength(maxprefixlen,callprefixlength->valueunsignedint,
                          err) != 0)
    {
      return -1;
    }
    *prefixlength = callprefixlength->valueunsignedint;
  } else
  {
    unsigned int recommendedprefixlength
      = gt_recommendedprefixlength(tyrindex->alphasize,
                                   (GtUword) tyrindex->numofmers,
                                   GT_RECOMMENDED_MULTIPLIER_DEFAULT,
                                   true);
    if (recommendedprefixlength > (unsigned int) tyrindex->mersize)
    {
      recommendedprefixlength = (unsigned int) tyrindex->mersize;
    }
    *prefixlength = recommendedprefixlength;
  }
  return 0;
}
extern int
gt_packedindex_chk_search(int argc, const char *argv[], GtError *err)
{
  struct chkSearchOptions params;
  Suffixarray suffixarray;
  Enumpatterniterator *epi = NULL;
  bool saIsLoaded = false;
  BWTSeq *bwtSeq = NULL;
  GtStr *inputProject = NULL;
  int parsedArgs;
  bool had_err = false;
  BWTSeqExactMatchesIterator EMIter;
  bool EMIterInitialized = false;
  GtLogger *logger = NULL;
  inputProject = gt_str_new();

  do {
    gt_error_check(err);
    {
      bool exitNow = false;
      switch (parseChkBWTOptions(&parsedArgs, argc, argv, &params,
                                 inputProject, err))
      {
      case GT_OPTION_PARSER_OK:
        break;
      case GT_OPTION_PARSER_ERROR:
        had_err = true;
        exitNow = true;
        break;
      case GT_OPTION_PARSER_REQUESTS_EXIT:
        exitNow = true;
        break;
      }
      if (exitNow)
        break;
    }
    gt_str_set(inputProject, argv[parsedArgs]);

    logger = gt_logger_new(params.verboseOutput,
                           GT_LOGGER_DEFLT_PREFIX, stdout);

    bwtSeq = gt_availBWTSeq(&params.idx.final, logger, err);
    if ((had_err = bwtSeq == NULL))
      break;

    {
      enum verifyBWTSeqErrCode retval =
        gt_BWTSeqVerifyIntegrity(bwtSeq, gt_str_get(inputProject), params.flags,
                              params.progressInterval, stderr, logger, err);
      if ((had_err = (retval != VERIFY_BWTSEQ_NO_ERROR)))
      {
        fprintf(stderr, "index integrity check failed: %s\n",
                gt_error_get(err));
        gt_error_set(err, "aborted because of index integrity check fail");
        break;
      }
    }
    if (BWTSeqHasLocateInformation(bwtSeq))
    {
      if ((had_err = !gt_initEmptyEMIterator(&EMIter, bwtSeq)))
      {
        gt_error_set(err, "Cannot create matches iterator for sequence index.");
        break;
      }
      EMIterInitialized = true;
    }
    {
      unsigned long totalLen, dbstart;
      unsigned long trial, patternLen;

      if ((had_err =
           gt_mapsuffixarray(&suffixarray, SARR_SUFTAB | SARR_ESQTAB,
                             gt_str_get(inputProject), NULL, err) != 0))
      {
        gt_error_set(err, "Can't load suffix array project with"
                  " demand for encoded sequence and suffix table files\n");
        break;
      }
      totalLen = gt_encseq_total_length(suffixarray.encseq);
      saIsLoaded = true;
      if ((had_err = (params.minPatLen >= 0L && params.maxPatLen >= 0L
                      && params.minPatLen > params.maxPatLen)))
      {
        gt_error_set(err, "Invalid pattern lengths selected: min=%ld, max=%ld;"
                  " min <= max is required.", params.minPatLen,
                  params.maxPatLen);
        break;
      }
      if (params.minPatLen < 0 || params.maxPatLen < 0)
      {
        unsigned int numofchars
          = gt_alphabet_num_of_chars(
                               gt_encseq_alphabet(suffixarray.encseq));
        if (params.minPatLen < 0)
          params.minPatLen
            = gt_recommendedprefixlength(numofchars,
                                         totalLen,
                                         GT_RECOMMENDED_MULTIPLIER_DEFAULT,
                                         true);
        if (params.maxPatLen < 0)
          params.maxPatLen =
            MAX(params.minPatLen,
                125 * gt_recommendedprefixlength(numofchars,totalLen,
                                         GT_RECOMMENDED_MULTIPLIER_DEFAULT,
                                         true)/100);
        else
          params.maxPatLen = MAX(params.maxPatLen, params.minPatLen);
      }
      fprintf(stderr, "Using patterns of lengths %lu to %lu\n",
              params.minPatLen, params.maxPatLen);
      if ((had_err = totalLen + 1 != BWTSeqLength(bwtSeq)))
      {
        gt_error_set(err, "base suffix array and index have diferrent lengths!"
                          "%lu vs. %lu",  totalLen + 1,
                  BWTSeqLength(bwtSeq));
        break;
      }
      if ((had_err =
           (epi = gt_newenumpatterniterator(params.minPatLen, params.maxPatLen,
                                         suffixarray.encseq,
                                         err)) == NULL))
      {
        fputs("Creation of pattern iterator failed!\n", stderr);
        break;
      }
      for (trial = 0; !had_err && trial < params.numOfSamples; ++trial)
      {
        const GtUchar *pptr = gt_nextEnumpatterniterator(&patternLen, epi);
        GtMMsearchiterator *mmsi =
          gt_mmsearchiterator_new_complete_olain(suffixarray.encseq,
                                            suffixarray.suftab,
                                            0,  /* leftbound */
                                            totalLen, /* rightbound */
                                            0, /* offset */
                                            suffixarray.readmode,
                                            pptr,
                                            patternLen);
        if (BWTSeqHasLocateInformation(bwtSeq))
        {
          if ((had_err = !gt_reinitEMIterator(&EMIter, bwtSeq, pptr, patternLen,
                                           false)))
          {
            fputs("Internal error: failed to reinitialize pattern match"
                  " iterator", stderr);
            abort();
          }
          gt_assert(gt_EMINumMatchesTotal(&EMIter) ==
                    gt_BWTSeqMatchCount(bwtSeq, pptr, patternLen,
                                        false));
          gt_assert(gt_EMINumMatchesTotal(&EMIter)
                      == gt_mmsearchiterator_count(mmsi));
          while (gt_mmsearchiterator_next(&dbstart,mmsi))
          {
            unsigned long matchPos = 0;
            bool match = EMIGetNextMatch(&EMIter, &matchPos, bwtSeq);
            if ((had_err = !match))
            {
              gt_error_set(err,
                           "matches of packedindex expired before mmsearch!");
              break;
            }
            if ((had_err = matchPos != dbstart))
            {
              gt_error_set(err, "packedindex match doesn't equal mmsearch "
                           "match result!\n%lu vs. %lu\n",
                           matchPos, dbstart);
            }
          }
          if (!had_err)
          {
            unsigned long matchPos;
            bool trailingMatch = EMIGetNextMatch(&EMIter, &matchPos, bwtSeq);
            if ((had_err = trailingMatch))
            {
              gt_error_set(err, "matches of mmsearch expired before fmindex!");
              break;
            }
          }
        }
        else
        {
          unsigned long numFMIMatches = gt_BWTSeqMatchCount(bwtSeq, pptr,
                                                         patternLen,
                                                         false),
            numMMSearchMatches = gt_mmsearchiterator_count(mmsi);
          if ((had_err = numFMIMatches != numMMSearchMatches))
          {
            gt_error_set(err, "Number of matches not equal for suffix array ("
                              "%lu) and fmindex (%lu).\n",
                      numFMIMatches, numMMSearchMatches);
          }
        }
        gt_mmsearchiterator_delete(mmsi);
        mmsi = NULL;
        if (params.progressInterval && !((trial + 1) % params.progressInterval))
          putc('.', stderr);
      }
      if (params.progressInterval)
        putc('\n', stderr);
      fprintf(stderr, "Finished %lu of %lu matchings successfully.\n",
              trial, params.numOfSamples);
    }
  } while (0);
  if (EMIterInitialized) gt_destructEMIterator(&EMIter);
  if (saIsLoaded) gt_freesuffixarray(&suffixarray);
  gt_freeEnumpatterniterator(epi);
  if (bwtSeq) gt_deleteBWTSeq(bwtSeq);
  if (logger) gt_logger_delete(logger);
  if (inputProject) gt_str_delete(inputProject);
  return had_err?-1:0;
}