Example #1
0
int gt_callenumselfmatches(const char *indexname,
                           GtReadmode queryreadmode,
                           unsigned int userdefinedleastlength,
                           GtProcessquerymatch processquerymatch,
                           void *processquerymatchinfo,
                           GtLogger *logger,
                           GtError *err)
{
  Suffixarray suffixarray;
  bool haserr = false;

  gt_assert(queryreadmode != GT_READMODE_FORWARD);
  if (gt_mapsuffixarray(&suffixarray,
                        SARR_ESQTAB | SARR_SUFTAB | SARR_SSPTAB,
                        indexname,
                        logger,
                        err) != 0)
  {
    haserr = true;
  } else
  {
    unsigned long seqnum, numofsequences, seqlength, seqstartpos;
    GtQuerymatch *querymatchspaceptr = gt_querymatch_new();
    GtQueryrep queryrep;

    numofsequences = gt_encseq_num_of_sequences(suffixarray.encseq);
    queryrep.sequence = NULL;
    queryrep.reversecopy = false;
    queryrep.encseq = suffixarray.encseq;
    queryrep.readmode = queryreadmode;
    for (seqnum = 0; seqnum < numofsequences; seqnum++)
    {
      seqstartpos = gt_encseq_seqstartpos(suffixarray.encseq, seqnum);
      seqlength = gt_encseq_seqlength(suffixarray.encseq, seqnum);
      if (seqlength >= (unsigned long) userdefinedleastlength)
      {
        queryrep.startpos = seqstartpos;
        queryrep.length = seqlength;
        if (gt_querysubstringmatch(true,
                                   &suffixarray,
                                   (uint64_t) seqnum,
                                   &queryrep,
                                   (unsigned long) userdefinedleastlength,
                                   processquerymatch,
                                   processquerymatchinfo,
                                   querymatchspaceptr,
                                   err) != 0)
        {
          haserr = true;
          break;
        }
      }
    }
    gt_querymatch_delete(querymatchspaceptr);
  }
  gt_freesuffixarray(&suffixarray);
  return haserr ? -1 : 0;
}
Example #2
0
int gt_callenumquerymatches(const char *indexname,
                            const GtStrArray *queryfiles,
                            bool findmums,
                            bool forwardstrand,
                            bool reversestrand,
                            unsigned int userdefinedleastlength,
                            GtProcessquerybeforematching
                              processquerybeforematching,
                            GtProcessquerymatch processquerymatch,
                            void *processquerymatchinfo,
                            GtLogger *logger,
                            GtError *err)
{
  Suffixarray suffixarray;
  bool haserr = false;

  if (gt_mapsuffixarray(&suffixarray,
                        SARR_ESQTAB | SARR_SUFTAB | SARR_SSPTAB,
                        indexname,
                        logger,
                        err) != 0)
  {
    haserr = true;
  } else
  {
    if (gt_callenumquerymatches_withindex(findmums
                                            ? gt_queryuniquematch
                                            : gt_querysubstringmatch,
                                          &suffixarray,
                                          queryfiles,
                                          forwardstrand,
                                          reversestrand,
                                          userdefinedleastlength,
                                          processquerybeforematching,
                                          processquerymatch,
                                          processquerymatchinfo,
                                          err) != 0)
    {
      haserr = true;
    }
  }
  gt_freesuffixarray(&suffixarray);
  return haserr ? -1 : 0;
}
Example #3
0
static int gt_matstat_runner(GT_UNUSED int argc, GT_UNUSED const char **argv,
                             GT_UNUSED int parsed_args,
                             void *tool_arguments, GtError *err)
{
  Gfmsubcallinfo *arguments = tool_arguments;
  Fmindex fmindex;
  Suffixarray suffixarray;
  void *packedindex = NULL;
  GtLogger *logger = NULL;
  bool haserr = false;
  const GtAlphabet *alphabet = NULL;
#ifdef WITHBCKTAB
  unsigned int prefixlength = 0;
#endif
  GtUword totallength;
  bool gt_mapfmindexfail = false;
  gt_error_check(err);
  gt_assert(arguments);

  logger = gt_logger_new(false, GT_LOGGER_DEFLT_PREFIX, stdout);
  if (arguments->indextype == Fmindextype)
  {
    if (gt_mapfmindex(&fmindex,gt_str_get(arguments->indexname),
                      logger, err) != 0)
    {
      haserr = true;
      gt_mapfmindexfail = true;
    } else
    {
      alphabet = fmindex.alphabet;
    }
    totallength = fmindex.bwtlength-1;
  } else
  {
    unsigned int mappedbits;

    if (arguments->indextype == Esaindextype)
    {
      mappedbits = SARR_ESQTAB | SARR_SUFTAB
#undef WITHBCKTAB
#ifdef WITHBCKTAB
                   | SARR_BCKTAB
#endif
                   ;
    } else
    {
      if (dotestsequence(arguments))
      {
        mappedbits = SARR_ESQTAB;
      } else
      {
        mappedbits = 0;
      }
    }
    if (gt_mapsuffixarray(&suffixarray,
                       mappedbits,
                       gt_str_get(arguments->indexname),
                       logger,
                       err) != 0)
    {
      haserr = true;
      totallength = 0;
    } else
    {
      alphabet = gt_encseq_alphabet(suffixarray.encseq);
#ifdef WITHBCKTAB
      prefixlength = suffixarray.prefixlength;
#endif
      totallength = gt_encseq_total_length(suffixarray.encseq);
    }
    if (!haserr)
    {
      if (arguments->indextype == Packedindextype)
      {
        packedindex =
          gt_loadvoidBWTSeqForSA(gt_str_get(arguments->indexname),
                                 false,
                                 err);
        if (packedindex == NULL)
        {
          haserr = true;
        }
      }
    }
  }
  if (!haserr)
  {
    const void *theindex;
    Greedygmatchforwardfunction gmatchforwardfunction;

    if (arguments->indextype == Fmindextype)
    {
      theindex = (const void *) &fmindex;
      if (arguments->doms)
      {
        gmatchforwardfunction = gt_skfmmstats;
      } else
      {
        gmatchforwardfunction = gt_skfmuniqueforward;
      }
    } else
    {
      if (arguments->indextype == Esaindextype)
      {
        theindex = (const void *) &suffixarray;
        if (arguments->doms)
        {
          gmatchforwardfunction = gt_suffixarraymstats;
        } else
        {
          gmatchforwardfunction = gt_suffixarrayuniqueforward;
        }
      } else
      {
        gt_assert(arguments->indextype == Packedindextype);
        theindex = (const void *) packedindex;
        if (arguments->doms)
        {
          gmatchforwardfunction = gt_voidpackedindexmstatsforward;
        } else
        {
          gmatchforwardfunction = gt_voidpackedindexuniqueforward;
        }
      }
    }
    if (!haserr)
    {
#ifdef WITHBCKTAB
      if (prefixlength > 0 &&
          arguments->indextype == Esaindextype &&
          runsubstringiteration(gmatchforwardfunction,
                                theindex,
                                totallength,
                                suffixarray.bcktab,
                                suffixarray.countspecialcodes,
                                alphabet,
                                prefixlength,
                                arguments->queryfilenames,
                                err) != 0)

      {
        haserr = true;
      }
#endif
      if (!haserr &&
          gt_findsubquerygmatchforward(dotestsequence(arguments)
                                      ? suffixarray.encseq
                                      : NULL,
                                      theindex,
                                      totallength,
                                      gmatchforwardfunction,
                                      alphabet,
                                      arguments->queryfilenames,
                                      arguments->minlength,
                                      arguments->maxlength,
                                      (arguments->showmode & SHOWSEQUENCE)
                                             ? true : false,
                                      (arguments->showmode & SHOWQUERYPOS)
                                             ? true : false,
                                      (arguments->showmode & SHOWSUBJECTPOS)
                                             ? true : false,
                                      err) != 0)
      {
        haserr = true;
      }
    }
  }
  if (arguments->indextype == Fmindextype)
  {
    if (!gt_mapfmindexfail)
    {
      gt_freefmindex(&fmindex);
    }
  } else
  {
    if (arguments->indextype == Packedindextype && packedindex != NULL)
    {
      gt_deletevoidBWTSeq(packedindex);
    }
    gt_freesuffixarray(&suffixarray);
  }
  gt_logger_delete(logger);

  return haserr ? -1 : 0;;
}
extern int
gt_packedindex_chk_search(int argc, const char *argv[], GtError *err)
{
  struct chkSearchOptions params;
  Suffixarray suffixarray;
  Enumpatterniterator *epi = NULL;
  bool saIsLoaded = false;
  BWTSeq *bwtSeq = NULL;
  GtStr *inputProject = NULL;
  int parsedArgs;
  bool had_err = false;
  BWTSeqExactMatchesIterator EMIter;
  bool EMIterInitialized = false;
  GtLogger *logger = NULL;
  inputProject = gt_str_new();

  do {
    gt_error_check(err);
    {
      bool exitNow = false;
      switch (parseChkBWTOptions(&parsedArgs, argc, argv, &params,
                                 inputProject, err))
      {
      case GT_OPTION_PARSER_OK:
        break;
      case GT_OPTION_PARSER_ERROR:
        had_err = true;
        exitNow = true;
        break;
      case GT_OPTION_PARSER_REQUESTS_EXIT:
        exitNow = true;
        break;
      }
      if (exitNow)
        break;
    }
    gt_str_set(inputProject, argv[parsedArgs]);

    logger = gt_logger_new(params.verboseOutput,
                           GT_LOGGER_DEFLT_PREFIX, stdout);

    bwtSeq = gt_availBWTSeq(&params.idx.final, logger, err);
    if ((had_err = bwtSeq == NULL))
      break;

    {
      enum verifyBWTSeqErrCode retval =
        gt_BWTSeqVerifyIntegrity(bwtSeq, gt_str_get(inputProject), params.flags,
                              params.progressInterval, stderr, logger, err);
      if ((had_err = (retval != VERIFY_BWTSEQ_NO_ERROR)))
      {
        fprintf(stderr, "index integrity check failed: %s\n",
                gt_error_get(err));
        gt_error_set(err, "aborted because of index integrity check fail");
        break;
      }
    }
    if (BWTSeqHasLocateInformation(bwtSeq))
    {
      if ((had_err = !gt_initEmptyEMIterator(&EMIter, bwtSeq)))
      {
        gt_error_set(err, "Cannot create matches iterator for sequence index.");
        break;
      }
      EMIterInitialized = true;
    }
    {
      unsigned long totalLen, dbstart;
      unsigned long trial, patternLen;

      if ((had_err =
           gt_mapsuffixarray(&suffixarray, SARR_SUFTAB | SARR_ESQTAB,
                             gt_str_get(inputProject), NULL, err) != 0))
      {
        gt_error_set(err, "Can't load suffix array project with"
                  " demand for encoded sequence and suffix table files\n");
        break;
      }
      totalLen = gt_encseq_total_length(suffixarray.encseq);
      saIsLoaded = true;
      if ((had_err = (params.minPatLen >= 0L && params.maxPatLen >= 0L
                      && params.minPatLen > params.maxPatLen)))
      {
        gt_error_set(err, "Invalid pattern lengths selected: min=%ld, max=%ld;"
                  " min <= max is required.", params.minPatLen,
                  params.maxPatLen);
        break;
      }
      if (params.minPatLen < 0 || params.maxPatLen < 0)
      {
        unsigned int numofchars
          = gt_alphabet_num_of_chars(
                               gt_encseq_alphabet(suffixarray.encseq));
        if (params.minPatLen < 0)
          params.minPatLen
            = gt_recommendedprefixlength(numofchars,
                                         totalLen,
                                         GT_RECOMMENDED_MULTIPLIER_DEFAULT,
                                         true);
        if (params.maxPatLen < 0)
          params.maxPatLen =
            MAX(params.minPatLen,
                125 * gt_recommendedprefixlength(numofchars,totalLen,
                                         GT_RECOMMENDED_MULTIPLIER_DEFAULT,
                                         true)/100);
        else
          params.maxPatLen = MAX(params.maxPatLen, params.minPatLen);
      }
      fprintf(stderr, "Using patterns of lengths %lu to %lu\n",
              params.minPatLen, params.maxPatLen);
      if ((had_err = totalLen + 1 != BWTSeqLength(bwtSeq)))
      {
        gt_error_set(err, "base suffix array and index have diferrent lengths!"
                          "%lu vs. %lu",  totalLen + 1,
                  BWTSeqLength(bwtSeq));
        break;
      }
      if ((had_err =
           (epi = gt_newenumpatterniterator(params.minPatLen, params.maxPatLen,
                                         suffixarray.encseq,
                                         err)) == NULL))
      {
        fputs("Creation of pattern iterator failed!\n", stderr);
        break;
      }
      for (trial = 0; !had_err && trial < params.numOfSamples; ++trial)
      {
        const GtUchar *pptr = gt_nextEnumpatterniterator(&patternLen, epi);
        GtMMsearchiterator *mmsi =
          gt_mmsearchiterator_new_complete_olain(suffixarray.encseq,
                                            suffixarray.suftab,
                                            0,  /* leftbound */
                                            totalLen, /* rightbound */
                                            0, /* offset */
                                            suffixarray.readmode,
                                            pptr,
                                            patternLen);
        if (BWTSeqHasLocateInformation(bwtSeq))
        {
          if ((had_err = !gt_reinitEMIterator(&EMIter, bwtSeq, pptr, patternLen,
                                           false)))
          {
            fputs("Internal error: failed to reinitialize pattern match"
                  " iterator", stderr);
            abort();
          }
          gt_assert(gt_EMINumMatchesTotal(&EMIter) ==
                    gt_BWTSeqMatchCount(bwtSeq, pptr, patternLen,
                                        false));
          gt_assert(gt_EMINumMatchesTotal(&EMIter)
                      == gt_mmsearchiterator_count(mmsi));
          while (gt_mmsearchiterator_next(&dbstart,mmsi))
          {
            unsigned long matchPos = 0;
            bool match = EMIGetNextMatch(&EMIter, &matchPos, bwtSeq);
            if ((had_err = !match))
            {
              gt_error_set(err,
                           "matches of packedindex expired before mmsearch!");
              break;
            }
            if ((had_err = matchPos != dbstart))
            {
              gt_error_set(err, "packedindex match doesn't equal mmsearch "
                           "match result!\n%lu vs. %lu\n",
                           matchPos, dbstart);
            }
          }
          if (!had_err)
          {
            unsigned long matchPos;
            bool trailingMatch = EMIGetNextMatch(&EMIter, &matchPos, bwtSeq);
            if ((had_err = trailingMatch))
            {
              gt_error_set(err, "matches of mmsearch expired before fmindex!");
              break;
            }
          }
        }
        else
        {
          unsigned long numFMIMatches = gt_BWTSeqMatchCount(bwtSeq, pptr,
                                                         patternLen,
                                                         false),
            numMMSearchMatches = gt_mmsearchiterator_count(mmsi);
          if ((had_err = numFMIMatches != numMMSearchMatches))
          {
            gt_error_set(err, "Number of matches not equal for suffix array ("
                              "%lu) and fmindex (%lu).\n",
                      numFMIMatches, numMMSearchMatches);
          }
        }
        gt_mmsearchiterator_delete(mmsi);
        mmsi = NULL;
        if (params.progressInterval && !((trial + 1) % params.progressInterval))
          putc('.', stderr);
      }
      if (params.progressInterval)
        putc('\n', stderr);
      fprintf(stderr, "Finished %lu of %lu matchings successfully.\n",
              trial, params.numOfSamples);
    }
  } while (0);
  if (EMIterInitialized) gt_destructEMIterator(&EMIter);
  if (saIsLoaded) gt_freesuffixarray(&suffixarray);
  gt_freeEnumpatterniterator(epi);
  if (bwtSeq) gt_deleteBWTSeq(bwtSeq);
  if (logger) gt_logger_delete(logger);
  if (inputProject) gt_str_delete(inputProject);
  return had_err?-1:0;
}
Example #5
0
enum verifyBWTSeqErrCode
gt_BWTSeqVerifyIntegrity(BWTSeq *bwtSeq, const char *projectName,
                      int checkFlags,
                      GtUword tickPrint, FILE *fp,
                      GtLogger *verbosity, GtError *err)
{
  Suffixarray suffixArray;
  struct extBitsRetrieval extBits;
  bool suffixArrayIsInitialized = false, extBitsAreInitialized = false;
  enum verifyBWTSeqErrCode retval = VERIFY_BWTSEQ_NO_ERROR;
  do
  {
    GtUword seqLen;
    gt_assert(bwtSeq && projectName && err);
    gt_error_check(err);

    initExtBitsRetrieval(&extBits);
    extBitsAreInitialized = true;

    if (gt_mapsuffixarray(&suffixArray,
                       SARR_SUFTAB | SARR_ESQTAB, projectName, verbosity, err))
    {
      gt_error_set(err, "Cannot load reference suffix array project with"
                    " demand for suffix table file and encoded sequence"
                    " for project: %s", projectName);
      retval = VERIFY_BWTSEQ_REFLOAD_ERROR;
      break;
    }
    suffixArrayIsInitialized = true;
    seqLen = gt_encseq_total_length(suffixArray.encseq) + 1;
    if (BWTSeqLength(bwtSeq) != seqLen)
    {
      gt_error_set(err, "length mismatch for suffix array project %s and "
                "bwt sequence index", projectName);
      retval = VERIFY_BWTSEQ_LENCOMPARE_ERROR;
      break;
    }

    if (checkFlags & VERIFY_BWTSEQ_SUFVAL
        && BWTSeqHasLocateInformation(bwtSeq))
    {
      GtUword i;
      for (i = 0; i < seqLen && retval == VERIFY_BWTSEQ_NO_ERROR; ++i)
      {
        if (gt_BWTSeqPosHasLocateInfo(bwtSeq, i, &extBits))
        {
          GtUword sfxArrayValue = gt_BWTSeqLocateMatch(bwtSeq, i,
                                                             &extBits);
          if (sfxArrayValue != ESASUFFIXPTRGET(suffixArray.suftab,i))
          {
            gt_error_set(err, "Failed suffix array value comparison"
                          " at position "GT_WU": "GT_WU" != "GT_WU"",
                          i, sfxArrayValue,
                          ESASUFFIXPTRGET(suffixArray.suftab,i));
            retval = VERIFY_BWTSEQ_SUFVAL_ERROR;
            break;
          }
        }
        if (tickPrint && !((i + 1) % tickPrint))
          putc('.', fp);
      }
      if (tickPrint)
        putc('\n', fp);
      if (retval != VERIFY_BWTSEQ_NO_ERROR)
        break;
    }
    else if (checkFlags & VERIFY_BWTSEQ_SUFVAL)
    {
      gt_error_set(err, "check of suffix array values was requested,"
                " but index contains no  locate information!");
      retval = VERIFY_BWTSEQ_SUFVAL_ERROR;
      break;
    }
    else if (!(checkFlags & VERIFY_BWTSEQ_SUFVAL)
             && BWTSeqHasLocateInformation(bwtSeq))
    {
      fputs("Not checking suftab values.\n", stderr);
    }
    if (BWTSeqHasLocateInformation(bwtSeq))
    {
      GtUword nextLocate = BWTSeqTerminatorPos(bwtSeq);
      if (suffixArray.longest.defined &&
          suffixArray.longest.valueunsignedlong != nextLocate)
      {
        gt_error_set(err, "terminator/0-rotation position mismatch "GT_WU""
                  " vs. "GT_WU"", suffixArray.longest.valueunsignedlong,
                  nextLocate);
        retval = VERIFY_BWTSEQ_TERMPOS_ERROR;
        break;
      }
      if ((checkFlags & VERIFY_BWTSEQ_LFMAPWALK)
          && (bwtSeq->featureToggles & BWTReversiblySorted))
      {
        GtUword i = seqLen;
        /* handle first symbol specially because the encseq
         * will not return the terminator symbol */
        {
          Symbol sym = BWTSeqGetSym(bwtSeq, nextLocate);
          if (sym != UNDEFBWTCHAR)
          {
            gt_error_set(err, "symbol mismatch at position "GT_WU": "
                      "%d vs. reference symbol %d", i - 1, (int)sym,
                      (int)UNDEFBWTCHAR);
            retval = VERIFY_BWTSEQ_LFMAPWALK_ERROR;
            break;
          }
          --i;
          nextLocate = BWTSeqLFMap(bwtSeq, nextLocate, &extBits);
        }
        while (i > 0)
        {
          Symbol symRef =
                         gt_encseq_get_encoded_char(suffixArray.encseq,
                                                          --i,
                                                          suffixArray.readmode);
          Symbol symCmp = BWTSeqGetSym(bwtSeq, nextLocate);
          if (symCmp != symRef)
          {
            gt_error_set(err, "symbol mismatch at position "GT_WU": "
                      "%d vs. reference symbol %d", i, symCmp, symRef);
            retval = VERIFY_BWTSEQ_LFMAPWALK_ERROR;
            break;
          }
          nextLocate = BWTSeqLFMap(bwtSeq, nextLocate, &extBits);
        }
        if (retval != VERIFY_BWTSEQ_NO_ERROR)
          break;
      }
      else if ((checkFlags & VERIFY_BWTSEQ_LFMAPWALK)
               && !(bwtSeq->featureToggles & BWTReversiblySorted))
      {
        gt_error_set(err, "requested complete backwards regeneration in index"
                  " without regeneration capability");
        retval = VERIFY_BWTSEQ_LFMAPWALK_IMP_ERROR;
        break;
      }
    }
    if (checkFlags & VERIFY_BWTSEQ_CONTEXT)
    {
      BWTSeqContextRetriever *bwtSeqCR =
        gt_BWTSeqCRLoad(bwtSeq, projectName, CTX_MAP_ILOG_AUTOSIZE);
      if (!bwtSeqCR)
      {
        gt_error_set(err, "cannot load BWT sequence context access table"
                  " for project %s", projectName);
        retval = VERIFY_BWTSEQ_CONTEXT_LOADFAIL;
        break;
      }
      fputs("Checking context regeneration.\n", stderr);
      {
        GtUword i, start, subSeqLen,
          maxSubSeqLen = MIN(MAX(MIN_CONTEXT_LEN, seqLen/CONTEXT_FRACTION),
                             MAX_CONTEXT_LEN),
          numTries = MIN(MAX_NUM_CONTEXT_CHECKS,
                         MAX(2, seqLen/CONTEXT_INTERVAL));
        Symbol *contextBuf = gt_malloc(sizeof (Symbol) * MAX_CONTEXT_LEN);
        GtEncseqReader *esr =
           gt_encseq_create_reader_with_readmode(suffixArray.encseq,
                                                 suffixArray.readmode,
                                                 0);
        for (i = 0; i < numTries && retval == VERIFY_BWTSEQ_NO_ERROR; ++i)
        {
          GtUword j, end, inSubSeqLen;
          subSeqLen = random()%maxSubSeqLen + 1;
          start = random()%(seqLen - subSeqLen + 1);
          end = start + subSeqLen;
          inSubSeqLen = subSeqLen - ((end==seqLen)?1:0);
          gt_BWTSeqCRAccessSubseq(bwtSeqCR, start, subSeqLen, contextBuf);
          gt_encseq_reader_reinit_with_readmode(esr, suffixArray.encseq,
                                                suffixArray.readmode, start);
          for (j = 0; j < inSubSeqLen; ++j)
          {
            Symbol symRef = gt_encseq_reader_next_encoded_char(esr);
            Symbol symCmp = contextBuf[j];
            if (symCmp != symRef)
            {
              gt_error_set(err, "symbol mismatch at position "GT_WU": "
                        "%d vs. reference symbol %d", start + j, (int)symCmp,
                        (int)symRef);
              retval = VERIFY_BWTSEQ_CONTEXT_SYMFAIL;
              break;
            }
          }
          while (j < subSeqLen)
          {
            Symbol symRef = UNDEFBWTCHAR;
            Symbol symCmp = contextBuf[j];
            if (symCmp != symRef)
            {
              gt_error_set(err, "symbol mismatch at position "GT_WU": "
                        "%d vs. reference symbol %d", start + j, (int)symCmp,
                        (int)symRef);
              retval = VERIFY_BWTSEQ_CONTEXT_SYMFAIL;
              break;
            }
            ++j;
          }
        }
        if (retval == VERIFY_BWTSEQ_NO_ERROR)
          fputs("Context regeneration completed successfully.\n", stderr);
        gt_encseq_reader_delete(esr);
        gt_free(contextBuf);
      }
      gt_deleteBWTSeqCR(bwtSeqCR);
    }
  } while (0);
  if (suffixArrayIsInitialized) gt_freesuffixarray(&suffixArray);
  if (extBitsAreInitialized) destructExtBitsRetrieval(&extBits);
  return retval;
}