static void checknumberofoccurrences(const TyrDfsstate *dfsstate, GtUword countocc, GtUword position) { GtMMsearchiterator *mmsi; GtUword idx, bfcount; for (idx = 0; idx < dfsstate->mersize; idx++) { dfsstate->currentmer[idx] = gt_encseq_get_encoded_char(dfsstate->encseq,position+idx, dfsstate->readmode); } mmsi = gt_mmsearchiterator_new_complete_plain(dfsstate->encseq, dfsstate->suftab, 0, dfsstate->totallength, 0, dfsstate->readmode, dfsstate->currentmer, dfsstate->mersize); bfcount = gt_mmsearchiterator_count(mmsi); if (bfcount != countocc) { fprintf(stderr,"bfcount = "GT_WU" != "GT_WU" = countocc\n", bfcount,countocc); exit(GT_EXIT_PROGRAMMING_ERROR); } gt_mmsearchiterator_delete(mmsi); }
void gt_querysubstringmatchiterator_delete(GtQuerysubstringmatchiterator *qsmi) { if (qsmi != NULL) { gt_mmsearchiterator_delete(qsmi->mmsi); gt_seq_iterator_delete(qsmi->seqit); gt_free(qsmi); } }
static void gt_querysubstringmatch(bool selfmatch, const GtEncseq *dbencseq, const ESASuffixptr *suftabpart, GtReadmode readmode, GtUword numberofsuffixes, uint64_t queryunitnum, GtQueryrepresentation *queryrep, GtUword minmatchlength, GtProcessquerymatch processquerymatch, void *processquerymatchinfo, GtQuerymatch *querymatchspaceptr) { GtMMsearchiterator *mmsi; GtUword totallength, localqueryoffset = 0; uint64_t localqueryunitnum = queryunitnum; GtQuerysubstring querysubstring; gt_assert(numberofsuffixes > 0); totallength = gt_encseq_total_length(dbencseq); querysubstring.queryrep = queryrep; for (querysubstring.currentoffset = 0; querysubstring.currentoffset <= queryrep->seqlen - minmatchlength; querysubstring.currentoffset++) { GtUword dbstart; mmsi = gt_mmsearchiterator_new(dbencseq, suftabpart, 0, /* leftbound */ numberofsuffixes - 1, /* rightbound */ 0, /* offset */ readmode, &querysubstring, minmatchlength); while (gt_mmsearchiterator_next(&dbstart,mmsi)) { if (gt_mmsearch_isleftmaximal(dbencseq, readmode, dbstart, &querysubstring)) { GtUword dbseqnum, dbseqstartpos, dbseqlen, extend; extend = gt_mmsearch_extendright(dbencseq, mmsi->esr, readmode, totallength, dbstart + minmatchlength, &querysubstring, minmatchlength); if (gt_encseq_has_multiseq_support(dbencseq)) { dbseqnum = gt_encseq_seqnum(dbencseq,dbstart); dbseqstartpos = gt_encseq_seqstartpos(dbencseq,dbseqnum); dbseqlen = gt_encseq_seqlength(dbencseq,dbseqnum); } else { dbseqnum = dbseqstartpos = dbseqlen = 0; } gt_querymatch_init(querymatchspaceptr, minmatchlength + extend, dbstart, dbseqnum, dbstart - dbseqstartpos, dbseqlen, 0, /* score */ 0, /* edist */ selfmatch, localqueryunitnum, minmatchlength + extend, localqueryoffset, queryrep->seqlen); processquerymatch(processquerymatchinfo,querymatchspaceptr); } } gt_mmsearchiterator_delete(mmsi); mmsi = NULL; if (gt_mmsearch_accessquery(queryrep,querysubstring.currentoffset) == (GtUchar) SEPARATOR) { localqueryunitnum++; localqueryoffset = 0; } else { localqueryoffset++; } } }
extern int gt_packedindex_chk_search(int argc, const char *argv[], GtError *err) { struct chkSearchOptions params; Suffixarray suffixarray; Enumpatterniterator *epi = NULL; bool saIsLoaded = false; BWTSeq *bwtSeq = NULL; GtStr *inputProject = NULL; int parsedArgs; bool had_err = false; BWTSeqExactMatchesIterator EMIter; bool EMIterInitialized = false; GtLogger *logger = NULL; inputProject = gt_str_new(); do { gt_error_check(err); { bool exitNow = false; switch (parseChkBWTOptions(&parsedArgs, argc, argv, ¶ms, inputProject, err)) { case GT_OPTION_PARSER_OK: break; case GT_OPTION_PARSER_ERROR: had_err = true; exitNow = true; break; case GT_OPTION_PARSER_REQUESTS_EXIT: exitNow = true; break; } if (exitNow) break; } gt_str_set(inputProject, argv[parsedArgs]); logger = gt_logger_new(params.verboseOutput, GT_LOGGER_DEFLT_PREFIX, stdout); bwtSeq = gt_availBWTSeq(¶ms.idx.final, logger, err); if ((had_err = bwtSeq == NULL)) break; { enum verifyBWTSeqErrCode retval = gt_BWTSeqVerifyIntegrity(bwtSeq, gt_str_get(inputProject), params.flags, params.progressInterval, stderr, logger, err); if ((had_err = (retval != VERIFY_BWTSEQ_NO_ERROR))) { fprintf(stderr, "index integrity check failed: %s\n", gt_error_get(err)); gt_error_set(err, "aborted because of index integrity check fail"); break; } } if (BWTSeqHasLocateInformation(bwtSeq)) { if ((had_err = !gt_initEmptyEMIterator(&EMIter, bwtSeq))) { gt_error_set(err, "Cannot create matches iterator for sequence index."); break; } EMIterInitialized = true; } { unsigned long totalLen, dbstart; unsigned long trial, patternLen; if ((had_err = gt_mapsuffixarray(&suffixarray, SARR_SUFTAB | SARR_ESQTAB, gt_str_get(inputProject), NULL, err) != 0)) { gt_error_set(err, "Can't load suffix array project with" " demand for encoded sequence and suffix table files\n"); break; } totalLen = gt_encseq_total_length(suffixarray.encseq); saIsLoaded = true; if ((had_err = (params.minPatLen >= 0L && params.maxPatLen >= 0L && params.minPatLen > params.maxPatLen))) { gt_error_set(err, "Invalid pattern lengths selected: min=%ld, max=%ld;" " min <= max is required.", params.minPatLen, params.maxPatLen); break; } if (params.minPatLen < 0 || params.maxPatLen < 0) { unsigned int numofchars = gt_alphabet_num_of_chars( gt_encseq_alphabet(suffixarray.encseq)); if (params.minPatLen < 0) params.minPatLen = gt_recommendedprefixlength(numofchars, totalLen, GT_RECOMMENDED_MULTIPLIER_DEFAULT, true); if (params.maxPatLen < 0) params.maxPatLen = MAX(params.minPatLen, 125 * gt_recommendedprefixlength(numofchars,totalLen, GT_RECOMMENDED_MULTIPLIER_DEFAULT, true)/100); else params.maxPatLen = MAX(params.maxPatLen, params.minPatLen); } fprintf(stderr, "Using patterns of lengths %lu to %lu\n", params.minPatLen, params.maxPatLen); if ((had_err = totalLen + 1 != BWTSeqLength(bwtSeq))) { gt_error_set(err, "base suffix array and index have diferrent lengths!" "%lu vs. %lu", totalLen + 1, BWTSeqLength(bwtSeq)); break; } if ((had_err = (epi = gt_newenumpatterniterator(params.minPatLen, params.maxPatLen, suffixarray.encseq, err)) == NULL)) { fputs("Creation of pattern iterator failed!\n", stderr); break; } for (trial = 0; !had_err && trial < params.numOfSamples; ++trial) { const GtUchar *pptr = gt_nextEnumpatterniterator(&patternLen, epi); GtMMsearchiterator *mmsi = gt_mmsearchiterator_new_complete_olain(suffixarray.encseq, suffixarray.suftab, 0, /* leftbound */ totalLen, /* rightbound */ 0, /* offset */ suffixarray.readmode, pptr, patternLen); if (BWTSeqHasLocateInformation(bwtSeq)) { if ((had_err = !gt_reinitEMIterator(&EMIter, bwtSeq, pptr, patternLen, false))) { fputs("Internal error: failed to reinitialize pattern match" " iterator", stderr); abort(); } gt_assert(gt_EMINumMatchesTotal(&EMIter) == gt_BWTSeqMatchCount(bwtSeq, pptr, patternLen, false)); gt_assert(gt_EMINumMatchesTotal(&EMIter) == gt_mmsearchiterator_count(mmsi)); while (gt_mmsearchiterator_next(&dbstart,mmsi)) { unsigned long matchPos = 0; bool match = EMIGetNextMatch(&EMIter, &matchPos, bwtSeq); if ((had_err = !match)) { gt_error_set(err, "matches of packedindex expired before mmsearch!"); break; } if ((had_err = matchPos != dbstart)) { gt_error_set(err, "packedindex match doesn't equal mmsearch " "match result!\n%lu vs. %lu\n", matchPos, dbstart); } } if (!had_err) { unsigned long matchPos; bool trailingMatch = EMIGetNextMatch(&EMIter, &matchPos, bwtSeq); if ((had_err = trailingMatch)) { gt_error_set(err, "matches of mmsearch expired before fmindex!"); break; } } } else { unsigned long numFMIMatches = gt_BWTSeqMatchCount(bwtSeq, pptr, patternLen, false), numMMSearchMatches = gt_mmsearchiterator_count(mmsi); if ((had_err = numFMIMatches != numMMSearchMatches)) { gt_error_set(err, "Number of matches not equal for suffix array (" "%lu) and fmindex (%lu).\n", numFMIMatches, numMMSearchMatches); } } gt_mmsearchiterator_delete(mmsi); mmsi = NULL; if (params.progressInterval && !((trial + 1) % params.progressInterval)) putc('.', stderr); } if (params.progressInterval) putc('\n', stderr); fprintf(stderr, "Finished %lu of %lu matchings successfully.\n", trial, params.numOfSamples); } } while (0); if (EMIterInitialized) gt_destructEMIterator(&EMIter); if (saIsLoaded) gt_freesuffixarray(&suffixarray); gt_freeEnumpatterniterator(epi); if (bwtSeq) gt_deleteBWTSeq(bwtSeq); if (logger) gt_logger_delete(logger); if (inputProject) gt_str_delete(inputProject); return had_err?-1:0; }
static int gt_querysubstringmatch_generic( bool selfmatch, const GtEncseq *dbencseq, const ESASuffixptr *suftabpart, GtReadmode readmode, unsigned long numberofsuffixes, uint64_t queryunitnum, const GtQueryrep *queryrep, unsigned long minmatchlength, GtProcessquerymatch processquerymatch, void *processquerymatchinfo, GtQuerymatch *querymatchspaceptr, GtError *err) { GtMMsearchiterator *mmsi; unsigned long totallength, localqueryoffset = 0; uint64_t localqueryunitnum = queryunitnum; GtQuerysubstring querysubstring; bool haserr = false; gt_assert(numberofsuffixes > 0); totallength = gt_encseq_total_length(dbencseq); querysubstring.queryrep = queryrep; for (querysubstring.offset = 0; querysubstring.offset <= queryrep->length - minmatchlength; querysubstring.offset++) { unsigned long dbstart; mmsi = gt_mmsearchiterator_new_generic(dbencseq, suftabpart, 0, /* leftbound */ numberofsuffixes-1, /* rightbound */ 0, /* offset */ readmode, &querysubstring, minmatchlength); while (!haserr && gt_mmsearchiterator_next(&dbstart,mmsi)) { if (gt_mmsearch_isleftmaximal(dbencseq, readmode, dbstart, &querysubstring)) { unsigned long extend = gt_mmsearch_extendright(dbencseq, mmsi->esr, readmode, totallength, dbstart + minmatchlength, &querysubstring, minmatchlength); gt_querymatch_fill(querymatchspaceptr, minmatchlength + extend, dbstart, queryrep->readmode, queryrep->reversecopy, 0, /* score */ 0, /* edist */ selfmatch, localqueryunitnum, minmatchlength + extend, localqueryoffset); if (processquerymatch(processquerymatchinfo, dbencseq, querymatchspaceptr, queryrep->sequence, queryrep->length, err) != 0) { haserr = true; } } } gt_mmsearchiterator_delete(mmsi); mmsi = NULL; if (!haserr) { if (gt_mmsearch_accessquery(queryrep,querysubstring.offset) == (GtUchar) SEPARATOR) { localqueryunitnum++; localqueryoffset = 0; } else { localqueryoffset++; } } } return haserr ? -1 : 0; }