int gt_querysubstringmatchiterator_next(GtQuerysubstringmatchiterator *qsmi, GtError *err) { gt_assert(qsmi != NULL); while (true) { if (qsmi->query_seqlen < qsmi->userdefinedleastlength) { if (qsmi->seqit != NULL) { int retval = gt_seq_iterator_next(qsmi->seqit, &qsmi->query_for_seqit, &qsmi->query_seqlen, &qsmi->desc, err); if (retval < 0) { return -1; /* error */ } if (retval == 0) { return 1; /* no more sequences */ } gt_assert(qsmi->query_seqlen > 0 && qsmi->query_for_seqit != NULL); qsmi->queryrep.sequence = qsmi->query_for_seqit; } else { if (qsmi->queryunitnum == qsmi->query_encseq_numofsequences) { return 1; } qsmi->queryrep.startpos = gt_encseq_seqstartpos(qsmi->queryrep.encseq, qsmi->queryunitnum); qsmi->query_seqlen = gt_encseq_seqlength(qsmi->queryrep.encseq, qsmi->queryunitnum); } gt_assert(qsmi->query_seqlen > 0); qsmi->queryrep.seqlen = qsmi->query_seqlen; qsmi->querysubstring.currentoffset = 0; } if (qsmi->query_seqlen >= qsmi->userdefinedleastlength) { if (!qsmi->mmsi_defined) { gt_mmsearchiterator_reinit(qsmi->mmsi, qsmi->dbencseq, qsmi->suftabpart, 0, /* l */ qsmi->numberofsuffixes - 1, /* r */ 0, /* offset */ qsmi->db_readmode, &qsmi->querysubstring, qsmi->userdefinedleastlength); qsmi->mmsi_defined = true; } else { if (gt_mmsearchiterator_next(&qsmi->dbstart,qsmi->mmsi)) { GtUword extend; if (gt_mmsearch_isleftmaximal(qsmi->dbencseq, qsmi->db_readmode, qsmi->dbstart, &qsmi->querysubstring)) { extend = gt_mmsearch_extendright(qsmi->dbencseq, qsmi->mmsi->esr, qsmi->db_readmode, qsmi->totallength, qsmi->dbstart + qsmi->userdefinedleastlength, &qsmi->querysubstring, qsmi->userdefinedleastlength); qsmi->matchlength = qsmi->userdefinedleastlength + extend; return 0; } } else { qsmi->mmsi_defined = false; if (qsmi->querysubstring.currentoffset + qsmi->userdefinedleastlength < qsmi->query_seqlen) { qsmi->querysubstring.currentoffset++; } else { qsmi->query_seqlen = 0; qsmi->queryunitnum++; } } } } else { qsmi->query_seqlen = 0; qsmi->queryunitnum++; } } }
extern int gt_packedindex_chk_search(int argc, const char *argv[], GtError *err) { struct chkSearchOptions params; Suffixarray suffixarray; Enumpatterniterator *epi = NULL; bool saIsLoaded = false; BWTSeq *bwtSeq = NULL; GtStr *inputProject = NULL; int parsedArgs; bool had_err = false; BWTSeqExactMatchesIterator EMIter; bool EMIterInitialized = false; GtLogger *logger = NULL; inputProject = gt_str_new(); do { gt_error_check(err); { bool exitNow = false; switch (parseChkBWTOptions(&parsedArgs, argc, argv, ¶ms, inputProject, err)) { case GT_OPTION_PARSER_OK: break; case GT_OPTION_PARSER_ERROR: had_err = true; exitNow = true; break; case GT_OPTION_PARSER_REQUESTS_EXIT: exitNow = true; break; } if (exitNow) break; } gt_str_set(inputProject, argv[parsedArgs]); logger = gt_logger_new(params.verboseOutput, GT_LOGGER_DEFLT_PREFIX, stdout); bwtSeq = gt_availBWTSeq(¶ms.idx.final, logger, err); if ((had_err = bwtSeq == NULL)) break; { enum verifyBWTSeqErrCode retval = gt_BWTSeqVerifyIntegrity(bwtSeq, gt_str_get(inputProject), params.flags, params.progressInterval, stderr, logger, err); if ((had_err = (retval != VERIFY_BWTSEQ_NO_ERROR))) { fprintf(stderr, "index integrity check failed: %s\n", gt_error_get(err)); gt_error_set(err, "aborted because of index integrity check fail"); break; } } if (BWTSeqHasLocateInformation(bwtSeq)) { if ((had_err = !gt_initEmptyEMIterator(&EMIter, bwtSeq))) { gt_error_set(err, "Cannot create matches iterator for sequence index."); break; } EMIterInitialized = true; } { unsigned long totalLen, dbstart; unsigned long trial, patternLen; if ((had_err = gt_mapsuffixarray(&suffixarray, SARR_SUFTAB | SARR_ESQTAB, gt_str_get(inputProject), NULL, err) != 0)) { gt_error_set(err, "Can't load suffix array project with" " demand for encoded sequence and suffix table files\n"); break; } totalLen = gt_encseq_total_length(suffixarray.encseq); saIsLoaded = true; if ((had_err = (params.minPatLen >= 0L && params.maxPatLen >= 0L && params.minPatLen > params.maxPatLen))) { gt_error_set(err, "Invalid pattern lengths selected: min=%ld, max=%ld;" " min <= max is required.", params.minPatLen, params.maxPatLen); break; } if (params.minPatLen < 0 || params.maxPatLen < 0) { unsigned int numofchars = gt_alphabet_num_of_chars( gt_encseq_alphabet(suffixarray.encseq)); if (params.minPatLen < 0) params.minPatLen = gt_recommendedprefixlength(numofchars, totalLen, GT_RECOMMENDED_MULTIPLIER_DEFAULT, true); if (params.maxPatLen < 0) params.maxPatLen = MAX(params.minPatLen, 125 * gt_recommendedprefixlength(numofchars,totalLen, GT_RECOMMENDED_MULTIPLIER_DEFAULT, true)/100); else params.maxPatLen = MAX(params.maxPatLen, params.minPatLen); } fprintf(stderr, "Using patterns of lengths %lu to %lu\n", params.minPatLen, params.maxPatLen); if ((had_err = totalLen + 1 != BWTSeqLength(bwtSeq))) { gt_error_set(err, "base suffix array and index have diferrent lengths!" "%lu vs. %lu", totalLen + 1, BWTSeqLength(bwtSeq)); break; } if ((had_err = (epi = gt_newenumpatterniterator(params.minPatLen, params.maxPatLen, suffixarray.encseq, err)) == NULL)) { fputs("Creation of pattern iterator failed!\n", stderr); break; } for (trial = 0; !had_err && trial < params.numOfSamples; ++trial) { const GtUchar *pptr = gt_nextEnumpatterniterator(&patternLen, epi); GtMMsearchiterator *mmsi = gt_mmsearchiterator_new_complete_olain(suffixarray.encseq, suffixarray.suftab, 0, /* leftbound */ totalLen, /* rightbound */ 0, /* offset */ suffixarray.readmode, pptr, patternLen); if (BWTSeqHasLocateInformation(bwtSeq)) { if ((had_err = !gt_reinitEMIterator(&EMIter, bwtSeq, pptr, patternLen, false))) { fputs("Internal error: failed to reinitialize pattern match" " iterator", stderr); abort(); } gt_assert(gt_EMINumMatchesTotal(&EMIter) == gt_BWTSeqMatchCount(bwtSeq, pptr, patternLen, false)); gt_assert(gt_EMINumMatchesTotal(&EMIter) == gt_mmsearchiterator_count(mmsi)); while (gt_mmsearchiterator_next(&dbstart,mmsi)) { unsigned long matchPos = 0; bool match = EMIGetNextMatch(&EMIter, &matchPos, bwtSeq); if ((had_err = !match)) { gt_error_set(err, "matches of packedindex expired before mmsearch!"); break; } if ((had_err = matchPos != dbstart)) { gt_error_set(err, "packedindex match doesn't equal mmsearch " "match result!\n%lu vs. %lu\n", matchPos, dbstart); } } if (!had_err) { unsigned long matchPos; bool trailingMatch = EMIGetNextMatch(&EMIter, &matchPos, bwtSeq); if ((had_err = trailingMatch)) { gt_error_set(err, "matches of mmsearch expired before fmindex!"); break; } } } else { unsigned long numFMIMatches = gt_BWTSeqMatchCount(bwtSeq, pptr, patternLen, false), numMMSearchMatches = gt_mmsearchiterator_count(mmsi); if ((had_err = numFMIMatches != numMMSearchMatches)) { gt_error_set(err, "Number of matches not equal for suffix array (" "%lu) and fmindex (%lu).\n", numFMIMatches, numMMSearchMatches); } } gt_mmsearchiterator_delete(mmsi); mmsi = NULL; if (params.progressInterval && !((trial + 1) % params.progressInterval)) putc('.', stderr); } if (params.progressInterval) putc('\n', stderr); fprintf(stderr, "Finished %lu of %lu matchings successfully.\n", trial, params.numOfSamples); } } while (0); if (EMIterInitialized) gt_destructEMIterator(&EMIter); if (saIsLoaded) gt_freesuffixarray(&suffixarray); gt_freeEnumpatterniterator(epi); if (bwtSeq) gt_deleteBWTSeq(bwtSeq); if (logger) gt_logger_delete(logger); if (inputProject) gt_str_delete(inputProject); return had_err?-1:0; }
static void gt_querysubstringmatch(bool selfmatch, const GtEncseq *dbencseq, const ESASuffixptr *suftabpart, GtReadmode readmode, GtUword numberofsuffixes, uint64_t queryunitnum, GtQueryrepresentation *queryrep, GtUword minmatchlength, GtProcessquerymatch processquerymatch, void *processquerymatchinfo, GtQuerymatch *querymatchspaceptr) { GtMMsearchiterator *mmsi; GtUword totallength, localqueryoffset = 0; uint64_t localqueryunitnum = queryunitnum; GtQuerysubstring querysubstring; gt_assert(numberofsuffixes > 0); totallength = gt_encseq_total_length(dbencseq); querysubstring.queryrep = queryrep; for (querysubstring.currentoffset = 0; querysubstring.currentoffset <= queryrep->seqlen - minmatchlength; querysubstring.currentoffset++) { GtUword dbstart; mmsi = gt_mmsearchiterator_new(dbencseq, suftabpart, 0, /* leftbound */ numberofsuffixes - 1, /* rightbound */ 0, /* offset */ readmode, &querysubstring, minmatchlength); while (gt_mmsearchiterator_next(&dbstart,mmsi)) { if (gt_mmsearch_isleftmaximal(dbencseq, readmode, dbstart, &querysubstring)) { GtUword dbseqnum, dbseqstartpos, dbseqlen, extend; extend = gt_mmsearch_extendright(dbencseq, mmsi->esr, readmode, totallength, dbstart + minmatchlength, &querysubstring, minmatchlength); if (gt_encseq_has_multiseq_support(dbencseq)) { dbseqnum = gt_encseq_seqnum(dbencseq,dbstart); dbseqstartpos = gt_encseq_seqstartpos(dbencseq,dbseqnum); dbseqlen = gt_encseq_seqlength(dbencseq,dbseqnum); } else { dbseqnum = dbseqstartpos = dbseqlen = 0; } gt_querymatch_init(querymatchspaceptr, minmatchlength + extend, dbstart, dbseqnum, dbstart - dbseqstartpos, dbseqlen, 0, /* score */ 0, /* edist */ selfmatch, localqueryunitnum, minmatchlength + extend, localqueryoffset, queryrep->seqlen); processquerymatch(processquerymatchinfo,querymatchspaceptr); } } gt_mmsearchiterator_delete(mmsi); mmsi = NULL; if (gt_mmsearch_accessquery(queryrep,querysubstring.currentoffset) == (GtUchar) SEPARATOR) { localqueryunitnum++; localqueryoffset = 0; } else { localqueryoffset++; } } }
static int gt_querysubstringmatch_generic( bool selfmatch, const GtEncseq *dbencseq, const ESASuffixptr *suftabpart, GtReadmode readmode, unsigned long numberofsuffixes, uint64_t queryunitnum, const GtQueryrep *queryrep, unsigned long minmatchlength, GtProcessquerymatch processquerymatch, void *processquerymatchinfo, GtQuerymatch *querymatchspaceptr, GtError *err) { GtMMsearchiterator *mmsi; unsigned long totallength, localqueryoffset = 0; uint64_t localqueryunitnum = queryunitnum; GtQuerysubstring querysubstring; bool haserr = false; gt_assert(numberofsuffixes > 0); totallength = gt_encseq_total_length(dbencseq); querysubstring.queryrep = queryrep; for (querysubstring.offset = 0; querysubstring.offset <= queryrep->length - minmatchlength; querysubstring.offset++) { unsigned long dbstart; mmsi = gt_mmsearchiterator_new_generic(dbencseq, suftabpart, 0, /* leftbound */ numberofsuffixes-1, /* rightbound */ 0, /* offset */ readmode, &querysubstring, minmatchlength); while (!haserr && gt_mmsearchiterator_next(&dbstart,mmsi)) { if (gt_mmsearch_isleftmaximal(dbencseq, readmode, dbstart, &querysubstring)) { unsigned long extend = gt_mmsearch_extendright(dbencseq, mmsi->esr, readmode, totallength, dbstart + minmatchlength, &querysubstring, minmatchlength); gt_querymatch_fill(querymatchspaceptr, minmatchlength + extend, dbstart, queryrep->readmode, queryrep->reversecopy, 0, /* score */ 0, /* edist */ selfmatch, localqueryunitnum, minmatchlength + extend, localqueryoffset); if (processquerymatch(processquerymatchinfo, dbencseq, querymatchspaceptr, queryrep->sequence, queryrep->length, err) != 0) { haserr = true; } } } gt_mmsearchiterator_delete(mmsi); mmsi = NULL; if (!haserr) { if (gt_mmsearch_accessquery(queryrep,querysubstring.offset) == (GtUchar) SEPARATOR) { localqueryunitnum++; localqueryoffset = 0; } else { localqueryoffset++; } } } return haserr ? -1 : 0; }