static int inputthesequences(unsigned int *numofchars, unsigned long *nextpostable, Suffixarray *suffixarraytable, const GtStrArray *indexnametab, unsigned int demand, GtLogger *logger, GtError *err) { unsigned long idx; const char *indexname; gt_error_check(err); for (idx=0; idx<gt_str_array_size(indexnametab); idx++) { indexname = gt_str_array_get(indexnametab,idx); if (streamsuffixarray(&suffixarraytable[idx], demand, indexname, logger, err) != 0) { return -1; } if (idx == 0) { *numofchars = gt_alphabet_num_of_chars( gt_encseq_alphabet(suffixarraytable[idx].encseq)); } nextpostable[idx] = 0; } return 0; }
FMindex *gt_loadvoidBWTSeqForSA(const char *indexname, bool withpckbt, GtError *err) { BWTSeq *bwtseq = NULL; bool haserr = false; GtEncseqMetadata *emd; GtAlphabet *alphabet; emd = gt_encseq_metadata_new(indexname, err); if (emd == NULL) { gt_assert(gt_error_is_set(err)); haserr = true; } if (!haserr) { alphabet = gt_encseq_metadata_alphabet(emd); if (alphabet == NULL) { gt_assert(gt_error_is_set(err)); haserr = true; } } if (!haserr) { bwtseq = gt_loadBWTSeqForSA(indexname, BWT_ON_BLOCK_ENC, BWTDEFOPT_MULTI_QUERY, alphabet, err); if (bwtseq == NULL) { gt_assert(gt_error_is_set(err)); haserr = true; } } if (!haserr) { if (withpckbt && gt_pckbuckettable_exists(indexname)) { unsigned int numofchars = gt_alphabet_num_of_chars(alphabet); bwtseq->pckbuckettable = gt_pckbuckettable_map(indexname,numofchars,err); if (bwtseq->pckbuckettable == NULL) { gt_assert(gt_error_is_set(err)); haserr = true; } } else { bwtseq->pckbuckettable = NULL; } } gt_encseq_metadata_delete(emd); if (haserr && bwtseq != NULL) { gt_deletevoidBWTSeq((FMindex *) bwtseq); bwtseq = NULL; } return haserr ? NULL : (FMindex *) bwtseq; }
Substriter *gt_substriter_new(const GtAlphabet *alphabet,unsigned int qvalue) { Substriter *substriter; ALLOCASSIGNSPACE(substriter,NULL,Substriter,1); substriter->qvalue = qvalue; substriter->numofchars = gt_alphabet_num_of_chars(alphabet); substriter->multimappower = gt_initmultimappower(substriter->numofchars, qvalue); return substriter; }
void getencseqkmers(const GtEncseq *encseq, GtReadmode readmode, unsigned int kmersize, void(*processkmercode)(void *, unsigned long, const GtKmercode *), void *processkmercodeinfo) { unsigned long currentposition = 0, totallength; Kmerstream *spwp; GtUchar charcode; GtEncseqReader *esr; unsigned int numofchars, overshoot; totallength = gt_encseq_total_length(encseq); if (totallength < (unsigned long) kmersize) { return; } numofchars = gt_alphabet_num_of_chars(gt_encseq_alphabet(encseq)); spwp = kmerstream_new(numofchars,kmersize); esr = gt_encseq_create_reader_with_readmode(encseq,readmode,0); for (currentposition = 0; currentposition < (unsigned long) kmersize; currentposition++) { charcode = gt_encseq_reader_next_encoded_char(esr); GT_CHECKENCCHAR(charcode,encseq,currentposition,readmode); spwp->windowwidth++; updatespecialpositions(spwp,charcode,false,0); spwp->cyclicwindow[spwp->windowwidth-1] = charcode; } kmerstream_newcode(&spwp->currentkmercode,spwp); processkmercode(processkmercodeinfo,0,&spwp->currentkmercode); for (currentposition = (unsigned long) kmersize; currentposition<totallength; currentposition++) { charcode = gt_encseq_reader_next_encoded_char(esr); GT_CHECKENCCHAR(charcode,encseq,currentposition,readmode); shiftrightwithchar(spwp,charcode); kmerstream_newcode(&spwp->currentkmercode,spwp); processkmercode(processkmercodeinfo,currentposition + 1 - spwp->kmersize, &spwp->currentkmercode); } gt_encseq_reader_delete(esr); for (overshoot=0; overshoot<kmersize; overshoot++) { shiftrightwithchar(spwp,(GtUchar) WILDCARD); kmerstream_newcode(&spwp->currentkmercode,spwp); processkmercode(processkmercodeinfo, overshoot + currentposition + 1 - spwp->kmersize, &spwp->currentkmercode); } kmerstream_delete(spwp); }
Substriter *gt_substriter_new(const GtAlphabet *alphabet,unsigned int qvalue) { Substriter *substriter; substriter = gt_malloc(sizeof *substriter); substriter->qvalue = qvalue; substriter->numofchars = gt_alphabet_num_of_chars(alphabet); substriter->multimappower = gt_initmultimappower(substriter->numofchars, qvalue); return substriter; }
static int sarrselfsubstringmatch(const GtUchar *dbseq, GtUword dblen, const GtUchar *query, GtUword querylen, unsigned int minlength, GtAlphabet *alpha, Processmaxpairs processmaxpairs, void *processmaxpairsinfo, GtLogger *logger, GtError *err) { Substringmatchinfo ssi; unsigned int numofchars, recommendedprefixlength; GtEncseqBuilder *eb; bool haserr = false; eb = gt_encseq_builder_new(alpha); gt_encseq_builder_disable_multiseq_support(eb); gt_encseq_builder_disable_description_support(eb); gt_encseq_builder_set_logger(eb, logger); gt_encseq_builder_add_encoded(eb, dbseq, dblen, NULL); gt_encseq_builder_add_encoded(eb, query, querylen, NULL); ssi.encseq = gt_encseq_builder_build(eb, err); gt_encseq_builder_delete(eb); ssi.minlength = minlength; ssi.processmaxpairs = processmaxpairs; ssi.processmaxpairsinfo = processmaxpairsinfo; numofchars = gt_alphabet_num_of_chars(alpha); recommendedprefixlength = gt_recommendedprefixlength(numofchars, dblen+querylen+1, GT_RECOMMENDED_MULTIPLIER_DEFAULT, true); if (constructsarrandrunmaxpairs(&ssi, GT_READMODE_FORWARD, recommendedprefixlength, 1U, /* parts */ 0, /* maximumspace */ NULL, false, logger, err) != 0) { haserr = true; } gt_encseq_delete(ssi.encseq); ssi.encseq = NULL; return haserr ? -1 : 0; }
int gt_sarrquerysubstringmatch(const GtUchar *dbseq, GtUword dblen, const GtUchar *query, GtUword querylen, unsigned int minlength, GtAlphabet *alpha, GtProcessquerymatch processquerymatch, void *processquerymatchinfo, GtLogger *logger, GtError *err) { unsigned int numofchars, recommendedprefixlength; bool haserr = false; GtEncseq *dbencseq; GtEncseqBuilder *eb; gt_assert(querylen >= (GtUword) minlength && dblen >= (GtUword) minlength); eb = gt_encseq_builder_new(alpha); gt_encseq_builder_disable_multiseq_support(eb); gt_encseq_builder_disable_description_support(eb); gt_encseq_builder_set_logger(eb, logger); gt_encseq_builder_add_multiple_encoded(eb,dbseq,dblen); dbencseq = gt_encseq_builder_build(eb, err); gt_encseq_builder_delete(eb); numofchars = gt_alphabet_num_of_chars(alpha); recommendedprefixlength = gt_recommendedprefixlength(numofchars,dblen, GT_RECOMMENDED_MULTIPLIER_DEFAULT, true); if (gt_constructsarrandrunmmsearch(dbencseq, GT_READMODE_FORWARD, recommendedprefixlength, 1U, /* parts */ 0, /* maximumspace */ query, querylen, minlength, processquerymatch, processquerymatchinfo, NULL, false, logger, err) != 0) { haserr = true; } gt_encseq_delete(dbencseq); dbencseq = NULL; return haserr ? -1 : 0; }
/*@notnull@*/ GtKmercodeiterator *gt_kmercodeiterator_encseq_new( const GtEncseq *encseq, GtReadmode readmode, unsigned int kmersize, unsigned long startpos) { GtKmercodeiterator *kmercodeiterator; unsigned int numofchars; GtUchar charcode; gt_assert(!GT_ISDIRREVERSE(readmode) || startpos == 0); kmercodeiterator = gt_malloc(sizeof (*kmercodeiterator)); kmercodeiterator->totallength = gt_encseq_total_length(encseq); kmercodeiterator->startpos = startpos; gt_assert(startpos < kmercodeiterator->totallength); if (kmercodeiterator->totallength - startpos < (unsigned long) kmersize) { kmercodeiterator->inputexhausted = true; kmercodeiterator->fb = NULL; kmercodeiterator->encseq = encseq; kmercodeiterator->esr = NULL; kmercodeiterator->spwp = NULL; } else { kmercodeiterator->inputexhausted = false; kmercodeiterator->fb = NULL; kmercodeiterator->encseq = encseq; kmercodeiterator->readmode = readmode; kmercodeiterator->esr = gt_encseq_create_reader_with_readmode(encseq, readmode, startpos); numofchars = gt_alphabet_num_of_chars(gt_encseq_alphabet(encseq)); kmercodeiterator->spwp = kmerstream_new(numofchars,kmersize); kmercodeiterator->hasprocessedfirst = false; for (kmercodeiterator->currentposition = startpos; kmercodeiterator->currentposition < startpos+(unsigned long) kmersize; kmercodeiterator->currentposition++) { charcode = gt_encseq_reader_next_encoded_char(kmercodeiterator->esr); kmercodeiterator->spwp->windowwidth++; updatespecialpositions(kmercodeiterator->spwp,charcode,false,0); kmercodeiterator->spwp->cyclicwindow[kmercodeiterator-> spwp->windowwidth-1] = charcode; } } return kmercodeiterator; }
MRAEnc * gt_MRAEncGTAlphaNew(const GtAlphabet *alpha) { AlphabetRangeSize symsPerRange[2]; uint8_t *mappings; MRAEnc *result; uint32_t numofchars = gt_alphabet_num_of_chars(alpha); mappings = gt_malloc(sizeof (uint8_t) * (UINT8_MAX + 1)); memset(mappings, UNDEF_UCHAR, UINT8_MAX+1); { /* handle regular symbols */ int i; for (i = 0; i < numofchars; ++i) mappings[i] = i; symsPerRange[0] = numofchars; } /* handle special symbols */ mappings[WILDCARD] = numofchars; symsPerRange[1] = 1; result = gt_newMultiRangeAlphabetEncodingUInt8(2, symsPerRange, mappings); gt_free(mappings); return result; }
int gt_verifymappedstr(const GtEncseq *encseq, unsigned int prefixlength, GtError *err) { unsigned int numofchars; GtArrayGtCodetype codeliststream; bool haserr = false; gt_error_check(err); numofchars = gt_alphabet_num_of_chars(gt_encseq_alphabet(encseq)); GT_INITARRAY(&codeliststream,GtCodetype); if (getfastastreamkmers(gt_encseq_filenames(encseq), numofchars, prefixlength, gt_alphabet_symbolmap( gt_encseq_alphabet(encseq)), false, &codeliststream, err) != 0) { haserr = true; } if (!haserr) { if (verifycodelists(encseq, prefixlength, numofchars, &codeliststream, err) != 0) { haserr = true; } } GT_FREEARRAY(&codeliststream,GtCodetype); return haserr ? -1 : 0; }
static int parse_score_line(GtScoreMatrix *sm, GtTokenizer *tz, GtArray *index_to_alpha_char_mapping, char *parsed_characters, GtError *err) { unsigned int num_of_chars, i = 0; char amino_acid; int score, had_err = 0; GtStr *token; gt_assert(sm && tz && index_to_alpha_char_mapping); gt_error_check(err); token = gt_tokenizer_get_token(tz); gt_assert(token); if (gt_str_length(token) != 1) { gt_error_set(err, "illegal character token '%s' on line %lu in file '%s'", gt_str_get(token), gt_tokenizer_get_line_number(tz), gt_tokenizer_get_filename(tz)); had_err = -1; } amino_acid = gt_str_get(token)[0]; /* check for character duplications */ if (parsed_characters[(int) amino_acid]) { gt_error_set(err, "multiple character '%c' entry on line %lu in file '%s'", amino_acid, gt_tokenizer_get_line_number(tz), gt_tokenizer_get_filename(tz)); had_err = -1; } parsed_characters[(int) amino_acid] = GT_UNDEF_CHAR; gt_str_delete(token); if (!had_err) { num_of_chars = gt_alphabet_num_of_chars(sm->alphabet); gt_tokenizer_next_token(tz); while ((token = gt_tokenizer_get_token(tz))) { unsigned int idx1, idx2; /* the tokenizer can return tokens which are empty except for a newline -> skip these */ if (!strcmp(gt_str_get(token), "\n")) { gt_str_delete(token); gt_tokenizer_next_token(tz); if (gt_tokenizer_line_start(tz)) break; continue; } /* token is not empty -> parse score */ had_err = gt_parse_int_line(&score, gt_str_get(token), gt_tokenizer_get_line_number(tz), gt_tokenizer_get_filename(tz), err); if (had_err) break; idx1 = gt_alphabet_encode(sm->alphabet, amino_acid); idx2 = gt_alphabet_encode(sm->alphabet, *(char*) gt_array_get(index_to_alpha_char_mapping, i)); gt_score_matrix_set_score(sm, idx1 == WILDCARD ? num_of_chars : idx1, idx2 == WILDCARD ? num_of_chars : idx2, score); i++; gt_str_delete(token); gt_tokenizer_next_token(tz); if (gt_tokenizer_line_start(tz)) break; } } return had_err; }
static int gt_condenseq_compress_runner(GT_UNUSED int argc, const char **argv, int parsed_args, void *tool_arguments, GtError *err) { GtCondenseqCompressArguments *arguments = tool_arguments; GtLogger *logger, *kdb_logger; FILE *kmer_fp = NULL; int had_err = 0; gt_error_check(err); gt_assert(arguments); logger = gt_logger_new(arguments->verbose, GT_LOGGER_DEFLT_PREFIX, stderr); kdb_logger = gt_logger_new(arguments->kdb, GT_LOGGER_DEFLT_PREFIX, stderr); if (arguments->kdb) { kmer_fp = gt_fa_fopen("kmer_db.out", "w", err); gt_logger_set_target(kdb_logger, kmer_fp); } if (gt_str_length(arguments->indexname) == 0UL) { char *basenameptr; basenameptr = gt_basename(argv[parsed_args]); gt_str_set(arguments->indexname, basenameptr); gt_free(basenameptr); } if (!had_err) { GtEncseqLoader *es_l = gt_encseq_loader_new(); arguments->input_es = gt_encseq_loader_load(es_l, argv[parsed_args], err); if (arguments->input_es == NULL) had_err = -1; gt_encseq_loader_delete(es_l); } if (!had_err) { if (arguments->minalignlength == GT_UNDEF_UWORD) arguments->minalignlength = arguments->initsize != GT_UNDEF_UWORD ? arguments->initsize / (GtUword) 3UL : GT_UNDEF_UWORD; if (arguments->windowsize == GT_UNDEF_UINT) arguments->windowsize = arguments->minalignlength != GT_UNDEF_UWORD ? (unsigned int) (arguments->minalignlength / 5U) : GT_UNDEF_UINT; if (arguments->windowsize < 4U) arguments->windowsize = 4U; if (arguments->kmersize == GT_UNDEF_UINT) { unsigned int size = gt_alphabet_num_of_chars(gt_encseq_alphabet(arguments->input_es)); /* size^k ~= 100000 */ gt_safe_assign(arguments->kmersize, gt_round_to_long(gt_log_base(100000.0, (double) size))); gt_logger_log(logger, "|A|: %u, k: %u", size, arguments->kmersize); } if (arguments->windowsize == GT_UNDEF_UINT) { arguments->windowsize = 5U * arguments->kmersize; } if (arguments->minalignlength == GT_UNDEF_UWORD) { arguments->minalignlength = (GtUword) (3UL * arguments->windowsize); } if (arguments->initsize == GT_UNDEF_UWORD) { arguments->initsize = (GtUword) (3UL * arguments->minalignlength); } } if (!had_err && arguments->windowsize <= arguments->kmersize) { gt_error_set(err, "-windowsize (%u) must be larger -kmersize (%u)!", arguments->windowsize, arguments->kmersize); had_err = -1; } if (!had_err && arguments->minalignlength < (GtUword) arguments->windowsize) { gt_error_set(err, "-alignlength (" GT_WU ") must be at least " "-windowsize (%u)!", arguments->minalignlength, arguments->windowsize); had_err = -1; } if (!had_err && (arguments->initsize < arguments->minalignlength)) { gt_error_set(err, "-initsize (" GT_WU ") must be at least " "-alignlength (" GT_WU ")!", arguments->initsize, arguments->minalignlength); had_err = -1; } if (!had_err) { GtCondenseqCreator *ces_c; if (!had_err) { ces_c = gt_condenseq_creator_new(arguments->initsize, arguments->minalignlength, arguments->xdrop, &(arguments->scores), arguments->kmersize, arguments->windowsize, logger, err); if (ces_c == NULL) had_err = -1; } if (!had_err) { if (arguments->cutoff_value == GT_UNDEF_UWORD) gt_condenseq_creator_use_mean_cutoff(ces_c); else if (arguments->cutoff_value == 0) gt_condenseq_creator_disable_cutoff(ces_c); else gt_condenseq_creator_set_cutoff(ces_c, arguments->cutoff_value); gt_condenseq_creator_set_mean_fraction(ces_c, arguments->fraction); if (arguments->prune) gt_condenseq_creator_disable_prune(ces_c); if (arguments->brute) gt_condenseq_creator_enable_brute_force(ces_c); if (!arguments->diags) gt_condenseq_creator_disable_diagonals(ces_c); if (arguments->full_diags) gt_condenseq_creator_enable_full_diagonals(ces_c); if (arguments->clean_percent != GT_UNDEF_UINT) gt_condenseq_creator_set_diags_clean_limit(ces_c, arguments->clean_percent); had_err = gt_condenseq_creator_create(ces_c, arguments->indexname, arguments->input_es, logger, kdb_logger, err); gt_condenseq_creator_delete(ces_c); } } gt_logger_delete(logger); gt_logger_delete(kdb_logger); if (arguments->kdb) gt_fa_fclose(kmer_fp); return had_err; }
extern int gt_packedindex_chk_search(int argc, const char *argv[], GtError *err) { struct chkSearchOptions params; Suffixarray suffixarray; Enumpatterniterator *epi = NULL; bool saIsLoaded = false; BWTSeq *bwtSeq = NULL; GtStr *inputProject = NULL; int parsedArgs; bool had_err = false; BWTSeqExactMatchesIterator EMIter; bool EMIterInitialized = false; GtLogger *logger = NULL; inputProject = gt_str_new(); do { gt_error_check(err); { bool exitNow = false; switch (parseChkBWTOptions(&parsedArgs, argc, argv, ¶ms, inputProject, err)) { case GT_OPTION_PARSER_OK: break; case GT_OPTION_PARSER_ERROR: had_err = true; exitNow = true; break; case GT_OPTION_PARSER_REQUESTS_EXIT: exitNow = true; break; } if (exitNow) break; } gt_str_set(inputProject, argv[parsedArgs]); logger = gt_logger_new(params.verboseOutput, GT_LOGGER_DEFLT_PREFIX, stdout); bwtSeq = gt_availBWTSeq(¶ms.idx.final, logger, err); if ((had_err = bwtSeq == NULL)) break; { enum verifyBWTSeqErrCode retval = gt_BWTSeqVerifyIntegrity(bwtSeq, gt_str_get(inputProject), params.flags, params.progressInterval, stderr, logger, err); if ((had_err = (retval != VERIFY_BWTSEQ_NO_ERROR))) { fprintf(stderr, "index integrity check failed: %s\n", gt_error_get(err)); gt_error_set(err, "aborted because of index integrity check fail"); break; } } if (BWTSeqHasLocateInformation(bwtSeq)) { if ((had_err = !gt_initEmptyEMIterator(&EMIter, bwtSeq))) { gt_error_set(err, "Cannot create matches iterator for sequence index."); break; } EMIterInitialized = true; } { unsigned long totalLen, dbstart; unsigned long trial, patternLen; if ((had_err = gt_mapsuffixarray(&suffixarray, SARR_SUFTAB | SARR_ESQTAB, gt_str_get(inputProject), NULL, err) != 0)) { gt_error_set(err, "Can't load suffix array project with" " demand for encoded sequence and suffix table files\n"); break; } totalLen = gt_encseq_total_length(suffixarray.encseq); saIsLoaded = true; if ((had_err = (params.minPatLen >= 0L && params.maxPatLen >= 0L && params.minPatLen > params.maxPatLen))) { gt_error_set(err, "Invalid pattern lengths selected: min=%ld, max=%ld;" " min <= max is required.", params.minPatLen, params.maxPatLen); break; } if (params.minPatLen < 0 || params.maxPatLen < 0) { unsigned int numofchars = gt_alphabet_num_of_chars( gt_encseq_alphabet(suffixarray.encseq)); if (params.minPatLen < 0) params.minPatLen = gt_recommendedprefixlength(numofchars, totalLen, GT_RECOMMENDED_MULTIPLIER_DEFAULT, true); if (params.maxPatLen < 0) params.maxPatLen = MAX(params.minPatLen, 125 * gt_recommendedprefixlength(numofchars,totalLen, GT_RECOMMENDED_MULTIPLIER_DEFAULT, true)/100); else params.maxPatLen = MAX(params.maxPatLen, params.minPatLen); } fprintf(stderr, "Using patterns of lengths %lu to %lu\n", params.minPatLen, params.maxPatLen); if ((had_err = totalLen + 1 != BWTSeqLength(bwtSeq))) { gt_error_set(err, "base suffix array and index have diferrent lengths!" "%lu vs. %lu", totalLen + 1, BWTSeqLength(bwtSeq)); break; } if ((had_err = (epi = gt_newenumpatterniterator(params.minPatLen, params.maxPatLen, suffixarray.encseq, err)) == NULL)) { fputs("Creation of pattern iterator failed!\n", stderr); break; } for (trial = 0; !had_err && trial < params.numOfSamples; ++trial) { const GtUchar *pptr = gt_nextEnumpatterniterator(&patternLen, epi); GtMMsearchiterator *mmsi = gt_mmsearchiterator_new_complete_olain(suffixarray.encseq, suffixarray.suftab, 0, /* leftbound */ totalLen, /* rightbound */ 0, /* offset */ suffixarray.readmode, pptr, patternLen); if (BWTSeqHasLocateInformation(bwtSeq)) { if ((had_err = !gt_reinitEMIterator(&EMIter, bwtSeq, pptr, patternLen, false))) { fputs("Internal error: failed to reinitialize pattern match" " iterator", stderr); abort(); } gt_assert(gt_EMINumMatchesTotal(&EMIter) == gt_BWTSeqMatchCount(bwtSeq, pptr, patternLen, false)); gt_assert(gt_EMINumMatchesTotal(&EMIter) == gt_mmsearchiterator_count(mmsi)); while (gt_mmsearchiterator_next(&dbstart,mmsi)) { unsigned long matchPos = 0; bool match = EMIGetNextMatch(&EMIter, &matchPos, bwtSeq); if ((had_err = !match)) { gt_error_set(err, "matches of packedindex expired before mmsearch!"); break; } if ((had_err = matchPos != dbstart)) { gt_error_set(err, "packedindex match doesn't equal mmsearch " "match result!\n%lu vs. %lu\n", matchPos, dbstart); } } if (!had_err) { unsigned long matchPos; bool trailingMatch = EMIGetNextMatch(&EMIter, &matchPos, bwtSeq); if ((had_err = trailingMatch)) { gt_error_set(err, "matches of mmsearch expired before fmindex!"); break; } } } else { unsigned long numFMIMatches = gt_BWTSeqMatchCount(bwtSeq, pptr, patternLen, false), numMMSearchMatches = gt_mmsearchiterator_count(mmsi); if ((had_err = numFMIMatches != numMMSearchMatches)) { gt_error_set(err, "Number of matches not equal for suffix array (" "%lu) and fmindex (%lu).\n", numFMIMatches, numMMSearchMatches); } } gt_mmsearchiterator_delete(mmsi); mmsi = NULL; if (params.progressInterval && !((trial + 1) % params.progressInterval)) putc('.', stderr); } if (params.progressInterval) putc('\n', stderr); fprintf(stderr, "Finished %lu of %lu matchings successfully.\n", trial, params.numOfSamples); } } while (0); if (EMIterInitialized) gt_destructEMIterator(&EMIter); if (saIsLoaded) gt_freesuffixarray(&suffixarray); gt_freeEnumpatterniterator(epi); if (bwtSeq) gt_deleteBWTSeq(bwtSeq); if (logger) gt_logger_delete(logger); if (inputProject) gt_str_delete(inputProject); return had_err?-1:0; }
static int gt_kmer_database_runner(GT_UNUSED int argc, const char **argv, int parsed_args, void *tool_arguments, GtError *err) { GtKmerDatabaseArguments *arguments = tool_arguments; int had_err = 0; GtEncseq *es; GtUword es_length, nu_kmer_codes = 0; GtKmerDatabase *compare_db = NULL, *db = NULL; GtLogger *logger; FILE *fp = NULL; GtHashmap *kmer_hash = NULL; GtTimer *timer = NULL; gt_error_check(err); gt_assert(arguments); if (arguments->use_hash) kmer_hash = gt_hashmap_new(GT_HASH_DIRECT, NULL, (GtFree) gt_kmer_database_delete_hash_value); if (arguments->bench) timer = gt_timer_new_with_progress_description("loading encoded sequence"); logger = gt_logger_new(arguments->verbose, GT_LOGGER_DEFLT_PREFIX, stderr); if (arguments->verbose && gt_str_length(arguments->print_filename) > 0UL) { fp = gt_fa_fopen(gt_str_get(arguments->print_filename), "w", err); gt_logger_set_target(logger, fp); } if (!had_err) { GtEncseqLoader *es_l; if (arguments->bench) gt_timer_start(timer); es_l = gt_encseq_loader_new(); es = gt_encseq_loader_load(es_l, argv[parsed_args], err); if (arguments->bench) gt_timer_show_progress(timer, "saving kmers (+iterating over file)", stdout); if (es == NULL) { had_err = -1; } gt_encseq_loader_delete(es_l); } if (!had_err) { es_length = gt_encseq_total_length(es); if (es_length < (GtUword) arguments->kmersize) { gt_error_set(err, "Input is too short for used kmersize. File length: " GT_WU " kmersize: %u", es_length, arguments->kmersize); had_err = -1; } } if (!had_err) { GtAlphabet *alphabet; alphabet = gt_encseq_alphabet(es); if (arguments->bench) nu_kmer_codes = gt_power_for_small_exponents( gt_alphabet_num_of_chars(alphabet), arguments->kmersize); if (!arguments->merge_only && !arguments->use_hash && !arguments->bench) { compare_db = gt_kmer_database_new(gt_alphabet_num_of_chars(alphabet), arguments->kmersize, arguments->sb_size, es); } if (!arguments->use_hash) { db = gt_kmer_database_new(gt_alphabet_num_of_chars(alphabet), arguments->kmersize, arguments->sb_size, es); if (arguments->cutoff) { if (arguments->mean_cutoff) gt_kmer_database_use_mean_cutoff(db, (GtUword) 2, arguments->cutoff_value); else gt_kmer_database_set_cutoff(db, arguments->cutoff_value); if (!arguments->prune) gt_kmer_database_set_prune(db); } } } if (!had_err) { GtUword startpos = 0, endpos; GtKmercodeiterator *iter; const GtKmercode *kmercode = NULL; iter = gt_kmercodeiterator_encseq_new(es, GT_READMODE_FORWARD, arguments->kmersize, 0); while (!had_err && startpos < es_length - (arguments->kmersize - 1)) { GtUword startpos_add_kmer = startpos; if (arguments->merge_only) { endpos = startpos + (arguments->kmersize - 1) + (gt_rand_max((arguments->sb_size - 1) * 2)); if (endpos > es_length) endpos = es_length; } else { endpos = startpos + (arguments->kmersize - 1) + (gt_rand_max(arguments->sb_size - 1)); } gt_kmercodeiterator_reset(iter, GT_READMODE_FORWARD, startpos); while ((kmercode = gt_kmercodeiterator_encseq_next(iter)) != NULL && startpos_add_kmer <= endpos - (arguments->kmersize - 1)) { if (!arguments->merge_only && !arguments->use_hash && !kmercode->definedspecialposition && !arguments->bench) { gt_kmer_database_add_kmer(compare_db, kmercode->code, startpos_add_kmer); } if (arguments->use_hash && !kmercode->definedspecialposition) { gt_kmer_database_add_to_hash(kmer_hash, kmercode->code, startpos_add_kmer); } startpos_add_kmer++; } if (!arguments->use_hash) { gt_kmer_database_add_interval(db, startpos, endpos); gt_kmer_database_print_buffer(db, logger); if (!arguments->bench) had_err = gt_kmer_database_check_consistency(db, err); } startpos = endpos + 1; } if (!arguments->use_hash) { gt_kmer_database_flush(db); gt_kmer_database_print_buffer(db, logger); if (!had_err && !arguments->bench) had_err = gt_kmer_database_check_consistency(db, err); if (!arguments->merge_only && !had_err && !arguments->bench) had_err = gt_kmer_database_check_consistency(compare_db, err); if (!arguments->merge_only && !arguments->bench) gt_kmer_database_print(compare_db, logger, true); if (!arguments->merge_only && !had_err && !arguments->bench) had_err = gt_kmer_database_compare(compare_db, db, err); gt_kmer_database_print(db, logger, true); } gt_kmercodeiterator_delete(iter); } if (arguments->bench) { GtKmerStartpos pos; GtArrayGtUword *pos_hash; GtUword rand_access = (GtUword) 50000000, rand_code, i, sum = 0; gt_timer_show_progress(timer, "random access", stdout); for (i = 0; i < rand_access; i++) { rand_code = gt_rand_max(nu_kmer_codes - 1); if (arguments->use_hash) { pos_hash = gt_hashmap_get(kmer_hash, (const void *) rand_code); if (pos_hash != NULL) sum += pos_hash->spaceGtUword[pos_hash->nextfreeGtUword - 1]; } else { pos = gt_kmer_database_get_startpos(db, rand_code); if (pos.no_positions > 0) sum += pos.startpos[pos.no_positions - 1]; } } printf("sum: " GT_WU "\n", sum); gt_timer_show_progress(timer, "", stdout); gt_timer_stop(timer); gt_timer_delete(timer); } if (arguments->use_hash) gt_hashmap_delete(kmer_hash); gt_encseq_delete(es); if (!arguments->use_hash) gt_kmer_database_delete(db); if (!arguments->merge_only && !arguments->bench) gt_kmer_database_delete(compare_db); gt_logger_delete(logger); gt_fa_fclose(fp); return had_err; }
static int enumeratelcpintervals(const char *inputindex, Sequentialsuffixarrayreader *ssar, const char *storeindex, bool storecounts, GtUword mersize, GtUword minocc, GtUword maxocc, bool performtest, GtLogger *logger, GtError *err) { TyrDfsstate *state; bool haserr = false; unsigned int alphasize; gt_error_check(err); state = gt_malloc(sizeof (*state)); GT_INITARRAY(&state->occdistribution,Countwithpositions); state->esrspace = gt_encseq_create_reader_with_readmode( gt_encseqSequentialsuffixarrayreader(ssar), gt_readmodeSequentialsuffixarrayreader(ssar), 0); state->mersize = (GtUword) mersize; state->encseq = gt_encseqSequentialsuffixarrayreader(ssar); alphasize = gt_alphabet_num_of_chars(gt_encseq_alphabet(state->encseq)); state->readmode = gt_readmodeSequentialsuffixarrayreader(ssar); state->storecounts = storecounts; state->minocc = minocc; state->maxocc = maxocc; state->totallength = gt_encseq_total_length(state->encseq); state->performtest = performtest; state->countoutputmers = 0; state->merindexfpout = NULL; state->countsfilefpout = NULL; GT_INITARRAY(&state->largecounts,Largecount); if (strlen(storeindex) == 0) { state->sizeofbuffer = 0; state->bytebuffer = NULL; } else { state->sizeofbuffer = MERBYTES(mersize); state->bytebuffer = gt_malloc(sizeof *state->bytebuffer * state->sizeofbuffer); } if (performtest) { state->currentmer = gt_malloc(sizeof *state->currentmer * state->mersize); state->suftab = gt_suftabSequentialsuffixarrayreader(ssar); } else { state->currentmer = NULL; state->suftab = NULL; } if (state->mersize > state->totallength) { gt_error_set(err,"mersize "GT_WU" > "GT_WU" = totallength not allowed", state->mersize, state->totallength); haserr = true; } else { if (strlen(storeindex) == 0) { state->processoccurrencecount = adddistpos2distribution; } else { state->merindexfpout = gt_fa_fopen_with_suffix(storeindex,MERSUFFIX, "wb",err); if (state->merindexfpout == NULL) { haserr = true; } else { if (state->storecounts) { state->countsfilefpout = gt_fa_fopen_with_suffix(storeindex,COUNTSSUFFIX,"wb",err); if (state->countsfilefpout == NULL) { haserr = true; } } } state->processoccurrencecount = outputsortedstring2index; } if (!haserr) { if (gt_depthfirstesa(ssar, tyr_allocateDfsinfo, tyr_freeDfsinfo, tyr_processleafedge, NULL, tyr_processcompletenode, tyr_assignleftmostleaf, tyr_assignrightmostleaf, (Dfsstate*) state, logger, err) != 0) { haserr = true; } if (strlen(storeindex) == 0) { showfinalstatistics(state,inputindex,logger); } } if (!haserr) { if (state->countsfilefpout != NULL) { gt_logger_log(logger,"write "GT_WU" mercounts > "GT_WU " to file \"%s%s\"", state->largecounts.nextfreeLargecount, (GtUword) MAXSMALLMERCOUNT, storeindex, COUNTSSUFFIX); gt_xfwrite(state->largecounts.spaceLargecount, sizeof (Largecount), (size_t) state->largecounts.nextfreeLargecount, state->countsfilefpout); } } if (!haserr) { gt_logger_log(logger,"number of "GT_WU"-mers in index: "GT_WU"", mersize, state->countoutputmers); gt_logger_log(logger,"index size: %.2f megabytes\n", GT_MEGABYTES(state->countoutputmers * state->sizeofbuffer + sizeof (GtUword) * EXTRAINTEGERS)); } } /* now out EXTRAINTEGERS integer values */ if (!haserr && state->merindexfpout != NULL) { outputbytewiseUlongvalue(state->merindexfpout, (GtUword) state->mersize); outputbytewiseUlongvalue(state->merindexfpout,(GtUword) alphasize); } gt_fa_xfclose(state->merindexfpout); gt_fa_xfclose(state->countsfilefpout); GT_FREEARRAY(&state->occdistribution,Countwithpositions); gt_free(state->currentmer); gt_free(state->bytebuffer); GT_FREEARRAY(&state->largecounts,Largecount); gt_encseq_reader_delete(state->esrspace); gt_free(state); return haserr ? -1 : 0; }
static int gt_encseq_info_runner(GT_UNUSED int argc, const char **argv, int parsed_args, void *tool_arguments, GtError *err) { GtEncseqInfoArguments *arguments = tool_arguments; int had_err = 0; GtAlphabet *alpha; const GtUchar *chars; gt_error_check(err); gt_assert(arguments); if (arguments->nomap) { GtEncseqMetadata *emd = gt_encseq_metadata_new(argv[parsed_args], err); if (!emd) had_err = -1; if (!had_err) { if (!arguments->noindexname) { gt_file_xprintf(arguments->outfp, "index name: "); gt_file_xprintf(arguments->outfp, "%s\n", argv[parsed_args]); } gt_file_xprintf(arguments->outfp, "file format version: "); gt_file_xprintf(arguments->outfp, ""GT_WU"\n", gt_encseq_metadata_version(emd)); gt_file_xprintf(arguments->outfp, "64-bit file: "); gt_file_xprintf(arguments->outfp, "%s\n", gt_encseq_metadata_is64bit(emd) ? "yes" : "no"); gt_file_xprintf(arguments->outfp, "total length: "); gt_file_xprintf(arguments->outfp, ""GT_WU"\n", gt_encseq_metadata_total_length(emd)); gt_file_xprintf(arguments->outfp, "number of sequences: "); gt_file_xprintf(arguments->outfp, ""GT_WU"\n", gt_encseq_metadata_num_of_sequences(emd)); gt_file_xprintf(arguments->outfp, "number of files: "); gt_file_xprintf(arguments->outfp, ""GT_WU"\n", gt_encseq_metadata_num_of_files(emd)); gt_file_xprintf(arguments->outfp, "length of shortest/longest " "sequence: "); gt_file_xprintf(arguments->outfp, ""GT_WU"/"GT_WU"\n", gt_encseq_metadata_min_seq_length(emd), gt_encseq_metadata_max_seq_length(emd)); gt_file_xprintf(arguments->outfp, "accesstype: "); gt_file_xprintf(arguments->outfp, "%s\n", gt_encseq_access_type_str(gt_encseq_metadata_accesstype(emd))); alpha = gt_encseq_metadata_alphabet(emd); chars = gt_alphabet_characters(alpha); gt_file_xprintf(arguments->outfp, "alphabet size: "); gt_file_xprintf(arguments->outfp, "%u\n", gt_alphabet_num_of_chars(alpha)); gt_file_xprintf(arguments->outfp, "alphabet characters: "); gt_file_xprintf(arguments->outfp, "%.*s", gt_alphabet_num_of_chars(alpha), (char*) chars); if (gt_alphabet_is_dna(alpha)) gt_file_xprintf(arguments->outfp, " (DNA)"); if (gt_alphabet_is_protein(alpha)) gt_file_xprintf(arguments->outfp, " (Protein)"); gt_file_xprintf(arguments->outfp, "\n"); if (arguments->show_alphabet) { GtStr *out = gt_str_new(); gt_alphabet_to_str(alpha, out); gt_file_xprintf(arguments->outfp, "alphabet definition:\n"); gt_file_xprintf(arguments->outfp, "%s\n", gt_str_get(out)); gt_str_delete(out); } } gt_encseq_metadata_delete(emd); } else { GtEncseqLoader *encseq_loader; GtEncseq *encseq; encseq_loader = gt_encseq_loader_new(); if (arguments->mirror) gt_encseq_loader_mirror(encseq_loader); if (!(encseq = gt_encseq_loader_load(encseq_loader, argv[parsed_args], err))) had_err = -1; if (!had_err) { const GtStrArray *filenames; GtUword i; if (!arguments->noindexname) { gt_file_xprintf(arguments->outfp, "index name: "); gt_file_xprintf(arguments->outfp, "%s\n", argv[parsed_args]); } gt_file_xprintf(arguments->outfp, "file format version: "); gt_file_xprintf(arguments->outfp, ""GT_WU"\n", gt_encseq_version(encseq)); gt_file_xprintf(arguments->outfp, "64-bit file: "); gt_file_xprintf(arguments->outfp, "%s\n", gt_encseq_is_64_bit(encseq) ? "yes" : "no"); gt_file_xprintf(arguments->outfp, "total length: "); gt_file_xprintf(arguments->outfp, ""GT_WU"\n", gt_encseq_total_length(encseq)); gt_file_xprintf(arguments->outfp, "compressed size: "); gt_file_xprintf(arguments->outfp, ""GT_WU" bytes\n", gt_encseq_sizeofrep(encseq)); gt_file_xprintf(arguments->outfp, "number of sequences: "); gt_file_xprintf(arguments->outfp, ""GT_WU"\n", gt_encseq_num_of_sequences(encseq)); gt_file_xprintf(arguments->outfp, "number of files: "); gt_file_xprintf(arguments->outfp, ""GT_WU"\n", gt_encseq_num_of_files(encseq)); gt_file_xprintf(arguments->outfp, "length of shortest/longest " "sequence: "); gt_file_xprintf(arguments->outfp, ""GT_WU"/"GT_WU"\n", gt_encseq_min_seq_length(encseq), gt_encseq_max_seq_length(encseq)); filenames = gt_encseq_filenames(encseq); gt_file_xprintf(arguments->outfp, "original filenames:\n"); for (i = 0; i < gt_str_array_size(filenames); i++) { gt_file_xprintf(arguments->outfp, "\t%s ("GT_WU" characters)\n", gt_str_array_get(filenames, i), (GtUword) gt_encseq_effective_filelength(encseq, i)); } alpha = gt_encseq_alphabet(encseq); chars = gt_alphabet_characters(alpha); gt_file_xprintf(arguments->outfp, "alphabet size: "); gt_file_xprintf(arguments->outfp, "%u\n", gt_alphabet_num_of_chars(alpha)); gt_file_xprintf(arguments->outfp, "alphabet characters: "); gt_file_xprintf(arguments->outfp, "%.*s", gt_alphabet_num_of_chars(alpha), (char*) chars); if (gt_alphabet_is_dna(alpha)) gt_file_xprintf(arguments->outfp, " (DNA)"); if (gt_alphabet_is_protein(alpha)) gt_file_xprintf(arguments->outfp, " (Protein)"); gt_file_xprintf(arguments->outfp, "\n"); if (arguments->show_alphabet) { GtStr *out = gt_str_new(); gt_alphabet_to_str(alpha, out); gt_file_xprintf(arguments->outfp, "alphabet definition:\n"); gt_file_xprintf(arguments->outfp, "%s\n", gt_str_get(out)); gt_str_delete(out); } gt_file_xprintf(arguments->outfp, "character distribution:\n"); for (i = 0; i < gt_alphabet_num_of_chars(alpha); i++) { GtUword cc; cc = gt_encseq_charcount(encseq, gt_alphabet_encode(alpha, chars[i])); gt_file_xprintf(arguments->outfp, "\t%c: "GT_WU" (%.2f%%)\n", (char) chars[i], cc, (cc /(double) (gt_encseq_total_length(encseq) - gt_encseq_num_of_sequences(encseq)+1))*100); } gt_file_xprintf(arguments->outfp, "number of wildcards: "); gt_file_xprintf(arguments->outfp, ""GT_WU" ("GT_WU" range(s))\n", gt_encseq_wildcards(encseq), gt_encseq_realwildcardranges(encseq)); gt_file_xprintf(arguments->outfp, "number of special characters: "); gt_file_xprintf(arguments->outfp, ""GT_WU" ("GT_WU" range(s))\n", gt_encseq_specialcharacters(encseq), gt_encseq_realspecialranges(encseq)); gt_file_xprintf(arguments->outfp, "length of longest non-special " "character stretch: "); gt_file_xprintf(arguments->outfp, ""GT_WU"\n", gt_encseq_lengthoflongestnonspecial(encseq)); gt_file_xprintf(arguments->outfp, "accesstype: "); gt_file_xprintf(arguments->outfp, "%s\n", gt_encseq_access_type_str(gt_encseq_accesstype_get(encseq))); gt_file_xprintf(arguments->outfp, "bits used per character: "); gt_file_xprintf(arguments->outfp, "%f\n", (double) ((uint64_t) CHAR_BIT * (uint64_t) gt_encseq_sizeofrep(encseq)) / (double) gt_encseq_total_length(encseq)); gt_file_xprintf(arguments->outfp, "has special ranges: "); gt_file_xprintf(arguments->outfp, "%s\n", gt_encseq_has_specialranges(encseq) ? "yes" : "no"); gt_file_xprintf(arguments->outfp, "has description support: "); gt_file_xprintf(arguments->outfp, "%s\n", gt_encseq_has_description_support(encseq) ? "yes" : "no"); if (gt_encseq_has_description_support(encseq)) { gt_file_xprintf(arguments->outfp, "length of longest description: "); gt_file_xprintf(arguments->outfp, ""GT_WU"\n", gt_encseq_max_desc_length(encseq)); } gt_file_xprintf(arguments->outfp, "has multiple sequence support: "); gt_file_xprintf(arguments->outfp, "%s\n", gt_encseq_has_multiseq_support(encseq) ? "yes" : "no"); } gt_encseq_delete(encseq); gt_encseq_loader_delete(encseq_loader); } return had_err; }
int gt_mapfmindex (Fmindex *fmindex,const char *indexname, GtLogger *logger,GtError *err) { FILE *fpin = NULL; bool haserr = false, storeindexpos = true; GtSpecialcharinfo specialcharinfo; gt_error_check(err); fmindex->mappedptr = NULL; fmindex->bwtformatching = NULL; fmindex->alphabet = NULL; fpin = gt_fa_fopen_with_suffix(indexname,FMASCIIFILESUFFIX,"rb",err); if (fpin == NULL) { haserr = true; } if (!haserr) { if (scanfmafileviafileptr(fmindex, &specialcharinfo, &storeindexpos, indexname, fpin, logger, err) != 0) { haserr = true; } } gt_fa_xfclose(fpin); if (!haserr) { fmindex->bwtformatching = mapbwtencoding(indexname,logger,err); if (fmindex->bwtformatching == NULL) { haserr = true; } } if (!haserr) { fmindex->specpos.nextfreeGtPairBwtidx = (unsigned long) gt_determinenumberofspecialstostore(&specialcharinfo); fmindex->specpos.spaceGtPairBwtidx = NULL; fmindex->specpos.allocatedGtPairBwtidx = 0; fmindex->alphabet = gt_alphabet_ref( gt_encseq_alphabet(fmindex->bwtformatching)); if (fmindex->alphabet == NULL) { haserr = true; } } if (!haserr) { GtStr *tmpfilename; gt_computefmkeyvalues (fmindex, &specialcharinfo, fmindex->bwtlength, fmindex->log2bsize, fmindex->log2markdist, gt_alphabet_num_of_chars(fmindex->alphabet), fmindex->suffixlength, storeindexpos); tmpfilename = gt_str_new_cstr(indexname); gt_str_append_cstr(tmpfilename,FMDATAFILESUFFIX); if (gt_fillfmmapspecstartptr(fmindex,storeindexpos,tmpfilename,err) != 0) { haserr = true; } gt_str_delete(tmpfilename); } if (haserr) { gt_freefmindex(fmindex); } return haserr ? -1 : 0; }
GtBareEncseq *gt_bare_encseq_parse_new(GtUchar *filecontents,size_t numofbytes, const GtAlphabet *alphabet, GtError *err) { GtUchar *writeptr = filecontents, *readptr = filecontents; const GtUchar *endptr = filecontents + numofbytes; bool firstline = true, haserr = false; GtUword lastspecialrange_length = 0; GtBareSpecialrange *srptr = NULL; GtBareEncseq *bare_encseq = gt_malloc(sizeof *bare_encseq); const GtUchar *smap = gt_alphabet_symbolmap(alphabet); bare_encseq->specialcharacters = 0; bare_encseq->numofchars = (GtUword) gt_alphabet_num_of_chars(alphabet); bare_encseq->charcount = gt_calloc((size_t) bare_encseq->numofchars, sizeof *bare_encseq->charcount); GT_INITARRAY(&bare_encseq->specialranges,GtBareSpecialrange); readptr = filecontents; while (!haserr && readptr < endptr) { if (*readptr == '>') { if (!firstline) { if (lastspecialrange_length == 0) { GT_GETNEXTFREEINARRAY(srptr,&bare_encseq->specialranges, GtBareSpecialrange,128UL); srptr->start = (GtUword) (writeptr - filecontents); } lastspecialrange_length++; *writeptr++ = SEPARATOR; bare_encseq->specialcharacters++; } else { firstline = false; } while (readptr < endptr && *readptr != '\n') { readptr++; } readptr++; } else { while (readptr < endptr && *readptr != '\n') { if (!isspace(*readptr)) { GtUchar cc = smap[*readptr]; if (cc == UNDEFCHAR) { gt_error_set(err,"illegal input characters %c\n",*readptr); haserr = true; break; } if (ISSPECIAL(cc)) { if (lastspecialrange_length == 0) { GT_GETNEXTFREEINARRAY(srptr,&bare_encseq->specialranges, GtBareSpecialrange,128UL); srptr->start = (GtUword) (writeptr - filecontents); } lastspecialrange_length++; bare_encseq->specialcharacters++; } else { gt_assert((GtUword) cc < bare_encseq->numofchars); bare_encseq->charcount[(int) cc]++; if (lastspecialrange_length > 0) { gt_assert(srptr != NULL); srptr->length = lastspecialrange_length; } lastspecialrange_length = 0; } *writeptr++ = cc; } readptr++; } readptr++; } } if (lastspecialrange_length > 0) { gt_assert(srptr != NULL); srptr->length = lastspecialrange_length; } bare_encseq->sequence = filecontents; bare_encseq->totallength = (GtUword) (writeptr - filecontents); if (haserr) { gt_bare_encseq_delete(bare_encseq); return NULL; } return bare_encseq; }