static int verifycodelists(const GtEncseq *encseq, unsigned int kmersize, unsigned int numofchars, const GtArrayGtCodetype *codeliststream, GtError *err) { bool haserr = false; GtArrayGtCodetype codeliststring; const GtUchar *characters; GtUword stringtotallength; gt_error_check(err); stringtotallength = gt_encseq_total_length(encseq); characters = gt_alphabet_characters(gt_encseq_alphabet(encseq)); GT_INITARRAY(&codeliststring,GtCodetype); collectkmercode(&codeliststring, encseq, kmersize, numofchars, stringtotallength); if (comparecodelists(codeliststream, &codeliststring, kmersize, numofchars, (const char *) characters, err) != 0) { haserr = true; } GT_FREEARRAY(&codeliststring,GtCodetype); return haserr ? -1 : 0; }
static int inputthesequences(unsigned int *numofchars, unsigned long *nextpostable, Suffixarray *suffixarraytable, const GtStrArray *indexnametab, unsigned int demand, GtLogger *logger, GtError *err) { unsigned long idx; const char *indexname; gt_error_check(err); for (idx=0; idx<gt_str_array_size(indexnametab); idx++) { indexname = gt_str_array_get(indexnametab,idx); if (streamsuffixarray(&suffixarraytable[idx], demand, indexname, logger, err) != 0) { return -1; } if (idx == 0) { *numofchars = gt_alphabet_num_of_chars( gt_encseq_alphabet(suffixarraytable[idx].encseq)); } nextpostable[idx] = 0; } return 0; }
static int decode_sequence_file(const char *seqfile, GtEncseqDecodeArguments *args, GtError *err) { GtEncseqLoader *encseq_loader; GtEncseq *encseq; int had_err = 0; gt_error_check(err); gt_assert(seqfile); encseq_loader = gt_encseq_loader_new(); if (!had_err && gt_encseq_options_lossless_value(args->eopts)) { gt_encseq_loader_require_lossless_support(encseq_loader); } if (!(encseq = gt_encseq_loader_load(encseq_loader, seqfile, err))) had_err = -1; if (!had_err && gt_encseq_options_mirrored_value(args->eopts)) { if (!gt_alphabet_is_dna(gt_encseq_alphabet(encseq))) { gt_error_set(err, "mirroring is only defined on DNA sequences"); had_err = -1; } if (!had_err) had_err = gt_encseq_mirror(encseq, err); } if (!had_err) had_err = output_sequence(encseq, args, seqfile, err); gt_encseq_delete(encseq); gt_encseq_loader_delete(encseq_loader); return had_err; }
static int encseq_lua_alphabet(lua_State *L) { GtEncseq **encseq; GtAlphabet *alpha; encseq = check_encseq(L, 1); gt_assert(*encseq); alpha = gt_alphabet_ref(gt_encseq_alphabet(*encseq)); gt_lua_alphabet_push(L, alpha); return 1; }
GtSeq* gt_bioseq_get_seq(GtBioseq *bs, GtUword idx) { GtSeq *seq; gt_assert(bs); gt_assert(idx < gt_encseq_num_of_sequences(bs->encseq)); seq = gt_seq_new_own(gt_bioseq_get_sequence(bs, idx), gt_bioseq_get_sequence_length(bs, idx), gt_encseq_alphabet(bs->encseq)); gt_seq_set_description(seq, gt_bioseq_get_description(bs, idx)); return seq; }
void getencseqkmers(const GtEncseq *encseq, GtReadmode readmode, unsigned int kmersize, void(*processkmercode)(void *, unsigned long, const GtKmercode *), void *processkmercodeinfo) { unsigned long currentposition = 0, totallength; Kmerstream *spwp; GtUchar charcode; GtEncseqReader *esr; unsigned int numofchars, overshoot; totallength = gt_encseq_total_length(encseq); if (totallength < (unsigned long) kmersize) { return; } numofchars = gt_alphabet_num_of_chars(gt_encseq_alphabet(encseq)); spwp = kmerstream_new(numofchars,kmersize); esr = gt_encseq_create_reader_with_readmode(encseq,readmode,0); for (currentposition = 0; currentposition < (unsigned long) kmersize; currentposition++) { charcode = gt_encseq_reader_next_encoded_char(esr); GT_CHECKENCCHAR(charcode,encseq,currentposition,readmode); spwp->windowwidth++; updatespecialpositions(spwp,charcode,false,0); spwp->cyclicwindow[spwp->windowwidth-1] = charcode; } kmerstream_newcode(&spwp->currentkmercode,spwp); processkmercode(processkmercodeinfo,0,&spwp->currentkmercode); for (currentposition = (unsigned long) kmersize; currentposition<totallength; currentposition++) { charcode = gt_encseq_reader_next_encoded_char(esr); GT_CHECKENCCHAR(charcode,encseq,currentposition,readmode); shiftrightwithchar(spwp,charcode); kmerstream_newcode(&spwp->currentkmercode,spwp); processkmercode(processkmercodeinfo,currentposition + 1 - spwp->kmersize, &spwp->currentkmercode); } gt_encseq_reader_delete(esr); for (overshoot=0; overshoot<kmersize; overshoot++) { shiftrightwithchar(spwp,(GtUchar) WILDCARD); kmerstream_newcode(&spwp->currentkmercode,spwp); processkmercode(processkmercodeinfo, overshoot + currentposition + 1 - spwp->kmersize, &spwp->currentkmercode); } kmerstream_delete(spwp); }
GtQuerysubstringmatchiterator *gt_querysubstringmatchiterator_new( const GtEncseq *dbencseq, GtUword totallength, const ESASuffixptr *suftabpart, GtReadmode db_readmode, GtUword numberofsuffixes, const GtStrArray *query_files, const GtEncseq *query_encseq, GtReadmode query_readmode, unsigned int userdefinedleastlength, GtError *err) { GtQuerysubstringmatchiterator *qsmi = gt_malloc(sizeof *qsmi); qsmi->dbencseq = dbencseq; qsmi->suftabpart = suftabpart; qsmi->db_readmode = db_readmode; qsmi->numberofsuffixes = numberofsuffixes; qsmi->totallength = totallength; qsmi->userdefinedleastlength = (GtUword) userdefinedleastlength; qsmi->queryunitnum = 0; qsmi->desc = NULL; qsmi->query_for_seqit = NULL; qsmi->query_seqlen = 0; qsmi->queryrep.sequence = NULL; qsmi->queryrep.encseq = query_encseq; qsmi->queryrep.readmode = query_readmode; qsmi->queryrep.startpos = 0; qsmi->dbstart = 0; qsmi->matchlength = 0; qsmi->querysubstring.queryrep = &qsmi->queryrep; qsmi->mmsi = gt_mmsearchiterator_new_empty(); qsmi->mmsi_defined = false; if (query_files == NULL || gt_str_array_size(query_files) == 0) { gt_assert(query_encseq != NULL); qsmi->seqit = NULL; qsmi->query_encseq_numofsequences = (uint64_t) gt_encseq_num_of_sequences(query_encseq); } else { gt_assert(query_encseq == NULL); qsmi->seqit = gt_seq_iterator_sequence_buffer_new(query_files, err); if (qsmi->seqit == NULL) { gt_querysubstringmatchiterator_delete(qsmi); return NULL; } gt_seq_iterator_set_symbolmap(qsmi->seqit, gt_alphabet_symbolmap(gt_encseq_alphabet(dbencseq))); } return qsmi; }
int gt_test_trieins(bool onlyins,const char *indexname,GtError *err) { Suffixarray suffixarray; bool haserr = false; unsigned long totallength = 0; gt_error_check(err); if (streamsuffixarray(&suffixarray, SARR_ESQTAB, indexname, NULL, err) != 0) { haserr = true; } else { totallength = gt_encseq_total_length(suffixarray.encseq); } if (!haserr) { Mergertrierep trierep; const GtUchar *characters; trierep.encseqreadinfo = gt_malloc(sizeof *trierep.encseqreadinfo); trierep.encseqreadinfo->encseqptr = suffixarray.encseq; trierep.encseqreadinfo->readmode = suffixarray.readmode; characters = gt_alphabet_characters(gt_encseq_alphabet(suffixarray.encseq)); gt_mergertrie_initnodetable(&trierep,totallength,1U); maketrie(&trierep,characters,totallength); if (onlyins) { #ifdef WITHTRIEIDENT #ifdef WITHTRIESHOW showtrie(&trierep,characters); #endif checktrie(&trierep,totallength+1,totallength,err); #endif } else { #ifdef WITHTRIEIDENT #ifdef WITHTRIESHOW showallnoderelations(trierep.root); #endif #endif successivelydeletesmallest(&trierep,totallength,characters,err); } gt_mergertrie_delete(&trierep); } gt_freesuffixarray(&suffixarray); return haserr ? -1 : 0; }
GtCondenseq *gt_condenseq_new(const GtEncseq *orig_es, GtLogger *logger) { GtCondenseq *condenseq; condenseq = condenseq_new_empty(gt_encseq_alphabet(orig_es)); condenseq->orig_num_seq = gt_encseq_num_of_sequences(orig_es); condenseq->ssptab = condenseq_fill_tab(condenseq, orig_es); condenseq->orig_length = gt_encseq_total_length(orig_es); condenseq_process_descriptions(condenseq, orig_es, logger); return condenseq; }
GtSeq* gt_bioseq_get_seq_range(GtBioseq *bs, GtUword idx, GtUword start, GtUword end) { GtSeq *seq; gt_assert(bs); gt_assert(idx < gt_encseq_num_of_sequences(bs->encseq)); gt_assert(end >= start); gt_assert(end - start + 1 > gt_encseq_seqlength(bs->encseq, idx)); seq = gt_seq_new_own(gt_bioseq_get_sequence_range(bs, idx, start, end), end - start + 1, gt_encseq_alphabet(bs->encseq)); gt_seq_set_description(seq, gt_bioseq_get_description(bs, idx)); return seq; }
void gt_bioseq_show_gc_content(GtBioseq *bs, GtFile *outfp) { gt_assert(bs); if (gt_alphabet_is_dna(gt_encseq_alphabet(bs->encseq))) { GtUword i, GT_UNUSED purecharlen; GtStr *str = gt_str_new(); purecharlen = gt_encseq_total_length(bs->encseq) - gt_encseq_num_of_sequences(bs->encseq) + 1; for (i=0; i < gt_encseq_num_of_sequences(bs->encseq); i++) { char *tmp; tmp = gt_bioseq_get_sequence(bs, i); gt_str_append_cstr(str, tmp); gt_free(tmp); } gt_assert(gt_str_length(str) == purecharlen); gt_file_xprintf(outfp, "showing GC-content for sequence file \"%s\"\n", gt_str_get(bs->sequence_file)); gt_gc_content_show(gt_str_get(str), gt_str_length(str), gt_encseq_alphabet(bs->encseq), outfp); gt_str_delete(str); } }
int gt_verifymappedstr(const GtEncseq *encseq, unsigned int prefixlength, GtError *err) { unsigned int numofchars; GtArrayGtCodetype codeliststream; bool haserr = false; gt_error_check(err); numofchars = gt_alphabet_num_of_chars(gt_encseq_alphabet(encseq)); GT_INITARRAY(&codeliststream,GtCodetype); if (getfastastreamkmers(gt_encseq_filenames(encseq), numofchars, prefixlength, gt_alphabet_symbolmap( gt_encseq_alphabet(encseq)), false, &codeliststream, err) != 0) { haserr = true; } if (!haserr) { if (verifycodelists(encseq, prefixlength, numofchars, &codeliststream, err) != 0) { haserr = true; } } GT_FREEARRAY(&codeliststream,GtCodetype); return haserr ? -1 : 0; }
/*@notnull@*/ GtKmercodeiterator *gt_kmercodeiterator_encseq_new( const GtEncseq *encseq, GtReadmode readmode, unsigned int kmersize, unsigned long startpos) { GtKmercodeiterator *kmercodeiterator; unsigned int numofchars; GtUchar charcode; gt_assert(!GT_ISDIRREVERSE(readmode) || startpos == 0); kmercodeiterator = gt_malloc(sizeof (*kmercodeiterator)); kmercodeiterator->totallength = gt_encseq_total_length(encseq); kmercodeiterator->startpos = startpos; gt_assert(startpos < kmercodeiterator->totallength); if (kmercodeiterator->totallength - startpos < (unsigned long) kmersize) { kmercodeiterator->inputexhausted = true; kmercodeiterator->fb = NULL; kmercodeiterator->encseq = encseq; kmercodeiterator->esr = NULL; kmercodeiterator->spwp = NULL; } else { kmercodeiterator->inputexhausted = false; kmercodeiterator->fb = NULL; kmercodeiterator->encseq = encseq; kmercodeiterator->readmode = readmode; kmercodeiterator->esr = gt_encseq_create_reader_with_readmode(encseq, readmode, startpos); numofchars = gt_alphabet_num_of_chars(gt_encseq_alphabet(encseq)); kmercodeiterator->spwp = kmerstream_new(numofchars,kmersize); kmercodeiterator->hasprocessedfirst = false; for (kmercodeiterator->currentposition = startpos; kmercodeiterator->currentposition < startpos+(unsigned long) kmersize; kmercodeiterator->currentposition++) { charcode = gt_encseq_reader_next_encoded_char(kmercodeiterator->esr); kmercodeiterator->spwp->windowwidth++; updatespecialpositions(kmercodeiterator->spwp,charcode,false,0); kmercodeiterator->spwp->cyclicwindow[kmercodeiterator-> spwp->windowwidth-1] = charcode; } } return kmercodeiterator; }
int gt_esa2shulengthqueryfiles(unsigned long *totalgmatchlength, const Suffixarray *suffixarray, const GtStrArray *queryfilenames, GtError *err) { bool haserr = false; GtSeqIterator *seqit; const GtUchar *query; unsigned long querylen; char *desc = NULL; int retval; GtAlphabet *alphabet; gt_error_check(err); alphabet = gt_encseq_alphabet(suffixarray->encseq); gt_assert(gt_str_array_size(queryfilenames) == 1UL); seqit = gt_seq_iterator_sequence_buffer_new(queryfilenames, err); if (!seqit) { haserr = true; } if (!haserr) { gt_seq_iterator_set_symbolmap(seqit, gt_alphabet_symbolmap(alphabet)); for (; /* Nothing */; ) { retval = gt_seq_iterator_next(seqit, &query, &querylen, &desc, err); if (retval < 0) { haserr = true; break; } if (retval == 0) { break; } *totalgmatchlength += gt_esa2shulengthquery(suffixarray,query,querylen); } gt_seq_iterator_delete(seqit); } return haserr ? -1 : 0; }
void gt_encseq2symbolstring(FILE *fpout, const GtEncseq *encseq, GtReadmode readmode, unsigned long start, unsigned long wlen, unsigned long width) { unsigned long j, idx, lastpos; GtUchar currentchar; GtEncseqReader *esr; const GtAlphabet *alpha; esr = gt_encseq_create_reader_with_readmode(encseq, readmode, start); gt_assert(width > 0); lastpos = start + wlen - 1; alpha = gt_encseq_alphabet(encseq); for (idx = start, j = 0; /* Nothing */ ; idx++) { currentchar = gt_encseq_reader_next_encoded_char(esr); if (currentchar == (GtUchar) SEPARATOR) { fprintf(fpout,"\n>\n"); j = 0; } else { gt_alphabet_echo_pretty_symbol(alpha,fpout,currentchar); } if (idx == lastpos) { fprintf(fpout,"\n"); break; } if (currentchar != (GtUchar) SEPARATOR) { j++; if (j >= width) { fprintf(fpout,"\n"); j = 0; } } } gt_encseq_reader_delete(esr); }
void gt_fprintfencseq(FILE *fpout, const GtEncseq *encseq, unsigned long start, unsigned long wlen) { unsigned long idx; GtUchar currentchar; const GtAlphabet *alpha; alpha = gt_encseq_alphabet(encseq); for (idx = start; idx < start + wlen; idx++) { currentchar = gt_encseq_get_encoded_char(encseq, idx, GT_READMODE_FORWARD); gt_assert(ISNOTSPECIAL(currentchar)); gt_alphabet_echo_pretty_symbol(alpha,fpout,currentchar); } }
GtWtree* gt_wtree_encseq_new(GtEncseq *encseq) { /* sample rate for compressd bitseq */ const unsigned int samplerate = 32U; GtWtree *wtree; GtWtreeEncseq *wtree_encseq; wtree = gt_wtree_create(gt_wtree_encseq_class()); wtree_encseq = gt_wtree_encseq_cast(wtree); wtree_encseq->encseq = gt_encseq_ref(encseq); wtree_encseq->alpha = gt_alphabet_ref(gt_encseq_alphabet(encseq)); /* encoded chars + WC given by gt_alphabet_size, we have to encode UNDEFCHAR and SEPARATOR too */ wtree_encseq->alpha_size = gt_alphabet_size(wtree_encseq->alpha) + 2; wtree->members->num_of_symbols = (GtUword) wtree_encseq->alpha_size; /* levels in tree: \lceil log_2(\sigma)\rceil */ wtree_encseq->levels = gt_determinebitspervalue((GtUword) wtree_encseq->alpha_size); wtree_encseq->root_fo = gt_wtree_encseq_fill_offset_new(); wtree_encseq->current_fo = wtree_encseq->root_fo; wtree->members->length = gt_encseq_total_length(encseq); /* each level has number of symbols bits */ wtree_encseq->num_of_bits = wtree_encseq->levels * wtree->members->length; wtree_encseq->bits_size = wtree_encseq->num_of_bits / (sizeof (GtBitsequence) * CHAR_BIT); if (wtree_encseq->num_of_bits % (sizeof (GtBitsequence) * CHAR_BIT) != 0) wtree_encseq->bits_size++; wtree_encseq->bits = gt_calloc((size_t) wtree_encseq->bits_size, sizeof (GtBitsequence)); wtree_encseq->node_start = 0; gt_wtree_encseq_fill_bits(wtree_encseq); wtree_encseq->c_bits = gt_compressed_bitsequence_new(wtree_encseq->bits, samplerate, wtree_encseq->num_of_bits); gt_free(wtree_encseq->bits); wtree_encseq->bits = NULL; return wtree; }
static int gt_seed_extend_runner(GT_UNUSED int argc, GT_UNUSED const char **argv, GT_UNUSED int parsed_args, void *tool_arguments, GtError *err) { GtSeedExtendArguments *arguments = tool_arguments; GtEncseqLoader *encseq_loader = NULL; GtEncseq *aencseq = NULL, *bencseq = NULL; GtGreedyextendmatchinfo *grextinfo = NULL; GtXdropmatchinfo *xdropinfo = NULL; GtQuerymatchoutoptions *querymatchoutopt = NULL; GtTimer *seedextendtimer = NULL; GtExtendCharAccess cam = GT_EXTEND_CHAR_ACCESS_ANY; GtUword errorpercentage = 0UL; int had_err = 0; gt_error_check(err); gt_assert(arguments != NULL); gt_assert(arguments->se_minidentity >= GT_EXTEND_MIN_IDENTITY_PERCENTAGE && arguments->se_minidentity <= 100UL); /* Calculate error percentage from minidentity */ errorpercentage = 100UL - arguments->se_minidentity; /* Measure whole running time */ if (arguments->benchmark || arguments->verbose) { gt_showtime_enable(); } if (gt_showtime_enabled()) { seedextendtimer = gt_timer_new(); gt_timer_start(seedextendtimer); } /* Load encseq A */ encseq_loader = gt_encseq_loader_new(); gt_encseq_loader_enable_autosupport(encseq_loader); aencseq = gt_encseq_loader_load(encseq_loader, gt_str_get(arguments->dbs_indexname), err); if (aencseq == NULL) had_err = -1; /* If there is a 2nd read set: Load encseq B */ if (!had_err) { if (strcmp(gt_str_get(arguments->dbs_queryname), "") != 0) { bencseq = gt_encseq_loader_load(encseq_loader, gt_str_get(arguments->dbs_queryname), err); } else { bencseq = gt_encseq_ref(aencseq); } if (bencseq == NULL) { had_err = -1; gt_encseq_delete(aencseq); } } gt_encseq_loader_delete(encseq_loader); /* set character access method */ if (!had_err && (gt_option_is_set(arguments->se_option_greedy) || gt_option_is_set(arguments->se_option_xdrop) || arguments->se_alignmentwidth > 0)) { cam = gt_greedy_extend_char_access(gt_str_get (arguments->se_char_access_mode), err); if ((int) cam == -1) { had_err = -1; gt_encseq_delete(aencseq); gt_encseq_delete(bencseq); } } /* Use bias dependent parameters, adapted from E. Myers' DALIGNER */ if (!had_err && arguments->bias_parameters) { const GtAlphabet *alpha = gt_encseq_alphabet(aencseq); const double bias_factor[10] = {.690, .690, .690, .690, .780, .850, .900, .933, .966, 1.000}; if (gt_alphabet_is_dna(alpha)) { GtUword at, cg; at = gt_encseq_charcount(aencseq, gt_alphabet_encode(alpha, 'a')); at += gt_encseq_charcount(aencseq, gt_alphabet_encode(alpha, 't')); cg = gt_encseq_charcount(aencseq, gt_alphabet_encode(alpha, 'c')); cg += gt_encseq_charcount(aencseq, gt_alphabet_encode(alpha, 'g')); if (at + cg > 0) { const double ratio = (double)MIN(at, cg) / (at + cg); int bias_index = (int)MAX(0.0, (ratio + 0.025) * 20.0 - 1.0); gt_assert(bias_index < 10); arguments->se_maxalilendiff = 30; arguments->se_perc_match_hist = (GtUword)(100.0 - errorpercentage * bias_factor[bias_index]); if (arguments->verbose) { printf("# Base ratio = %4.2lf -> percmathistory = "GT_WU"\n", ratio, arguments->se_perc_match_hist); } } else { had_err = -1; } } else { had_err = -1; } if (had_err) { gt_error_set(err, "option \"-bias-parameters\" can only be applied to " "the DNA alphabet"); gt_encseq_delete(aencseq); gt_encseq_delete(bencseq); } } /* Prepare options for greedy extension */ if (!had_err && gt_option_is_set(arguments->se_option_greedy)) { grextinfo = gt_greedy_extend_matchinfo_new(errorpercentage, arguments->se_maxalilendiff, arguments->se_historysize, arguments->se_perc_match_hist, arguments->se_alignlength, cam, arguments->se_extendgreedy); if (arguments->benchmark) { gt_greedy_extend_matchinfo_silent_set(grextinfo); } } /* Prepare options for xdrop extension */ if (!had_err && gt_option_is_set(arguments->se_option_xdrop)) { xdropinfo = gt_xdrop_matchinfo_new(arguments->se_alignlength, errorpercentage, arguments->se_xdropbelowscore, arguments->se_extendxdrop); if (arguments->benchmark) { gt_xdrop_matchinfo_silent_set(xdropinfo); } } /* Prepare output options */ if (!had_err && (arguments->se_alignmentwidth > 0 || gt_option_is_set(arguments->se_option_xdrop))) { querymatchoutopt = gt_querymatchoutoptions_new(arguments->se_alignmentwidth); if (gt_option_is_set(arguments->se_option_xdrop) || gt_option_is_set(arguments->se_option_greedy)) { const GtUword sensitivity = gt_option_is_set(arguments->se_option_greedy) ? arguments->se_extendgreedy : 100; gt_querymatchoutoptions_extend(querymatchoutopt, errorpercentage, arguments->se_maxalilendiff, arguments->se_historysize, arguments->se_perc_match_hist, cam, sensitivity); } } /* Start algorithm */ if (!had_err) { GtDiagbandseed dbsarguments; dbsarguments.errorpercentage = errorpercentage; dbsarguments.userdefinedleastlength = arguments->se_alignlength; dbsarguments.seedlength = arguments->dbs_seedlength; dbsarguments.logdiagbandwidth = arguments->dbs_logdiagbandwidth; dbsarguments.mincoverage = arguments->dbs_mincoverage; dbsarguments.maxfreq = arguments->dbs_maxfreq; dbsarguments.memlimit = arguments->dbs_memlimit; dbsarguments.mirror = arguments->mirror; dbsarguments.overlappingseeds = arguments->overlappingseeds; dbsarguments.verify = arguments->dbs_verify; dbsarguments.verbose = arguments->verbose; dbsarguments.debug_kmer = arguments->dbs_debug_kmer; dbsarguments.debug_seedpair = arguments->dbs_debug_seedpair; dbsarguments.seed_display = arguments->seed_display; dbsarguments.extendgreedyinfo = grextinfo; dbsarguments.extendxdropinfo = xdropinfo; dbsarguments.querymatchoutopt = querymatchoutopt; had_err = gt_diagbandseed_run(aencseq, bencseq, &dbsarguments, err); /* clean up */ gt_encseq_delete(aencseq); gt_encseq_delete(bencseq); if (gt_option_is_set(arguments->se_option_greedy)) { gt_greedy_extend_matchinfo_delete(grextinfo); } if (gt_option_is_set(arguments->se_option_xdrop)) { gt_xdrop_matchinfo_delete(xdropinfo); } if (arguments->se_alignmentwidth > 0 || gt_option_is_set(arguments->se_option_xdrop)) { gt_querymatchoutoptions_delete(querymatchoutopt); } } if (gt_showtime_enabled()) { if (!had_err) { char *keystring = gt_seed_extend_params_keystring(gt_option_is_set(arguments-> se_option_greedy), gt_option_is_set(arguments-> se_option_xdrop), arguments->dbs_seedlength, arguments->se_alignlength, arguments->se_minidentity, arguments->se_maxalilendiff, arguments->se_perc_match_hist, arguments->se_extendgreedy, arguments->se_extendxdrop, arguments->se_xdropbelowscore); printf("# TIME seedextend-%s", keystring); gt_free(keystring); gt_timer_show_formatted(seedextendtimer, " overall " GT_WD ".%06ld\n", stdout); } gt_timer_delete(seedextendtimer); } return had_err; }
GtAlphabet* gt_bioseq_get_alphabet(GtBioseq *bs) { gt_assert(bs); return gt_encseq_alphabet(bs->encseq); }
static int gt_matstat_runner(GT_UNUSED int argc, GT_UNUSED const char **argv, GT_UNUSED int parsed_args, void *tool_arguments, GtError *err) { Gfmsubcallinfo *arguments = tool_arguments; Fmindex fmindex; Suffixarray suffixarray; void *packedindex = NULL; GtLogger *logger = NULL; bool haserr = false; const GtAlphabet *alphabet = NULL; #ifdef WITHBCKTAB unsigned int prefixlength = 0; #endif GtUword totallength; bool gt_mapfmindexfail = false; gt_error_check(err); gt_assert(arguments); logger = gt_logger_new(false, GT_LOGGER_DEFLT_PREFIX, stdout); if (arguments->indextype == Fmindextype) { if (gt_mapfmindex(&fmindex,gt_str_get(arguments->indexname), logger, err) != 0) { haserr = true; gt_mapfmindexfail = true; } else { alphabet = fmindex.alphabet; } totallength = fmindex.bwtlength-1; } else { unsigned int mappedbits; if (arguments->indextype == Esaindextype) { mappedbits = SARR_ESQTAB | SARR_SUFTAB #undef WITHBCKTAB #ifdef WITHBCKTAB | SARR_BCKTAB #endif ; } else { if (dotestsequence(arguments)) { mappedbits = SARR_ESQTAB; } else { mappedbits = 0; } } if (gt_mapsuffixarray(&suffixarray, mappedbits, gt_str_get(arguments->indexname), logger, err) != 0) { haserr = true; totallength = 0; } else { alphabet = gt_encseq_alphabet(suffixarray.encseq); #ifdef WITHBCKTAB prefixlength = suffixarray.prefixlength; #endif totallength = gt_encseq_total_length(suffixarray.encseq); } if (!haserr) { if (arguments->indextype == Packedindextype) { packedindex = gt_loadvoidBWTSeqForSA(gt_str_get(arguments->indexname), false, err); if (packedindex == NULL) { haserr = true; } } } } if (!haserr) { const void *theindex; Greedygmatchforwardfunction gmatchforwardfunction; if (arguments->indextype == Fmindextype) { theindex = (const void *) &fmindex; if (arguments->doms) { gmatchforwardfunction = gt_skfmmstats; } else { gmatchforwardfunction = gt_skfmuniqueforward; } } else { if (arguments->indextype == Esaindextype) { theindex = (const void *) &suffixarray; if (arguments->doms) { gmatchforwardfunction = gt_suffixarraymstats; } else { gmatchforwardfunction = gt_suffixarrayuniqueforward; } } else { gt_assert(arguments->indextype == Packedindextype); theindex = (const void *) packedindex; if (arguments->doms) { gmatchforwardfunction = gt_voidpackedindexmstatsforward; } else { gmatchforwardfunction = gt_voidpackedindexuniqueforward; } } } if (!haserr) { #ifdef WITHBCKTAB if (prefixlength > 0 && arguments->indextype == Esaindextype && runsubstringiteration(gmatchforwardfunction, theindex, totallength, suffixarray.bcktab, suffixarray.countspecialcodes, alphabet, prefixlength, arguments->queryfilenames, err) != 0) { haserr = true; } #endif if (!haserr && gt_findsubquerygmatchforward(dotestsequence(arguments) ? suffixarray.encseq : NULL, theindex, totallength, gmatchforwardfunction, alphabet, arguments->queryfilenames, arguments->minlength, arguments->maxlength, (arguments->showmode & SHOWSEQUENCE) ? true : false, (arguments->showmode & SHOWQUERYPOS) ? true : false, (arguments->showmode & SHOWSUBJECTPOS) ? true : false, err) != 0) { haserr = true; } } } if (arguments->indextype == Fmindextype) { if (!gt_mapfmindexfail) { gt_freefmindex(&fmindex); } } else { if (arguments->indextype == Packedindextype && packedindex != NULL) { gt_deletevoidBWTSeq(packedindex); } gt_freesuffixarray(&suffixarray); } gt_logger_delete(logger); return haserr ? -1 : 0;; }
int gt_testmaxpairs(const char *indexname, GtUword samples, unsigned int minlength, GtUword substringlength, GtLogger *logger, GtError *err) { GtEncseq *encseq; GtUword totallength = 0, dblen, querylen; GtUchar *dbseq = NULL, *query = NULL; bool haserr = false; GtUword s; GtArray *tabmaxquerymatches; Maxmatchselfinfo maxmatchselfinfo; GtEncseqLoader *el; gt_logger_log(logger,"draw "GT_WU" samples",samples); el = gt_encseq_loader_new(); gt_encseq_loader_do_not_require_des_tab(el); gt_encseq_loader_do_not_require_ssp_tab(el); gt_encseq_loader_do_not_require_sds_tab(el); gt_encseq_loader_set_logger(el, logger); encseq = gt_encseq_loader_load(el, indexname, err); gt_encseq_loader_delete(el); if (encseq == NULL) { haserr = true; } else { totallength = gt_encseq_total_length(encseq); } if (!haserr) { if (substringlength > totallength/2) { substringlength = totallength/2; } dbseq = gt_malloc(sizeof *dbseq * substringlength); query = gt_malloc(sizeof *query * substringlength); } for (s=0; s<samples && !haserr; s++) { dblen = samplesubstring(dbseq,encseq,substringlength); querylen = samplesubstring(query,encseq,substringlength); gt_logger_log(logger,"run query match for dblen="GT_WU"" ",querylen= "GT_WU", minlength=%u", dblen, querylen, minlength); tabmaxquerymatches = gt_array_new(sizeof (Substringmatch)); if (gt_sarrquerysubstringmatch(dbseq, dblen, query, (GtUword) querylen, minlength, gt_encseq_alphabet(encseq), storemaxmatchquery, tabmaxquerymatches, logger, err) != 0) { haserr = true; break; } gt_logger_log(logger,"run self match for dblen="GT_WU"" ",querylen= "GT_WU", minlength=%u", dblen, querylen, minlength); maxmatchselfinfo.results = gt_array_new(sizeof (Substringmatch)); maxmatchselfinfo.dblen = dblen; maxmatchselfinfo.querylen = querylen; maxmatchselfinfo.querymarkpos = sequence2markpositions(&maxmatchselfinfo.numofquerysequences, query,querylen); if (sarrselfsubstringmatch(dbseq, dblen, query, (GtUword) querylen, minlength, gt_encseq_alphabet(encseq), storemaxmatchself, &maxmatchselfinfo, logger, err) != 0) { haserr = true; break; } gt_array_sort(tabmaxquerymatches,orderSubstringmatch); gt_array_sort(maxmatchselfinfo.results,orderSubstringmatch); if (!gt_array_equal(tabmaxquerymatches,maxmatchselfinfo.results, orderSubstringmatch)) { const GtUword width = 60UL; printf("failure for query of length "GT_WU"\n",(GtUword) querylen); printf("querymatches\n"); (void) gt_array_iterate(tabmaxquerymatches,showSubstringmatch,NULL, err); printf("dbmatches\n"); (void) gt_array_iterate(maxmatchselfinfo.results,showSubstringmatch, NULL,err); gt_symbolstring2fasta(stdout,"dbseq", gt_encseq_alphabet(encseq), dbseq, (GtUword) dblen, width); gt_symbolstring2fasta(stdout,"queryseq", gt_encseq_alphabet(encseq), query, (GtUword) querylen, width); exit(GT_EXIT_PROGRAMMING_ERROR); } gt_free(maxmatchselfinfo.querymarkpos); printf("# numberofmatches="GT_WU"\n",gt_array_size(tabmaxquerymatches)); gt_array_delete(tabmaxquerymatches); gt_array_delete(maxmatchselfinfo.results); } gt_free(dbseq); gt_free(query); gt_encseq_delete(encseq); encseq = NULL; return haserr ? -1 : 0; }
static int gt_condenseq_compress_runner(GT_UNUSED int argc, const char **argv, int parsed_args, void *tool_arguments, GtError *err) { GtCondenseqCompressArguments *arguments = tool_arguments; GtLogger *logger, *kdb_logger; FILE *kmer_fp = NULL; int had_err = 0; gt_error_check(err); gt_assert(arguments); logger = gt_logger_new(arguments->verbose, GT_LOGGER_DEFLT_PREFIX, stderr); kdb_logger = gt_logger_new(arguments->kdb, GT_LOGGER_DEFLT_PREFIX, stderr); if (arguments->kdb) { kmer_fp = gt_fa_fopen("kmer_db.out", "w", err); gt_logger_set_target(kdb_logger, kmer_fp); } if (gt_str_length(arguments->indexname) == 0UL) { char *basenameptr; basenameptr = gt_basename(argv[parsed_args]); gt_str_set(arguments->indexname, basenameptr); gt_free(basenameptr); } if (!had_err) { GtEncseqLoader *es_l = gt_encseq_loader_new(); arguments->input_es = gt_encseq_loader_load(es_l, argv[parsed_args], err); if (arguments->input_es == NULL) had_err = -1; gt_encseq_loader_delete(es_l); } if (!had_err) { if (arguments->minalignlength == GT_UNDEF_UWORD) arguments->minalignlength = arguments->initsize != GT_UNDEF_UWORD ? arguments->initsize / (GtUword) 3UL : GT_UNDEF_UWORD; if (arguments->windowsize == GT_UNDEF_UINT) arguments->windowsize = arguments->minalignlength != GT_UNDEF_UWORD ? (unsigned int) (arguments->minalignlength / 5U) : GT_UNDEF_UINT; if (arguments->windowsize < 4U) arguments->windowsize = 4U; if (arguments->kmersize == GT_UNDEF_UINT) { unsigned int size = gt_alphabet_num_of_chars(gt_encseq_alphabet(arguments->input_es)); /* size^k ~= 100000 */ gt_safe_assign(arguments->kmersize, gt_round_to_long(gt_log_base(100000.0, (double) size))); gt_logger_log(logger, "|A|: %u, k: %u", size, arguments->kmersize); } if (arguments->windowsize == GT_UNDEF_UINT) { arguments->windowsize = 5U * arguments->kmersize; } if (arguments->minalignlength == GT_UNDEF_UWORD) { arguments->minalignlength = (GtUword) (3UL * arguments->windowsize); } if (arguments->initsize == GT_UNDEF_UWORD) { arguments->initsize = (GtUword) (3UL * arguments->minalignlength); } } if (!had_err && arguments->windowsize <= arguments->kmersize) { gt_error_set(err, "-windowsize (%u) must be larger -kmersize (%u)!", arguments->windowsize, arguments->kmersize); had_err = -1; } if (!had_err && arguments->minalignlength < (GtUword) arguments->windowsize) { gt_error_set(err, "-alignlength (" GT_WU ") must be at least " "-windowsize (%u)!", arguments->minalignlength, arguments->windowsize); had_err = -1; } if (!had_err && (arguments->initsize < arguments->minalignlength)) { gt_error_set(err, "-initsize (" GT_WU ") must be at least " "-alignlength (" GT_WU ")!", arguments->initsize, arguments->minalignlength); had_err = -1; } if (!had_err) { GtCondenseqCreator *ces_c; if (!had_err) { ces_c = gt_condenseq_creator_new(arguments->initsize, arguments->minalignlength, arguments->xdrop, &(arguments->scores), arguments->kmersize, arguments->windowsize, logger, err); if (ces_c == NULL) had_err = -1; } if (!had_err) { if (arguments->cutoff_value == GT_UNDEF_UWORD) gt_condenseq_creator_use_mean_cutoff(ces_c); else if (arguments->cutoff_value == 0) gt_condenseq_creator_disable_cutoff(ces_c); else gt_condenseq_creator_set_cutoff(ces_c, arguments->cutoff_value); gt_condenseq_creator_set_mean_fraction(ces_c, arguments->fraction); if (arguments->prune) gt_condenseq_creator_disable_prune(ces_c); if (arguments->brute) gt_condenseq_creator_enable_brute_force(ces_c); if (!arguments->diags) gt_condenseq_creator_disable_diagonals(ces_c); if (arguments->full_diags) gt_condenseq_creator_enable_full_diagonals(ces_c); if (arguments->clean_percent != GT_UNDEF_UINT) gt_condenseq_creator_set_diags_clean_limit(ces_c, arguments->clean_percent); had_err = gt_condenseq_creator_create(ces_c, arguments->indexname, arguments->input_es, logger, kdb_logger, err); gt_condenseq_creator_delete(ces_c); } } gt_logger_delete(logger); gt_logger_delete(kdb_logger); if (arguments->kdb) gt_fa_fclose(kmer_fp); return had_err; }
int gt_runidxlocali(const IdxlocaliOptions *idxlocalioptions,GtError *err) { Genericindex *genericindex = NULL; bool haserr = false; GtLogger *logger; const GtEncseq *encseq = NULL; logger = gt_logger_new(idxlocalioptions->verbose, GT_LOGGER_DEFLT_PREFIX, stdout); if (idxlocalioptions->doonline) { GtEncseqLoader *el; el = gt_encseq_loader_new(); gt_encseq_loader_require_multiseq_support(el); gt_encseq_loader_drop_description_support(el); gt_encseq_loader_set_logger(el, logger); encseq = gt_encseq_loader_load(el, gt_str_get(idxlocalioptions->indexname), err); gt_encseq_loader_delete(el); if (encseq == NULL) { haserr = true; } } else { genericindex = genericindex_new(gt_str_get(idxlocalioptions->indexname), idxlocalioptions->withesa, idxlocalioptions->withesa || idxlocalioptions->docompare, false, true, 0, logger, err); if (genericindex == NULL) { haserr = true; } else { encseq = genericindex_getencseq(genericindex); } } if (!haserr) { GtSeqIterator *seqit; const GtUchar *query; unsigned long querylen; char *desc = NULL; int retval; Limdfsresources *limdfsresources = NULL; const AbstractDfstransformer *dfst; SWdpresource *swdpresource = NULL; Showmatchinfo showmatchinfo; ProcessIdxMatch processmatch; GtAlphabet *a; void *processmatchinfoonline, *processmatchinfooffline; Storematchinfo storeonline, storeoffline; a = gt_encseq_alphabet(encseq); if (idxlocalioptions->docompare) { processmatch = storematch; gt_initstorematch(&storeonline,encseq); gt_initstorematch(&storeoffline,encseq); processmatchinfoonline = &storeonline; processmatchinfooffline = &storeoffline; } else { processmatch = showmatch; showmatchinfo.encseq = encseq; showmatchinfo.characters = gt_alphabet_characters(a); showmatchinfo.wildcardshow = gt_alphabet_wildcard_show(a); showmatchinfo.showalignment = idxlocalioptions->showalignment; processmatchinfoonline = processmatchinfooffline = &showmatchinfo; } if (idxlocalioptions->doonline || idxlocalioptions->docompare) { swdpresource = gt_newSWdpresource(idxlocalioptions->matchscore, idxlocalioptions->mismatchscore, idxlocalioptions->gapextend, idxlocalioptions->threshold, idxlocalioptions->showalignment, processmatch, processmatchinfoonline); } dfst = gt_locali_AbstractDfstransformer(); if (!idxlocalioptions->doonline || idxlocalioptions->docompare) { gt_assert(genericindex != NULL); limdfsresources = gt_newLimdfsresources(genericindex, true, 0, 0, /* maxpathlength */ true, /* keepexpandedonstack */ processmatch, processmatchinfooffline, NULL, /* processresult */ NULL, /* processresult info */ dfst); } seqit = gt_seq_iterator_sequence_buffer_new(idxlocalioptions->queryfiles, err); if (!seqit) haserr = true; if (!haserr) { gt_seq_iterator_set_symbolmap(seqit, gt_alphabet_symbolmap(a)); for (showmatchinfo.queryunit = 0; /* Nothing */; showmatchinfo.queryunit++) { retval = gt_seq_iterator_next(seqit, &query, &querylen, &desc, err); if (retval < 0) { haserr = true; break; } if (retval == 0) { break; } printf("process sequence " Formatuint64_t " of length %lu\n", PRINTuint64_tcast(showmatchinfo.queryunit),querylen); if (idxlocalioptions->doonline || idxlocalioptions->docompare) { gt_multiapplysmithwaterman(swdpresource,encseq,query,querylen); } if (!idxlocalioptions->doonline || idxlocalioptions->docompare) { gt_indexbasedlocali(limdfsresources, idxlocalioptions->matchscore, idxlocalioptions->mismatchscore, idxlocalioptions->gapstart, idxlocalioptions->gapextend, idxlocalioptions->threshold, query, querylen, dfst); } if (idxlocalioptions->docompare) { gt_checkandresetstorematch(showmatchinfo.queryunit, &storeonline,&storeoffline); } } if (limdfsresources != NULL) { gt_freeLimdfsresources(&limdfsresources,dfst); } if (swdpresource != NULL) { gt_freeSWdpresource(swdpresource); swdpresource = NULL; } gt_seq_iterator_delete(seqit); } if (idxlocalioptions->docompare) { gt_freestorematch(&storeonline); gt_freestorematch(&storeoffline); } } if (genericindex == NULL) { gt_encseq_delete((GtEncseq *) encseq); encseq = NULL; } else { genericindex_delete(genericindex); } gt_logger_delete(logger); logger = NULL; return haserr ? -1 : 0; }
int gt_genomediff_pck_shu_simple(GtLogger *logger, const GtGenomediffArguments *arguments, GtError *err) { int had_err = 0; int retval; GtSeqIterator *queries = NULL; const GtUchar *symbolmap, *currentQuery; const GtAlphabet *alphabet; GtUchar c_sym = 0, g_sym = 0; uint64_t queryNo; char *description = NULL; unsigned long queryLength, subjectLength = 0, currentSuffix; double avgShuLength, currentShuLength = 0.0, /*gc_subject,*/ gc_query /*, gc*/; const FMindex *subjectindex = NULL; Genericindex *genericindexSubject; const GtEncseq *encseq = NULL; double *ln_n_fac; /* get the precalculation of ln(n!) for 0<n<max_ln_n_fac */ ln_n_fac = gt_get_ln_n_fac(arguments->max_ln_n_fac); gt_log_log("ln(max_ln_n_fac!) = %f\n", ln_n_fac[arguments->max_ln_n_fac]); genericindexSubject = genericindex_new(gt_str_get( arguments->indexname), arguments->with_esa, true, false, true, arguments->user_max_depth, logger, err); if (genericindexSubject == NULL) { had_err = 1; } else { encseq = genericindex_getencseq(genericindexSubject); } if (!had_err) { subjectLength = genericindex_get_totallength(genericindexSubject) - 1; /*subjectLength /= 2;*/ /*gt_log_log("subject length: %lu", subjectLength);*/ subjectindex = genericindex_get_packedindex(genericindexSubject); queries = gt_seqiterator_sequence_buffer_new( arguments->queryname, err); gt_assert(queries); alphabet = gt_encseq_alphabet(encseq); /* makes assumption that alphabet is dna, it has to calculate the gc! */ if (!gt_alphabet_is_dna(alphabet)) { fprintf(stderr, "error: Sequences need to be dna"); had_err = 1; } else { symbolmap = gt_alphabet_symbolmap(alphabet); gt_seqiterator_set_symbolmap(queries, symbolmap); c_sym = gt_alphabet_encode(alphabet, 'c'); g_sym = gt_alphabet_encode(alphabet, 'g'); } } for (queryNo = 0; !had_err; queryNo++) { retval = gt_seqiterator_next(queries, ¤tQuery, &queryLength, &description, err); if ( retval != 1) { if (retval < 0) { gt_free(description); } break; } gt_logger_log(logger, "found query of length: %lu", queryLength); avgShuLength = 0.0; gc_query = 0.0; for (currentSuffix = 0; currentSuffix < queryLength; currentSuffix++) { currentShuLength = (double) gt_pck_getShuStringLength( subjectindex, ¤tQuery[currentSuffix], queryLength - currentSuffix); avgShuLength += currentShuLength; if (currentQuery[currentSuffix] == c_sym || currentQuery[currentSuffix] == g_sym) { gc_query++; } } if (arguments->shulen_only) { printf("# Query %d sum of shulen:\n %.0f\n", (int) queryNo, avgShuLength); } else { avgShuLength /= (double) queryLength; gc_query /= (double) queryLength; gt_logger_log(logger, "Query %d has an average SHUstring length " "of\n# shulength: %f", (int) queryNo, avgShuLength); gt_logger_log(logger, "Query description: %s", description); gt_log_log("Query (i): %s", description); /* XXX Fehlerabfragen einbauen */ if ( !had_err ) { double div, kr; gt_logger_log(logger, "shulen:\n%f", avgShuLength); gt_log_log("shu: %f, gc: %f, len: %lu", avgShuLength, gc_query, subjectLength); div = gt_divergence(arguments->divergence_rel_err, arguments->divergence_abs_err, arguments->divergence_m, arguments->divergence_threshold, avgShuLength, subjectLength, gc_query, ln_n_fac, arguments->max_ln_n_fac); gt_logger_log(logger, "divergence:\n%f", div); kr = gt_calculateKr(div); printf("# Kr:\n%f\n", kr); } } } gt_free(ln_n_fac); gt_seqiterator_delete(queries); genericindex_delete(genericindexSubject); return had_err; }
int gt_mapfmindex (Fmindex *fmindex,const char *indexname, GtLogger *logger,GtError *err) { FILE *fpin = NULL; bool haserr = false, storeindexpos = true; GtSpecialcharinfo specialcharinfo; gt_error_check(err); fmindex->mappedptr = NULL; fmindex->bwtformatching = NULL; fmindex->alphabet = NULL; fpin = gt_fa_fopen_with_suffix(indexname,FMASCIIFILESUFFIX,"rb",err); if (fpin == NULL) { haserr = true; } if (!haserr) { if (scanfmafileviafileptr(fmindex, &specialcharinfo, &storeindexpos, indexname, fpin, logger, err) != 0) { haserr = true; } } gt_fa_xfclose(fpin); if (!haserr) { fmindex->bwtformatching = mapbwtencoding(indexname,logger,err); if (fmindex->bwtformatching == NULL) { haserr = true; } } if (!haserr) { fmindex->specpos.nextfreeGtPairBwtidx = (unsigned long) gt_determinenumberofspecialstostore(&specialcharinfo); fmindex->specpos.spaceGtPairBwtidx = NULL; fmindex->specpos.allocatedGtPairBwtidx = 0; fmindex->alphabet = gt_alphabet_ref( gt_encseq_alphabet(fmindex->bwtformatching)); if (fmindex->alphabet == NULL) { haserr = true; } } if (!haserr) { GtStr *tmpfilename; gt_computefmkeyvalues (fmindex, &specialcharinfo, fmindex->bwtlength, fmindex->log2bsize, fmindex->log2markdist, gt_alphabet_num_of_chars(fmindex->alphabet), fmindex->suffixlength, storeindexpos); tmpfilename = gt_str_new_cstr(indexname); gt_str_append_cstr(tmpfilename,FMDATAFILESUFFIX); if (gt_fillfmmapspecstartptr(fmindex,storeindexpos,tmpfilename,err) != 0) { haserr = true; } gt_str_delete(tmpfilename); } if (haserr) { gt_freefmindex(fmindex); } return haserr ? -1 : 0; }
static GtMatchIteratorStatus gt_match_iterator_sw_next(GtMatchIterator *mi, GT_UNUSED GtMatch **match, GT_UNUSED GtError *err) { GtMatchIteratorSW *mis; GtSeq *seq_a, *seq_b; char *a, *b; const char *adesc, *bdesc; GtAlignment *ali = NULL; unsigned long seqlen_a, seqlen_b, seqpos; GtRange arng, brng; gt_assert(mi && match); mis = gt_match_iterator_sw_cast(mi); while (true) { if (!mis->pvt->firstali) mis->pvt->seqno_es2++; if (mis->pvt->seqno_es2 == gt_encseq_num_of_sequences(mis->pvt->es2)) { mis->pvt->seqno_es1++; if (mis->pvt->seqno_es1 == gt_encseq_num_of_sequences(mis->pvt->es1)) return GT_MATCHER_STATUS_END; mis->pvt->seqno_es2 = 0; } seqlen_a = gt_encseq_seqlength(mis->pvt->es1, mis->pvt->seqno_es1); seqlen_b = gt_encseq_seqlength(mis->pvt->es2, mis->pvt->seqno_es2); /* XXX: reuse buffers for performance improvement */ a = gt_malloc(seqlen_a * sizeof (char)); seqpos = gt_encseq_seqstartpos(mis->pvt->es1, mis->pvt->seqno_es1); gt_encseq_extract_decoded(mis->pvt->es1, a, seqpos, seqpos + seqlen_a - 1); b = gt_malloc(seqlen_b * sizeof (char)); seqpos = gt_encseq_seqstartpos(mis->pvt->es2, mis->pvt->seqno_es2); gt_encseq_extract_decoded(mis->pvt->es1, b, seqpos, seqpos + seqlen_b - 1); seq_a = gt_seq_new(a, seqlen_a, gt_encseq_alphabet(mis->pvt->es1)); seq_b = gt_seq_new(b, seqlen_b, gt_encseq_alphabet(mis->pvt->es2)); ali = gt_swalign(seq_a, seq_b, mis->pvt->sf); mis->pvt->firstali = false; if (ali && gt_alignment_get_length(ali) >= mis->pvt->min_len && gt_alignment_eval(ali) <= mis->pvt->max_edist) { break; } gt_alignment_delete(ali); gt_seq_delete(seq_a); gt_seq_delete(seq_b); gt_free(a); gt_free(b); } arng = gt_alignment_get_urange(ali); brng = gt_alignment_get_vrange(ali); adesc = gt_encseq_description(mis->pvt->es1, &seqlen_a, mis->pvt->seqno_es1); bdesc = gt_encseq_description(mis->pvt->es2, &seqlen_b, mis->pvt->seqno_es2); *match = gt_match_sw_new("", "", mis->pvt->seqno_es1, mis->pvt->seqno_es2, gt_alignment_get_length(ali), gt_alignment_eval(ali), arng.start, brng.start, arng.end, brng.end, GT_MATCH_DIRECT); gt_match_set_seqid1_nt(*match, adesc, seqlen_a); gt_match_set_seqid2_nt(*match, bdesc, seqlen_b); gt_alignment_delete(ali); gt_seq_delete(seq_a); gt_seq_delete(seq_b); gt_free(a); gt_free(b); return GT_MATCHER_STATUS_OK; }
static int gt_kmer_database_runner(GT_UNUSED int argc, const char **argv, int parsed_args, void *tool_arguments, GtError *err) { GtKmerDatabaseArguments *arguments = tool_arguments; int had_err = 0; GtEncseq *es; GtUword es_length, nu_kmer_codes = 0; GtKmerDatabase *compare_db = NULL, *db = NULL; GtLogger *logger; FILE *fp = NULL; GtHashmap *kmer_hash = NULL; GtTimer *timer = NULL; gt_error_check(err); gt_assert(arguments); if (arguments->use_hash) kmer_hash = gt_hashmap_new(GT_HASH_DIRECT, NULL, (GtFree) gt_kmer_database_delete_hash_value); if (arguments->bench) timer = gt_timer_new_with_progress_description("loading encoded sequence"); logger = gt_logger_new(arguments->verbose, GT_LOGGER_DEFLT_PREFIX, stderr); if (arguments->verbose && gt_str_length(arguments->print_filename) > 0UL) { fp = gt_fa_fopen(gt_str_get(arguments->print_filename), "w", err); gt_logger_set_target(logger, fp); } if (!had_err) { GtEncseqLoader *es_l; if (arguments->bench) gt_timer_start(timer); es_l = gt_encseq_loader_new(); es = gt_encseq_loader_load(es_l, argv[parsed_args], err); if (arguments->bench) gt_timer_show_progress(timer, "saving kmers (+iterating over file)", stdout); if (es == NULL) { had_err = -1; } gt_encseq_loader_delete(es_l); } if (!had_err) { es_length = gt_encseq_total_length(es); if (es_length < (GtUword) arguments->kmersize) { gt_error_set(err, "Input is too short for used kmersize. File length: " GT_WU " kmersize: %u", es_length, arguments->kmersize); had_err = -1; } } if (!had_err) { GtAlphabet *alphabet; alphabet = gt_encseq_alphabet(es); if (arguments->bench) nu_kmer_codes = gt_power_for_small_exponents( gt_alphabet_num_of_chars(alphabet), arguments->kmersize); if (!arguments->merge_only && !arguments->use_hash && !arguments->bench) { compare_db = gt_kmer_database_new(gt_alphabet_num_of_chars(alphabet), arguments->kmersize, arguments->sb_size, es); } if (!arguments->use_hash) { db = gt_kmer_database_new(gt_alphabet_num_of_chars(alphabet), arguments->kmersize, arguments->sb_size, es); if (arguments->cutoff) { if (arguments->mean_cutoff) gt_kmer_database_use_mean_cutoff(db, (GtUword) 2, arguments->cutoff_value); else gt_kmer_database_set_cutoff(db, arguments->cutoff_value); if (!arguments->prune) gt_kmer_database_set_prune(db); } } } if (!had_err) { GtUword startpos = 0, endpos; GtKmercodeiterator *iter; const GtKmercode *kmercode = NULL; iter = gt_kmercodeiterator_encseq_new(es, GT_READMODE_FORWARD, arguments->kmersize, 0); while (!had_err && startpos < es_length - (arguments->kmersize - 1)) { GtUword startpos_add_kmer = startpos; if (arguments->merge_only) { endpos = startpos + (arguments->kmersize - 1) + (gt_rand_max((arguments->sb_size - 1) * 2)); if (endpos > es_length) endpos = es_length; } else { endpos = startpos + (arguments->kmersize - 1) + (gt_rand_max(arguments->sb_size - 1)); } gt_kmercodeiterator_reset(iter, GT_READMODE_FORWARD, startpos); while ((kmercode = gt_kmercodeiterator_encseq_next(iter)) != NULL && startpos_add_kmer <= endpos - (arguments->kmersize - 1)) { if (!arguments->merge_only && !arguments->use_hash && !kmercode->definedspecialposition && !arguments->bench) { gt_kmer_database_add_kmer(compare_db, kmercode->code, startpos_add_kmer); } if (arguments->use_hash && !kmercode->definedspecialposition) { gt_kmer_database_add_to_hash(kmer_hash, kmercode->code, startpos_add_kmer); } startpos_add_kmer++; } if (!arguments->use_hash) { gt_kmer_database_add_interval(db, startpos, endpos); gt_kmer_database_print_buffer(db, logger); if (!arguments->bench) had_err = gt_kmer_database_check_consistency(db, err); } startpos = endpos + 1; } if (!arguments->use_hash) { gt_kmer_database_flush(db); gt_kmer_database_print_buffer(db, logger); if (!had_err && !arguments->bench) had_err = gt_kmer_database_check_consistency(db, err); if (!arguments->merge_only && !had_err && !arguments->bench) had_err = gt_kmer_database_check_consistency(compare_db, err); if (!arguments->merge_only && !arguments->bench) gt_kmer_database_print(compare_db, logger, true); if (!arguments->merge_only && !had_err && !arguments->bench) had_err = gt_kmer_database_compare(compare_db, db, err); gt_kmer_database_print(db, logger, true); } gt_kmercodeiterator_delete(iter); } if (arguments->bench) { GtKmerStartpos pos; GtArrayGtUword *pos_hash; GtUword rand_access = (GtUword) 50000000, rand_code, i, sum = 0; gt_timer_show_progress(timer, "random access", stdout); for (i = 0; i < rand_access; i++) { rand_code = gt_rand_max(nu_kmer_codes - 1); if (arguments->use_hash) { pos_hash = gt_hashmap_get(kmer_hash, (const void *) rand_code); if (pos_hash != NULL) sum += pos_hash->spaceGtUword[pos_hash->nextfreeGtUword - 1]; } else { pos = gt_kmer_database_get_startpos(db, rand_code); if (pos.no_positions > 0) sum += pos.startpos[pos.no_positions - 1]; } } printf("sum: " GT_WU "\n", sum); gt_timer_show_progress(timer, "", stdout); gt_timer_stop(timer); gt_timer_delete(timer); } if (arguments->use_hash) gt_hashmap_delete(kmer_hash); gt_encseq_delete(es); if (!arguments->use_hash) gt_kmer_database_delete(db); if (!arguments->merge_only && !arguments->bench) gt_kmer_database_delete(compare_db); gt_logger_delete(logger); gt_fa_fclose(fp); return had_err; }
/*read condenseq data structure from file*/ GtCondenseq *gt_condenseq_new_from_file(const char *indexname, GtLogger *logger, GtError *err) { int had_err = 0; FILE* fp; GtEncseqLoader *esl; GtEncseq *unique_es; GtCondenseq *condenseq = NULL; /*load unique_es*/ esl = gt_encseq_loader_new(); unique_es = gt_encseq_loader_load(esl, indexname, err); if (!unique_es) had_err = -1; if (!had_err) { gt_encseq_loader_delete(esl); condenseq = condenseq_new_empty(gt_encseq_alphabet(unique_es)); condenseq->filename = gt_cstr_dup(indexname); condenseq->unique_es = unique_es; fp = gt_fa_fopen_with_suffix(indexname, GT_CONDENSEQ_FILE_SUFFIX, "rb", err); if (fp == NULL) { had_err = -1; } else { had_err = condenseq_io(condenseq, fp, gt_io_error_fread, err); if (!had_err) { GtUword i; gt_assert(condenseq->uniques); gt_assert(condenseq->links); gt_fa_fclose(fp); /*create link array for each unique entry*/ for (i = 0; i < condenseq->udb_nelems; i++) { GT_INITARRAY(&(condenseq->uniques[i].links),uint32_t); } /* check for overflows */ if (condenseq->ldb_nelems > (GtUword) ((uint32_t) 0 - (uint32_t) 1)) { gt_error_set(err, "Overflow, to many link-elements. Can't be stored"); had_err = -1; } /* iterate through link entrys and store ids in corresponding unique entry array */ for (i = 0; !had_err && (GtUword) i < condenseq->ldb_nelems; i++) { GtUword uid = condenseq->links[i].unique_id; gt_assert(uid < condenseq->udb_nelems); GT_STOREINARRAY(&(condenseq->uniques[uid].links), uint32_t, 10, (uint32_t) i); } } } } if (!had_err) { gt_assert(condenseq != NULL); if (condenseq->id_len != GT_UNDEF_UWORD) gt_logger_log(logger, "IDs const len: " GT_WU, condenseq->id_len); else gt_logger_log(logger, "using sdstab to access IDs"); } if (had_err) { gt_condenseq_delete(condenseq); condenseq = NULL; } return (condenseq); }
static int gt_encseq_check_runner(GT_UNUSED int argc, const char **argv, int parsed_args, void *tool_arguments, GtError *err) { GtEncseqCheckArguments *arguments = tool_arguments; int had_err = 0; GtEncseqLoader *encseq_loader; GtEncseq *encseq; gt_error_check(err); gt_assert(arguments); encseq_loader = gt_encseq_loader_new(); if (!(encseq = gt_encseq_loader_load(encseq_loader, argv[parsed_args], err))) had_err = -1; if (!had_err) { int readmode; gt_encseq_check_startpositions(encseq); for (readmode = 0; readmode < 4; readmode++) { if (gt_alphabet_is_dna(gt_encseq_alphabet(encseq)) || ((GtReadmode) readmode) == GT_READMODE_FORWARD || ((GtReadmode) readmode) == GT_READMODE_REVERSE) { if (gt_encseq_check_consistency(encseq, gt_encseq_filenames(encseq), (GtReadmode) readmode, arguments->scantrials, arguments->multicharcmptrials, gt_encseq_has_multiseq_support(encseq), err) != 0) { had_err = -1; break; } } } if (!had_err) { gt_encseq_check_specialranges(encseq); } if (!had_err) { gt_encseq_check_markpos(encseq); } if (!had_err) { had_err = gt_encseq_check_minmax(encseq, err); } if (!had_err && arguments->prefixlength > 0) { if (gt_verifymappedstr(encseq, arguments->prefixlength, err) != 0) { had_err = -1; } } } gt_encseq_delete(encseq); gt_encseq_loader_delete(encseq_loader); return had_err; }
extern int gt_packedindex_chk_search(int argc, const char *argv[], GtError *err) { struct chkSearchOptions params; Suffixarray suffixarray; Enumpatterniterator *epi = NULL; bool saIsLoaded = false; BWTSeq *bwtSeq = NULL; GtStr *inputProject = NULL; int parsedArgs; bool had_err = false; BWTSeqExactMatchesIterator EMIter; bool EMIterInitialized = false; GtLogger *logger = NULL; inputProject = gt_str_new(); do { gt_error_check(err); { bool exitNow = false; switch (parseChkBWTOptions(&parsedArgs, argc, argv, ¶ms, inputProject, err)) { case GT_OPTION_PARSER_OK: break; case GT_OPTION_PARSER_ERROR: had_err = true; exitNow = true; break; case GT_OPTION_PARSER_REQUESTS_EXIT: exitNow = true; break; } if (exitNow) break; } gt_str_set(inputProject, argv[parsedArgs]); logger = gt_logger_new(params.verboseOutput, GT_LOGGER_DEFLT_PREFIX, stdout); bwtSeq = gt_availBWTSeq(¶ms.idx.final, logger, err); if ((had_err = bwtSeq == NULL)) break; { enum verifyBWTSeqErrCode retval = gt_BWTSeqVerifyIntegrity(bwtSeq, gt_str_get(inputProject), params.flags, params.progressInterval, stderr, logger, err); if ((had_err = (retval != VERIFY_BWTSEQ_NO_ERROR))) { fprintf(stderr, "index integrity check failed: %s\n", gt_error_get(err)); gt_error_set(err, "aborted because of index integrity check fail"); break; } } if (BWTSeqHasLocateInformation(bwtSeq)) { if ((had_err = !gt_initEmptyEMIterator(&EMIter, bwtSeq))) { gt_error_set(err, "Cannot create matches iterator for sequence index."); break; } EMIterInitialized = true; } { unsigned long totalLen, dbstart; unsigned long trial, patternLen; if ((had_err = gt_mapsuffixarray(&suffixarray, SARR_SUFTAB | SARR_ESQTAB, gt_str_get(inputProject), NULL, err) != 0)) { gt_error_set(err, "Can't load suffix array project with" " demand for encoded sequence and suffix table files\n"); break; } totalLen = gt_encseq_total_length(suffixarray.encseq); saIsLoaded = true; if ((had_err = (params.minPatLen >= 0L && params.maxPatLen >= 0L && params.minPatLen > params.maxPatLen))) { gt_error_set(err, "Invalid pattern lengths selected: min=%ld, max=%ld;" " min <= max is required.", params.minPatLen, params.maxPatLen); break; } if (params.minPatLen < 0 || params.maxPatLen < 0) { unsigned int numofchars = gt_alphabet_num_of_chars( gt_encseq_alphabet(suffixarray.encseq)); if (params.minPatLen < 0) params.minPatLen = gt_recommendedprefixlength(numofchars, totalLen, GT_RECOMMENDED_MULTIPLIER_DEFAULT, true); if (params.maxPatLen < 0) params.maxPatLen = MAX(params.minPatLen, 125 * gt_recommendedprefixlength(numofchars,totalLen, GT_RECOMMENDED_MULTIPLIER_DEFAULT, true)/100); else params.maxPatLen = MAX(params.maxPatLen, params.minPatLen); } fprintf(stderr, "Using patterns of lengths %lu to %lu\n", params.minPatLen, params.maxPatLen); if ((had_err = totalLen + 1 != BWTSeqLength(bwtSeq))) { gt_error_set(err, "base suffix array and index have diferrent lengths!" "%lu vs. %lu", totalLen + 1, BWTSeqLength(bwtSeq)); break; } if ((had_err = (epi = gt_newenumpatterniterator(params.minPatLen, params.maxPatLen, suffixarray.encseq, err)) == NULL)) { fputs("Creation of pattern iterator failed!\n", stderr); break; } for (trial = 0; !had_err && trial < params.numOfSamples; ++trial) { const GtUchar *pptr = gt_nextEnumpatterniterator(&patternLen, epi); GtMMsearchiterator *mmsi = gt_mmsearchiterator_new_complete_olain(suffixarray.encseq, suffixarray.suftab, 0, /* leftbound */ totalLen, /* rightbound */ 0, /* offset */ suffixarray.readmode, pptr, patternLen); if (BWTSeqHasLocateInformation(bwtSeq)) { if ((had_err = !gt_reinitEMIterator(&EMIter, bwtSeq, pptr, patternLen, false))) { fputs("Internal error: failed to reinitialize pattern match" " iterator", stderr); abort(); } gt_assert(gt_EMINumMatchesTotal(&EMIter) == gt_BWTSeqMatchCount(bwtSeq, pptr, patternLen, false)); gt_assert(gt_EMINumMatchesTotal(&EMIter) == gt_mmsearchiterator_count(mmsi)); while (gt_mmsearchiterator_next(&dbstart,mmsi)) { unsigned long matchPos = 0; bool match = EMIGetNextMatch(&EMIter, &matchPos, bwtSeq); if ((had_err = !match)) { gt_error_set(err, "matches of packedindex expired before mmsearch!"); break; } if ((had_err = matchPos != dbstart)) { gt_error_set(err, "packedindex match doesn't equal mmsearch " "match result!\n%lu vs. %lu\n", matchPos, dbstart); } } if (!had_err) { unsigned long matchPos; bool trailingMatch = EMIGetNextMatch(&EMIter, &matchPos, bwtSeq); if ((had_err = trailingMatch)) { gt_error_set(err, "matches of mmsearch expired before fmindex!"); break; } } } else { unsigned long numFMIMatches = gt_BWTSeqMatchCount(bwtSeq, pptr, patternLen, false), numMMSearchMatches = gt_mmsearchiterator_count(mmsi); if ((had_err = numFMIMatches != numMMSearchMatches)) { gt_error_set(err, "Number of matches not equal for suffix array (" "%lu) and fmindex (%lu).\n", numFMIMatches, numMMSearchMatches); } } gt_mmsearchiterator_delete(mmsi); mmsi = NULL; if (params.progressInterval && !((trial + 1) % params.progressInterval)) putc('.', stderr); } if (params.progressInterval) putc('\n', stderr); fprintf(stderr, "Finished %lu of %lu matchings successfully.\n", trial, params.numOfSamples); } } while (0); if (EMIterInitialized) gt_destructEMIterator(&EMIter); if (saIsLoaded) gt_freesuffixarray(&suffixarray); gt_freeEnumpatterniterator(epi); if (bwtSeq) gt_deleteBWTSeq(bwtSeq); if (logger) gt_logger_delete(logger); if (inputProject) gt_str_delete(inputProject); return had_err?-1:0; }