static int processleafedge_spmsk(bool firstedge, GtUword fd, GtBUinfo_spmsk *finfo, GtUword seqnum, GtUword relpos, GtBUstate_spmsk *state, GT_UNUSED GtError *err) { if (fd >= state->minmatchlength) { if (firstedge) { gt_assert(finfo != NULL); ((GtBUinfo_spmsk *) finfo)->firstinW = state->Wset.nextfreeGtUlong; } if (relpos == 0) { GT_STOREINARRAY(&state->Wset,GtUlong,128,seqnum); } if (relpos + fd == gt_encseq_seqlength(state->encseq,seqnum)) { GT_STOREINARRAY(&state->Lset,GtUlong,128,seqnum); } } return 0; }
Pckbuckettable *gt_pckbuckettable_new(const FMindex *fmindex, unsigned int numofchars, unsigned long totallength, unsigned int maxdepth) { GtArrayPckbck_Boundsatdepth stack; Pckbck_Boundsatdepth parent, child; unsigned long rangesize, idx, *rangeOccs; Pckbuckettable *pckbt; Mbtab *tmpmbtab; GT_INITARRAY(&stack,Pckbck_Boundsatdepth); child.lowerbound = 0; child.upperbound = totallength+1; child.depth = 0; child.code = (GtCodetype) 0; GT_STOREINARRAY(&stack,Pckbck_Boundsatdepth,128,child); rangeOccs = gt_malloc(sizeof (*rangeOccs) * GT_MULT2(numofchars)); tmpmbtab = gt_malloc(sizeof (*tmpmbtab) * numofchars); pckbt = pckbuckettable_allocandinittable(numofchars,maxdepth,true); while (stack.nextfreePckbck_Boundsatdepth > 0) { parent = stack.spacePckbck_Boundsatdepth[--stack.nextfreePckbck_Boundsatdepth]; gt_assert(parent.lowerbound < parent.upperbound); rangesize = gt_bwtrangesplitallwithoutspecial(tmpmbtab, rangeOccs, fmindex, parent.lowerbound, parent.upperbound); gt_assert(rangesize <= (unsigned long) numofchars); for (idx = 0; idx < rangesize; idx++) { child.lowerbound = tmpmbtab[idx].lowerbound; child.upperbound = tmpmbtab[idx].upperbound; child.depth = parent.depth + 1; gt_assert(child.depth <= maxdepth); child.code = parent.code * numofchars + idx; pckbuckettable_storeBoundsatdepth(pckbt,&child); if (child.depth < maxdepth) { if (child.lowerbound + 1 < child.upperbound) { GT_STOREINARRAY(&stack,Pckbck_Boundsatdepth,128,child); } else { pckbuckettable_followleafedge(pckbt,fmindex,&child); } } } } GT_FREEARRAY(&stack,Pckbck_Boundsatdepth); gt_free(rangeOccs); gt_free(tmpmbtab); printf("filled: %lu (%.2f)\n",pckbt->numofvalues, (double) pckbt->numofvalues/pckbt->maxnumofvalues); return pckbt; }
static void outkmeroccurrence(void *processinfo, const GtKmercode *kmercode) { GtArrayGtCodetype *codelist = (GtArrayGtCodetype *) processinfo; GT_STOREINARRAY(codelist,GtCodetype,1024,kmercode->code); }
static void gt_kmer_database_add_to_hash(GtHashmap *hash, GtCodetype kmercode, GtUword position) { GtArrayGtUword *arr = (GtArrayGtUword *) gt_hashmap_get(hash, (void *) kmercode); if (arr == NULL) { arr = gt_malloc(sizeof (*arr)); GT_INITARRAY(arr, GtUword); gt_hashmap_add(hash, (void *) kmercode, (void *) arr); } if (arr->allocatedGtUword == 0) GT_STOREINARRAY(arr, GtUword, (GtUword) 20, position); else GT_STOREINARRAY(arr, GtUword, arr->allocatedGtUword * 0.1, position); }
static void collectkmercode(GtArrayGtCodetype *codelist, const GtEncseq *encseq, unsigned int kmersize, unsigned int numofchars, GtUword stringtotallength) { GtUword offset; GtCodetype code; for (offset=0; offset<=stringtotallength; offset++) { code = qgram2codefillspecial(numofchars, kmersize, encseq, GT_READMODE_FORWARD, offset, stringtotallength); GT_STOREINARRAY(codelist,GtCodetype,1024,code); } }
/*read condenseq data structure from file*/ GtCondenseq *gt_condenseq_new_from_file(const char *indexname, GtLogger *logger, GtError *err) { int had_err = 0; FILE* fp; GtEncseqLoader *esl; GtEncseq *unique_es; GtCondenseq *condenseq = NULL; /*load unique_es*/ esl = gt_encseq_loader_new(); unique_es = gt_encseq_loader_load(esl, indexname, err); if (!unique_es) had_err = -1; if (!had_err) { gt_encseq_loader_delete(esl); condenseq = condenseq_new_empty(gt_encseq_alphabet(unique_es)); condenseq->filename = gt_cstr_dup(indexname); condenseq->unique_es = unique_es; fp = gt_fa_fopen_with_suffix(indexname, GT_CONDENSEQ_FILE_SUFFIX, "rb", err); if (fp == NULL) { had_err = -1; } else { had_err = condenseq_io(condenseq, fp, gt_io_error_fread, err); if (!had_err) { GtUword i; gt_assert(condenseq->uniques); gt_assert(condenseq->links); gt_fa_fclose(fp); /*create link array for each unique entry*/ for (i = 0; i < condenseq->udb_nelems; i++) { GT_INITARRAY(&(condenseq->uniques[i].links),uint32_t); } /* check for overflows */ if (condenseq->ldb_nelems > (GtUword) ((uint32_t) 0 - (uint32_t) 1)) { gt_error_set(err, "Overflow, to many link-elements. Can't be stored"); had_err = -1; } /* iterate through link entrys and store ids in corresponding unique entry array */ for (i = 0; !had_err && (GtUword) i < condenseq->ldb_nelems; i++) { GtUword uid = condenseq->links[i].unique_id; gt_assert(uid < condenseq->udb_nelems); GT_STOREINARRAY(&(condenseq->uniques[uid].links), uint32_t, 10, (uint32_t) i); } } } } if (!had_err) { gt_assert(condenseq != NULL); if (condenseq->id_len != GT_UNDEF_UWORD) gt_logger_log(logger, "IDs const len: " GT_WU, condenseq->id_len); else gt_logger_log(logger, "using sdstab to access IDs"); } if (had_err) { gt_condenseq_delete(condenseq); condenseq = NULL; } return (condenseq); }
void gt_evalxdroparbitscoresextend(bool forward, GtXdropbest *xdropbest, GtXdropresources *res, const GtSeqabstract *useq, const GtSeqabstract *vseq, GtXdropscore xdropbelowscore) { const GtWord ulen = (GtWord) gt_seqabstract_length(useq), vlen = (GtWord) gt_seqabstract_length(vseq), end_k = (GtWord) ulen - vlen, /* diagonal of endpoint (ulen, vlen) */ integermax = (GtWord) MAX(ulen, vlen), integermin = -integermax, dback = GT_XDROP_SETDBACK(xdropbelowscore); GtWord idx, lbound, /* diagonal lower bound */ ubound, /* diagonal upper bound */ currd = 0, /* distance */ k; /* lbound - 1 <= k <= ubound + 1*/ /*The following function calculates the maximal allowed number of generations with all front values equal minus infinity.*/ const int allowedMININFINITYINTgenerations = MAX(MAX(res->arbitdistances.mis, res->arbitdistances.ins), res->arbitdistances.del) - 1; int currentMININFINITYINTgeneration = 0; GtXdropfrontvalue tmpfront; GtXdropscore bigt_tmp; /* best score T' seen already */ bool alwaysMININFINITYINT = true; gt_assert(ulen != 0 && vlen != 0); res->big_t.nextfreeGtXdropscore = 0; res->fronts.nextfreeGtXdropfrontvalue = 0; /* phase 0 */ idx = (GtWord) gt_seqabstract_lcp(forward, useq, vseq,0,0); /* alignment already finished */ if (idx >= ulen || idx >= vlen) { lbound = 1L; ubound = -1L; } else { lbound = 0; ubound = 0; } tmpfront.row = (GtWord) idx; tmpfront.direction = (GtUchar) 0; /* no predecessor */ gt_xdrop_frontvalue_set(res,0,0,tmpfront); xdropbest->score = bigt_tmp = GT_XDROP_EVAL(idx + idx, 0); gt_assert(idx >= 0); xdropbest->ivalue = xdropbest->jvalue = (GtUword) idx; xdropbest->best_d = currd; xdropbest->best_k = 0; GT_STOREINARRAY (&res->big_t, GtXdropscore, 10, bigt_tmp); /* phase d > 0 */ while (lbound <= ubound) { currd++; /* calculate fronts */ for (k = lbound - 1; k <= ubound + 1; k++) { GtWord i = integermin, row; /* case 1 : DELETION-EDGE */ if (lbound < k && currd - res->arbitdistances.del >= 0 && -(currd - res->arbitdistances.del) <= k - 1 && k - 1 <= currd - res->arbitdistances.del) { i = gt_xdrop_frontvalue_get(res, currd - res->arbitdistances.del, k-1) + 1; tmpfront.direction = GT_XDROP_DELETIONBIT; } /* case 2: REPLACEMENT-EDGE */ if (lbound <= k && k <= ubound && currd - res->arbitdistances.mis >= 0 && -(currd - res->arbitdistances.mis) <= k && k <= currd - res->arbitdistances.mis) { row = gt_xdrop_frontvalue_get(res, currd - res->arbitdistances.mis, k) + 1; /* test, if case 1 has happened. */ if (!(tmpfront.direction & GT_XDROP_DELETIONBIT) || row > i) { i = row; tmpfront.direction = GT_XDROP_REPLACEMENTBIT; } } /* case 3: INSERTION-EDGE */ if (k < ubound && currd - res->arbitdistances.ins >= 0 && -(currd - res->arbitdistances.ins) <= k + 1 && k + 1 <= currd - res->arbitdistances.ins) { row = gt_xdrop_frontvalue_get(res, currd - res->arbitdistances.ins, k+1); if (!(tmpfront.direction & (GT_XDROP_DELETIONBIT | GT_XDROP_REPLACEMENTBIT)) || row > i) { i = row; tmpfront.direction = GT_XDROP_INSERTIONBIT; } } /* if i = MINUSINFINITYINY or MINUSINFINITYINY + 1 */ if (i < 0) { if (tmpfront.direction == (GtUchar) 0) alwaysMININFINITYINT = false; tmpfront.row = integermin; } else { GtWord j = i - k; const GtWord previousd = currd - dback; /* alignment score smaller than T - X */ if (previousd > 0 && res->big_t.spaceGtXdropscore != NULL && GT_XDROP_EVAL (i + j, currd) < res->big_t.spaceGtXdropscore[previousd] - xdropbelowscore) { tmpfront.row = integermin; } else { if (k <= -currd || k >= currd || (gt_xdrop_frontvalue_get(res, currd-1, k) < i && i <= MIN(ulen, vlen + k))) { if (ulen > i && vlen > j) { GtUword lcp; gt_assert(forward || (ulen - 1 >= (GtWord) i && vlen - 1 >= (GtWord) j)); lcp = gt_seqabstract_lcp(forward, useq, vseq,i,j); i += lcp; j += lcp; } alwaysMININFINITYINT = false; tmpfront.row = i; if (GT_XDROP_EVAL(i + j, currd) > bigt_tmp) { xdropbest->score = bigt_tmp = GT_XDROP_EVAL(i + j, currd); gt_assert(i >= 0 && j >= 0); xdropbest->ivalue = (GtUword) i; xdropbest->jvalue = (GtUword) j; xdropbest->best_d = currd; xdropbest->best_k = k; } } else { alwaysMININFINITYINT = false; tmpfront.row = gt_xdrop_frontvalue_get(res,currd-1,k); } } } gt_xdrop_frontvalue_set(res, currd, k, tmpfront); } /* if all front values are integermin, alignment prematurely finished if allowedMININFINITYINTgenerations exceeded (full front has already ended at currd - currentMININFINITYINTgeneration). */ if (alwaysMININFINITYINT) { currentMININFINITYINTgeneration++; if (currentMININFINITYINTgeneration > allowedMININFINITYINTgenerations) break; } else { currentMININFINITYINTgeneration = 0; alwaysMININFINITYINT = true; } GT_STOREINARRAY (&res->big_t, GtXdropscore, 10, bigt_tmp); /* fill out of bounds values of integermin needed for gt_showfrontvalues function */ for (k = -currd; k < lbound - 1; k++) { tmpfront.row = integermin; gt_xdrop_frontvalue_set(res,currd,k,tmpfront); } for (k = ubound + 2; k <= currd; k++) { tmpfront.row = integermin; gt_xdrop_frontvalue_set(res,currd,k,tmpfront); } /* alignment finished */ if (-currd <= end_k && end_k <= currd && gt_xdrop_frontvalue_get(res,currd,end_k) == ulen) break; /* pruning lower bound lbound may decrease by one or increase/stays the same l <- min{k:R(d,k) > -inf} */ for (k = lbound - 1; k <= ubound + 1; k++) { if (gt_xdrop_frontvalue_get(res,currd,k) > integermin) { lbound = k; break; } } /* pruning upper bound ubound may increase by one or decrease/stays the same u <- max{k:R(d,k) > -inf} */ for (k = ubound + 1; k >= lbound - 1; k--) { if (gt_xdrop_frontvalue_get(res,currd,k) > integermin) { ubound = k; break; } } /* handling boundaries lower bound */ for (k = 0; k >= lbound; k--) { if (gt_xdrop_frontvalue_get(res,currd,k) == vlen + k) { lbound = k; break; } } /* handling boundaries upper bound */ for (k = 0; k <= ubound; k++) { if (gt_xdrop_frontvalue_get(res,currd,k) == ulen) { ubound = k; break; } } } }
static int gt_seq_iterator_sequence_buffer_next(GtSeqIterator *si, const GtUchar **sequence, unsigned long *len, char **desc, GtError *err) { GtSeqIteratorSequenceBuffer *seqit; GtUchar charcode; int retval; bool haserr = false, foundseq = false; gt_assert(si); gt_assert(len && desc); seqit = gt_seq_iterator_sequence_buffer_cast(si); gt_assert((sequence && seqit->withsequence) || !seqit->withsequence); if (seqit->exhausted) { return 0; } while (true) { retval = gt_sequence_buffer_next(seqit->fb,&charcode,err); if (retval < 0) { haserr = true; break; } if (retval == 0) { seqit->exhausted = true; break; } if (seqit->currentread < seqit->maxread) { seqit->currentread++; } if (charcode == (GtUchar) SEPARATOR) { if (seqit->sequencebuffer.nextfreeGtUchar == 0 && seqit->withsequence) { gt_error_set(err,"sequence %llu is empty", seqit->unitnum); haserr = true; break; } *desc = (char*) gt_desc_buffer_get_next(seqit->descptr); *len = seqit->sequencebuffer.nextfreeGtUchar; if (seqit->withsequence) { /* make sure the outgoing sequence is '\0' terminated */ seqit->sequencebuffer.spaceGtUchar [seqit->sequencebuffer.nextfreeGtUchar] = (GtUchar) '\0'; *sequence = seqit->sequencebuffer.spaceGtUchar; } seqit->sequencebuffer.nextfreeGtUchar = 0; foundseq = true; seqit->unitnum++; break; } if (seqit->withsequence) { GT_STOREINARRAY(&seqit->sequencebuffer, GtUchar, MAX(1024UL, seqit->sequencebuffer.nextfreeGtUchar * 0.5), charcode); } else { seqit->sequencebuffer.nextfreeGtUchar++; } } if (!haserr && seqit->sequencebuffer.nextfreeGtUchar > 0) { *desc = (char*) gt_desc_buffer_get_next(seqit->descptr); if (seqit->withsequence) { /* make sure the outgoing sequence is '\0' terminated */ seqit->sequencebuffer.spaceGtUchar [seqit->sequencebuffer.nextfreeGtUchar] = (GtUchar) '\0'; *sequence = seqit->sequencebuffer.spaceGtUchar; } *len = seqit->sequencebuffer.nextfreeGtUchar; foundseq = true; seqit->sequencebuffer.nextfreeGtUchar = 0; } if (haserr) { return -1; } if (foundseq) { return 1; } return 0; }
void gt_wlis_filter_evaluate(GtArrayGtUword *chain, GtUword *sum_distance_chain, GtUword *sum_aligned_len_chain, GtUword *chain_weighted_score, GtWLisFilterMatches *wlismatches) { GtUword bestchain_idx, *fwd, *bck; if (wlismatches->items.nextfreeGtWlisItem == 0) { return; } gt_assert((chain == NULL && sum_distance_chain != NULL && sum_aligned_len_chain != NULL && chain_weighted_score != NULL) || (chain != NULL && sum_distance_chain == NULL && sum_aligned_len_chain == NULL && chain_weighted_score == NULL)); /* sort by query seuqence */ qsort(wlismatches->items.spaceGtWlisItem, (size_t) wlismatches->items.nextfreeGtWlisItem, sizeof *wlismatches->items.spaceGtWlisItem, gt_alignment_link_compare); /* call filter algorithm */ bestchain_idx = gt_filter_apply(wlismatches); /* get the chain by backtracing */ if (chain == NULL) { *chain_weighted_score = GT_WLIS_ACC(bestchain_idx).score; } do { if (chain != NULL) { GT_STOREINARRAY(chain,GtUword,chain->allocatedGtUword * 0.2 + 256, GT_WLIS_ACC(bestchain_idx).oi_di.original_index); } else { gt_assert(sum_distance_chain != NULL && sum_aligned_len_chain != NULL); *sum_distance_chain += GT_WLIS_ACC(bestchain_idx).oi_di.distance; *sum_aligned_len_chain += gt_wlis_filter_aligned_len(wlismatches->items.spaceGtWlisItem + bestchain_idx); } bestchain_idx = GT_WLIS_ACC(bestchain_idx).prev; } while (bestchain_idx != GT_WLIS_FILTER_UNDEF(wlismatches)); if (chain != NULL) { /* invert the order */ for (fwd = chain->spaceGtUword, bck = fwd + chain->nextfreeGtUword - 1; fwd < bck; fwd++, bck--) { GtUword tmp = *fwd; *fwd = *bck; *bck = tmp; } } }