static int gt_simplesuffixprefixmatchoutput(GT_UNUSED void *info, const GtGenericEncseq *genericencseq, GtUword matchlen, GtUword pos1, GtUword pos2, GT_UNUSED GtError *err) { GtUword seqnum1, relpos1, seqnum2, relpos2, seqstartpos; const GtEncseq *encseq; if (pos1 > pos2) { GtUword tmp = pos1; pos1 = pos2; pos2 = tmp; } gt_assert(genericencseq != NULL && genericencseq->hasencseq); encseq = genericencseq->seqptr.encseq; seqnum1 = gt_encseq_seqnum(encseq,pos1); seqstartpos = gt_encseq_seqstartpos(encseq, seqnum1); gt_assert(seqstartpos <= pos1); relpos1 = pos1 - seqstartpos; seqnum2 = gt_encseq_seqnum(encseq,pos2); seqstartpos = gt_encseq_seqstartpos(encseq, seqnum2); gt_assert(seqstartpos <= pos2); relpos2 = pos2 - seqstartpos; if (relpos1 == 0) { GtUword seqlen2 = gt_encseq_seqlength(encseq,seqnum2); if (relpos2 + matchlen == seqlen2) { printf(""GT_WU" "GT_WU" "GT_WU"\n",seqnum2,seqnum1,matchlen); } } else { if (relpos2 == 0) { GtUword seqlen1 = gt_encseq_seqlength(encseq,seqnum1); if (relpos1 + matchlen == seqlen1) { printf(""GT_WU" "GT_WU" "GT_WU"\n",seqnum1,seqnum2,matchlen); } } } return 0; }
static void showmatch(void *processinfo,const GtIdxMatch *match) { Showmatchinfo *showmatchinfo = (Showmatchinfo *) processinfo; unsigned long seqnum; unsigned long relpos; if (match->dbabsolute) { unsigned long seqstartpos; seqnum = gt_encseq_seqnum(showmatchinfo->encseq, match->dbstartpos); seqstartpos = gt_encseq_seqstartpos(showmatchinfo->encseq, seqnum); gt_assert(seqstartpos <= match->dbstartpos); relpos = match->dbstartpos - seqstartpos; } else { relpos = match->dbstartpos; seqnum = match->dbseqnum; } printf("%lu\t%lu\t",seqnum,relpos); printf("%lu\t",match->dblen); printf("\t" Formatuint64_t "\t%lu\t%lu\t%lu\n", PRINTuint64_tcast(showmatchinfo->queryunit), match->querystartpos, match->querylen, match->distance); if (showmatchinfo->showalignment) { gt_alignment_show_with_mapped_chars( (const GtAlignment *) match->alignment, showmatchinfo->characters, showmatchinfo->wildcardshow, stdout); } }
static GtUword condenseq_unique_extract_encoded(const GtCondenseq *cs, GtUword id, GtUchar *buffer, GtUword frompos, GtUword topos) { GtCondenseqUnique unique = cs->uniques[id]; GtUword startoffset, startpos, uniquelength, targetlength, endpos; gt_assert(unique.orig_startpos <= frompos); startoffset = frompos - unique.orig_startpos; gt_assert(startoffset < unique.len); startpos = gt_encseq_seqstartpos(cs->unique_es, id) + startoffset; uniquelength = unique.len - startoffset; targetlength = topos - frompos + 1; if (uniquelength < targetlength) endpos = startpos + uniquelength - 1; else endpos = startpos + targetlength - 1; gt_encseq_extract_encoded(cs->unique_es, buffer, startpos, endpos); return endpos - startpos + 1; }
static void gt_seqorder_output(unsigned long seqnum, GtEncseq *encseq) { GtEncseqReader *esr; unsigned long startpos, len, desclen = 0; const char *desc = NULL; unsigned long i; startpos = gt_encseq_seqstartpos(encseq, seqnum); len = gt_encseq_seqlength(encseq, seqnum); gt_xfputc(GT_FASTA_SEPARATOR, stdout); if (gt_encseq_has_description_support(encseq)) { desc = gt_encseq_description(encseq, &desclen, seqnum); gt_xfwrite(desc, (size_t)1, (size_t)desclen, stdout); } gt_xfputc('\n', stdout); esr = gt_encseq_create_reader_with_readmode(encseq, GT_READMODE_FORWARD, startpos); for (i = 0; i < len; i++) { gt_xfputc(gt_encseq_reader_next_decoded_char(esr), stdout); } gt_encseq_reader_delete(esr); gt_xfputc('\n', stdout); }
static int gt_simpleexactselfmatchoutput(void *info, const GtEncseq *encseq, unsigned long len, unsigned long pos1, unsigned long pos2, GT_UNUSED GtError *err) { unsigned long queryseqnum, seqstartpos, seqlength; GtQuerymatch *querymatch = (GtQuerymatch *) info; if (pos1 > pos2) { unsigned long tmp = pos1; pos1 = pos2; pos2 = tmp; } queryseqnum = gt_encseq_seqnum(encseq,pos2); seqstartpos = gt_encseq_seqstartpos(encseq, queryseqnum); seqlength = gt_encseq_seqlength(encseq, queryseqnum); gt_assert(pos2 >= seqstartpos); gt_querymatch_fill(querymatch, len, pos1, GT_READMODE_FORWARD, false, 0, 0, true, (uint64_t) queryseqnum, len, pos2 - seqstartpos); return gt_querymatch_output(info, encseq, querymatch, NULL, seqlength, err); }
static GtUword condenseq_link_extract_encoded(const GtCondenseq *cs, GtUword id, GtUchar *buffer, GtUword frompos, GtUword topos) { GtCondenseqLink link = cs->links[id]; GtEditscript *editscript = link.editscript; GtUword unique_startpos, targetlength, startoffset, endpos, linklength, written; gt_assert(link.orig_startpos <= frompos); unique_startpos = gt_encseq_seqstartpos(cs->unique_es, link.unique_id); startoffset = frompos - link.orig_startpos; gt_assert(startoffset < link.len); linklength = link.len - startoffset; targetlength = topos - frompos + 1; if (linklength < targetlength) endpos = link.len - 1; else endpos = startoffset + targetlength - 1; written = gt_editscript_get_sub_sequence_v(editscript, cs->unique_es, unique_startpos + link.unique_offset, GT_READMODE_FORWARD, startoffset, endpos, buffer); gt_assert(written == endpos - startoffset + 1); return written; }
static GtIntset *condenseq_fill_tab(GtCondenseq *condenseq, const GtEncseq *orig_es) { GtIntset *ssptab = NULL; GtUword max, idx; if (condenseq->orig_num_seq > (GtUword) 1) { max = gt_encseq_seqstartpos(orig_es, condenseq->orig_num_seq - 1); /* we store the internal separators, the end is explicit */ ssptab = gt_intset_best_new(max - 1, condenseq->orig_num_seq - 1); for (idx = (GtUword) 1; idx < condenseq->orig_num_seq; ++idx) { GtUword pos = gt_encseq_seqstartpos(orig_es, idx) - 1; gt_assert(pos != 0); gt_intset_add(ssptab, pos); } } return ssptab; }
int gt_callenumselfmatches(const char *indexname, GtReadmode queryreadmode, unsigned int userdefinedleastlength, GtProcessquerymatch processquerymatch, void *processquerymatchinfo, GtLogger *logger, GtError *err) { Suffixarray suffixarray; bool haserr = false; gt_assert(queryreadmode != GT_READMODE_FORWARD); if (gt_mapsuffixarray(&suffixarray, SARR_ESQTAB | SARR_SUFTAB | SARR_SSPTAB, indexname, logger, err) != 0) { haserr = true; } else { unsigned long seqnum, numofsequences, seqlength, seqstartpos; GtQuerymatch *querymatchspaceptr = gt_querymatch_new(); GtQueryrep queryrep; numofsequences = gt_encseq_num_of_sequences(suffixarray.encseq); queryrep.sequence = NULL; queryrep.reversecopy = false; queryrep.encseq = suffixarray.encseq; queryrep.readmode = queryreadmode; for (seqnum = 0; seqnum < numofsequences; seqnum++) { seqstartpos = gt_encseq_seqstartpos(suffixarray.encseq, seqnum); seqlength = gt_encseq_seqlength(suffixarray.encseq, seqnum); if (seqlength >= (unsigned long) userdefinedleastlength) { queryrep.startpos = seqstartpos; queryrep.length = seqlength; if (gt_querysubstringmatch(true, &suffixarray, (uint64_t) seqnum, &queryrep, (unsigned long) userdefinedleastlength, processquerymatch, processquerymatchinfo, querymatchspaceptr, err) != 0) { haserr = true; break; } } } gt_querymatch_delete(querymatchspaceptr); } gt_freesuffixarray(&suffixarray); return haserr ? -1 : 0; }
static int gt_simplesuffixprefixmatchoutput(GT_UNUSED void *info, const GtEncseq *encseq, unsigned long matchlen, unsigned long pos1, unsigned long pos2, GT_UNUSED GtError *err) { unsigned long seqnum1, relpos1, seqnum2, relpos2, seqstartpos; if (pos1 > pos2) { unsigned long tmp = pos1; pos1 = pos2; pos2 = tmp; } seqnum1 = gt_encseq_seqnum(encseq,pos1); seqstartpos = gt_encseq_seqstartpos(encseq, seqnum1); gt_assert(seqstartpos <= pos1); relpos1 = pos1 - seqstartpos; seqnum2 = gt_encseq_seqnum(encseq,pos2); seqstartpos = gt_encseq_seqstartpos(encseq, seqnum2); gt_assert(seqstartpos <= pos2); relpos2 = pos2 - seqstartpos; if (relpos1 == 0) { unsigned long seqlen2 = gt_encseq_seqlength(encseq,seqnum2); if (relpos2 + matchlen == seqlen2) { printf("%lu %lu %lu\n",seqnum2,seqnum1,matchlen); } } else { if (relpos2 == 0) { unsigned long seqlen1 = gt_encseq_seqlength(encseq,seqnum1); if (relpos1 + matchlen == seqlen1) { printf("%lu %lu %lu\n",seqnum1,seqnum2,matchlen); } } } return 0; }
GtUchar gt_bioseq_get_encoded_char(const GtBioseq *bs, GtUword index, GtUword position) { GtUword startpos; gt_assert(bs); gt_assert(index < gt_encseq_num_of_sequences(bs->encseq)); startpos = gt_encseq_seqstartpos(bs->encseq, index); return gt_encseq_get_encoded_char(bs->encseq, startpos + position, GT_READMODE_FORWARD); }
void gt_bioseq_get_encoded_sequence_range(const GtBioseq *bs, GtUchar *out, GtUword idx, GtUword start, GtUword end) { GtUword startpos; gt_assert(bs); gt_assert(idx < gt_encseq_num_of_sequences(bs->encseq) && end >= start); startpos = gt_encseq_seqstartpos(bs->encseq, idx); gt_encseq_extract_encoded(bs->encseq, out, startpos + start, startpos + end); }
void gt_bioseq_get_encoded_sequence(const GtBioseq *bs, GtUchar *out, GtUword idx) { GtUword startpos; gt_assert(bs); gt_assert(idx < gt_encseq_num_of_sequences(bs->encseq)); startpos = gt_encseq_seqstartpos(bs->encseq, idx); gt_encseq_extract_encoded(bs->encseq, out, startpos, startpos + gt_encseq_seqlength(bs->encseq, idx) - 1); }
static int encseq_lua_seqstartpos(lua_State *L) { GtEncseq **encseq; GtUword pos; encseq = check_encseq(L, 1); pos = luaL_checknumber(L, 2); luaL_argcheck(L, pos < gt_encseq_num_of_sequences(*encseq), 2, "cannot exceed number of sequences"); lua_pushnumber(L, gt_encseq_seqstartpos(*encseq, pos)); return 1; }
char* gt_bioseq_get_sequence_range(const GtBioseq *bs, GtUword idx, GtUword start, GtUword end) { char *out; GtUword startpos; gt_assert(bs); gt_assert(idx < gt_encseq_num_of_sequences(bs->encseq) && end >= start); out = gt_malloc((end - start + 1) * sizeof (char)); startpos = gt_encseq_seqstartpos(bs->encseq, idx); gt_encseq_extract_decoded(bs->encseq, out, startpos + start, startpos + end); return out; }
char* gt_bioseq_get_sequence(const GtBioseq *bs, GtUword idx) { char *out; GtUword startpos; gt_assert(bs); gt_assert(idx < gt_encseq_num_of_sequences(bs->encseq)); out = gt_calloc(gt_encseq_seqlength(bs->encseq, idx), sizeof (char)); startpos = gt_encseq_seqstartpos(bs->encseq, idx); gt_encseq_extract_decoded(bs->encseq, out, startpos, startpos + gt_encseq_seqlength(bs->encseq, idx) - 1); return out; }
bool gt_bioseq_seq_has_wildcards(const GtBioseq* bioseq, GtUword idx) { bool has_wildcard = false; GtUword length = gt_encseq_seqlength(bioseq->encseq, idx), seqstart = gt_encseq_seqstartpos(bioseq->encseq, idx), i; for (i = 0; !has_wildcard && i < length; ++i) { has_wildcard = gt_encseq_position_is_wildcard(bioseq->encseq, seqstart + i, GT_READMODE_FORWARD); } return has_wildcard; }
static void gt_seqorder_sort(GtSuffixsortspace *suffixsortspace, GtEncseq *encseq) { unsigned long i; Sfxstrategy sfxstrategy; defaultsfxstrategy(&sfxstrategy, false); for (i = 0; i < gt_encseq_num_of_sequences(encseq); i++) gt_suffixsortspace_setdirect(suffixsortspace, i, gt_encseq_seqstartpos(encseq, i)); gt_sortallsuffixesfromstart(suffixsortspace, gt_encseq_num_of_sequences(encseq), encseq, GT_READMODE_FORWARD, NULL, 0, &sfxstrategy, NULL, NULL, NULL); }
static int gt_encseq_col_do_grep_desc(GtEncseqCol *esc, GtUword *filenum, GtUword *seqnum, GtStr *seqid, GtError *err) { GtUword j; const GtSeqInfo *seq_info_ptr; GtSeqInfo seq_info; bool match = false; int had_err = 0; gt_error_check(err); gt_assert(esc && filenum && seqnum && seqid); /* create cache */ if (!esc->grep_cache) esc->grep_cache = gt_seq_info_cache_new(); /* try to read from cache */ seq_info_ptr = gt_seq_info_cache_get(esc->grep_cache, gt_str_get(seqid)); if (seq_info_ptr) { *filenum = seq_info_ptr->filenum; *seqnum = seq_info_ptr->seqnum; return 0; } for (j = 0; !had_err && j < gt_encseq_num_of_sequences(esc->encseq); j++) { const char *desc; char *buf; GtUword desc_len; desc = gt_encseq_description(esc->encseq, &desc_len, j); buf = gt_calloc(desc_len + 1, sizeof (char)); memcpy(buf, desc, desc_len * sizeof (char)); had_err = gt_grep(&match, gt_str_get(seqid), buf, err); gt_free(buf); if (!had_err && match) { *filenum = seq_info.filenum = gt_encseq_filenum(esc->encseq, gt_encseq_seqstartpos(esc->encseq, j)); *seqnum = seq_info.seqnum = j - gt_encseq_filenum_first_seqnum(esc->encseq, *filenum); gt_seq_info_cache_add(esc->grep_cache, gt_str_get(seqid), &seq_info); break; } } if (!had_err && !match) { gt_error_set(err, "no description matched sequence ID '%s'", gt_str_get(seqid)); had_err = -1; } return had_err; }
static int giextract_encodedseq2fasta(FILE *fpout, const GtEncseq *encseq, unsigned long seqnum, const Fastakeyquery *fastakeyquery, unsigned long linewidth, GT_UNUSED GtError *err) { const char *desc; unsigned long desclen; bool haserr = false; desc = gt_encseq_description(encseq, &desclen, seqnum); gt_xfputc('>',fpout); if (fastakeyquery != NULL && !COMPLETE(fastakeyquery)) { printf("%s %lu %lu ",fastakeyquery->fastakey, fastakeyquery->frompos, fastakeyquery->topos); } gt_xfwrite(desc,sizeof *desc,(size_t) desclen,fpout); if (!haserr) { unsigned long frompos, topos, seqstartpos, seqlength ; gt_xfputc('\n',fpout); seqstartpos = gt_encseq_seqstartpos(encseq, seqnum); seqlength = gt_encseq_seqlength(encseq, seqnum); if (fastakeyquery != NULL && !COMPLETE(fastakeyquery)) { frompos = fastakeyquery->frompos-1; topos = fastakeyquery->topos - fastakeyquery->frompos + 1; } else { frompos = 0; topos = seqlength; } gt_encseq2symbolstring(fpout, encseq, GT_READMODE_FORWARD, seqstartpos + frompos, topos, linewidth); } return haserr ? -1 : 0; }
static char* gt_encseq_col_get_sequence(const GtSeqCol *sc, GtUword filenum, GtUword seqnum, GtUword start, GtUword end) { GtEncseqCol *esc; char *out; GtUword encseq_seqnum, startpos; esc = gt_encseq_col_cast(sc); gt_assert(esc && filenum < gt_encseq_num_of_files(esc->encseq)); encseq_seqnum = gt_encseq_filenum_first_seqnum(esc->encseq, filenum) + seqnum; gt_assert(encseq_seqnum < gt_encseq_num_of_sequences(esc->encseq)); gt_assert(start <= end); startpos = gt_encseq_seqstartpos(esc->encseq, encseq_seqnum); out = gt_calloc(end - start + 1, sizeof (char)); gt_encseq_extract_decoded(esc->encseq, out, startpos + start, startpos + end); return out; }
static void verifymatch(const GtEncseq *encseq, GtUword len, GtUword pos1, uint64_t seqnum2, GtUword pos2, GtReadmode readmode) { if (readmode == GT_READMODE_REVERSE) { GtUword offset, seqstartpos, totallength = gt_encseq_total_length(encseq); GtUchar cc1, cc2; seqstartpos = gt_encseq_seqstartpos(encseq, seqnum2); pos2 += seqstartpos; for (offset = 0; offset < len; offset++) { gt_assert(pos1 + len - 1 < totallength); gt_assert(pos2 + len - 1 < totallength); cc1 = gt_encseq_get_encoded_char(encseq,pos1+offset,GT_READMODE_FORWARD); cc2 = gt_encseq_get_encoded_char(encseq,pos2+len-1-offset, GT_READMODE_FORWARD); gt_assert(cc1 == cc2 && ISNOTSPECIAL(cc1)); } if (pos1 + len < totallength) { cc1 = gt_encseq_get_encoded_char(encseq,pos1+len,GT_READMODE_FORWARD); } else { cc1 = SEPARATOR; } if (pos2 > 0) { cc2 = gt_encseq_get_encoded_char(encseq,pos2-1,GT_READMODE_FORWARD); } else { cc2 = SEPARATOR; } gt_assert(cc1 != cc2 || ISSPECIAL(cc1)); } }
/* prepare sspbittab and determine length of shortest sequence */ static void prepare_sspbittab_and_shortest(unsigned long totallength, ContfindBUstate *state) { unsigned long length, lastseqstart, i, ssp; GT_INITBITTAB(state->sspbittab, totallength + 1); lastseqstart = 0; state->shortest = totallength; for (i = 1UL; i <= state->nofsequences - 1; i++) { ssp = gt_encseq_seqstartpos(state->encseq, i) - 1; GT_SETIBIT(state->sspbittab, ssp); length = ssp - lastseqstart; lastseqstart = ssp + 1; if (length < state->shortest) state->shortest = length; } GT_SETIBIT(state->sspbittab, totallength); length = totallength - lastseqstart; if (length < state->shortest) state->shortest = length; }
static int gt_encseq_col_md5_to_seq(GtSeqCol *sc, char **seq, GtUword start, GtUword end, GtStr *md5_seqid, GtError *err) { GtUword seqnum = GT_UNDEF_UWORD; char seqid[GT_MD5_SEQID_HASH_LEN + 1]; int had_err = 0; GtEncseqCol *esc; esc = gt_encseq_col_cast(sc); gt_error_check(err); gt_assert(esc && seq && start <= end && md5_seqid && err); gt_assert(gt_md5_seqid_has_prefix(gt_str_get(md5_seqid))); if (gt_str_length(md5_seqid) >= GT_MD5_SEQID_TOTAL_LEN) { const char *cstrseqid = gt_str_get(md5_seqid); if (cstrseqid[GT_MD5_SEQID_TOTAL_LEN-1] != GT_MD5_SEQID_SEPARATOR) { gt_error_set(err, "MD5 sequence id %s not terminated with '%c'", gt_str_get(md5_seqid), GT_MD5_SEQID_SEPARATOR); had_err = -1; } if (!had_err) { strncpy(seqid, cstrseqid + GT_MD5_SEQID_PREFIX_LEN, GT_MD5_SEQID_HASH_LEN); seqid[GT_MD5_SEQID_HASH_LEN] = '\0'; } } seqnum = gt_md5_tab_map(esc->md5_tab, seqid); if (seqnum != GT_UNDEF_UWORD) { GtUword startpos = gt_encseq_seqstartpos(esc->encseq, seqnum), GT_UNUSED seqlength = gt_encseq_seqlength(esc->encseq, seqnum); *seq = gt_calloc(end - start + 1, sizeof (char)); gt_encseq_extract_decoded(esc->encseq, (char*) *seq, startpos + start, startpos + end); } else { gt_error_set(err, "sequence %s not found", gt_str_get(md5_seqid)); had_err = -1; } return had_err; }
void gt_multiapplysmithwaterman(SWdpresource *dpresource, const GtEncseq *encseq, const GtUchar *query, GtUword querylen) { GtUword seqnum, seqstartpos, seqlength, numofdbsequences = gt_encseq_num_of_sequences(encseq); for (seqnum = 0; seqnum < numofdbsequences; seqnum++) { seqstartpos = gt_encseq_seqstartpos(encseq, seqnum); seqlength = gt_encseq_seqlength(encseq, seqnum); applysmithwaterman(dpresource, encseq, seqnum, seqstartpos, seqstartpos + seqlength, query, querylen); } }
static int gt_simpleexactselfmatchoutput(void *info, const GtGenericEncseq *genericencseq, GtUword len, GtUword pos1, GtUword pos2, GT_UNUSED GtError *err) { GtUword queryseqnum, seqstartpos, seqlength; GtQuerymatch *querymatch = (GtQuerymatch *) info; const GtEncseq *encseq; if (pos1 > pos2) { GtUword tmp = pos1; pos1 = pos2; pos2 = tmp; } gt_assert(genericencseq != NULL && genericencseq->hasencseq); encseq = genericencseq->seqptr.encseq; queryseqnum = gt_encseq_seqnum(encseq,pos2); seqstartpos = gt_encseq_seqstartpos(encseq, queryseqnum); seqlength = gt_encseq_seqlength(encseq, queryseqnum); gt_assert(pos2 >= seqstartpos); gt_querymatch_fill(querymatch, len, pos1, GT_READMODE_FORWARD, false, 0, 0, true, (uint64_t) queryseqnum, len, pos2 - seqstartpos); return gt_querymatch_output(info, encseq, querymatch, NULL, seqlength, err); }
static int output_sequence(GtEncseq *encseq, GtEncseqDecodeArguments *args, const char *filename, GtError *err) { GtUword i, j, sfrom, sto; int had_err = 0; bool has_desc; GtEncseqReader *esr; gt_assert(encseq); if (!(has_desc = gt_encseq_has_description_support(encseq))) gt_warning("Missing description support for file %s", filename); if (strcmp(gt_str_get(args->mode), "fasta") == 0) { /* specify a single sequence to extract */ if (args->seq != GT_UNDEF_UWORD) { if (args->seq >= gt_encseq_num_of_sequences(encseq)) { gt_error_set(err, "requested sequence "GT_WU" exceeds number of sequences " "("GT_WU")", args->seq, gt_encseq_num_of_sequences(encseq)); return -1; } sfrom = args->seq; sto = args->seq + 1; } else if (args->seqrng.start != GT_UNDEF_UWORD && args->seqrng.end != GT_UNDEF_UWORD) { /* specify a sequence range to extract */ if (args->seqrng.start >= gt_encseq_num_of_sequences(encseq) || args->seqrng.end >= gt_encseq_num_of_sequences(encseq)) { gt_error_set(err, "range "GT_WU"-"GT_WU" includes a sequence number " "exceeding the total number of sequences ("GT_WU")", args->seqrng.start, args->seqrng.end, gt_encseq_num_of_sequences(encseq)); return -1; } sfrom = args->seqrng.start; sto = args->seqrng.end + 1; } else { /* extract all sequences */ sfrom = 0; sto = gt_encseq_num_of_sequences(encseq); } for (i = sfrom; i < sto; i++) { GtUword desclen, startpos, len; char buf[BUFSIZ]; const char *desc = NULL; /* XXX: maybe make this distinction in the functions via readmode? */ if (!GT_ISDIRREVERSE(args->rm)) { startpos = gt_encseq_seqstartpos(encseq, i); len = gt_encseq_seqlength(encseq, i); if (has_desc) { desc = gt_encseq_description(encseq, &desclen, i); } else { (void) snprintf(buf, BUFSIZ, "sequence "GT_WU"", i); desclen = strlen(buf); desc = buf; } } else { startpos = gt_encseq_seqstartpos(encseq, i); len = gt_encseq_seqlength(encseq, gt_encseq_num_of_sequences(encseq)-1-i); startpos = gt_encseq_total_length(encseq) - (gt_encseq_seqstartpos(encseq, gt_encseq_num_of_sequences( encseq)-1-i) + len); if (has_desc) { desc = gt_encseq_description(encseq, &desclen, gt_encseq_num_of_sequences(encseq)-1-i); } else { (void) snprintf(buf, BUFSIZ, "sequence "GT_WU"", i); desclen = strlen(buf); desc = buf; } } gt_assert(desc); /* output description */ gt_xfputc(GT_FASTA_SEPARATOR, stdout); gt_xfwrite(desc, 1, desclen, stdout); gt_xfputc('\n', stdout); /* XXX: make this more efficient by writing in a buffer first and then showing the result */ if (args->singlechars) { for (j = 0; j < len; j++) { gt_xfputc(gt_encseq_get_decoded_char(encseq, startpos + j, args->rm), stdout); } } else { esr = gt_encseq_create_reader_with_readmode(encseq, args->rm, startpos); for (j = 0; j < len; j++) { gt_xfputc(gt_encseq_reader_next_decoded_char(esr), stdout); } gt_encseq_reader_delete(esr); } gt_xfputc('\n', stdout); } } if (strcmp(gt_str_get(args->mode), "concat") == 0) { GtUword from = 0, to = gt_encseq_total_length(encseq) - 1; if (args->rng.start != GT_UNDEF_UWORD && args->rng.end != GT_UNDEF_UWORD) { if (args->rng.end > to) { had_err = -1; gt_error_set(err, "end of range ("GT_WU") exceeds encoded sequence length " "("GT_WU")", args->rng.end, to); } if (!had_err) { from = args->rng.start; to = args->rng.end; } } if (!had_err) { if (args->singlechars) { for (j = from; j <= to; j++) { char cc = gt_encseq_get_decoded_char(encseq, j, args->rm); if (cc == (char) SEPARATOR) cc = gt_str_get(args->sepchar)[0]; gt_xfputc(cc, stdout); } } else { esr = gt_encseq_create_reader_with_readmode(encseq, args->rm, from); if (esr) { for (j = from; j <= to; j++) { char cc = gt_encseq_reader_next_decoded_char(esr); if (cc == (char) SEPARATOR) cc = gt_str_get(args->sepchar)[0]; gt_xfputc(cc, stdout); } gt_encseq_reader_delete(esr); } } gt_xfputc('\n', stdout); } } return had_err; }
static int gt_simplexdropselfmatchoutput(void *info, const GtGenericEncseq *genericencseq, GtUword len, GtUword pos1, GtUword pos2, GtError *err) { GtXdropmatchinfo *xdropmatchinfo = (GtXdropmatchinfo *) info; GtXdropscore score; GtUword dbseqnum, dbseqstartpos, dbseqlength, dbstart, dblen, querystart, queryseqnum, querylen, queryseqlength, queryseqstartpos, dbtotallength; const GtEncseq *encseq; gt_assert(genericencseq != NULL && genericencseq->hasencseq); encseq = genericencseq->seqptr.encseq; dbtotallength = gt_encseq_total_length(encseq); if (pos1 > pos2) { GtUword tmp = pos1; pos1 = pos2; pos2 = tmp; } dbseqnum = gt_encseq_seqnum(encseq,pos1), dbseqstartpos = gt_encseq_seqstartpos(encseq,dbseqnum), dbseqlength = gt_encseq_seqlength(encseq,dbseqnum); if (pos2 < dbseqstartpos + dbseqlength) { queryseqnum = dbseqnum; queryseqstartpos = dbseqstartpos; queryseqlength = dbseqlength; } else { queryseqnum = gt_encseq_seqnum(encseq,pos2); gt_assert(dbseqnum < queryseqnum); queryseqstartpos = gt_encseq_seqstartpos(encseq,queryseqnum); queryseqlength = gt_encseq_seqlength(encseq,queryseqnum); } if (pos1 > 0 && pos2 > 0) { gt_assert(pos1 >= dbseqstartpos && pos2 >= queryseqstartpos); gt_seqabstract_reinit_encseq(xdropmatchinfo->useq,encseq, pos1 - dbseqstartpos,0); gt_seqabstract_reinit_encseq(xdropmatchinfo->vseq,encseq, pos2 - queryseqstartpos,0); gt_evalxdroparbitscoresextend(false, &xdropmatchinfo->best_left, xdropmatchinfo->res, xdropmatchinfo->useq, xdropmatchinfo->vseq, pos1, pos2, xdropmatchinfo->belowscore); } else { xdropmatchinfo->best_left.ivalue = 0; xdropmatchinfo->best_left.jvalue = 0; xdropmatchinfo->best_left.score = 0; } if (pos1 + len < dbtotallength && pos2 + len < dbtotallength) { const GtUword seqend1 = dbseqstartpos + dbseqlength; const GtUword seqend2 = queryseqstartpos + queryseqlength; gt_assert(seqend1 >= pos1 + len && seqend2 >= pos2 + len); gt_seqabstract_reinit_encseq(xdropmatchinfo->useq, encseq,seqend1 - (pos1 + len),0); gt_seqabstract_reinit_encseq(xdropmatchinfo->vseq, encseq,seqend2 - (pos2 + len),0); gt_evalxdroparbitscoresextend(true, &xdropmatchinfo->best_right, xdropmatchinfo->res, xdropmatchinfo->useq, xdropmatchinfo->vseq, pos1 + len, pos2 + len, xdropmatchinfo->belowscore); } else { xdropmatchinfo->best_right.ivalue = 0; xdropmatchinfo->best_right.jvalue = 0; xdropmatchinfo->best_right.score = 0; } gt_assert(pos1 >= (GtUword) xdropmatchinfo->best_left.ivalue && pos2 >= (GtUword) xdropmatchinfo->best_left.jvalue); querystart = pos2 - xdropmatchinfo->best_left.jvalue; gt_assert(querystart >= queryseqstartpos); dblen = len + xdropmatchinfo->best_left.ivalue + xdropmatchinfo->best_right.ivalue; dbstart = pos1 - xdropmatchinfo->best_left.ivalue; querylen = len + xdropmatchinfo->best_left.jvalue + xdropmatchinfo->best_right.jvalue, score = (GtXdropscore) len * xdropmatchinfo->arbitscores.mat + xdropmatchinfo->best_left.score + xdropmatchinfo->best_right.score; gt_seqabstract_reinit_encseq(xdropmatchinfo->useq, encseq, dblen, dbstart); gt_seqabstract_reinit_encseq(xdropmatchinfo->vseq, encseq, querylen, querystart); gt_querymatch_fill(xdropmatchinfo->querymatchspaceptr, dblen, dbstart, GT_READMODE_FORWARD, false, score, greedyunitedist(xdropmatchinfo->frontresource, xdropmatchinfo->useq,xdropmatchinfo->vseq), true, (uint64_t) queryseqnum, querylen, querystart - queryseqstartpos); return gt_querymatch_output(info, encseq, xdropmatchinfo->querymatchspaceptr, NULL, gt_encseq_seqlength(encseq, queryseqnum), err); }
static void gt_querysubstringmatch(bool selfmatch, const GtEncseq *dbencseq, const ESASuffixptr *suftabpart, GtReadmode readmode, GtUword numberofsuffixes, uint64_t queryunitnum, GtQueryrepresentation *queryrep, GtUword minmatchlength, GtProcessquerymatch processquerymatch, void *processquerymatchinfo, GtQuerymatch *querymatchspaceptr) { GtMMsearchiterator *mmsi; GtUword totallength, localqueryoffset = 0; uint64_t localqueryunitnum = queryunitnum; GtQuerysubstring querysubstring; gt_assert(numberofsuffixes > 0); totallength = gt_encseq_total_length(dbencseq); querysubstring.queryrep = queryrep; for (querysubstring.currentoffset = 0; querysubstring.currentoffset <= queryrep->seqlen - minmatchlength; querysubstring.currentoffset++) { GtUword dbstart; mmsi = gt_mmsearchiterator_new(dbencseq, suftabpart, 0, /* leftbound */ numberofsuffixes - 1, /* rightbound */ 0, /* offset */ readmode, &querysubstring, minmatchlength); while (gt_mmsearchiterator_next(&dbstart,mmsi)) { if (gt_mmsearch_isleftmaximal(dbencseq, readmode, dbstart, &querysubstring)) { GtUword dbseqnum, dbseqstartpos, dbseqlen, extend; extend = gt_mmsearch_extendright(dbencseq, mmsi->esr, readmode, totallength, dbstart + minmatchlength, &querysubstring, minmatchlength); if (gt_encseq_has_multiseq_support(dbencseq)) { dbseqnum = gt_encseq_seqnum(dbencseq,dbstart); dbseqstartpos = gt_encseq_seqstartpos(dbencseq,dbseqnum); dbseqlen = gt_encseq_seqlength(dbencseq,dbseqnum); } else { dbseqnum = dbseqstartpos = dbseqlen = 0; } gt_querymatch_init(querymatchspaceptr, minmatchlength + extend, dbstart, dbseqnum, dbstart - dbseqstartpos, dbseqlen, 0, /* score */ 0, /* edist */ selfmatch, localqueryunitnum, minmatchlength + extend, localqueryoffset, queryrep->seqlen); processquerymatch(processquerymatchinfo,querymatchspaceptr); } } gt_mmsearchiterator_delete(mmsi); mmsi = NULL; if (gt_mmsearch_accessquery(queryrep,querysubstring.currentoffset) == (GtUchar) SEPARATOR) { localqueryunitnum++; localqueryoffset = 0; } else { localqueryoffset++; } } }
void gt_queryuniquematch(bool selfmatch, const Suffixarray *suffixarray, uint64_t queryunitnum, GtQueryrepresentation *queryrep, GtUword minmatchlength, GtProcessquerymatch processquerymatch, void *processquerymatchinfo, GtQuerymatch *querymatchspaceptr) { GtUword offset, totallength = gt_encseq_total_length(suffixarray->encseq), localqueryoffset = 0; uint64_t localqueryunitnum = queryunitnum; gt_assert(!selfmatch && queryrep->seqlen >= minmatchlength); for (offset = 0; offset <= queryrep->seqlen - minmatchlength; offset++) { GtUword matchlen, dbstart; matchlen = gt_suffixarrayfindmums (suffixarray, 0, 0, /* leftbound */ totallength, /* rightbound */ &dbstart, queryrep->sequence + offset, queryrep->sequence + queryrep->seqlen); if (dbstart != ULONG_MAX && matchlen >= minmatchlength && gt_mum_isleftmaximal(suffixarray->encseq, suffixarray->readmode, dbstart, offset, queryrep->sequence)) { GtUword dbseqnum = gt_encseq_seqnum(suffixarray->encseq,dbstart), dbseqstartpos = gt_encseq_seqstartpos(suffixarray->encseq, dbseqnum), dbseqlen = gt_encseq_seqlength(suffixarray->encseq,dbseqnum); gt_querymatch_init(querymatchspaceptr, matchlen, dbstart, dbseqnum, dbstart - dbseqstartpos, dbseqlen, 0, /* score */ 0, /* edist */ selfmatch, localqueryunitnum, matchlen, localqueryoffset, queryrep->seqlen); processquerymatch(processquerymatchinfo,querymatchspaceptr); } if (queryrep->sequence[offset] == (GtUchar) SEPARATOR) { localqueryunitnum++; localqueryoffset = 0; } else { localqueryoffset++; } } }
int gt_querysubstringmatchiterator_next(GtQuerysubstringmatchiterator *qsmi, GtError *err) { gt_assert(qsmi != NULL); while (true) { if (qsmi->query_seqlen < qsmi->userdefinedleastlength) { if (qsmi->seqit != NULL) { int retval = gt_seq_iterator_next(qsmi->seqit, &qsmi->query_for_seqit, &qsmi->query_seqlen, &qsmi->desc, err); if (retval < 0) { return -1; /* error */ } if (retval == 0) { return 1; /* no more sequences */ } gt_assert(qsmi->query_seqlen > 0 && qsmi->query_for_seqit != NULL); qsmi->queryrep.sequence = qsmi->query_for_seqit; } else { if (qsmi->queryunitnum == qsmi->query_encseq_numofsequences) { return 1; } qsmi->queryrep.startpos = gt_encseq_seqstartpos(qsmi->queryrep.encseq, qsmi->queryunitnum); qsmi->query_seqlen = gt_encseq_seqlength(qsmi->queryrep.encseq, qsmi->queryunitnum); } gt_assert(qsmi->query_seqlen > 0); qsmi->queryrep.seqlen = qsmi->query_seqlen; qsmi->querysubstring.currentoffset = 0; } if (qsmi->query_seqlen >= qsmi->userdefinedleastlength) { if (!qsmi->mmsi_defined) { gt_mmsearchiterator_reinit(qsmi->mmsi, qsmi->dbencseq, qsmi->suftabpart, 0, /* l */ qsmi->numberofsuffixes - 1, /* r */ 0, /* offset */ qsmi->db_readmode, &qsmi->querysubstring, qsmi->userdefinedleastlength); qsmi->mmsi_defined = true; } else { if (gt_mmsearchiterator_next(&qsmi->dbstart,qsmi->mmsi)) { GtUword extend; if (gt_mmsearch_isleftmaximal(qsmi->dbencseq, qsmi->db_readmode, qsmi->dbstart, &qsmi->querysubstring)) { extend = gt_mmsearch_extendright(qsmi->dbencseq, qsmi->mmsi->esr, qsmi->db_readmode, qsmi->totallength, qsmi->dbstart + qsmi->userdefinedleastlength, &qsmi->querysubstring, qsmi->userdefinedleastlength); qsmi->matchlength = qsmi->userdefinedleastlength + extend; return 0; } } else { qsmi->mmsi_defined = false; if (qsmi->querysubstring.currentoffset + qsmi->userdefinedleastlength < qsmi->query_seqlen) { qsmi->querysubstring.currentoffset++; } else { qsmi->query_seqlen = 0; qsmi->queryunitnum++; } } } } else { qsmi->query_seqlen = 0; qsmi->queryunitnum++; } } }