int gt_region_mapping_get_sequence_length(GtRegionMapping *rm, unsigned long *length, GtStr *seqid, GtError *err) { unsigned long filenum, seqnum; int had_err; gt_error_check(err); GT_UNUSED GtRange range; gt_assert(rm && seqid); if (rm->userawseq) { return rm->rawlength; } had_err = update_seq_col_if_necessary(rm, seqid, err); if (!had_err) { if (gt_md5_seqid_has_prefix(gt_str_get(seqid))) { had_err = gt_seq_col_md5_to_sequence_length(rm->seq_col, length, seqid, err); } else if (rm->usedesc) { gt_assert(rm->seqid2seqnum_mapping); had_err = gt_seqid2seqnum_mapping_map(rm->seqid2seqnum_mapping, gt_str_get(seqid), &range, &seqnum, &filenum, NULL, err); if (!had_err) *length = gt_seq_col_get_sequence_length(rm->seq_col, filenum, seqnum); } else if (rm->matchdesc) { had_err = gt_seq_col_grep_desc_sequence_length(rm->seq_col, length, seqid, err); } else if (rm->useseqno) { unsigned long seqno = GT_UNDEF_ULONG; gt_assert(rm->encseq); if (1 != sscanf(gt_str_get(seqid), "seq%lu", &seqno)) { gt_error_set(err, "seqid '%s' does not have the form 'seqX' " "where X is a sequence number in the encoded " "sequence", gt_str_get(seqid)); had_err = -1; } gt_assert(had_err || seqno != GT_UNDEF_ULONG); if (!had_err && seqno >= gt_encseq_num_of_sequences(rm->encseq)) { gt_error_set(err, "trying to access sequence %lu, but encoded " "sequence contains only %lu sequences", seqno, gt_encseq_num_of_sequences(rm->encseq)); had_err = -1; } if (!had_err) { *length = gt_encseq_seqlength(rm->encseq, seqno); } } else *length = gt_seq_col_get_sequence_length(rm->seq_col, 0, 0); } return had_err; }
static void gt_seqorder_get_hdrsorted_seqnums(const GtEncseq *encseq, GtUword *seqnums, GtCompareWithData cmpfunc) { GtUword i; gt_assert(encseq != NULL); for (i = 0UL; i < gt_encseq_num_of_sequences(encseq); i++) seqnums[i] = i; (void) gt_qsort_r(seqnums, gt_encseq_num_of_sequences(encseq), sizeof (GtUword), (void*) encseq, cmpfunc); }
static void gt_seqorder_sort(GtSuffixsortspace *suffixsortspace, GtEncseq *encseq) { unsigned long i; Sfxstrategy sfxstrategy; defaultsfxstrategy(&sfxstrategy, false); for (i = 0; i < gt_encseq_num_of_sequences(encseq); i++) gt_suffixsortspace_setdirect(suffixsortspace, i, gt_encseq_seqstartpos(encseq, i)); gt_sortallsuffixesfromstart(suffixsortspace, gt_encseq_num_of_sequences(encseq), encseq, GT_READMODE_FORWARD, NULL, 0, &sfxstrategy, NULL, NULL, NULL); }
static GtBioseq* bioseq_new_with_recreate_and_type(GtStr *sequence_file, bool recreate, GtError *err) { GtBioseq *bs; int had_err = 0; gt_error_check(err); bs = gt_calloc(1, sizeof *bs); if (!strcmp(gt_str_get(sequence_file), "-")) bs->use_stdin = true; if (!bs->use_stdin && !gt_file_exists(gt_str_get(sequence_file))) { gt_error_set(err, "sequence file \"%s\" does not exist or is not readable", gt_str_get(sequence_file)); had_err = -1; } if (!had_err) { bs->sequence_file = gt_str_ref(sequence_file); had_err = bioseq_fill(bs, recreate, err); } if (had_err) { gt_bioseq_delete(bs); return NULL; } gt_assert(bs->encseq); bs->descriptions = gt_calloc(gt_encseq_num_of_sequences(bs->encseq), sizeof (char*)); return bs; }
int gt_extractkeysfromfastaindex(const char *indexname, const GtStr *fileofkeystoextract, unsigned long linewidth,GtError *err) { GtEncseq *encseq = NULL; GtEncseqLoader *el = NULL; bool haserr = false; unsigned long numofdbsequences = 0, keysize = 0; el = gt_encseq_loader_new(); encseq = gt_encseq_loader_load(el, indexname, err); gt_encseq_loader_delete(el); if (encseq == NULL) { haserr = true; } if (!haserr) { int retval; numofdbsequences = gt_encseq_num_of_sequences(encseq); retval = readkeysize(indexname,err); if (retval < 0) { haserr = true; } keysize = (unsigned long) retval; } if (!haserr) { char *keytab; unsigned long keytablength; keytablength = 1UL + numofdbsequences * (keysize+1); keytab = gt_fa_mmap_check_size_with_suffix(indexname, GT_KEYSTABFILESUFFIX, keytablength, sizeof (GtUchar), err); if (keytab == NULL) { haserr = true; } else { if (itersearchoverallkeys(encseq,keytab,numofdbsequences, keysize,fileofkeystoextract, linewidth,err) != 0) { haserr = true; } } gt_fa_xmunmap(keytab); } if (encseq != NULL) { gt_encseq_delete(encseq); encseq = NULL; } return haserr ? -1 : 0; }
static int encseq_lua_num_of_sequences(lua_State *L) { GtEncseq **encseq; encseq = check_encseq(L, 1); lua_pushnumber(L, gt_encseq_num_of_sequences(*encseq)); return 1; }
void gt_initstorematch(Storematchinfo *storematch, const GtEncseq *encseq) { unsigned long numofdbsequences = gt_encseq_num_of_sequences(encseq); storematch->encseq = encseq; GT_INITBITTAB(storematch->hasmatch,numofdbsequences); }
int gt_callenumselfmatches(const char *indexname, GtReadmode queryreadmode, unsigned int userdefinedleastlength, GtProcessquerymatch processquerymatch, void *processquerymatchinfo, GtLogger *logger, GtError *err) { Suffixarray suffixarray; bool haserr = false; gt_assert(queryreadmode != GT_READMODE_FORWARD); if (gt_mapsuffixarray(&suffixarray, SARR_ESQTAB | SARR_SUFTAB | SARR_SSPTAB, indexname, logger, err) != 0) { haserr = true; } else { unsigned long seqnum, numofsequences, seqlength, seqstartpos; GtQuerymatch *querymatchspaceptr = gt_querymatch_new(); GtQueryrep queryrep; numofsequences = gt_encseq_num_of_sequences(suffixarray.encseq); queryrep.sequence = NULL; queryrep.reversecopy = false; queryrep.encseq = suffixarray.encseq; queryrep.readmode = queryreadmode; for (seqnum = 0; seqnum < numofsequences; seqnum++) { seqstartpos = gt_encseq_seqstartpos(suffixarray.encseq, seqnum); seqlength = gt_encseq_seqlength(suffixarray.encseq, seqnum); if (seqlength >= (unsigned long) userdefinedleastlength) { queryrep.startpos = seqstartpos; queryrep.length = seqlength; if (gt_querysubstringmatch(true, &suffixarray, (uint64_t) seqnum, &queryrep, (unsigned long) userdefinedleastlength, processquerymatch, processquerymatchinfo, querymatchspaceptr, err) != 0) { haserr = true; break; } } } gt_querymatch_delete(querymatchspaceptr); } gt_freesuffixarray(&suffixarray); return haserr ? -1 : 0; }
GtUchar gt_bioseq_get_encoded_char(const GtBioseq *bs, GtUword index, GtUword position) { GtUword startpos; gt_assert(bs); gt_assert(index < gt_encseq_num_of_sequences(bs->encseq)); startpos = gt_encseq_seqstartpos(bs->encseq, index); return gt_encseq_get_encoded_char(bs->encseq, startpos + position, GT_READMODE_FORWARD); }
GtSeq* gt_bioseq_get_seq(GtBioseq *bs, GtUword idx) { GtSeq *seq; gt_assert(bs); gt_assert(idx < gt_encseq_num_of_sequences(bs->encseq)); seq = gt_seq_new_own(gt_bioseq_get_sequence(bs, idx), gt_bioseq_get_sequence_length(bs, idx), gt_encseq_alphabet(bs->encseq)); gt_seq_set_description(seq, gt_bioseq_get_description(bs, idx)); return seq; }
static GtUword gt_encseq_col_num_of_seqs(const GtSeqCol *sc, GtUword filenum) { GtEncseqCol *esc; /* XXX cache function evaluated values */ esc = gt_encseq_col_cast(sc); gt_assert(esc && filenum < gt_encseq_num_of_files(esc->encseq)); if (gt_encseq_num_of_files(esc->encseq) == 1 && filenum == 0) return gt_encseq_num_of_sequences(esc->encseq); else if (filenum == gt_encseq_num_of_files(esc->encseq) - 1) { return (gt_encseq_num_of_sequences(esc->encseq) - gt_encseq_filenum_first_seqnum(esc->encseq, filenum)); } else { GtUword firstpos, nextpos; gt_assert(filenum < gt_encseq_num_of_files(esc->encseq) - 1); firstpos = gt_encseq_filenum_first_seqnum(esc->encseq, filenum); nextpos = gt_encseq_filenum_first_seqnum(esc->encseq, filenum + 1); return nextpos - firstpos; } }
void gt_bioseq_get_encoded_sequence(const GtBioseq *bs, GtUchar *out, GtUword idx) { GtUword startpos; gt_assert(bs); gt_assert(idx < gt_encseq_num_of_sequences(bs->encseq)); startpos = gt_encseq_seqstartpos(bs->encseq, idx); gt_encseq_extract_encoded(bs->encseq, out, startpos, startpos + gt_encseq_seqlength(bs->encseq, idx) - 1); }
void gt_bioseq_get_encoded_sequence_range(const GtBioseq *bs, GtUchar *out, GtUword idx, GtUword start, GtUword end) { GtUword startpos; gt_assert(bs); gt_assert(idx < gt_encseq_num_of_sequences(bs->encseq) && end >= start); startpos = gt_encseq_seqstartpos(bs->encseq, idx); gt_encseq_extract_encoded(bs->encseq, out, startpos + start, startpos + end); }
static int encseq_lua_seqstartpos(lua_State *L) { GtEncseq **encseq; GtUword pos; encseq = check_encseq(L, 1); pos = luaL_checknumber(L, 2); luaL_argcheck(L, pos < gt_encseq_num_of_sequences(*encseq), 2, "cannot exceed number of sequences"); lua_pushnumber(L, gt_encseq_seqstartpos(*encseq, pos)); return 1; }
static void gt_seqorder_sort(GtSuffixsortspace *suffixsortspace, const GtEncseq *encseq) { Sfxstrategy sfxstrategy; defaultsfxstrategy(&sfxstrategy, false); gt_suffixsortspace_init_seqstartpos(suffixsortspace,encseq); gt_sortallsuffixesfromstart(suffixsortspace, gt_encseq_num_of_sequences(encseq), encseq, GT_READMODE_FORWARD, NULL, 0, &sfxstrategy, NULL, NULL, NULL); }
GtQuerysubstringmatchiterator *gt_querysubstringmatchiterator_new( const GtEncseq *dbencseq, GtUword totallength, const ESASuffixptr *suftabpart, GtReadmode db_readmode, GtUword numberofsuffixes, const GtStrArray *query_files, const GtEncseq *query_encseq, GtReadmode query_readmode, unsigned int userdefinedleastlength, GtError *err) { GtQuerysubstringmatchiterator *qsmi = gt_malloc(sizeof *qsmi); qsmi->dbencseq = dbencseq; qsmi->suftabpart = suftabpart; qsmi->db_readmode = db_readmode; qsmi->numberofsuffixes = numberofsuffixes; qsmi->totallength = totallength; qsmi->userdefinedleastlength = (GtUword) userdefinedleastlength; qsmi->queryunitnum = 0; qsmi->desc = NULL; qsmi->query_for_seqit = NULL; qsmi->query_seqlen = 0; qsmi->queryrep.sequence = NULL; qsmi->queryrep.encseq = query_encseq; qsmi->queryrep.readmode = query_readmode; qsmi->queryrep.startpos = 0; qsmi->dbstart = 0; qsmi->matchlength = 0; qsmi->querysubstring.queryrep = &qsmi->queryrep; qsmi->mmsi = gt_mmsearchiterator_new_empty(); qsmi->mmsi_defined = false; if (query_files == NULL || gt_str_array_size(query_files) == 0) { gt_assert(query_encseq != NULL); qsmi->seqit = NULL; qsmi->query_encseq_numofsequences = (uint64_t) gt_encseq_num_of_sequences(query_encseq); } else { gt_assert(query_encseq == NULL); qsmi->seqit = gt_seq_iterator_sequence_buffer_new(query_files, err); if (qsmi->seqit == NULL) { gt_querysubstringmatchiterator_delete(qsmi); return NULL; } gt_seq_iterator_set_symbolmap(qsmi->seqit, gt_alphabet_symbolmap(gt_encseq_alphabet(dbencseq))); } return qsmi; }
char* gt_bioseq_get_sequence_range(const GtBioseq *bs, GtUword idx, GtUword start, GtUword end) { char *out; GtUword startpos; gt_assert(bs); gt_assert(idx < gt_encseq_num_of_sequences(bs->encseq) && end >= start); out = gt_malloc((end - start + 1) * sizeof (char)); startpos = gt_encseq_seqstartpos(bs->encseq, idx); gt_encseq_extract_decoded(bs->encseq, out, startpos + start, startpos + end); return out; }
static int encseq_lua_description(lua_State *L) { GtEncseq **encseq; GtUword seqno, desclen; const char *string; encseq = check_encseq(L, 1); seqno = luaL_checknumber(L, 2); luaL_argcheck(L, seqno < gt_encseq_num_of_sequences(*encseq), 2, "cannot exceed number of sequences"); string = gt_encseq_description(*encseq, &desclen, seqno); lua_pushlstring(L, string, desclen); return 1; }
GtCondenseq *gt_condenseq_new(const GtEncseq *orig_es, GtLogger *logger) { GtCondenseq *condenseq; condenseq = condenseq_new_empty(gt_encseq_alphabet(orig_es)); condenseq->orig_num_seq = gt_encseq_num_of_sequences(orig_es); condenseq->ssptab = condenseq_fill_tab(condenseq, orig_es); condenseq->orig_length = gt_encseq_total_length(orig_es); condenseq_process_descriptions(condenseq, orig_es, logger); return condenseq; }
char* gt_bioseq_get_sequence(const GtBioseq *bs, GtUword idx) { char *out; GtUword startpos; gt_assert(bs); gt_assert(idx < gt_encseq_num_of_sequences(bs->encseq)); out = gt_calloc(gt_encseq_seqlength(bs->encseq, idx), sizeof (char)); startpos = gt_encseq_seqstartpos(bs->encseq, idx); gt_encseq_extract_decoded(bs->encseq, out, startpos, startpos + gt_encseq_seqlength(bs->encseq, idx) - 1); return out; }
unsigned long gt_contfind_bottomup(Sequentialsuffixarrayreader *ssar, bool show_progressbar, GtBitsequence *contained, unsigned long firstrevcompl, unsigned long read_length /* 0 = variable */) { ContfindBUstate state; unsigned long totallength; GT_UNUSED int retval; gt_assert(ssar != NULL); gt_assert(contained != NULL); state.contained = contained; state.encseq = gt_encseqSequentialsuffixarrayreader(ssar); totallength = gt_encseq_total_length(state.encseq); state.nofsequences = gt_encseq_num_of_sequences(state.encseq); if (read_length == 0) { prepare_sspbittab_and_shortest(totallength, &state); } else { state.shortest = read_length; state.spacing = read_length + 1; } state.show_progressbar = show_progressbar; state.csize = 0; state.cmin = 0; state.firstrevcompl = firstrevcompl; state.counter = 0; if (show_progressbar) { state.progress = 0; gt_progressbar_start(&(state.progress), (unsigned long long)totallength); } retval = (read_length == 0) ? gt_esa_bottomup_rdjcv(ssar, &state, NULL) : gt_esa_bottomup_rdjce(ssar, &state, NULL); gt_assert(retval == 0); if (show_progressbar) gt_progressbar_stop(); if (read_length == 0) gt_free(state.sspbittab); return state.counter; }
GtSeq* gt_bioseq_get_seq_range(GtBioseq *bs, GtUword idx, GtUword start, GtUword end) { GtSeq *seq; gt_assert(bs); gt_assert(idx < gt_encseq_num_of_sequences(bs->encseq)); gt_assert(end >= start); gt_assert(end - start + 1 > gt_encseq_seqlength(bs->encseq, idx)); seq = gt_seq_new_own(gt_bioseq_get_sequence_range(bs, idx, start, end), end - start + 1, gt_encseq_alphabet(bs->encseq)); gt_seq_set_description(seq, gt_bioseq_get_description(bs, idx)); return seq; }
void gt_bioseq_show_gc_content(GtBioseq *bs, GtFile *outfp) { gt_assert(bs); if (gt_alphabet_is_dna(gt_encseq_alphabet(bs->encseq))) { GtUword i, GT_UNUSED purecharlen; GtStr *str = gt_str_new(); purecharlen = gt_encseq_total_length(bs->encseq) - gt_encseq_num_of_sequences(bs->encseq) + 1; for (i=0; i < gt_encseq_num_of_sequences(bs->encseq); i++) { char *tmp; tmp = gt_bioseq_get_sequence(bs, i); gt_str_append_cstr(str, tmp); gt_free(tmp); } gt_assert(gt_str_length(str) == purecharlen); gt_file_xprintf(outfp, "showing GC-content for sequence file \"%s\"\n", gt_str_get(bs->sequence_file)); gt_gc_content_show(gt_str_get(str), gt_str_length(str), gt_encseq_alphabet(bs->encseq), outfp); gt_str_delete(str); } }
static char* gt_encseq_col_get_description(const GtSeqCol *sc, GtUword filenum, GtUword seqnum) { GtEncseqCol *esc; const char *desc; GtUword encseq_seqnum, desclen; esc = gt_encseq_col_cast(sc); gt_assert(esc && filenum < gt_encseq_num_of_files(esc->encseq)); encseq_seqnum = gt_encseq_filenum_first_seqnum(esc->encseq, filenum) + seqnum; gt_assert(encseq_seqnum < gt_encseq_num_of_sequences(esc->encseq)); desc = gt_encseq_description(esc->encseq, &desclen, encseq_seqnum); gt_assert(desc && desclen > 0); return gt_cstr_dup_nt(desc, desclen);; }
const char* gt_bioseq_get_description(GtBioseq *bs, GtUword idx) { const char *desc; char *mydesc; GtUword desclen; gt_assert(bs && bs->encseq); gt_assert(idx < gt_encseq_num_of_sequences(bs->encseq)); if (!(mydesc = bs->descriptions[idx])) { desc = gt_encseq_description(bs->encseq, &desclen, idx); mydesc = gt_calloc(desclen + 1, sizeof (char)); strncpy(mydesc, desc, desclen); bs->descriptions[idx] = mydesc; } return (const char*) mydesc; }
void gt_bioseq_delete(GtBioseq *bs) { GtUword i; if (!bs) return; gt_str_delete(bs->sequence_file); gt_md5_tab_delete(bs->md5_tab); if (bs->descriptions) { for (i = 0; i < gt_encseq_num_of_sequences(bs->encseq); i++) { gt_free(bs->descriptions[i]); } gt_free(bs->descriptions); } gt_encseq_delete(bs->encseq); gt_free(bs); }
static int gt_encseq_col_do_grep_desc(GtEncseqCol *esc, GtUword *filenum, GtUword *seqnum, GtStr *seqid, GtError *err) { GtUword j; const GtSeqInfo *seq_info_ptr; GtSeqInfo seq_info; bool match = false; int had_err = 0; gt_error_check(err); gt_assert(esc && filenum && seqnum && seqid); /* create cache */ if (!esc->grep_cache) esc->grep_cache = gt_seq_info_cache_new(); /* try to read from cache */ seq_info_ptr = gt_seq_info_cache_get(esc->grep_cache, gt_str_get(seqid)); if (seq_info_ptr) { *filenum = seq_info_ptr->filenum; *seqnum = seq_info_ptr->seqnum; return 0; } for (j = 0; !had_err && j < gt_encseq_num_of_sequences(esc->encseq); j++) { const char *desc; char *buf; GtUword desc_len; desc = gt_encseq_description(esc->encseq, &desc_len, j); buf = gt_calloc(desc_len + 1, sizeof (char)); memcpy(buf, desc, desc_len * sizeof (char)); had_err = gt_grep(&match, gt_str_get(seqid), buf, err); gt_free(buf); if (!had_err && match) { *filenum = seq_info.filenum = gt_encseq_filenum(esc->encseq, gt_encseq_seqstartpos(esc->encseq, j)); *seqnum = seq_info.seqnum = j - gt_encseq_filenum_first_seqnum(esc->encseq, *filenum); gt_seq_info_cache_add(esc->grep_cache, gt_str_get(seqid), &seq_info); break; } } if (!had_err && !match) { gt_error_set(err, "no description matched sequence ID '%s'", gt_str_get(seqid)); had_err = -1; } return had_err; }
void gt_bioseq_show_stat(GtBioseq *bs, GtFile *outfp) { GtUword i, num_of_seqs; gt_assert(bs); num_of_seqs = gt_bioseq_number_of_sequences(bs); gt_file_xprintf(outfp, "showing statistics for sequence file \"%s\"\n", gt_str_get(bs->sequence_file)); gt_file_xprintf(outfp, "number of sequences: "GT_WU"\n", num_of_seqs); gt_file_xprintf(outfp, "total length: "GT_WU"\n", gt_encseq_total_length(bs->encseq) - gt_encseq_num_of_sequences(bs->encseq) + 1); for (i = 0; i < num_of_seqs; i++) { gt_file_xprintf(outfp, "sequence #"GT_WU" length: "GT_WU"\n", i+1, gt_bioseq_get_sequence_length(bs, i)); } }
static void showprjinfo(FILE *outprj, GtReadmode readmode, const GtEncseq *encseq, GtUword numberofallsortedsuffixes, unsigned int prefixlength, GtUword numoflargelcpvalues, double averagelcp, GtUword maxbranchdepth, const Definedunsignedlong *longest) { GtUword totallength; GtUword numofsequences; totallength = gt_encseq_total_length(encseq); fprintf(outprj,"totallength="GT_WU"\n",totallength); PRJSPECIALOUT(specialcharacters); PRJSPECIALOUT(specialranges); PRJSPECIALOUT(realspecialranges); PRJSPECIALOUT(lengthofspecialprefix); PRJSPECIALOUT(lengthofspecialsuffix); PRJSPECIALOUT(wildcards); PRJSPECIALOUT(wildcardranges); PRJSPECIALOUT(realwildcardranges); PRJSPECIALOUT(lengthofwildcardprefix); PRJSPECIALOUT(lengthofwildcardsuffix); numofsequences = gt_encseq_num_of_sequences(encseq); fprintf(outprj,"numofsequences="GT_WU"\n",numofsequences); fprintf(outprj,"numofdbsequences="GT_WU"\n",numofsequences); fprintf(outprj,"numofquerysequences=0\n"); fprintf(outprj,"numberofallsortedsuffixes="GT_WU"\n", numberofallsortedsuffixes); if (longest->defined) { fprintf(outprj,"longest="GT_WU"\n",longest->valueunsignedlong); } fprintf(outprj,"prefixlength=%u\n",prefixlength); fprintf(outprj,"largelcpvalues="GT_WU"\n",numoflargelcpvalues); fprintf(outprj,"averagelcp=%.2f\n",averagelcp); fprintf(outprj,"maxbranchdepth="GT_WU"\n",maxbranchdepth); fprintf(outprj,"integersize=%u\n", (unsigned int) (sizeof (GtUword) * CHAR_BIT)); fprintf(outprj,"littleendian=%c\n",gt_is_little_endian() ? '1' : '0'); fprintf(outprj,"readmode=%u\n",(unsigned int) readmode); fprintf(outprj,"mirrored=%c\n", gt_encseq_is_mirrored(encseq) ? '1' : '0'); }
static inline void calculate_gc(const GtEncseq *encseq, double *gc_content, bool with_special, unsigned long seq_idx, unsigned long gc_count, unsigned long at_count) { if (with_special) { gt_assert(seq_idx < gt_encseq_num_of_sequences(encseq)); gc_content[seq_idx] = (double) gc_count / (double) gt_encseq_seqlength(encseq, seq_idx); } else { gc_content[seq_idx] = (double) gc_count / (double) (gc_count + at_count); } }