char* gt_bioseq_get_sequence_range(const GtBioseq *bs, GtUword idx, GtUword start, GtUword end) { char *out; GtUword startpos; gt_assert(bs); gt_assert(idx < gt_encseq_num_of_sequences(bs->encseq) && end >= start); out = gt_malloc((end - start + 1) * sizeof (char)); startpos = gt_encseq_seqstartpos(bs->encseq, idx); gt_encseq_extract_decoded(bs->encseq, out, startpos + start, startpos + end); return out; }
char* gt_bioseq_get_sequence(const GtBioseq *bs, GtUword idx) { char *out; GtUword startpos; gt_assert(bs); gt_assert(idx < gt_encseq_num_of_sequences(bs->encseq)); out = gt_calloc(gt_encseq_seqlength(bs->encseq, idx), sizeof (char)); startpos = gt_encseq_seqstartpos(bs->encseq, idx); gt_encseq_extract_decoded(bs->encseq, out, startpos, startpos + gt_encseq_seqlength(bs->encseq, idx) - 1); return out; }
static int encseq_lua_extract_decoded(lua_State *L) { GtEncseq **encseq; GtUword from, to; char *string; encseq = check_encseq(L, 1); from = luaL_checknumber(L, 2); to = luaL_checknumber(L, 3); luaL_argcheck(L, from <= to, 2, "must be <= range endposition"); luaL_argcheck(L, to < gt_encseq_total_length(*encseq), 3, "cannot exceed total length of encoded sequence"); string = gt_malloc((to - from + 1) * sizeof (char)); gt_encseq_extract_decoded(*encseq, string, from, to); lua_pushlstring(L, string, (to - from + 1)); gt_free(string); return 1; }
static char* gt_encseq_col_get_sequence(const GtSeqCol *sc, GtUword filenum, GtUword seqnum, GtUword start, GtUword end) { GtEncseqCol *esc; char *out; GtUword encseq_seqnum, startpos; esc = gt_encseq_col_cast(sc); gt_assert(esc && filenum < gt_encseq_num_of_files(esc->encseq)); encseq_seqnum = gt_encseq_filenum_first_seqnum(esc->encseq, filenum) + seqnum; gt_assert(encseq_seqnum < gt_encseq_num_of_sequences(esc->encseq)); gt_assert(start <= end); startpos = gt_encseq_seqstartpos(esc->encseq, encseq_seqnum); out = gt_calloc(end - start + 1, sizeof (char)); gt_encseq_extract_decoded(esc->encseq, out, startpos + start, startpos + end); return out; }
static int gt_encseq_col_md5_to_seq(GtSeqCol *sc, char **seq, GtUword start, GtUword end, GtStr *md5_seqid, GtError *err) { GtUword seqnum = GT_UNDEF_UWORD; char seqid[GT_MD5_SEQID_HASH_LEN + 1]; int had_err = 0; GtEncseqCol *esc; esc = gt_encseq_col_cast(sc); gt_error_check(err); gt_assert(esc && seq && start <= end && md5_seqid && err); gt_assert(gt_md5_seqid_has_prefix(gt_str_get(md5_seqid))); if (gt_str_length(md5_seqid) >= GT_MD5_SEQID_TOTAL_LEN) { const char *cstrseqid = gt_str_get(md5_seqid); if (cstrseqid[GT_MD5_SEQID_TOTAL_LEN-1] != GT_MD5_SEQID_SEPARATOR) { gt_error_set(err, "MD5 sequence id %s not terminated with '%c'", gt_str_get(md5_seqid), GT_MD5_SEQID_SEPARATOR); had_err = -1; } if (!had_err) { strncpy(seqid, cstrseqid + GT_MD5_SEQID_PREFIX_LEN, GT_MD5_SEQID_HASH_LEN); seqid[GT_MD5_SEQID_HASH_LEN] = '\0'; } } seqnum = gt_md5_tab_map(esc->md5_tab, seqid); if (seqnum != GT_UNDEF_UWORD) { GtUword startpos = gt_encseq_seqstartpos(esc->encseq, seqnum), GT_UNUSED seqlength = gt_encseq_seqlength(esc->encseq, seqnum); *seq = gt_calloc(end - start + 1, sizeof (char)); gt_encseq_extract_decoded(esc->encseq, (char*) *seq, startpos + start, startpos + end); } else { gt_error_set(err, "sequence %s not found", gt_str_get(md5_seqid)); had_err = -1; } return had_err; }
static inline void rdj_pairwise_generic(bool use_dp, GtOvlfindMode m, GtEncseq *encseq, bool revcompl, bool show_progressbar, bool use_kmp, double max_error, GtUword min_length, bool find_nonmaximal, GtSpmproc proc, GtSpmprocA proc_a, void* procdata, bool cntfilter, GtBitsequence *cntreads_in, GtBitsequence **cntreads_out, GtUword *nofreads) { GtContfind containment_status; GtBitsequence *cntreads = NULL; GtUint64 progress = 0; GtUword i, j, startpos, v_seqnum, nofsequences, n; struct Read u, v; struct Data d; gt_kmp_t** kmp_values = NULL; GT_RDJ_PAIRWISE_INIT_STRUCT_DATA(d, proc, proc_a, procdata, &u, &v, 0); gt_assert(encseq != NULL); d.mode = m; if ((m == GT_OVLFIND_ALL) && cntfilter) d.mode = GT_OVLFIND_PROPER_SPM; n = gt_encseq_num_of_sequences(encseq); if (use_kmp) kmp_values = prepare_kmp_values(encseq, n); nofsequences = n; if (revcompl) n = n >> 1; if (cntreads_in != NULL) cntreads = cntreads_in; else if (m != GT_OVLFIND_SPM) GT_INITBITTAB(cntreads, n); if (show_progressbar) gt_progressbar_start(&progress, (GtUint64)n * ((GtUint64)n - 1ULL) / 2ULL); for (i = 0; i < n; i++) { u.seqnum = i; u.direct = true; u.len = gt_encseq_seqlength(encseq, i); u.seq = gt_malloc(sizeof (char) * (u.len + 1)); startpos = gt_encseq_seqstartpos(encseq, i); gt_encseq_extract_decoded(encseq, u.seq, startpos, startpos + u.len - 1); u.seq[u.len] = '\0'; if (use_kmp) { gt_assert(kmp_values != NULL); u.pi = kmp_values[i]; } for (j = i; j < n; j++) { if (cntfilter) { gt_assert(cntreads != NULL); if ((bool)GT_ISIBITSET(cntreads, i)) break; if ((bool)GT_ISIBITSET(cntreads, j)) continue; } v.seqnum = j; /* find overlaps using direct v */ v.direct = true; v.len = gt_encseq_seqlength(encseq, j); v.seq = gt_malloc(sizeof (char) * (v.len + 1)); startpos = gt_encseq_seqstartpos(encseq, j); gt_encseq_extract_decoded(encseq, v.seq, startpos, startpos + v.len - 1); v.seq[v.len] = '\0'; if (use_kmp) { gt_assert(kmp_values != NULL); v.pi = kmp_values[j]; } containment_status = use_dp ? find_approx_overlaps(&d, max_error, min_length, find_nonmaximal) : find_exact_overlaps(&d, use_kmp, min_length, find_nonmaximal); if (m != GT_OVLFIND_SPM) mark_contained(containment_status, u.seqnum, v.seqnum, cntreads); /* find overlaps using reverse complement of v */ if (revcompl) { v_seqnum = nofsequences - j - 1; v.direct = false; gt_assert(gt_encseq_seqlength(encseq, j) == gt_encseq_seqlength(encseq, v_seqnum)); startpos = gt_encseq_seqstartpos(encseq, v_seqnum); gt_encseq_extract_decoded(encseq, v.seq, startpos, startpos + v.len - 1); if (use_kmp) { gt_assert(kmp_values != NULL); v.pi = kmp_values[v_seqnum]; } containment_status = use_dp ? find_approx_overlaps(&d, max_error, min_length, find_nonmaximal) : find_exact_overlaps(&d, use_kmp, min_length, find_nonmaximal); if (m != GT_OVLFIND_SPM) mark_contained(containment_status, u.seqnum, v.seqnum, cntreads); } gt_free(v.seq); progress++; } gt_free(u.seq); } if (cntreads_out != NULL) *cntreads_out = cntreads; else if (cntreads_in == NULL) gt_free(cntreads); if (nofreads != NULL) *nofreads = n; if (use_kmp) free_kmp_values(kmp_values, revcompl ? n << 1 : n); if (show_progressbar) gt_progressbar_stop(); }
static GtMatchIteratorStatus gt_match_iterator_sw_next(GtMatchIterator *mi, GT_UNUSED GtMatch **match, GT_UNUSED GtError *err) { GtMatchIteratorSW *mis; GtSeq *seq_a, *seq_b; char *a, *b; const char *adesc, *bdesc; GtAlignment *ali = NULL; unsigned long seqlen_a, seqlen_b, seqpos; GtRange arng, brng; gt_assert(mi && match); mis = gt_match_iterator_sw_cast(mi); while (true) { if (!mis->pvt->firstali) mis->pvt->seqno_es2++; if (mis->pvt->seqno_es2 == gt_encseq_num_of_sequences(mis->pvt->es2)) { mis->pvt->seqno_es1++; if (mis->pvt->seqno_es1 == gt_encseq_num_of_sequences(mis->pvt->es1)) return GT_MATCHER_STATUS_END; mis->pvt->seqno_es2 = 0; } seqlen_a = gt_encseq_seqlength(mis->pvt->es1, mis->pvt->seqno_es1); seqlen_b = gt_encseq_seqlength(mis->pvt->es2, mis->pvt->seqno_es2); /* XXX: reuse buffers for performance improvement */ a = gt_malloc(seqlen_a * sizeof (char)); seqpos = gt_encseq_seqstartpos(mis->pvt->es1, mis->pvt->seqno_es1); gt_encseq_extract_decoded(mis->pvt->es1, a, seqpos, seqpos + seqlen_a - 1); b = gt_malloc(seqlen_b * sizeof (char)); seqpos = gt_encseq_seqstartpos(mis->pvt->es2, mis->pvt->seqno_es2); gt_encseq_extract_decoded(mis->pvt->es1, b, seqpos, seqpos + seqlen_b - 1); seq_a = gt_seq_new(a, seqlen_a, gt_encseq_alphabet(mis->pvt->es1)); seq_b = gt_seq_new(b, seqlen_b, gt_encseq_alphabet(mis->pvt->es2)); ali = gt_swalign(seq_a, seq_b, mis->pvt->sf); mis->pvt->firstali = false; if (ali && gt_alignment_get_length(ali) >= mis->pvt->min_len && gt_alignment_eval(ali) <= mis->pvt->max_edist) { break; } gt_alignment_delete(ali); gt_seq_delete(seq_a); gt_seq_delete(seq_b); gt_free(a); gt_free(b); } arng = gt_alignment_get_urange(ali); brng = gt_alignment_get_vrange(ali); adesc = gt_encseq_description(mis->pvt->es1, &seqlen_a, mis->pvt->seqno_es1); bdesc = gt_encseq_description(mis->pvt->es2, &seqlen_b, mis->pvt->seqno_es2); *match = gt_match_sw_new("", "", mis->pvt->seqno_es1, mis->pvt->seqno_es2, gt_alignment_get_length(ali), gt_alignment_eval(ali), arng.start, brng.start, arng.end, brng.end, GT_MATCH_DIRECT); gt_match_set_seqid1_nt(*match, adesc, seqlen_a); gt_match_set_seqid2_nt(*match, bdesc, seqlen_b); gt_alignment_delete(ali); gt_seq_delete(seq_a); gt_seq_delete(seq_b); gt_free(a); gt_free(b); return GT_MATCHER_STATUS_OK; }
int gt_region_mapping_get_sequence(GtRegionMapping *rm, char **seq, GtStr *seqid, unsigned long start, unsigned long end, GtError *err) { int had_err = 0; unsigned long offset = 1; GtRange range = {GT_UNDEF_ULONG, GT_UNDEF_ULONG}; gt_error_check(err); gt_assert(rm && seq && seqid && gt_str_length(seqid) > 0); /* handle rawseq access first */ if (rm->userawseq) { gt_assert(!rm->seqid2seqnum_mapping); *seq = gt_calloc(end - start + 1, sizeof (char)); strncpy(*seq, rm->rawseq + start - 1, (end - start + 1) * sizeof (char)); return 0; } /* make sure that correct sequence is loaded */ had_err = update_seq_col_if_necessary(rm, seqid, err); /* MD5 sequence id */ if (!had_err) { if (gt_md5_seqid_has_prefix(gt_str_get(seqid))) { had_err = gt_seq_col_md5_to_seq(rm->seq_col, seq, start - offset, end - offset, seqid, err); return had_err; } } /* ``regular'' sequence ID */ if (!had_err) { gt_assert(!rm->usedesc || rm->seqid2seqnum_mapping); gt_assert(rm->mapping || rm->seq_col); if (rm->usedesc) { unsigned long seqnum, filenum; gt_assert(rm->seqid2seqnum_mapping); range.start = start; range.end = end; had_err = gt_seqid2seqnum_mapping_map(rm->seqid2seqnum_mapping, gt_str_get(seqid), &range, &seqnum, &filenum, &offset, err); if (!had_err) { if (range.end != GT_UNDEF_ULONG && range.start != GT_UNDEF_ULONG && range.end >= gt_seq_col_get_sequence_length(rm->seq_col, filenum, seqnum) + offset) { gt_error_set(err, "trying to extract range %lu-%lu on sequence " "``%s'' which is not covered by that sequence (with " "boundaries %lu-%lu). Has the sequence-region " "to sequence mapping been defined correctly?", start, end, gt_str_get(seqid), range.start, range.end); had_err = -1; } } if (!had_err) { *seq = gt_seq_col_get_sequence(rm->seq_col, filenum, seqnum, start - offset, end - offset); } } else if (rm->matchdesc) { gt_assert(!rm->seqid2seqnum_mapping); gt_assert(rm->seq_col); if (!had_err) { had_err = gt_seq_col_grep_desc(rm->seq_col, seq, start - 1, end - 1, seqid, err); } } else if (rm->useseqno) { unsigned long seqno = GT_UNDEF_ULONG; gt_assert(rm->encseq); if (1 != sscanf(gt_str_get(seqid), "seq%lu", &seqno)) { gt_error_set(err, "seqid '%s' does not have the form 'seqX' " "where X is a sequence number in the encoded " "sequence", gt_str_get(seqid)); had_err = -1; } gt_assert(had_err || seqno != GT_UNDEF_ULONG); if (!had_err && seqno >= gt_encseq_num_of_sequences(rm->encseq)) { gt_error_set(err, "trying to access sequence %lu, but encoded " "sequence contains only %lu sequences", seqno, gt_encseq_num_of_sequences(rm->encseq)); had_err = -1; } if (!had_err) { unsigned long seqlength = gt_encseq_seqlength(rm->encseq, seqno); if (start > seqlength || end > seqlength) { gt_error_set(err, "trying to extract range %lu-%lu on sequence " "``%s'' which is not covered by that sequence (only " "%lu characters in size). Has the sequence-region " "to sequence mapping been defined correctly?", start, end, gt_str_get(seqid), seqlength); had_err = -1; } } if (!had_err) { unsigned long seqstartpos; *seq = gt_calloc(end - start + 1, sizeof (char)); seqstartpos = gt_encseq_seqstartpos(rm->encseq, seqno); gt_encseq_extract_decoded(rm->encseq, *seq, seqstartpos + start - 1, seqstartpos + end - 1); } } else if (rm->userawseq) { gt_assert(!rm->seqid2seqnum_mapping); *seq = gt_calloc(end - start + 1, sizeof (char)); strncpy(*seq, rm->rawseq + start - 1, (end - start + 1) * sizeof (char)); } else { gt_assert(rm->seq_col); if (!had_err) { unsigned long seqlength = gt_seq_col_get_sequence_length(rm->seq_col, 0, 0); if (start > seqlength || end > seqlength) { had_err = -1; gt_error_set(err, "trying to extract range %lu-%lu on sequence " "``%s'' which is not covered by that sequence (only " "%lu characters in size). Has the sequence-region " "to sequence mapping been defined correctly?", start, end, gt_str_get(seqid), seqlength); } if (!had_err) { *seq = gt_seq_col_get_sequence(rm->seq_col, 0, 0, start - offset, end - offset); } } } } return had_err; }