GtUword gt_bioseq_md5_to_index(GtBioseq *bs, const char *md5) { gt_assert(bs && md5 && gt_encseq_has_md5_support(bs->encseq)); if (!bs->md5_tab) { bs->md5_tab = gt_encseq_get_md5_tab(bs->encseq, NULL); } return gt_md5_tab_map(bs->md5_tab, md5); }
int gt_encseq_col_md5_to_sequence_length(GtSeqCol *sc, GtUword *len, GtStr *md5_seqid, GtError *err) { GtUword seqnum = GT_UNDEF_UWORD; int had_err = 0; GtEncseqCol *esc; esc = gt_encseq_col_cast(sc); gt_error_check(err); gt_assert(esc && len && md5_seqid && err); gt_assert(gt_md5_seqid_has_prefix(gt_str_get(md5_seqid))); seqnum = gt_md5_tab_map(esc->md5_tab, gt_str_get(md5_seqid) + GT_MD5_SEQID_PREFIX_LEN); if (seqnum != GT_UNDEF_UWORD) { gt_assert(seqnum < gt_encseq_num_of_sequences(esc->encseq)); *len = gt_encseq_seqlength(esc->encseq, seqnum); } else { gt_error_set(err, "sequence %s not found", gt_str_get(md5_seqid)); had_err = -1; } return had_err; }
static int gt_encseq_col_md5_to_seq(GtSeqCol *sc, char **seq, GtUword start, GtUword end, GtStr *md5_seqid, GtError *err) { GtUword seqnum = GT_UNDEF_UWORD; char seqid[GT_MD5_SEQID_HASH_LEN + 1]; int had_err = 0; GtEncseqCol *esc; esc = gt_encseq_col_cast(sc); gt_error_check(err); gt_assert(esc && seq && start <= end && md5_seqid && err); gt_assert(gt_md5_seqid_has_prefix(gt_str_get(md5_seqid))); if (gt_str_length(md5_seqid) >= GT_MD5_SEQID_TOTAL_LEN) { const char *cstrseqid = gt_str_get(md5_seqid); if (cstrseqid[GT_MD5_SEQID_TOTAL_LEN-1] != GT_MD5_SEQID_SEPARATOR) { gt_error_set(err, "MD5 sequence id %s not terminated with '%c'", gt_str_get(md5_seqid), GT_MD5_SEQID_SEPARATOR); had_err = -1; } if (!had_err) { strncpy(seqid, cstrseqid + GT_MD5_SEQID_PREFIX_LEN, GT_MD5_SEQID_HASH_LEN); seqid[GT_MD5_SEQID_HASH_LEN] = '\0'; } } seqnum = gt_md5_tab_map(esc->md5_tab, seqid); if (seqnum != GT_UNDEF_UWORD) { GtUword startpos = gt_encseq_seqstartpos(esc->encseq, seqnum), GT_UNUSED seqlength = gt_encseq_seqlength(esc->encseq, seqnum); *seq = gt_calloc(end - start + 1, sizeof (char)); gt_encseq_extract_decoded(esc->encseq, (char*) *seq, startpos + start, startpos + end); } else { gt_error_set(err, "sequence %s not found", gt_str_get(md5_seqid)); had_err = -1; } return had_err; }
static int gt_encseq_col_md5_to_description(GtSeqCol *sc, GtStr *desc, GtStr *md5_seqid, GtError *err) { GtUword seqnum = GT_UNDEF_UWORD; char seqid[GT_MD5_SEQID_HASH_LEN + 1]; int had_err = 0; GtEncseqCol *esc; esc = gt_encseq_col_cast(sc); gt_error_check(err); gt_assert(esc && desc && md5_seqid && err); gt_assert(gt_md5_seqid_has_prefix(gt_str_get(md5_seqid))); if (gt_str_length(md5_seqid) >= GT_MD5_SEQID_TOTAL_LEN) { const char *cstrseqid = gt_str_get(md5_seqid); if (cstrseqid[GT_MD5_SEQID_TOTAL_LEN-1] != GT_MD5_SEQID_SEPARATOR) { gt_error_set(err, "MD5 sequence id %s not terminated with '%c'", gt_str_get(md5_seqid), GT_MD5_SEQID_SEPARATOR); had_err = -1; } if (!had_err) { strncpy(seqid, cstrseqid + GT_MD5_SEQID_PREFIX_LEN, GT_MD5_SEQID_HASH_LEN); seqid[GT_MD5_SEQID_HASH_LEN] = '\0'; } } seqnum = gt_md5_tab_map(esc->md5_tab, seqid); if (seqnum != GT_UNDEF_UWORD) { const char *cdesc; GtUword desc_len; gt_assert(seqnum < gt_encseq_num_of_sequences(esc->encseq)); cdesc = gt_encseq_description(esc->encseq, &desc_len, seqnum); gt_str_append_cstr_nt(desc, cdesc, desc_len); } else { gt_error_set(err, "sequence %s not found", gt_str_get(md5_seqid)); had_err = -1; } return had_err; }