예제 #1
0
GtUword gt_bioseq_md5_to_index(GtBioseq *bs, const char *md5)
{
  gt_assert(bs && md5 && gt_encseq_has_md5_support(bs->encseq));
  if (!bs->md5_tab) {
    bs->md5_tab = gt_encseq_get_md5_tab(bs->encseq, NULL);
  }
  return gt_md5_tab_map(bs->md5_tab, md5);
}
예제 #2
0
int gt_encseq_col_md5_to_sequence_length(GtSeqCol *sc, GtUword *len,
                                         GtStr *md5_seqid, GtError *err)
{
  GtUword seqnum = GT_UNDEF_UWORD;
  int had_err = 0;
  GtEncseqCol *esc;
  esc = gt_encseq_col_cast(sc);
  gt_error_check(err);
  gt_assert(esc && len && md5_seqid && err);
  gt_assert(gt_md5_seqid_has_prefix(gt_str_get(md5_seqid)));
  seqnum = gt_md5_tab_map(esc->md5_tab, gt_str_get(md5_seqid) +
                                          GT_MD5_SEQID_PREFIX_LEN);
  if (seqnum != GT_UNDEF_UWORD) {
    gt_assert(seqnum < gt_encseq_num_of_sequences(esc->encseq));
    *len = gt_encseq_seqlength(esc->encseq, seqnum);
  } else {
    gt_error_set(err, "sequence %s not found", gt_str_get(md5_seqid));
    had_err = -1;
  }
  return had_err;
}
예제 #3
0
static int gt_encseq_col_md5_to_seq(GtSeqCol *sc, char **seq,
                                    GtUword start, GtUword end,
                                    GtStr *md5_seqid, GtError *err)
{
  GtUword seqnum = GT_UNDEF_UWORD;
  char seqid[GT_MD5_SEQID_HASH_LEN + 1];
  int had_err = 0;
  GtEncseqCol *esc;
  esc = gt_encseq_col_cast(sc);
  gt_error_check(err);
  gt_assert(esc && seq && start <= end && md5_seqid && err);
  gt_assert(gt_md5_seqid_has_prefix(gt_str_get(md5_seqid)));
  if (gt_str_length(md5_seqid) >= GT_MD5_SEQID_TOTAL_LEN) {
    const char *cstrseqid = gt_str_get(md5_seqid);
    if (cstrseqid[GT_MD5_SEQID_TOTAL_LEN-1] != GT_MD5_SEQID_SEPARATOR) {
      gt_error_set(err, "MD5 sequence id %s not terminated with '%c'",
                   gt_str_get(md5_seqid), GT_MD5_SEQID_SEPARATOR);
      had_err = -1;
    }
    if (!had_err) {
      strncpy(seqid, cstrseqid + GT_MD5_SEQID_PREFIX_LEN,
              GT_MD5_SEQID_HASH_LEN);
      seqid[GT_MD5_SEQID_HASH_LEN] = '\0';
    }
  }
  seqnum = gt_md5_tab_map(esc->md5_tab, seqid);
  if (seqnum != GT_UNDEF_UWORD) {
    GtUword startpos = gt_encseq_seqstartpos(esc->encseq, seqnum),
                  GT_UNUSED seqlength = gt_encseq_seqlength(esc->encseq,
                                                            seqnum);
    *seq = gt_calloc(end - start + 1, sizeof (char));
    gt_encseq_extract_decoded(esc->encseq, (char*) *seq, startpos + start,
                              startpos + end);
  } else {
    gt_error_set(err, "sequence %s not found", gt_str_get(md5_seqid));
    had_err = -1;
  }
  return had_err;
}
예제 #4
0
static int gt_encseq_col_md5_to_description(GtSeqCol *sc, GtStr *desc,
                                            GtStr *md5_seqid, GtError *err)
{
  GtUword seqnum = GT_UNDEF_UWORD;
  char seqid[GT_MD5_SEQID_HASH_LEN + 1];
  int had_err = 0;
  GtEncseqCol *esc;
  esc = gt_encseq_col_cast(sc);
  gt_error_check(err);
  gt_assert(esc && desc && md5_seqid && err);
  gt_assert(gt_md5_seqid_has_prefix(gt_str_get(md5_seqid)));
  if (gt_str_length(md5_seqid) >= GT_MD5_SEQID_TOTAL_LEN) {
    const char *cstrseqid = gt_str_get(md5_seqid);
    if (cstrseqid[GT_MD5_SEQID_TOTAL_LEN-1] != GT_MD5_SEQID_SEPARATOR) {
      gt_error_set(err, "MD5 sequence id %s not terminated with '%c'",
                   gt_str_get(md5_seqid), GT_MD5_SEQID_SEPARATOR);
      had_err = -1;
    }
    if (!had_err) {
      strncpy(seqid, cstrseqid + GT_MD5_SEQID_PREFIX_LEN,
              GT_MD5_SEQID_HASH_LEN);
      seqid[GT_MD5_SEQID_HASH_LEN] = '\0';
    }
  }
  seqnum = gt_md5_tab_map(esc->md5_tab, seqid);
  if (seqnum != GT_UNDEF_UWORD) {
    const char *cdesc;
    GtUword desc_len;
    gt_assert(seqnum < gt_encseq_num_of_sequences(esc->encseq));
    cdesc = gt_encseq_description(esc->encseq, &desc_len, seqnum);
    gt_str_append_cstr_nt(desc, cdesc, desc_len);
  } else {
    gt_error_set(err, "sequence %s not found", gt_str_get(md5_seqid));
    had_err = -1;
  }
  return had_err;
}