static int md5_to_seqid(GtGenomeNode *gn, GtRegionMapping *region_mapping,
                        GtError *err)
{
  GtStr *seqid;
  int had_err = 0;
  gt_error_check(err);
  gt_assert(gn && region_mapping);
  seqid = gt_genome_node_get_seqid(gn);
  if (gt_md5_seqid_has_prefix(gt_str_get(seqid))) {
    /* seqid is a MD5 seqid -> change id */
    GtStr *desc = gt_str_new();
    had_err = gt_region_mapping_get_description(region_mapping, desc, seqid,
                                                err);
    if (!had_err) {
      GtStr *new_seqid = gt_str_new();
      gt_regular_seqid_save(new_seqid, desc);
      if (gt_feature_node_try_cast(gn)) {
        M2IChangeSeqidInfo info;
        info.new_seqid = new_seqid;
        info.region_mapping = region_mapping;
        had_err = gt_feature_node_traverse_children((GtFeatureNode*) gn, &info,
                                                    m2i_change_seqid, true,
                                                    err);
      }
      else
        gt_genome_node_change_seqid(gn, new_seqid);
      gt_str_delete(new_seqid);
    }
    gt_str_delete(desc);
  }
  return had_err;
}
Example #2
0
int gt_region_mapping_get_sequence_length(GtRegionMapping *rm,
                                          unsigned long *length, GtStr *seqid,
                                          GtError *err)
{
  unsigned long filenum, seqnum;
  int had_err;
  gt_error_check(err);
  GT_UNUSED GtRange range;
  gt_assert(rm && seqid);
  if (rm->userawseq) {
    return rm->rawlength;
  }
  had_err = update_seq_col_if_necessary(rm, seqid, err);
  if (!had_err) {
    if (gt_md5_seqid_has_prefix(gt_str_get(seqid))) {
      had_err = gt_seq_col_md5_to_sequence_length(rm->seq_col, length, seqid,
                                                  err);
    }
    else if (rm->usedesc) {
      gt_assert(rm->seqid2seqnum_mapping);
      had_err = gt_seqid2seqnum_mapping_map(rm->seqid2seqnum_mapping,
                                            gt_str_get(seqid), &range, &seqnum,
                                            &filenum, NULL, err);
      if (!had_err)
        *length = gt_seq_col_get_sequence_length(rm->seq_col, filenum, seqnum);
    }
    else if (rm->matchdesc) {
      had_err = gt_seq_col_grep_desc_sequence_length(rm->seq_col, length,
                                                     seqid, err);
    }
    else if (rm->useseqno) {
      unsigned long seqno = GT_UNDEF_ULONG;
      gt_assert(rm->encseq);
      if (1 != sscanf(gt_str_get(seqid), "seq%lu", &seqno)) {
        gt_error_set(err, "seqid '%s' does not have the form 'seqX' "
                          "where X is a sequence number in the encoded "
                          "sequence", gt_str_get(seqid));
        had_err = -1;
      }
      gt_assert(had_err || seqno != GT_UNDEF_ULONG);
      if (!had_err && seqno >= gt_encseq_num_of_sequences(rm->encseq)) {
          gt_error_set(err, "trying to access sequence %lu, but encoded "
                            "sequence contains only %lu sequences",
                            seqno, gt_encseq_num_of_sequences(rm->encseq));
          had_err = -1;
      }
      if (!had_err) {
        *length = gt_encseq_seqlength(rm->encseq, seqno);
      }
    }
    else
      *length = gt_seq_col_get_sequence_length(rm->seq_col, 0, 0);
  }
  return had_err;
}
Example #3
0
int gt_bioseq_col_md5_to_sequence_length(GtSeqCol *sc, GtUword *len,
                                         GtStr *md5_seqid, GtError *err)
{
  GtUword seqnum = GT_UNDEF_UWORD;
  GtBioseq *bioseq = NULL;
  GtBioseqCol *bsc;
  int had_err = 0;
  bsc = gt_bioseq_col_cast(sc);
  gt_error_check(err);
  gt_assert(bsc && len && md5_seqid && err);
  gt_assert(gt_md5_seqid_has_prefix(gt_str_get(md5_seqid)));
  if (!(had_err = md5_to_index(&bioseq, &seqnum, bsc, md5_seqid, err))) {
    gt_assert(seqnum != GT_UNDEF_UWORD);
    *len = gt_bioseq_get_sequence_length(bioseq, seqnum);
  }
  return had_err;
}
Example #4
0
static int gt_bioseq_col_md5_to_description(GtSeqCol *sc, GtStr *desc,
                                            GtStr *md5_seqid, GtError *err)
{
  GtUword seqnum = GT_UNDEF_UWORD;
  GtBioseq *bioseq = NULL;
  GtBioseqCol *bsc;
  int had_err = 0;
  bsc = gt_bioseq_col_cast(sc);
  gt_error_check(err);
  gt_assert(bsc && desc && md5_seqid && err);
  gt_assert(gt_md5_seqid_has_prefix(gt_str_get(md5_seqid)));
  if (!(had_err = md5_to_index(&bioseq, &seqnum, bsc, md5_seqid, err))) {
    gt_assert(seqnum != GT_UNDEF_UWORD);
    gt_str_append_cstr(desc, gt_bioseq_get_description(bioseq, seqnum));
  }
  return had_err;
}
Example #5
0
static int gt_bioseq_col_md5_to_seq(GtSeqCol *sc, char **seq,
                                    GtUword start, GtUword end,
                                    GtStr *md5_seqid, GtError *err)
{
  GtUword seqnum = GT_UNDEF_UWORD;
  GtBioseq *bioseq = NULL;
  GtBioseqCol *bsc;
  int had_err = 0;
  bsc = gt_bioseq_col_cast(sc);
  gt_error_check(err);
  gt_assert(bsc && seq && md5_seqid && err);
  gt_assert(gt_md5_seqid_has_prefix(gt_str_get(md5_seqid)));
  if (!(had_err = md5_to_index(&bioseq, &seqnum, bsc, md5_seqid, err))) {
    gt_assert(seqnum != GT_UNDEF_UWORD);
    *seq = gt_bioseq_get_sequence_range(bioseq, seqnum, start, end);
  }
  return had_err;
}
Example #6
0
int gt_encseq_col_md5_to_sequence_length(GtSeqCol *sc, GtUword *len,
                                         GtStr *md5_seqid, GtError *err)
{
  GtUword seqnum = GT_UNDEF_UWORD;
  int had_err = 0;
  GtEncseqCol *esc;
  esc = gt_encseq_col_cast(sc);
  gt_error_check(err);
  gt_assert(esc && len && md5_seqid && err);
  gt_assert(gt_md5_seqid_has_prefix(gt_str_get(md5_seqid)));
  seqnum = gt_md5_tab_map(esc->md5_tab, gt_str_get(md5_seqid) +
                                          GT_MD5_SEQID_PREFIX_LEN);
  if (seqnum != GT_UNDEF_UWORD) {
    gt_assert(seqnum < gt_encseq_num_of_sequences(esc->encseq));
    *len = gt_encseq_seqlength(esc->encseq, seqnum);
  } else {
    gt_error_set(err, "sequence %s not found", gt_str_get(md5_seqid));
    had_err = -1;
  }
  return had_err;
}
Example #7
0
static int gt_encseq_col_md5_to_seq(GtSeqCol *sc, char **seq,
                                    GtUword start, GtUword end,
                                    GtStr *md5_seqid, GtError *err)
{
  GtUword seqnum = GT_UNDEF_UWORD;
  char seqid[GT_MD5_SEQID_HASH_LEN + 1];
  int had_err = 0;
  GtEncseqCol *esc;
  esc = gt_encseq_col_cast(sc);
  gt_error_check(err);
  gt_assert(esc && seq && start <= end && md5_seqid && err);
  gt_assert(gt_md5_seqid_has_prefix(gt_str_get(md5_seqid)));
  if (gt_str_length(md5_seqid) >= GT_MD5_SEQID_TOTAL_LEN) {
    const char *cstrseqid = gt_str_get(md5_seqid);
    if (cstrseqid[GT_MD5_SEQID_TOTAL_LEN-1] != GT_MD5_SEQID_SEPARATOR) {
      gt_error_set(err, "MD5 sequence id %s not terminated with '%c'",
                   gt_str_get(md5_seqid), GT_MD5_SEQID_SEPARATOR);
      had_err = -1;
    }
    if (!had_err) {
      strncpy(seqid, cstrseqid + GT_MD5_SEQID_PREFIX_LEN,
              GT_MD5_SEQID_HASH_LEN);
      seqid[GT_MD5_SEQID_HASH_LEN] = '\0';
    }
  }
  seqnum = gt_md5_tab_map(esc->md5_tab, seqid);
  if (seqnum != GT_UNDEF_UWORD) {
    GtUword startpos = gt_encseq_seqstartpos(esc->encseq, seqnum),
                  GT_UNUSED seqlength = gt_encseq_seqlength(esc->encseq,
                                                            seqnum);
    *seq = gt_calloc(end - start + 1, sizeof (char));
    gt_encseq_extract_decoded(esc->encseq, (char*) *seq, startpos + start,
                              startpos + end);
  } else {
    gt_error_set(err, "sequence %s not found", gt_str_get(md5_seqid));
    had_err = -1;
  }
  return had_err;
}
Example #8
0
static int gt_encseq_col_md5_to_description(GtSeqCol *sc, GtStr *desc,
                                            GtStr *md5_seqid, GtError *err)
{
  GtUword seqnum = GT_UNDEF_UWORD;
  char seqid[GT_MD5_SEQID_HASH_LEN + 1];
  int had_err = 0;
  GtEncseqCol *esc;
  esc = gt_encseq_col_cast(sc);
  gt_error_check(err);
  gt_assert(esc && desc && md5_seqid && err);
  gt_assert(gt_md5_seqid_has_prefix(gt_str_get(md5_seqid)));
  if (gt_str_length(md5_seqid) >= GT_MD5_SEQID_TOTAL_LEN) {
    const char *cstrseqid = gt_str_get(md5_seqid);
    if (cstrseqid[GT_MD5_SEQID_TOTAL_LEN-1] != GT_MD5_SEQID_SEPARATOR) {
      gt_error_set(err, "MD5 sequence id %s not terminated with '%c'",
                   gt_str_get(md5_seqid), GT_MD5_SEQID_SEPARATOR);
      had_err = -1;
    }
    if (!had_err) {
      strncpy(seqid, cstrseqid + GT_MD5_SEQID_PREFIX_LEN,
              GT_MD5_SEQID_HASH_LEN);
      seqid[GT_MD5_SEQID_HASH_LEN] = '\0';
    }
  }
  seqnum = gt_md5_tab_map(esc->md5_tab, seqid);
  if (seqnum != GT_UNDEF_UWORD) {
    const char *cdesc;
    GtUword desc_len;
    gt_assert(seqnum < gt_encseq_num_of_sequences(esc->encseq));
    cdesc = gt_encseq_description(esc->encseq, &desc_len, seqnum);
    gt_str_append_cstr_nt(desc, cdesc, desc_len);
  } else {
    gt_error_set(err, "sequence %s not found", gt_str_get(md5_seqid));
    had_err = -1;
  }
  return had_err;
}
Example #9
0
const char* gt_region_mapping_get_md5_fingerprint(GtRegionMapping *rm,
                                                  GtStr *seqid,
                                                  const GtRange *range,
                                                  unsigned long *offset,
                                                  GtError *err)
{
  const char *md5 = NULL;
  int had_err;
  unsigned long filenum, seqnum;
  gt_error_check(err);
  gt_assert(rm && seqid);
  gt_assert(!rm->userawseq); /* not implemented */
  gt_assert(!gt_md5_seqid_has_prefix(gt_str_get(seqid))); /* not implemented */
  had_err = update_seq_col_if_necessary(rm, seqid, err);
  if (!had_err) {
    if (rm->usedesc) {
      gt_assert(rm->seqid2seqnum_mapping);
      had_err = gt_seqid2seqnum_mapping_map(rm->seqid2seqnum_mapping,
                                            gt_str_get(seqid), range, &seqnum,
                                            &filenum, offset, err);
      if (!had_err)
        md5 = gt_seq_col_get_md5_fingerprint(rm->seq_col, filenum, seqnum);
    }
    else if (rm->matchdesc) {
      if (!rm->seq_col) {
        if (rm->encseq) {
          if (!(rm->seq_col = gt_encseq_col_new(rm->encseq, err)))
            had_err = -1;
        } else {
          if (!(rm->seq_col = gt_bioseq_col_new(rm->sequence_filenames, err)))
            had_err = -1;
        }
      }
      if (!had_err)
        (void) gt_seq_col_grep_desc_md5(rm->seq_col, &md5, seqid, err);
      *offset = 1;
    }
    else if (rm->useseqno) {
      GtMD5Tab *tab = NULL;
      unsigned long seqno = GT_UNDEF_ULONG;
      gt_assert(rm->encseq);
      if (1 != sscanf(gt_str_get(seqid), "seq%lu", &seqno)) {
        gt_error_set(err, "seqid '%s' does not have the form 'seqX' "
                          "where X is a sequence number in the encoded "
                          "sequence", gt_str_get(seqid));
        had_err = -1;
      }
      gt_assert(had_err || seqno != GT_UNDEF_ULONG);
      if (!had_err && seqno >= gt_encseq_num_of_sequences(rm->encseq)) {
          gt_error_set(err, "trying to access sequence %lu, but encoded"
                            "sequence contains only %lu sequences",
                            seqno, gt_encseq_num_of_sequences(rm->encseq));
          had_err = -1;
      }
      if (!had_err) {
        tab = gt_encseq_get_md5_tab(rm->encseq, err);
        if (!tab)
          had_err = -1;
      }
      *offset = 1;
      if (!had_err)
        return gt_md5_tab_get(tab, seqno);
      else
        return NULL;
    }
    else {
      if (!had_err)
        md5 = gt_seq_col_get_md5_fingerprint(rm->seq_col, 0, 0);
      *offset = 1;
    }
  }
  return md5;
}
Example #10
0
int gt_region_mapping_get_description(GtRegionMapping *rm, GtStr *desc,
                                      GtStr *seqid, GtError *err)
{
  int had_err = 0;
  gt_error_check(err);
  gt_assert(rm && desc && seqid);
  if (rm->userawseq) {
    gt_str_append_cstr(desc, "<rawseq>");
    return 0;
  }
  had_err = update_seq_col_if_necessary(rm, seqid, err);
  if (!had_err) {
    if (gt_md5_seqid_has_prefix(gt_str_get(seqid))) {
      had_err = gt_seq_col_md5_to_description(rm->seq_col, desc, seqid,
                                              err);
    }
    return had_err;
  }
  if (!had_err) {
    if (rm->usedesc) {
      unsigned long filenum, seqnum;
      gt_assert(rm->seqid2seqnum_mapping);
      had_err = gt_seqid2seqnum_mapping_map(rm->seqid2seqnum_mapping,
                                            gt_str_get(seqid), NULL, &seqnum,
                                            &filenum, NULL, err);
      if (!had_err) {
        char *cdesc;
        cdesc = gt_seq_col_get_description(rm->seq_col, filenum, seqnum);
        gt_assert(cdesc);
        gt_str_append_cstr(desc, cdesc);
        gt_free(cdesc);
      }
    }
    else if (rm->useseqno) {
      unsigned long seqno = GT_UNDEF_ULONG;
      gt_assert(rm->encseq);
      if (1 != sscanf(gt_str_get(seqid), "seq%lu", &seqno)) {
        gt_error_set(err, "seqid '%s' does not have the form 'seqX' "
                          "where X is a sequence number in the encoded "
                          "sequence", gt_str_get(seqid));
        had_err = -1;
      }
      gt_assert(had_err || seqno != GT_UNDEF_ULONG);
      if (!had_err && seqno >= gt_encseq_num_of_sequences(rm->encseq)) {
          gt_error_set(err, "trying to access sequence %lu, but encoded"
                            "sequence contains only %lu sequences",
                            seqno, gt_encseq_num_of_sequences(rm->encseq));
          had_err = -1;
      }
      if (!had_err) {
        unsigned long desclen;
        const char *edesc;
        edesc = gt_encseq_description(rm->encseq, &desclen, seqno);
        gt_str_append_cstr_nt(desc, edesc, desclen);
      }
    } else if (rm->matchdesc) {
      const char *md5;
      /* XXX: not beautiful, but works -- this may be LOTS faster */
      had_err = gt_seq_col_grep_desc_md5(rm->seq_col, &md5, seqid, err);
      if (!had_err) {
        GtStr *md5_seqid = gt_str_new_cstr(md5);
        had_err = gt_seq_col_md5_to_description(rm->seq_col, desc, md5_seqid,
                                                err);
        gt_str_delete(md5_seqid);
      }
    } else {
      if (!had_err) {
        char *cdesc;
        cdesc = gt_seq_col_get_description(rm->seq_col, 0, 0);
        gt_assert(cdesc);
        gt_str_append_cstr(desc, cdesc);
        gt_free(cdesc);
      }
    }
  }
  return had_err;
}
Example #11
0
int gt_region_mapping_get_sequence(GtRegionMapping *rm, char **seq,
                                   GtStr *seqid, unsigned long start,
                                   unsigned long end, GtError *err)
{
  int had_err = 0;
  unsigned long offset = 1;
  GtRange range = {GT_UNDEF_ULONG, GT_UNDEF_ULONG};
  gt_error_check(err);
  gt_assert(rm && seq && seqid && gt_str_length(seqid) > 0);

  /* handle rawseq access first  */
  if (rm->userawseq) {
    gt_assert(!rm->seqid2seqnum_mapping);
    *seq = gt_calloc(end - start + 1, sizeof (char));
    strncpy(*seq, rm->rawseq + start - 1, (end - start + 1) * sizeof (char));
    return 0;
  }

  /* make sure that correct sequence is loaded */
  had_err = update_seq_col_if_necessary(rm, seqid, err);

  /* MD5 sequence id */
  if (!had_err) {
    if (gt_md5_seqid_has_prefix(gt_str_get(seqid))) {
      had_err = gt_seq_col_md5_to_seq(rm->seq_col, seq, start - offset,
                                      end - offset, seqid, err);
      return had_err;
    }
  }

  /* ``regular'' sequence ID */
  if (!had_err) {
    gt_assert(!rm->usedesc || rm->seqid2seqnum_mapping);
    gt_assert(rm->mapping || rm->seq_col);
    if (rm->usedesc) {
      unsigned long seqnum, filenum;
      gt_assert(rm->seqid2seqnum_mapping);
      range.start = start;
      range.end = end;
      had_err = gt_seqid2seqnum_mapping_map(rm->seqid2seqnum_mapping,
                                            gt_str_get(seqid), &range, &seqnum,
                                            &filenum, &offset, err);

      if (!had_err) {
        if (range.end != GT_UNDEF_ULONG && range.start != GT_UNDEF_ULONG &&
              range.end >= gt_seq_col_get_sequence_length(rm->seq_col, filenum,
                                                          seqnum)
              + offset) {
          gt_error_set(err, "trying to extract range %lu-%lu on sequence "
                       "``%s'' which is not covered by that sequence (with "
                       "boundaries %lu-%lu). Has the sequence-region "
                       "to sequence mapping been defined correctly?",
                       start, end, gt_str_get(seqid),
                       range.start, range.end);
          had_err = -1;
        }
      }
      if (!had_err) {
        *seq = gt_seq_col_get_sequence(rm->seq_col, filenum, seqnum,
                                       start - offset, end - offset);
      }
    } else if (rm->matchdesc) {
      gt_assert(!rm->seqid2seqnum_mapping);
      gt_assert(rm->seq_col);
      if (!had_err) {
        had_err = gt_seq_col_grep_desc(rm->seq_col, seq, start - 1, end - 1,
                                       seqid, err);
      }
    } else if (rm->useseqno) {
      unsigned long seqno = GT_UNDEF_ULONG;
      gt_assert(rm->encseq);
      if (1 != sscanf(gt_str_get(seqid), "seq%lu", &seqno)) {
        gt_error_set(err, "seqid '%s' does not have the form 'seqX' "
                          "where X is a sequence number in the encoded "
                          "sequence", gt_str_get(seqid));
        had_err = -1;
      }
      gt_assert(had_err || seqno != GT_UNDEF_ULONG);
      if (!had_err && seqno >= gt_encseq_num_of_sequences(rm->encseq)) {
          gt_error_set(err, "trying to access sequence %lu, but encoded "
                            "sequence contains only %lu sequences",
                            seqno, gt_encseq_num_of_sequences(rm->encseq));
          had_err = -1;
      }
      if (!had_err) {
        unsigned long seqlength = gt_encseq_seqlength(rm->encseq, seqno);
        if (start > seqlength || end > seqlength) {
          gt_error_set(err, "trying to extract range %lu-%lu on sequence "
                       "``%s'' which is not covered by that sequence (only "
                       "%lu characters in size). Has the sequence-region "
                       "to sequence mapping been defined correctly?",
                       start, end, gt_str_get(seqid), seqlength);
          had_err = -1;
        }
      }
      if (!had_err) {
        unsigned long seqstartpos;
        *seq = gt_calloc(end - start + 1, sizeof (char));
        seqstartpos = gt_encseq_seqstartpos(rm->encseq, seqno);
        gt_encseq_extract_decoded(rm->encseq, *seq, seqstartpos + start - 1,
                                  seqstartpos + end - 1);
      }
    } else if (rm->userawseq) {
      gt_assert(!rm->seqid2seqnum_mapping);
      *seq = gt_calloc(end - start + 1, sizeof (char));
      strncpy(*seq, rm->rawseq + start - 1, (end - start + 1) * sizeof (char));
    } else {
      gt_assert(rm->seq_col);
      if (!had_err) {
        unsigned long seqlength = gt_seq_col_get_sequence_length(rm->seq_col,
                                                                 0, 0);
        if (start > seqlength || end > seqlength) {
          had_err = -1;
          gt_error_set(err, "trying to extract range %lu-%lu on sequence "
                       "``%s'' which is not covered by that sequence (only "
                       "%lu characters in size). Has the sequence-region "
                       "to sequence mapping been defined correctly?",
                       start, end, gt_str_get(seqid), seqlength);
        }
        if (!had_err) {
          *seq = gt_seq_col_get_sequence(rm->seq_col, 0, 0, start - offset,
                                         end - offset);
        }
      }
    }
  }
  return had_err;
}
Example #12
0
static int update_seq_col_if_necessary(GtRegionMapping *rm, GtStr *seqid,
                                       GtError *err)
{
  int had_err = 0;
  gt_error_check(err);
  gt_assert(rm && seqid);
  /* for mappings, we need to load the changed sequence, if needed... */
  if (rm->mapping) {
    if (!rm->sequence_file || (gt_str_cmp(rm->sequence_name, seqid))) {
      gt_str_delete(rm->sequence_file);
      /* ignore MD5 hashes when using region mappings */
      if (gt_md5_seqid_has_prefix(gt_str_get(seqid))) {
        rm->sequence_file = region_mapping_map(rm,
                                               gt_str_get(seqid)
                                                 +GT_MD5_SEQID_TOTAL_LEN,
                                               err);
      } else
        rm->sequence_file = region_mapping_map(rm, gt_str_get(seqid), err);
      if (!rm->sequence_file)
        had_err = -1;
      else {
        /* load new seqcol */
        if (!rm->sequence_filenames)
          rm->sequence_filenames = gt_str_array_new();
        else
          gt_str_array_reset(rm->sequence_filenames);
        gt_str_array_add(rm->sequence_filenames, rm->sequence_file);
        if (!rm->sequence_name)
          rm->sequence_name = gt_str_new();
        else
          gt_str_reset(rm->sequence_name);
        gt_str_append_str(rm->sequence_name, seqid);
        gt_seq_col_delete(rm->seq_col);
        rm->seq_col = gt_bioseq_col_new(rm->sequence_filenames, err);
        if (!rm->seq_col)
          had_err = -1;
      }
    }
  } else {
    /* ...otherwise, just make sure the seqcol is loaded */
    if (!rm->seq_col) {
      if (rm->encseq) {
        if (!(rm->seq_col = gt_encseq_col_new(rm->encseq, err)))
          had_err = -1;
      } else {
        gt_assert(rm->sequence_filenames);
        if (!(rm->seq_col = gt_bioseq_col_new(rm->sequence_filenames, err)))
          had_err = -1;
      }
    }
    if (!had_err && rm->usedesc) {
      if (rm->seqid2seqnum_mapping)
        gt_seqid2seqnum_mapping_delete(rm->seqid2seqnum_mapping);
      rm->seqid2seqnum_mapping =
                           gt_seqid2seqnum_mapping_new_seqcol(rm->seq_col, err);
      if (!rm->seqid2seqnum_mapping) {
        had_err = -1;
      }
    }
  }
  return had_err;
}