Beispiel #1
0
static int gt_encseq_col_do_grep_desc(GtEncseqCol *esc, GtUword *filenum,
                                      GtUword *seqnum, GtStr *seqid,
                                      GtError *err)
{
  GtUword j;
  const GtSeqInfo *seq_info_ptr;
  GtSeqInfo seq_info;
  bool match = false;
  int had_err = 0;
  gt_error_check(err);

  gt_assert(esc && filenum && seqnum && seqid);
  /* create cache */
  if (!esc->grep_cache)
    esc->grep_cache = gt_seq_info_cache_new();
  /* try to read from cache */
  seq_info_ptr = gt_seq_info_cache_get(esc->grep_cache, gt_str_get(seqid));
  if (seq_info_ptr) {
    *filenum = seq_info_ptr->filenum;
    *seqnum = seq_info_ptr->seqnum;
    return 0;
  }
  for (j = 0; !had_err && j < gt_encseq_num_of_sequences(esc->encseq); j++) {
    const char *desc;
    char *buf;
    GtUword desc_len;
    desc = gt_encseq_description(esc->encseq, &desc_len, j);
    buf = gt_calloc(desc_len + 1, sizeof (char));
    memcpy(buf, desc, desc_len * sizeof (char));
    had_err = gt_grep(&match, gt_str_get(seqid), buf, err);
    gt_free(buf);
    if (!had_err && match) {
      *filenum = seq_info.filenum =
                       gt_encseq_filenum(esc->encseq,
                                         gt_encseq_seqstartpos(esc->encseq, j));
      *seqnum = seq_info.seqnum =
                      j - gt_encseq_filenum_first_seqnum(esc->encseq, *filenum);
      gt_seq_info_cache_add(esc->grep_cache, gt_str_get(seqid), &seq_info);
      break;
    }
  }
  if (!had_err && !match) {
    gt_error_set(err, "no description matched sequence ID '%s'",
                 gt_str_get(seqid));
    had_err = -1;
  }
  return had_err;
}
static int grep_desc(GtBioseqCol *bsc, GtUword *filenum,
                     GtUword *seqnum, GtStr *seqid, GtError *err)
{
  GtUword i, j, num_matches = 0;
  const GtSeqInfo *seq_info_ptr;
  GtSeqInfo seq_info;
  GtStr *pattern, *escaped;
  bool match = false;
  int had_err = 0;
  gt_error_check(err);
  gt_assert(bsc && filenum && seqnum && seqid);
  /* create cache */
  if (!bsc->grep_cache)
    bsc->grep_cache = gt_seq_info_cache_new();
  /* try to read from cache */
  seq_info_ptr = gt_seq_info_cache_get(bsc->grep_cache, gt_str_get(seqid));
  if (seq_info_ptr) {
    *filenum = seq_info_ptr->filenum;
    *seqnum = seq_info_ptr->seqnum;
    return 0;
  }
  pattern = gt_str_new();
  escaped = gt_str_new();
  gt_grep_escape_extended(escaped, gt_str_get(seqid), gt_str_length(seqid));
  if (bsc->matchdescstart)
    gt_str_append_cstr(pattern, "^");
  gt_str_append_str(pattern, escaped);
  if (bsc->matchdescstart)
    gt_str_append_cstr(pattern, "([[:space:]]|$)");
  for (i = 0; !had_err && i < bsc->num_of_seqfiles; i++) {
    GtBioseq *bioseq = bsc->bioseqs[i];
    for (j = 0; !had_err && j < gt_bioseq_number_of_sequences(bioseq); j++) {
      const char *desc = gt_bioseq_get_description(bioseq, j);
      had_err = gt_grep(&match, gt_str_get(pattern), desc, err);
      if (!had_err && match) {
        num_matches++;
        if (num_matches > 1) {
          gt_error_set(err, "query seqid '%s' could match more than one "
                            "sequence description", gt_str_get(seqid));
          had_err = -1;
          break;
        }
        *filenum = i;
        *seqnum = j;
        /* cache results */
        seq_info.filenum = i;
        seq_info.seqnum = j;
        gt_seq_info_cache_add(bsc->grep_cache, gt_str_get(seqid), &seq_info);
      }
    }
    if (match)
      break;
  }
  gt_str_delete(pattern);
  gt_str_delete(escaped);
  if (!had_err && num_matches == 0) {
    gt_error_set(err, "no description matched sequence ID '%s'",
                 gt_str_get(seqid));
    had_err = -1;
  }
  return had_err;
}