Example #1
0
static int extractseq_match(GtFile *outfp, GtBioseq *bs,
                            const char *pattern, unsigned long width,
                            GtError *err)
{
  const char *desc;
  unsigned long i;
  bool match;
  int had_err = 0;

  gt_error_check(err);
  gt_assert(bs && pattern);

  for (i = 0; !had_err && i < gt_bioseq_number_of_sequences(bs); i++) {
    desc = gt_bioseq_get_description(bs, i);
    gt_assert(desc);
    had_err = gt_grep(&match, pattern, desc, err);
    if (!had_err && match) {
      gt_fasta_show_entry_generic(desc, gt_bioseq_get_sequence(bs, i),
                                  gt_bioseq_get_sequence_length(bs, i), width,
                                  outfp);
    }
  }

  return had_err;
}
bool gth_seq_contains_wildcard(GtStr *seq)
{
  bool match;
#ifndef NDEBUG
  int had_err =
#endif
    gt_grep(&match, "[^"GT_DNABASES"]", gt_str_get(seq), NULL);
  gt_assert(!had_err);
  return match;
}
Example #3
0
int gt_grep_unit_test(GtError *err)
{
  bool match;
  int grep_err, had_err = 0;
  gt_error_check(err);

  grep_err = gt_grep(&match, "a", "a", NULL);
  ensure(had_err, !grep_err);
  ensure(had_err, match);

  grep_err = gt_grep(&match, "b", "a", NULL);
  ensure(had_err, !grep_err);
  ensure(had_err, !match);

  grep_err =  gt_grep(&match, "aba", "wenbapzbpqSayhzzabaZZqyghaAAahhaA", NULL);
  ensure(had_err, !grep_err);
  ensure(had_err, match);

  grep_err = gt_grep(&match, "aba", "wenbapzbpqSayhzzaBaZZqyghaAAahhaA", NULL);
  ensure(had_err, !grep_err);
  ensure(had_err, !match);

  grep_err = gt_grep(&match, "^aba", "abawenbapzbpqSayhzzZZqyghaAAahhaA", NULL);
  ensure(had_err, !grep_err);
  ensure(had_err, match);

  grep_err = gt_grep(&match, "^aba", "wenbapzbpqSayhzzabaZZqyghaAAahhaA", NULL);
  ensure(had_err, !grep_err);
  ensure(had_err, !match);

  return had_err;
}
Example #4
0
static int gt_encseq_col_do_grep_desc(GtEncseqCol *esc, GtUword *filenum,
                                      GtUword *seqnum, GtStr *seqid,
                                      GtError *err)
{
  GtUword j;
  const GtSeqInfo *seq_info_ptr;
  GtSeqInfo seq_info;
  bool match = false;
  int had_err = 0;
  gt_error_check(err);

  gt_assert(esc && filenum && seqnum && seqid);
  /* create cache */
  if (!esc->grep_cache)
    esc->grep_cache = gt_seq_info_cache_new();
  /* try to read from cache */
  seq_info_ptr = gt_seq_info_cache_get(esc->grep_cache, gt_str_get(seqid));
  if (seq_info_ptr) {
    *filenum = seq_info_ptr->filenum;
    *seqnum = seq_info_ptr->seqnum;
    return 0;
  }
  for (j = 0; !had_err && j < gt_encseq_num_of_sequences(esc->encseq); j++) {
    const char *desc;
    char *buf;
    GtUword desc_len;
    desc = gt_encseq_description(esc->encseq, &desc_len, j);
    buf = gt_calloc(desc_len + 1, sizeof (char));
    memcpy(buf, desc, desc_len * sizeof (char));
    had_err = gt_grep(&match, gt_str_get(seqid), buf, err);
    gt_free(buf);
    if (!had_err && match) {
      *filenum = seq_info.filenum =
                       gt_encseq_filenum(esc->encseq,
                                         gt_encseq_seqstartpos(esc->encseq, j));
      *seqnum = seq_info.seqnum =
                      j - gt_encseq_filenum_first_seqnum(esc->encseq, *filenum);
      gt_seq_info_cache_add(esc->grep_cache, gt_str_get(seqid), &seq_info);
      break;
    }
  }
  if (!had_err && !match) {
    gt_error_set(err, "no description matched sequence ID '%s'",
                 gt_str_get(seqid));
    had_err = -1;
  }
  return had_err;
}
Example #5
0
static int grep_desc(GtBioseqCol *bsc, GtUword *filenum,
                     GtUword *seqnum, GtStr *seqid, GtError *err)
{
  GtUword i, j, num_matches = 0;
  const GtSeqInfo *seq_info_ptr;
  GtSeqInfo seq_info;
  GtStr *pattern, *escaped;
  bool match = false;
  int had_err = 0;
  gt_error_check(err);
  gt_assert(bsc && filenum && seqnum && seqid);
  /* create cache */
  if (!bsc->grep_cache)
    bsc->grep_cache = gt_seq_info_cache_new();
  /* try to read from cache */
  seq_info_ptr = gt_seq_info_cache_get(bsc->grep_cache, gt_str_get(seqid));
  if (seq_info_ptr) {
    *filenum = seq_info_ptr->filenum;
    *seqnum = seq_info_ptr->seqnum;
    return 0;
  }
  pattern = gt_str_new();
  escaped = gt_str_new();
  gt_grep_escape_extended(escaped, gt_str_get(seqid), gt_str_length(seqid));
  if (bsc->matchdescstart)
    gt_str_append_cstr(pattern, "^");
  gt_str_append_str(pattern, escaped);
  if (bsc->matchdescstart)
    gt_str_append_cstr(pattern, "([[:space:]]|$)");
  for (i = 0; !had_err && i < bsc->num_of_seqfiles; i++) {
    GtBioseq *bioseq = bsc->bioseqs[i];
    for (j = 0; !had_err && j < gt_bioseq_number_of_sequences(bioseq); j++) {
      const char *desc = gt_bioseq_get_description(bioseq, j);
      had_err = gt_grep(&match, gt_str_get(pattern), desc, err);
      if (!had_err && match) {
        num_matches++;
        if (num_matches > 1) {
          gt_error_set(err, "query seqid '%s' could match more than one "
                            "sequence description", gt_str_get(seqid));
          had_err = -1;
          break;
        }
        *filenum = i;
        *seqnum = j;
        /* cache results */
        seq_info.filenum = i;
        seq_info.seqnum = j;
        gt_seq_info_cache_add(bsc->grep_cache, gt_str_get(seqid), &seq_info);
      }
    }
    if (match)
      break;
  }
  gt_str_delete(pattern);
  gt_str_delete(escaped);
  if (!had_err && num_matches == 0) {
    gt_error_set(err, "no description matched sequence ID '%s'",
                 gt_str_get(seqid));
    had_err = -1;
  }
  return had_err;
}
bool gt_xrf_checker_is_valid(GtXRFChecker *xrc, const char *value, GtError *err)
{
  bool valid = true;
  char *myvalue = gt_cstr_dup(value),
       *dbid = NULL,
       *localid = NULL;
  GtXRFAbbrEntry *e;
  GtUword nof_tokens, i;
  gt_assert(xrc && value);
  gt_error_check(err);

  /* XXX: Thread safety! */
  gt_splitter_reset(xrc->splitter);
  gt_splitter_split(xrc->splitter, myvalue, strlen(myvalue), ',');
  nof_tokens = gt_splitter_size(xrc->splitter);

  for (i = 0; valid && i < nof_tokens; i++) {
    dbid = gt_splitter_get_token(xrc->splitter, i);

    if (!(localid = strchr(dbid, ':'))) {
      gt_error_set(err, "xref \"%s\": separator colon missing", value);
      valid = false;
    }
    if (valid) {
      *localid = '\0';
      if (*(++localid) == '\0') {
        gt_error_set(err, "xref \"%s\": local ID (part after colon) missing",
                     value);
        valid = false;
      }
    }
    if (valid) {
      gt_assert(dbid && localid);
      if (!(e = gt_hashmap_get(xrc->abbrvs, dbid))) {
        gt_error_set(err, "xref \"%s\": unknown database abbreviation \"%s\"",
                     value, dbid);
        valid = false;
      }
    }
    if (valid) {
      /* TODO: use #defines here. */
      const char *regex = NULL;
      gt_assert(e);
      if ((regex = gt_xrf_abbr_entry_get_value(e, "local_id_syntax"))) {
        bool match = false;
        GT_UNUSED int rval;
        rval = gt_grep(&match, regex, localid, NULL);
        gt_assert(rval == 0); /* regex format has been checked before */
        if (!match) {
          gt_error_set(err, "xref \"%s\": local ID \"%s\" does not "
                            "conform to syntax \"%s\" for the "
                            "%s database",
                       value, localid, regex, dbid);
          valid = false;
        }
      }
    }
  }

  gt_free(myvalue);
  return valid;
}
static int gt_xrf_abbr_parse_tree_validate_entries(const GtXRFAbbrParseTree
                                                           *xrf_abbr_parse_tree,
                                                   GtError *err)
{
  GtUword i;
  GtHashmap *abbrvs;
  const char *value;
  int had_err = 0;
  gt_error_check(err);
  gt_assert(xrf_abbr_parse_tree);

  abbrvs = gt_hashmap_new(GT_HASH_STRING, NULL, NULL);
  for (i = 0; !had_err
         && i < gt_xrf_abbr_parse_tree_num_of_entries(xrf_abbr_parse_tree);
       i++) {
    GtXRFAbbrEntry *entry = *(GtXRFAbbrEntry**)
                                gt_array_get(xrf_abbr_parse_tree->entries, i);
    if (!(value = gt_xrf_abbr_entry_get_value(entry, XRF_LABEL_ABBREVIATION))) {
      gt_error_set(err, "file \"%s\": line "GT_WU": required "
                        "label \"" XRF_LABEL_ABBREVIATION "\" missing",
                   gt_xrf_abbr_entry_filename(entry),
                   gt_xrf_abbr_entry_line(entry));
      had_err = -1;
    }
    if (!had_err) {
      gt_assert(value);
      if (gt_hashmap_get(abbrvs, value)) {
        gt_error_set(err, "file \"%s\": line "GT_WU": duplicate abbreviation "
                          "\"%s\", must be unique",
                     gt_xrf_abbr_entry_filename(entry),
                     gt_xrf_abbr_entry_line(entry),
                     value);
        had_err = -1;
      } else {
        gt_hashmap_add(abbrvs, (void*) value, (void*) value);
      }
    }
    if (!had_err && (value = gt_xrf_abbr_entry_get_value(entry,
                                                XRF_LABEL_SHORTHAND_NAME))) {
      if (strlen(value) >= 10) {
        gt_error_set(err, "file \"%s\": line "GT_WU": length of "
                          "shorthand name \"%s\" "
                          "is not less than 10 characters",
                     gt_xrf_abbr_entry_filename(entry),
                     gt_xrf_abbr_entry_line(entry), value);
        had_err = -1;
      }
    }
    if (!had_err && (value = gt_xrf_abbr_entry_get_value(entry,
                                              XRF_LABEL_LOCAL_ID_SYNTAX))) {
      GtError *regex_error = gt_error_new();
      bool match;
      if (gt_grep(&match, value, "", regex_error)) {
        gt_error_set(err, "file \"%s\": line "GT_WU": invalid "
                          "regular expression \"%s\" (%s)",
                     gt_xrf_abbr_entry_filename(entry),
                     gt_xrf_abbr_entry_line(entry), value,
                     gt_error_get(regex_error));
        had_err = -1;
      }
      gt_error_delete(regex_error);
    }
  }
  gt_hashmap_delete(abbrvs);
  return had_err;
}