static int extractseq_match(GtFile *outfp, GtBioseq *bs, const char *pattern, unsigned long width, GtError *err) { const char *desc; unsigned long i; bool match; int had_err = 0; gt_error_check(err); gt_assert(bs && pattern); for (i = 0; !had_err && i < gt_bioseq_number_of_sequences(bs); i++) { desc = gt_bioseq_get_description(bs, i); gt_assert(desc); had_err = gt_grep(&match, pattern, desc, err); if (!had_err && match) { gt_fasta_show_entry_generic(desc, gt_bioseq_get_sequence(bs, i), gt_bioseq_get_sequence_length(bs, i), width, outfp); } } return had_err; }
bool gth_seq_contains_wildcard(GtStr *seq) { bool match; #ifndef NDEBUG int had_err = #endif gt_grep(&match, "[^"GT_DNABASES"]", gt_str_get(seq), NULL); gt_assert(!had_err); return match; }
int gt_grep_unit_test(GtError *err) { bool match; int grep_err, had_err = 0; gt_error_check(err); grep_err = gt_grep(&match, "a", "a", NULL); ensure(had_err, !grep_err); ensure(had_err, match); grep_err = gt_grep(&match, "b", "a", NULL); ensure(had_err, !grep_err); ensure(had_err, !match); grep_err = gt_grep(&match, "aba", "wenbapzbpqSayhzzabaZZqyghaAAahhaA", NULL); ensure(had_err, !grep_err); ensure(had_err, match); grep_err = gt_grep(&match, "aba", "wenbapzbpqSayhzzaBaZZqyghaAAahhaA", NULL); ensure(had_err, !grep_err); ensure(had_err, !match); grep_err = gt_grep(&match, "^aba", "abawenbapzbpqSayhzzZZqyghaAAahhaA", NULL); ensure(had_err, !grep_err); ensure(had_err, match); grep_err = gt_grep(&match, "^aba", "wenbapzbpqSayhzzabaZZqyghaAAahhaA", NULL); ensure(had_err, !grep_err); ensure(had_err, !match); return had_err; }
static int gt_encseq_col_do_grep_desc(GtEncseqCol *esc, GtUword *filenum, GtUword *seqnum, GtStr *seqid, GtError *err) { GtUword j; const GtSeqInfo *seq_info_ptr; GtSeqInfo seq_info; bool match = false; int had_err = 0; gt_error_check(err); gt_assert(esc && filenum && seqnum && seqid); /* create cache */ if (!esc->grep_cache) esc->grep_cache = gt_seq_info_cache_new(); /* try to read from cache */ seq_info_ptr = gt_seq_info_cache_get(esc->grep_cache, gt_str_get(seqid)); if (seq_info_ptr) { *filenum = seq_info_ptr->filenum; *seqnum = seq_info_ptr->seqnum; return 0; } for (j = 0; !had_err && j < gt_encseq_num_of_sequences(esc->encseq); j++) { const char *desc; char *buf; GtUword desc_len; desc = gt_encseq_description(esc->encseq, &desc_len, j); buf = gt_calloc(desc_len + 1, sizeof (char)); memcpy(buf, desc, desc_len * sizeof (char)); had_err = gt_grep(&match, gt_str_get(seqid), buf, err); gt_free(buf); if (!had_err && match) { *filenum = seq_info.filenum = gt_encseq_filenum(esc->encseq, gt_encseq_seqstartpos(esc->encseq, j)); *seqnum = seq_info.seqnum = j - gt_encseq_filenum_first_seqnum(esc->encseq, *filenum); gt_seq_info_cache_add(esc->grep_cache, gt_str_get(seqid), &seq_info); break; } } if (!had_err && !match) { gt_error_set(err, "no description matched sequence ID '%s'", gt_str_get(seqid)); had_err = -1; } return had_err; }
static int grep_desc(GtBioseqCol *bsc, GtUword *filenum, GtUword *seqnum, GtStr *seqid, GtError *err) { GtUword i, j, num_matches = 0; const GtSeqInfo *seq_info_ptr; GtSeqInfo seq_info; GtStr *pattern, *escaped; bool match = false; int had_err = 0; gt_error_check(err); gt_assert(bsc && filenum && seqnum && seqid); /* create cache */ if (!bsc->grep_cache) bsc->grep_cache = gt_seq_info_cache_new(); /* try to read from cache */ seq_info_ptr = gt_seq_info_cache_get(bsc->grep_cache, gt_str_get(seqid)); if (seq_info_ptr) { *filenum = seq_info_ptr->filenum; *seqnum = seq_info_ptr->seqnum; return 0; } pattern = gt_str_new(); escaped = gt_str_new(); gt_grep_escape_extended(escaped, gt_str_get(seqid), gt_str_length(seqid)); if (bsc->matchdescstart) gt_str_append_cstr(pattern, "^"); gt_str_append_str(pattern, escaped); if (bsc->matchdescstart) gt_str_append_cstr(pattern, "([[:space:]]|$)"); for (i = 0; !had_err && i < bsc->num_of_seqfiles; i++) { GtBioseq *bioseq = bsc->bioseqs[i]; for (j = 0; !had_err && j < gt_bioseq_number_of_sequences(bioseq); j++) { const char *desc = gt_bioseq_get_description(bioseq, j); had_err = gt_grep(&match, gt_str_get(pattern), desc, err); if (!had_err && match) { num_matches++; if (num_matches > 1) { gt_error_set(err, "query seqid '%s' could match more than one " "sequence description", gt_str_get(seqid)); had_err = -1; break; } *filenum = i; *seqnum = j; /* cache results */ seq_info.filenum = i; seq_info.seqnum = j; gt_seq_info_cache_add(bsc->grep_cache, gt_str_get(seqid), &seq_info); } } if (match) break; } gt_str_delete(pattern); gt_str_delete(escaped); if (!had_err && num_matches == 0) { gt_error_set(err, "no description matched sequence ID '%s'", gt_str_get(seqid)); had_err = -1; } return had_err; }
bool gt_xrf_checker_is_valid(GtXRFChecker *xrc, const char *value, GtError *err) { bool valid = true; char *myvalue = gt_cstr_dup(value), *dbid = NULL, *localid = NULL; GtXRFAbbrEntry *e; GtUword nof_tokens, i; gt_assert(xrc && value); gt_error_check(err); /* XXX: Thread safety! */ gt_splitter_reset(xrc->splitter); gt_splitter_split(xrc->splitter, myvalue, strlen(myvalue), ','); nof_tokens = gt_splitter_size(xrc->splitter); for (i = 0; valid && i < nof_tokens; i++) { dbid = gt_splitter_get_token(xrc->splitter, i); if (!(localid = strchr(dbid, ':'))) { gt_error_set(err, "xref \"%s\": separator colon missing", value); valid = false; } if (valid) { *localid = '\0'; if (*(++localid) == '\0') { gt_error_set(err, "xref \"%s\": local ID (part after colon) missing", value); valid = false; } } if (valid) { gt_assert(dbid && localid); if (!(e = gt_hashmap_get(xrc->abbrvs, dbid))) { gt_error_set(err, "xref \"%s\": unknown database abbreviation \"%s\"", value, dbid); valid = false; } } if (valid) { /* TODO: use #defines here. */ const char *regex = NULL; gt_assert(e); if ((regex = gt_xrf_abbr_entry_get_value(e, "local_id_syntax"))) { bool match = false; GT_UNUSED int rval; rval = gt_grep(&match, regex, localid, NULL); gt_assert(rval == 0); /* regex format has been checked before */ if (!match) { gt_error_set(err, "xref \"%s\": local ID \"%s\" does not " "conform to syntax \"%s\" for the " "%s database", value, localid, regex, dbid); valid = false; } } } } gt_free(myvalue); return valid; }
static int gt_xrf_abbr_parse_tree_validate_entries(const GtXRFAbbrParseTree *xrf_abbr_parse_tree, GtError *err) { GtUword i; GtHashmap *abbrvs; const char *value; int had_err = 0; gt_error_check(err); gt_assert(xrf_abbr_parse_tree); abbrvs = gt_hashmap_new(GT_HASH_STRING, NULL, NULL); for (i = 0; !had_err && i < gt_xrf_abbr_parse_tree_num_of_entries(xrf_abbr_parse_tree); i++) { GtXRFAbbrEntry *entry = *(GtXRFAbbrEntry**) gt_array_get(xrf_abbr_parse_tree->entries, i); if (!(value = gt_xrf_abbr_entry_get_value(entry, XRF_LABEL_ABBREVIATION))) { gt_error_set(err, "file \"%s\": line "GT_WU": required " "label \"" XRF_LABEL_ABBREVIATION "\" missing", gt_xrf_abbr_entry_filename(entry), gt_xrf_abbr_entry_line(entry)); had_err = -1; } if (!had_err) { gt_assert(value); if (gt_hashmap_get(abbrvs, value)) { gt_error_set(err, "file \"%s\": line "GT_WU": duplicate abbreviation " "\"%s\", must be unique", gt_xrf_abbr_entry_filename(entry), gt_xrf_abbr_entry_line(entry), value); had_err = -1; } else { gt_hashmap_add(abbrvs, (void*) value, (void*) value); } } if (!had_err && (value = gt_xrf_abbr_entry_get_value(entry, XRF_LABEL_SHORTHAND_NAME))) { if (strlen(value) >= 10) { gt_error_set(err, "file \"%s\": line "GT_WU": length of " "shorthand name \"%s\" " "is not less than 10 characters", gt_xrf_abbr_entry_filename(entry), gt_xrf_abbr_entry_line(entry), value); had_err = -1; } } if (!had_err && (value = gt_xrf_abbr_entry_get_value(entry, XRF_LABEL_LOCAL_ID_SYNTAX))) { GtError *regex_error = gt_error_new(); bool match; if (gt_grep(&match, value, "", regex_error)) { gt_error_set(err, "file \"%s\": line "GT_WU": invalid " "regular expression \"%s\" (%s)", gt_xrf_abbr_entry_filename(entry), gt_xrf_abbr_entry_line(entry), value, gt_error_get(regex_error)); had_err = -1; } gt_error_delete(regex_error); } } gt_hashmap_delete(abbrvs); return had_err; }