static int gt_encseq_col_do_grep_desc(GtEncseqCol *esc, GtUword *filenum, GtUword *seqnum, GtStr *seqid, GtError *err) { GtUword j; const GtSeqInfo *seq_info_ptr; GtSeqInfo seq_info; bool match = false; int had_err = 0; gt_error_check(err); gt_assert(esc && filenum && seqnum && seqid); /* create cache */ if (!esc->grep_cache) esc->grep_cache = gt_seq_info_cache_new(); /* try to read from cache */ seq_info_ptr = gt_seq_info_cache_get(esc->grep_cache, gt_str_get(seqid)); if (seq_info_ptr) { *filenum = seq_info_ptr->filenum; *seqnum = seq_info_ptr->seqnum; return 0; } for (j = 0; !had_err && j < gt_encseq_num_of_sequences(esc->encseq); j++) { const char *desc; char *buf; GtUword desc_len; desc = gt_encseq_description(esc->encseq, &desc_len, j); buf = gt_calloc(desc_len + 1, sizeof (char)); memcpy(buf, desc, desc_len * sizeof (char)); had_err = gt_grep(&match, gt_str_get(seqid), buf, err); gt_free(buf); if (!had_err && match) { *filenum = seq_info.filenum = gt_encseq_filenum(esc->encseq, gt_encseq_seqstartpos(esc->encseq, j)); *seqnum = seq_info.seqnum = j - gt_encseq_filenum_first_seqnum(esc->encseq, *filenum); gt_seq_info_cache_add(esc->grep_cache, gt_str_get(seqid), &seq_info); break; } } if (!had_err && !match) { gt_error_set(err, "no description matched sequence ID '%s'", gt_str_get(seqid)); had_err = -1; } return had_err; }
static int grep_desc(GtBioseqCol *bsc, GtUword *filenum, GtUword *seqnum, GtStr *seqid, GtError *err) { GtUword i, j, num_matches = 0; const GtSeqInfo *seq_info_ptr; GtSeqInfo seq_info; GtStr *pattern, *escaped; bool match = false; int had_err = 0; gt_error_check(err); gt_assert(bsc && filenum && seqnum && seqid); /* create cache */ if (!bsc->grep_cache) bsc->grep_cache = gt_seq_info_cache_new(); /* try to read from cache */ seq_info_ptr = gt_seq_info_cache_get(bsc->grep_cache, gt_str_get(seqid)); if (seq_info_ptr) { *filenum = seq_info_ptr->filenum; *seqnum = seq_info_ptr->seqnum; return 0; } pattern = gt_str_new(); escaped = gt_str_new(); gt_grep_escape_extended(escaped, gt_str_get(seqid), gt_str_length(seqid)); if (bsc->matchdescstart) gt_str_append_cstr(pattern, "^"); gt_str_append_str(pattern, escaped); if (bsc->matchdescstart) gt_str_append_cstr(pattern, "([[:space:]]|$)"); for (i = 0; !had_err && i < bsc->num_of_seqfiles; i++) { GtBioseq *bioseq = bsc->bioseqs[i]; for (j = 0; !had_err && j < gt_bioseq_number_of_sequences(bioseq); j++) { const char *desc = gt_bioseq_get_description(bioseq, j); had_err = gt_grep(&match, gt_str_get(pattern), desc, err); if (!had_err && match) { num_matches++; if (num_matches > 1) { gt_error_set(err, "query seqid '%s' could match more than one " "sequence description", gt_str_get(seqid)); had_err = -1; break; } *filenum = i; *seqnum = j; /* cache results */ seq_info.filenum = i; seq_info.seqnum = j; gt_seq_info_cache_add(bsc->grep_cache, gt_str_get(seqid), &seq_info); } } if (match) break; } gt_str_delete(pattern); gt_str_delete(escaped); if (!had_err && num_matches == 0) { gt_error_set(err, "no description matched sequence ID '%s'", gt_str_get(seqid)); had_err = -1; } return had_err; }