static int seqorder_str_compare_num(const void *v1, const void *v2, void *data) { GtUword n1 = *(GtUword*) v1, n2 = *(GtUword*) v2, desclen1, desclen2, anum, bnum; int arval, brval, rval = 0; const char *desc1, *desc2; char buf[BUFSIZ]; desc1 = gt_encseq_description((GtEncseq*) data, &desclen1, n1); desc2 = gt_encseq_description((GtEncseq*) data, &desclen2, n2); (void) strncpy(buf, desc1, MIN(BUFSIZ, desclen1) * sizeof (char)); buf[desclen1] = '\0'; arval = gt_parse_uword(&anum, buf); (void) strncpy(buf, desc2, MIN(BUFSIZ, desclen2) * sizeof (char)); buf[desclen2] = '\0'; brval = gt_parse_uword(&bnum, buf); if (arval == 0 && brval == 0) rval = anum-bnum; else if (arval == 0) return -1; else if (brval == 0) return 1; else rval = 0; return rval; }
static int seqorder_str_compare_lex(const void *v1, const void *v2, void *data) { GtUword n1 = *(GtUword*) v1, n2 = *(GtUword*) v2, desclen1, desclen2; const char *desc1, *desc2; int rval = 0; desc1 = gt_encseq_description((GtEncseq*) data, &desclen1, n1); desc2 = gt_encseq_description((GtEncseq*) data, &desclen2, n2); rval = strncmp(desc1, desc2, MIN(desclen1, desclen2) * sizeof (char)); if (rval == 0) rval = desclen1-desclen2; return rval; }
static void gt_seqorder_output(unsigned long seqnum, GtEncseq *encseq) { GtEncseqReader *esr; unsigned long startpos, len, desclen = 0; const char *desc = NULL; unsigned long i; startpos = gt_encseq_seqstartpos(encseq, seqnum); len = gt_encseq_seqlength(encseq, seqnum); gt_xfputc(GT_FASTA_SEPARATOR, stdout); if (gt_encseq_has_description_support(encseq)) { desc = gt_encseq_description(encseq, &desclen, seqnum); gt_xfwrite(desc, (size_t)1, (size_t)desclen, stdout); } gt_xfputc('\n', stdout); esr = gt_encseq_create_reader_with_readmode(encseq, GT_READMODE_FORWARD, startpos); for (i = 0; i < len; i++) { gt_xfputc(gt_encseq_reader_next_decoded_char(esr), stdout); } gt_encseq_reader_delete(esr); gt_xfputc('\n', stdout); }
static int encseq_lua_description(lua_State *L) { GtEncseq **encseq; GtUword seqno, desclen; const char *string; encseq = check_encseq(L, 1); seqno = luaL_checknumber(L, 2); luaL_argcheck(L, seqno < gt_encseq_num_of_sequences(*encseq), 2, "cannot exceed number of sequences"); string = gt_encseq_description(*encseq, &desclen, seqno); lua_pushlstring(L, string, desclen); return 1; }
const char* gt_bioseq_get_description(GtBioseq *bs, GtUword idx) { const char *desc; char *mydesc; GtUword desclen; gt_assert(bs && bs->encseq); gt_assert(idx < gt_encseq_num_of_sequences(bs->encseq)); if (!(mydesc = bs->descriptions[idx])) { desc = gt_encseq_description(bs->encseq, &desclen, idx); mydesc = gt_calloc(desclen + 1, sizeof (char)); strncpy(mydesc, desc, desclen); bs->descriptions[idx] = mydesc; } return (const char*) mydesc; }
static char* gt_encseq_col_get_description(const GtSeqCol *sc, GtUword filenum, GtUword seqnum) { GtEncseqCol *esc; const char *desc; GtUword encseq_seqnum, desclen; esc = gt_encseq_col_cast(sc); gt_assert(esc && filenum < gt_encseq_num_of_files(esc->encseq)); encseq_seqnum = gt_encseq_filenum_first_seqnum(esc->encseq, filenum) + seqnum; gt_assert(encseq_seqnum < gt_encseq_num_of_sequences(esc->encseq)); desc = gt_encseq_description(esc->encseq, &desclen, encseq_seqnum); gt_assert(desc && desclen > 0); return gt_cstr_dup_nt(desc, desclen);; }
static int gt_encseq_col_do_grep_desc(GtEncseqCol *esc, GtUword *filenum, GtUword *seqnum, GtStr *seqid, GtError *err) { GtUword j; const GtSeqInfo *seq_info_ptr; GtSeqInfo seq_info; bool match = false; int had_err = 0; gt_error_check(err); gt_assert(esc && filenum && seqnum && seqid); /* create cache */ if (!esc->grep_cache) esc->grep_cache = gt_seq_info_cache_new(); /* try to read from cache */ seq_info_ptr = gt_seq_info_cache_get(esc->grep_cache, gt_str_get(seqid)); if (seq_info_ptr) { *filenum = seq_info_ptr->filenum; *seqnum = seq_info_ptr->seqnum; return 0; } for (j = 0; !had_err && j < gt_encseq_num_of_sequences(esc->encseq); j++) { const char *desc; char *buf; GtUword desc_len; desc = gt_encseq_description(esc->encseq, &desc_len, j); buf = gt_calloc(desc_len + 1, sizeof (char)); memcpy(buf, desc, desc_len * sizeof (char)); had_err = gt_grep(&match, gt_str_get(seqid), buf, err); gt_free(buf); if (!had_err && match) { *filenum = seq_info.filenum = gt_encseq_filenum(esc->encseq, gt_encseq_seqstartpos(esc->encseq, j)); *seqnum = seq_info.seqnum = j - gt_encseq_filenum_first_seqnum(esc->encseq, *filenum); gt_seq_info_cache_add(esc->grep_cache, gt_str_get(seqid), &seq_info); break; } } if (!had_err && !match) { gt_error_set(err, "no description matched sequence ID '%s'", gt_str_get(seqid)); had_err = -1; } return had_err; }
static int giextract_encodedseq2fasta(FILE *fpout, const GtEncseq *encseq, unsigned long seqnum, const Fastakeyquery *fastakeyquery, unsigned long linewidth, GT_UNUSED GtError *err) { const char *desc; unsigned long desclen; bool haserr = false; desc = gt_encseq_description(encseq, &desclen, seqnum); gt_xfputc('>',fpout); if (fastakeyquery != NULL && !COMPLETE(fastakeyquery)) { printf("%s %lu %lu ",fastakeyquery->fastakey, fastakeyquery->frompos, fastakeyquery->topos); } gt_xfwrite(desc,sizeof *desc,(size_t) desclen,fpout); if (!haserr) { unsigned long frompos, topos, seqstartpos, seqlength ; gt_xfputc('\n',fpout); seqstartpos = gt_encseq_seqstartpos(encseq, seqnum); seqlength = gt_encseq_seqlength(encseq, seqnum); if (fastakeyquery != NULL && !COMPLETE(fastakeyquery)) { frompos = fastakeyquery->frompos-1; topos = fastakeyquery->topos - fastakeyquery->frompos + 1; } else { frompos = 0; topos = seqlength; } gt_encseq2symbolstring(fpout, encseq, GT_READMODE_FORWARD, seqstartpos + frompos, topos, linewidth); } return haserr ? -1 : 0; }
static int gt_encseq_col_md5_to_description(GtSeqCol *sc, GtStr *desc, GtStr *md5_seqid, GtError *err) { GtUword seqnum = GT_UNDEF_UWORD; char seqid[GT_MD5_SEQID_HASH_LEN + 1]; int had_err = 0; GtEncseqCol *esc; esc = gt_encseq_col_cast(sc); gt_error_check(err); gt_assert(esc && desc && md5_seqid && err); gt_assert(gt_md5_seqid_has_prefix(gt_str_get(md5_seqid))); if (gt_str_length(md5_seqid) >= GT_MD5_SEQID_TOTAL_LEN) { const char *cstrseqid = gt_str_get(md5_seqid); if (cstrseqid[GT_MD5_SEQID_TOTAL_LEN-1] != GT_MD5_SEQID_SEPARATOR) { gt_error_set(err, "MD5 sequence id %s not terminated with '%c'", gt_str_get(md5_seqid), GT_MD5_SEQID_SEPARATOR); had_err = -1; } if (!had_err) { strncpy(seqid, cstrseqid + GT_MD5_SEQID_PREFIX_LEN, GT_MD5_SEQID_HASH_LEN); seqid[GT_MD5_SEQID_HASH_LEN] = '\0'; } } seqnum = gt_md5_tab_map(esc->md5_tab, seqid); if (seqnum != GT_UNDEF_UWORD) { const char *cdesc; GtUword desc_len; gt_assert(seqnum < gt_encseq_num_of_sequences(esc->encseq)); cdesc = gt_encseq_description(esc->encseq, &desc_len, seqnum); gt_str_append_cstr_nt(desc, cdesc, desc_len); } else { gt_error_set(err, "sequence %s not found", gt_str_get(md5_seqid)); had_err = -1; } return had_err; }
static int output_sequence(GtEncseq *encseq, GtEncseqDecodeArguments *args, const char *filename, GtError *err) { GtUword i, j, sfrom, sto; int had_err = 0; bool has_desc; GtEncseqReader *esr; gt_assert(encseq); if (!(has_desc = gt_encseq_has_description_support(encseq))) gt_warning("Missing description support for file %s", filename); if (strcmp(gt_str_get(args->mode), "fasta") == 0) { /* specify a single sequence to extract */ if (args->seq != GT_UNDEF_UWORD) { if (args->seq >= gt_encseq_num_of_sequences(encseq)) { gt_error_set(err, "requested sequence "GT_WU" exceeds number of sequences " "("GT_WU")", args->seq, gt_encseq_num_of_sequences(encseq)); return -1; } sfrom = args->seq; sto = args->seq + 1; } else if (args->seqrng.start != GT_UNDEF_UWORD && args->seqrng.end != GT_UNDEF_UWORD) { /* specify a sequence range to extract */ if (args->seqrng.start >= gt_encseq_num_of_sequences(encseq) || args->seqrng.end >= gt_encseq_num_of_sequences(encseq)) { gt_error_set(err, "range "GT_WU"-"GT_WU" includes a sequence number " "exceeding the total number of sequences ("GT_WU")", args->seqrng.start, args->seqrng.end, gt_encseq_num_of_sequences(encseq)); return -1; } sfrom = args->seqrng.start; sto = args->seqrng.end + 1; } else { /* extract all sequences */ sfrom = 0; sto = gt_encseq_num_of_sequences(encseq); } for (i = sfrom; i < sto; i++) { GtUword desclen, startpos, len; char buf[BUFSIZ]; const char *desc = NULL; /* XXX: maybe make this distinction in the functions via readmode? */ if (!GT_ISDIRREVERSE(args->rm)) { startpos = gt_encseq_seqstartpos(encseq, i); len = gt_encseq_seqlength(encseq, i); if (has_desc) { desc = gt_encseq_description(encseq, &desclen, i); } else { (void) snprintf(buf, BUFSIZ, "sequence "GT_WU"", i); desclen = strlen(buf); desc = buf; } } else { startpos = gt_encseq_seqstartpos(encseq, i); len = gt_encseq_seqlength(encseq, gt_encseq_num_of_sequences(encseq)-1-i); startpos = gt_encseq_total_length(encseq) - (gt_encseq_seqstartpos(encseq, gt_encseq_num_of_sequences( encseq)-1-i) + len); if (has_desc) { desc = gt_encseq_description(encseq, &desclen, gt_encseq_num_of_sequences(encseq)-1-i); } else { (void) snprintf(buf, BUFSIZ, "sequence "GT_WU"", i); desclen = strlen(buf); desc = buf; } } gt_assert(desc); /* output description */ gt_xfputc(GT_FASTA_SEPARATOR, stdout); gt_xfwrite(desc, 1, desclen, stdout); gt_xfputc('\n', stdout); /* XXX: make this more efficient by writing in a buffer first and then showing the result */ if (args->singlechars) { for (j = 0; j < len; j++) { gt_xfputc(gt_encseq_get_decoded_char(encseq, startpos + j, args->rm), stdout); } } else { esr = gt_encseq_create_reader_with_readmode(encseq, args->rm, startpos); for (j = 0; j < len; j++) { gt_xfputc(gt_encseq_reader_next_decoded_char(esr), stdout); } gt_encseq_reader_delete(esr); } gt_xfputc('\n', stdout); } } if (strcmp(gt_str_get(args->mode), "concat") == 0) { GtUword from = 0, to = gt_encseq_total_length(encseq) - 1; if (args->rng.start != GT_UNDEF_UWORD && args->rng.end != GT_UNDEF_UWORD) { if (args->rng.end > to) { had_err = -1; gt_error_set(err, "end of range ("GT_WU") exceeds encoded sequence length " "("GT_WU")", args->rng.end, to); } if (!had_err) { from = args->rng.start; to = args->rng.end; } } if (!had_err) { if (args->singlechars) { for (j = from; j <= to; j++) { char cc = gt_encseq_get_decoded_char(encseq, j, args->rm); if (cc == (char) SEPARATOR) cc = gt_str_get(args->sepchar)[0]; gt_xfputc(cc, stdout); } } else { esr = gt_encseq_create_reader_with_readmode(encseq, args->rm, from); if (esr) { for (j = from; j <= to; j++) { char cc = gt_encseq_reader_next_decoded_char(esr); if (cc == (char) SEPARATOR) cc = gt_str_get(args->sepchar)[0]; gt_xfputc(cc, stdout); } gt_encseq_reader_delete(esr); } } gt_xfputc('\n', stdout); } } return had_err; }
static void condenseq_process_descriptions(GtCondenseq *condenseq, const GtEncseq *orig_es, GtLogger *logger) { GtUword *dist; const char *desc; char *cur_id_startptr; GtUword desclen, dist_idx, distsize = (GtUword) 128, idlen, idx, maxendidx = 0, maxlen = 0, minlen = GT_UWORD_MAX, wastedmem = 0, sdssize, cur_total_id_len = 0; bool use_const_len; condenseq->ids_total_len = 0; dist = gt_calloc((size_t) distsize, sizeof (*dist)); for (idx = 0; idx < condenseq->orig_num_seq; ++idx) { desc = gt_encseq_description(orig_es, &desclen, idx); idlen = condenseq_idlen(desc, desclen); if (distsize <= idlen) { dist = gt_realloc(dist, (size_t) (idlen + 1) * sizeof (*dist)); for (dist_idx = distsize; dist_idx <= idlen; dist_idx++) dist[dist_idx] = 0; distsize = idlen + 1; } dist[idlen]++; if (idlen > maxlen) maxlen = idlen; if (idlen < minlen) minlen = idlen; maxendidx += idlen; } /* calculate memory we would waste if we assume equal length, and size if we store actual descriptions */ for (dist_idx = minlen; dist_idx < maxlen; dist_idx++) { wastedmem += dist[dist_idx] * (maxlen - dist_idx); condenseq->ids_total_len += dist[dist_idx] * dist_idx; } condenseq->ids_total_len += dist_idx * dist[dist_idx]; sdssize = (GtUword) gt_intset_best_memory_size(maxendidx, condenseq->orig_num_seq); use_const_len = wastedmem < sdssize; if (use_const_len) { gt_logger_log(logger, "Condenseq descriptions will use const len, " GT_WU ", \"wasting\" " GT_WU " bytes. SDS would use " GT_WU " bytes", maxlen, wastedmem, sdssize); condenseq->id_len = maxlen; condenseq->ids_total_len = maxlen * condenseq->orig_num_seq; } else { gt_logger_log(logger, "Condenseq descriptions will use sdstab with size " GT_WU ". Const length would have wasted " GT_WU " bytes.", sdssize, wastedmem); condenseq->sdstab = gt_intset_best_new(maxendidx, condenseq->orig_num_seq); } condenseq->orig_ids = gt_calloc((size_t) condenseq->ids_total_len, sizeof (*condenseq->orig_ids)); cur_id_startptr = condenseq->orig_ids; for (idx = 0; idx < condenseq->orig_num_seq; ++idx) { desc = gt_encseq_description(orig_es, &desclen, idx); idlen = condenseq_idlen(desc, desclen); gt_assert(idlen <= maxlen); (void) memcpy(cur_id_startptr, desc, (size_t) idlen); if (use_const_len) { cur_id_startptr += maxlen; cur_total_id_len += maxlen; } else { cur_id_startptr += idlen; cur_total_id_len += idlen; gt_intset_add(condenseq->sdstab, cur_total_id_len); } } gt_assert(cur_total_id_len == condenseq->ids_total_len); gt_free(dist); }
static GtMatchIteratorStatus gt_match_iterator_sw_next(GtMatchIterator *mi, GT_UNUSED GtMatch **match, GT_UNUSED GtError *err) { GtMatchIteratorSW *mis; GtSeq *seq_a, *seq_b; char *a, *b; const char *adesc, *bdesc; GtAlignment *ali = NULL; unsigned long seqlen_a, seqlen_b, seqpos; GtRange arng, brng; gt_assert(mi && match); mis = gt_match_iterator_sw_cast(mi); while (true) { if (!mis->pvt->firstali) mis->pvt->seqno_es2++; if (mis->pvt->seqno_es2 == gt_encseq_num_of_sequences(mis->pvt->es2)) { mis->pvt->seqno_es1++; if (mis->pvt->seqno_es1 == gt_encseq_num_of_sequences(mis->pvt->es1)) return GT_MATCHER_STATUS_END; mis->pvt->seqno_es2 = 0; } seqlen_a = gt_encseq_seqlength(mis->pvt->es1, mis->pvt->seqno_es1); seqlen_b = gt_encseq_seqlength(mis->pvt->es2, mis->pvt->seqno_es2); /* XXX: reuse buffers for performance improvement */ a = gt_malloc(seqlen_a * sizeof (char)); seqpos = gt_encseq_seqstartpos(mis->pvt->es1, mis->pvt->seqno_es1); gt_encseq_extract_decoded(mis->pvt->es1, a, seqpos, seqpos + seqlen_a - 1); b = gt_malloc(seqlen_b * sizeof (char)); seqpos = gt_encseq_seqstartpos(mis->pvt->es2, mis->pvt->seqno_es2); gt_encseq_extract_decoded(mis->pvt->es1, b, seqpos, seqpos + seqlen_b - 1); seq_a = gt_seq_new(a, seqlen_a, gt_encseq_alphabet(mis->pvt->es1)); seq_b = gt_seq_new(b, seqlen_b, gt_encseq_alphabet(mis->pvt->es2)); ali = gt_swalign(seq_a, seq_b, mis->pvt->sf); mis->pvt->firstali = false; if (ali && gt_alignment_get_length(ali) >= mis->pvt->min_len && gt_alignment_eval(ali) <= mis->pvt->max_edist) { break; } gt_alignment_delete(ali); gt_seq_delete(seq_a); gt_seq_delete(seq_b); gt_free(a); gt_free(b); } arng = gt_alignment_get_urange(ali); brng = gt_alignment_get_vrange(ali); adesc = gt_encseq_description(mis->pvt->es1, &seqlen_a, mis->pvt->seqno_es1); bdesc = gt_encseq_description(mis->pvt->es2, &seqlen_b, mis->pvt->seqno_es2); *match = gt_match_sw_new("", "", mis->pvt->seqno_es1, mis->pvt->seqno_es2, gt_alignment_get_length(ali), gt_alignment_eval(ali), arng.start, brng.start, arng.end, brng.end, GT_MATCH_DIRECT); gt_match_set_seqid1_nt(*match, adesc, seqlen_a); gt_match_set_seqid2_nt(*match, bdesc, seqlen_b); gt_alignment_delete(ali); gt_seq_delete(seq_a); gt_seq_delete(seq_b); gt_free(a); gt_free(b); return GT_MATCHER_STATUS_OK; }
int gt_region_mapping_get_description(GtRegionMapping *rm, GtStr *desc, GtStr *seqid, GtError *err) { int had_err = 0; gt_error_check(err); gt_assert(rm && desc && seqid); if (rm->userawseq) { gt_str_append_cstr(desc, "<rawseq>"); return 0; } had_err = update_seq_col_if_necessary(rm, seqid, err); if (!had_err) { if (gt_md5_seqid_has_prefix(gt_str_get(seqid))) { had_err = gt_seq_col_md5_to_description(rm->seq_col, desc, seqid, err); } return had_err; } if (!had_err) { if (rm->usedesc) { unsigned long filenum, seqnum; gt_assert(rm->seqid2seqnum_mapping); had_err = gt_seqid2seqnum_mapping_map(rm->seqid2seqnum_mapping, gt_str_get(seqid), NULL, &seqnum, &filenum, NULL, err); if (!had_err) { char *cdesc; cdesc = gt_seq_col_get_description(rm->seq_col, filenum, seqnum); gt_assert(cdesc); gt_str_append_cstr(desc, cdesc); gt_free(cdesc); } } else if (rm->useseqno) { unsigned long seqno = GT_UNDEF_ULONG; gt_assert(rm->encseq); if (1 != sscanf(gt_str_get(seqid), "seq%lu", &seqno)) { gt_error_set(err, "seqid '%s' does not have the form 'seqX' " "where X is a sequence number in the encoded " "sequence", gt_str_get(seqid)); had_err = -1; } gt_assert(had_err || seqno != GT_UNDEF_ULONG); if (!had_err && seqno >= gt_encseq_num_of_sequences(rm->encseq)) { gt_error_set(err, "trying to access sequence %lu, but encoded" "sequence contains only %lu sequences", seqno, gt_encseq_num_of_sequences(rm->encseq)); had_err = -1; } if (!had_err) { unsigned long desclen; const char *edesc; edesc = gt_encseq_description(rm->encseq, &desclen, seqno); gt_str_append_cstr_nt(desc, edesc, desclen); } } else if (rm->matchdesc) { const char *md5; /* XXX: not beautiful, but works -- this may be LOTS faster */ had_err = gt_seq_col_grep_desc_md5(rm->seq_col, &md5, seqid, err); if (!had_err) { GtStr *md5_seqid = gt_str_new_cstr(md5); had_err = gt_seq_col_md5_to_description(rm->seq_col, desc, md5_seqid, err); gt_str_delete(md5_seqid); } } else { if (!had_err) { char *cdesc; cdesc = gt_seq_col_get_description(rm->seq_col, 0, 0); gt_assert(cdesc); gt_str_append_cstr(desc, cdesc); gt_free(cdesc); } } } return had_err; }
static int process_feature(GtLTRClusterStream *lcs, const char *feature, GtError *err) { GtArray *matches; GtMatchIterator *mi = NULL; GtMatch *match = NULL; GtMatchIteratorStatus status; GtEncseq *encseq; unsigned long i; int had_err = 0; if (lcs->current_state != NULL) { char tmp[BUFSIZ]; gt_free(*lcs->current_state); (void) snprintf(tmp, BUFSIZ, "Clustering feature: %s", feature); *lcs->current_state = gt_cstr_dup(tmp); } matches = gt_array_new(sizeof(GtMatch*)); encseq = (GtEncseq*) gt_hashmap_get(lcs->feat_to_encseq, feature); gt_log_log("found encseq %p for feature %s", encseq, feature); if (!had_err) { mi = gt_match_iterator_last_new(encseq, encseq, lcs->match_score, lcs->mismatch_cost, lcs->gap_open_cost, lcs->gap_ext_cost, lcs->xdrop, lcs->ydrop, lcs->zdrop, lcs->k, lcs->mscoregapped, lcs->mscoregapless, err); if (mi != NULL) { while ((status = gt_match_iterator_next(mi, &match, err)) != GT_MATCHER_STATUS_END) { if (status == GT_MATCHER_STATUS_OK) { gt_array_add(matches, match); } else { gt_assert(status == GT_MATCHER_STATUS_ERROR); had_err = -1; break; } } } else had_err = -1; } if (!had_err) { GtClusteredSet *cs; GtHashmap *seqdesc2seqnum; GtMatch *tmp_match; const char *description; char *output; unsigned long desclen, num_of_seq; seqdesc2seqnum = gt_hashmap_new(GT_HASH_STRING, free_hash, NULL); num_of_seq = gt_encseq_num_of_sequences(encseq); for (i = 0; i < num_of_seq; i++) { description = gt_encseq_description(encseq, &desclen, i); output = gt_calloc((size_t) (desclen + 1), sizeof (char)); strncpy(output, description, (size_t) desclen); output[desclen] = '\0'; gt_hashmap_add(seqdesc2seqnum, (void*) gt_cstr_dup(output), (void*) (i + 1)); gt_free(output); } cs = gt_clustered_set_union_find_new(num_of_seq, err); if (cs != NULL) { if (cluster_sequences(matches, cs, seqdesc2seqnum, (unsigned) lcs->psmall, (unsigned) lcs->plarge, encseq, err) != 0) { had_err = -1; } if (!had_err) { (void) cluster_annotate_nodes(cs, encseq, feature, lcs->nodes, err); } } else had_err = -1; for (i = 0; i < gt_array_size(matches); i++) { tmp_match = *(GtMatch**) gt_array_get(matches, i); gt_match_delete(tmp_match); } gt_array_delete(matches); matches = NULL; gt_hashmap_delete(seqdesc2seqnum); gt_clustered_set_delete(cs, err); } gt_match_iterator_delete(mi); return had_err; }
static int cluster_annotate_nodes(GtClusteredSet *cs, GtEncseq *encseq, const char *feature, GtArray *nodes, GtError *err) { GtFeatureNodeIterator *fni; GtFeatureNode *curnode = NULL, *tmp; GtClusteredSetIterator *csi = NULL; GtGenomeNode *gn; GtHashmap *desc2node; GtStr *seqid = NULL; int had_err = 0; unsigned long num_of_clusters, i, elm; const char *fnt = NULL; char buffer[BUFSIZ], *real_feature; gt_error_check(err); if ((strcmp(feature, "lLTR") == 0) || (strcmp(feature, "rLTR") == 0)) real_feature = gt_cstr_dup(gt_ft_long_terminal_repeat); else real_feature = gt_cstr_dup(feature); desc2node = gt_hashmap_new(GT_HASH_STRING, free_hash, NULL); for (i = 0; i < gt_array_size(nodes); i++) { gn = *(GtGenomeNode**) gt_array_get(nodes, i); if (gt_feature_node_try_cast(gn) == NULL) continue; fni = gt_feature_node_iterator_new((GtFeatureNode*) gn); while ((curnode = gt_feature_node_iterator_next(fni)) != NULL) { char header[BUFSIZ]; fnt = gt_feature_node_get_type(curnode); if (strcmp(fnt, gt_ft_repeat_region) == 0) { const char *rid; unsigned long id; seqid = gt_genome_node_get_seqid((GtGenomeNode*) curnode); rid = gt_feature_node_get_attribute(curnode, "ID"); (void) sscanf(rid, "repeat_region%lu", &id); (void) snprintf(buffer, BUFSIZ, "%s_%lu", gt_str_get(seqid), id); } else if (strcmp(fnt, gt_ft_protein_match) == 0) { GtRange range; const char *attr; attr = gt_feature_node_get_attribute(curnode, "name"); if (!attr) continue; if (strcmp(feature, attr) != 0) continue; range = gt_genome_node_get_range((GtGenomeNode*) curnode); if ((range.end - range.start + 1) < 10UL) continue; (void) snprintf(header, BUFSIZ, "%s_%lu_%lu", buffer, range.start, range.end); gt_hashmap_add(desc2node, (void*) gt_cstr_dup(header), (void*) curnode); } else if (strcmp(fnt, real_feature) == 0) { GtRange range; range = gt_genome_node_get_range((GtGenomeNode*) curnode); if ((range.end - range.start + 1) < 10UL) continue; (void) snprintf(header, BUFSIZ, "%s_%lu_%lu", buffer, range.start, range.end); gt_hashmap_add(desc2node, (void*) gt_cstr_dup(header), (void*) curnode); } } gt_feature_node_iterator_delete(fni); } gt_free(real_feature); num_of_clusters = gt_clustered_set_num_of_clusters(cs, err); for (i = 0; i < num_of_clusters; i++) { csi = gt_clustered_set_get_iterator(cs, i ,err); if (csi != NULL) { while (!had_err && (gt_clustered_set_iterator_next(csi, &elm, err) != GT_CLUSTERED_SET_ITERATOR_STATUS_END)) { char clid[BUFSIZ]; const char *encseqdesc; char *encseqid; unsigned long desclen; encseqdesc = gt_encseq_description(encseq, &desclen, elm); encseqid = gt_calloc((size_t) (desclen + 1), sizeof (char)); (void) strncpy(encseqid, encseqdesc, (size_t) desclen); encseqid[desclen] = '\0'; tmp = (GtFeatureNode*) gt_hashmap_get(desc2node, (void*) encseqid); (void) snprintf(clid, BUFSIZ, "%lu", i); gt_feature_node_set_attribute(tmp, "clid", clid); gt_free(encseqid); } } gt_clustered_set_iterator_delete(csi, err); csi = NULL; } gt_hashmap_delete(desc2node); return had_err; }