示例#1
0
static int seqorder_str_compare_num(const void *v1, const void *v2, void *data)
{
  GtUword n1 = *(GtUword*) v1,
          n2 = *(GtUword*) v2,
          desclen1, desclen2,
          anum, bnum;
  int arval, brval, rval = 0;
  const char *desc1, *desc2;
  char buf[BUFSIZ];
  desc1 = gt_encseq_description((GtEncseq*) data, &desclen1, n1);
  desc2 = gt_encseq_description((GtEncseq*) data, &desclen2, n2);
  (void) strncpy(buf, desc1, MIN(BUFSIZ, desclen1) * sizeof (char));
  buf[desclen1] = '\0';
  arval = gt_parse_uword(&anum, buf);
  (void) strncpy(buf, desc2, MIN(BUFSIZ, desclen2) * sizeof (char));
  buf[desclen2] = '\0';
  brval = gt_parse_uword(&bnum, buf);
  if (arval == 0 && brval == 0)
    rval = anum-bnum;
  else if (arval == 0)
    return -1;
  else if (brval == 0)
    return 1;
  else
    rval = 0;
  return rval;
}
示例#2
0
static int seqorder_str_compare_lex(const void *v1, const void *v2, void *data)
{
  GtUword n1 = *(GtUword*) v1,
          n2 = *(GtUword*) v2,
          desclen1, desclen2;
  const char *desc1, *desc2;
  int rval = 0;
  desc1 = gt_encseq_description((GtEncseq*) data, &desclen1, n1);
  desc2 = gt_encseq_description((GtEncseq*) data, &desclen2, n2);
  rval = strncmp(desc1, desc2, MIN(desclen1, desclen2) * sizeof (char));
  if (rval == 0)
    rval = desclen1-desclen2;
  return rval;
}
示例#3
0
static void gt_seqorder_output(unsigned long seqnum, GtEncseq *encseq)
{
  GtEncseqReader *esr;
  unsigned long startpos, len, desclen = 0;
  const char *desc = NULL;
  unsigned long i;

  startpos = gt_encseq_seqstartpos(encseq, seqnum);
  len = gt_encseq_seqlength(encseq, seqnum);
  gt_xfputc(GT_FASTA_SEPARATOR, stdout);
  if (gt_encseq_has_description_support(encseq))
  {
    desc = gt_encseq_description(encseq, &desclen, seqnum);
    gt_xfwrite(desc, (size_t)1, (size_t)desclen, stdout);
  }
  gt_xfputc('\n', stdout);
  esr = gt_encseq_create_reader_with_readmode(encseq, GT_READMODE_FORWARD,
      startpos);
  for (i = 0; i < len; i++)
  {
    gt_xfputc(gt_encseq_reader_next_decoded_char(esr), stdout);
  }
  gt_encseq_reader_delete(esr);
  gt_xfputc('\n', stdout);
}
示例#4
0
static int encseq_lua_description(lua_State *L)
{
  GtEncseq **encseq;
  GtUword seqno, desclen;
  const char *string;
  encseq = check_encseq(L, 1);
  seqno = luaL_checknumber(L, 2);
  luaL_argcheck(L, seqno < gt_encseq_num_of_sequences(*encseq), 2,
                "cannot exceed number of sequences");
  string = gt_encseq_description(*encseq, &desclen, seqno);
  lua_pushlstring(L, string, desclen);
  return 1;
}
示例#5
0
const char* gt_bioseq_get_description(GtBioseq *bs, GtUword idx)
{
  const char *desc;
  char *mydesc;
  GtUword desclen;
  gt_assert(bs && bs->encseq);
  gt_assert(idx < gt_encseq_num_of_sequences(bs->encseq));
  if (!(mydesc = bs->descriptions[idx])) {
    desc = gt_encseq_description(bs->encseq, &desclen, idx);
    mydesc = gt_calloc(desclen + 1, sizeof (char));
    strncpy(mydesc, desc, desclen);
    bs->descriptions[idx] = mydesc;
  }
  return (const char*) mydesc;
}
示例#6
0
static char* gt_encseq_col_get_description(const GtSeqCol *sc,
                                           GtUword filenum,
                                           GtUword seqnum)
{
  GtEncseqCol *esc;
  const char *desc;
  GtUword encseq_seqnum, desclen;
  esc = gt_encseq_col_cast(sc);
  gt_assert(esc && filenum < gt_encseq_num_of_files(esc->encseq));
  encseq_seqnum = gt_encseq_filenum_first_seqnum(esc->encseq, filenum) + seqnum;
  gt_assert(encseq_seqnum < gt_encseq_num_of_sequences(esc->encseq));
  desc = gt_encseq_description(esc->encseq, &desclen, encseq_seqnum);
  gt_assert(desc && desclen > 0);
  return gt_cstr_dup_nt(desc, desclen);;
}
示例#7
0
static int gt_encseq_col_do_grep_desc(GtEncseqCol *esc, GtUword *filenum,
                                      GtUword *seqnum, GtStr *seqid,
                                      GtError *err)
{
  GtUword j;
  const GtSeqInfo *seq_info_ptr;
  GtSeqInfo seq_info;
  bool match = false;
  int had_err = 0;
  gt_error_check(err);

  gt_assert(esc && filenum && seqnum && seqid);
  /* create cache */
  if (!esc->grep_cache)
    esc->grep_cache = gt_seq_info_cache_new();
  /* try to read from cache */
  seq_info_ptr = gt_seq_info_cache_get(esc->grep_cache, gt_str_get(seqid));
  if (seq_info_ptr) {
    *filenum = seq_info_ptr->filenum;
    *seqnum = seq_info_ptr->seqnum;
    return 0;
  }
  for (j = 0; !had_err && j < gt_encseq_num_of_sequences(esc->encseq); j++) {
    const char *desc;
    char *buf;
    GtUword desc_len;
    desc = gt_encseq_description(esc->encseq, &desc_len, j);
    buf = gt_calloc(desc_len + 1, sizeof (char));
    memcpy(buf, desc, desc_len * sizeof (char));
    had_err = gt_grep(&match, gt_str_get(seqid), buf, err);
    gt_free(buf);
    if (!had_err && match) {
      *filenum = seq_info.filenum =
                       gt_encseq_filenum(esc->encseq,
                                         gt_encseq_seqstartpos(esc->encseq, j));
      *seqnum = seq_info.seqnum =
                      j - gt_encseq_filenum_first_seqnum(esc->encseq, *filenum);
      gt_seq_info_cache_add(esc->grep_cache, gt_str_get(seqid), &seq_info);
      break;
    }
  }
  if (!had_err && !match) {
    gt_error_set(err, "no description matched sequence ID '%s'",
                 gt_str_get(seqid));
    had_err = -1;
  }
  return had_err;
}
示例#8
0
文件: giextract.c 项目: 9beckert/TIR
static int giextract_encodedseq2fasta(FILE *fpout,
                                      const GtEncseq *encseq,
                                      unsigned long seqnum,
                                      const Fastakeyquery *fastakeyquery,
                                      unsigned long linewidth,
                                      GT_UNUSED GtError *err)
{
  const char *desc;
  unsigned long desclen;
  bool haserr = false;

  desc = gt_encseq_description(encseq, &desclen, seqnum);
  gt_xfputc('>',fpout);
  if (fastakeyquery != NULL && !COMPLETE(fastakeyquery))
  {
    printf("%s %lu %lu ",fastakeyquery->fastakey,
                         fastakeyquery->frompos,
                         fastakeyquery->topos);
  }
  gt_xfwrite(desc,sizeof *desc,(size_t) desclen,fpout);
  if (!haserr)
  {
    unsigned long frompos, topos, seqstartpos, seqlength ;

    gt_xfputc('\n',fpout);
    seqstartpos = gt_encseq_seqstartpos(encseq, seqnum);
    seqlength = gt_encseq_seqlength(encseq, seqnum);
    if (fastakeyquery != NULL && !COMPLETE(fastakeyquery))
    {
      frompos = fastakeyquery->frompos-1;
      topos = fastakeyquery->topos - fastakeyquery->frompos + 1;
    } else
    {
      frompos = 0;
      topos = seqlength;
    }
    gt_encseq2symbolstring(fpout,
                           encseq,
                           GT_READMODE_FORWARD,
                           seqstartpos + frompos,
                           topos,
                           linewidth);
  }
  return haserr ? -1 : 0;
}
示例#9
0
static int gt_encseq_col_md5_to_description(GtSeqCol *sc, GtStr *desc,
                                            GtStr *md5_seqid, GtError *err)
{
  GtUword seqnum = GT_UNDEF_UWORD;
  char seqid[GT_MD5_SEQID_HASH_LEN + 1];
  int had_err = 0;
  GtEncseqCol *esc;
  esc = gt_encseq_col_cast(sc);
  gt_error_check(err);
  gt_assert(esc && desc && md5_seqid && err);
  gt_assert(gt_md5_seqid_has_prefix(gt_str_get(md5_seqid)));
  if (gt_str_length(md5_seqid) >= GT_MD5_SEQID_TOTAL_LEN) {
    const char *cstrseqid = gt_str_get(md5_seqid);
    if (cstrseqid[GT_MD5_SEQID_TOTAL_LEN-1] != GT_MD5_SEQID_SEPARATOR) {
      gt_error_set(err, "MD5 sequence id %s not terminated with '%c'",
                   gt_str_get(md5_seqid), GT_MD5_SEQID_SEPARATOR);
      had_err = -1;
    }
    if (!had_err) {
      strncpy(seqid, cstrseqid + GT_MD5_SEQID_PREFIX_LEN,
              GT_MD5_SEQID_HASH_LEN);
      seqid[GT_MD5_SEQID_HASH_LEN] = '\0';
    }
  }
  seqnum = gt_md5_tab_map(esc->md5_tab, seqid);
  if (seqnum != GT_UNDEF_UWORD) {
    const char *cdesc;
    GtUword desc_len;
    gt_assert(seqnum < gt_encseq_num_of_sequences(esc->encseq));
    cdesc = gt_encseq_description(esc->encseq, &desc_len, seqnum);
    gt_str_append_cstr_nt(desc, cdesc, desc_len);
  } else {
    gt_error_set(err, "sequence %s not found", gt_str_get(md5_seqid));
    had_err = -1;
  }
  return had_err;
}
示例#10
0
static int output_sequence(GtEncseq *encseq, GtEncseqDecodeArguments *args,
                           const char *filename, GtError *err)
{
  GtUword i, j, sfrom, sto;
  int had_err = 0;
  bool has_desc;
  GtEncseqReader *esr;
  gt_assert(encseq);

  if (!(has_desc = gt_encseq_has_description_support(encseq)))
    gt_warning("Missing description support for file %s", filename);

  if (strcmp(gt_str_get(args->mode), "fasta") == 0) {
    /* specify a single sequence to extract */
    if (args->seq != GT_UNDEF_UWORD) {
      if (args->seq >= gt_encseq_num_of_sequences(encseq)) {
        gt_error_set(err,
                     "requested sequence "GT_WU" exceeds number of sequences "
                     "("GT_WU")", args->seq,
                     gt_encseq_num_of_sequences(encseq));
        return -1;
      }
      sfrom = args->seq;
      sto = args->seq + 1;
    } else if (args->seqrng.start != GT_UNDEF_UWORD
                 && args->seqrng.end != GT_UNDEF_UWORD) {
      /* specify a sequence range to extract */
      if (args->seqrng.start >= gt_encseq_num_of_sequences(encseq)
            || args->seqrng.end >= gt_encseq_num_of_sequences(encseq)) {
        gt_error_set(err,
                     "range "GT_WU"-"GT_WU" includes a sequence number "
                     "exceeding the total number of sequences ("GT_WU")",
                     args->seqrng.start,
                     args->seqrng.end,
                     gt_encseq_num_of_sequences(encseq));
        return -1;
      }
      sfrom = args->seqrng.start;
      sto = args->seqrng.end + 1;
    } else {
      /* extract all sequences */
      sfrom = 0;
      sto = gt_encseq_num_of_sequences(encseq);
    }
    for (i = sfrom; i < sto; i++) {
      GtUword desclen, startpos, len;
      char buf[BUFSIZ];
      const char *desc = NULL;
      /* XXX: maybe make this distinction in the functions via readmode? */
      if (!GT_ISDIRREVERSE(args->rm)) {
        startpos = gt_encseq_seqstartpos(encseq, i);
        len = gt_encseq_seqlength(encseq, i);
        if (has_desc) {
          desc = gt_encseq_description(encseq, &desclen, i);
        } else {
          (void) snprintf(buf, BUFSIZ, "sequence "GT_WU"", i);
          desclen = strlen(buf);
          desc = buf;
        }
      } else {
        startpos = gt_encseq_seqstartpos(encseq, i);
        len = gt_encseq_seqlength(encseq,
                                  gt_encseq_num_of_sequences(encseq)-1-i);
        startpos = gt_encseq_total_length(encseq)
                     - (gt_encseq_seqstartpos(encseq,
                                              gt_encseq_num_of_sequences(
                                                encseq)-1-i) + len);
        if (has_desc) {
          desc = gt_encseq_description(encseq,
                                       &desclen,
                                       gt_encseq_num_of_sequences(encseq)-1-i);
        } else {
          (void) snprintf(buf, BUFSIZ, "sequence "GT_WU"", i);
          desclen = strlen(buf);
          desc = buf;
        }
      }
      gt_assert(desc);
      /* output description */
      gt_xfputc(GT_FASTA_SEPARATOR, stdout);
      gt_xfwrite(desc, 1, desclen, stdout);
      gt_xfputc('\n', stdout);
      /* XXX: make this more efficient by writing in a buffer first and then
         showing the result */
      if (args->singlechars) {
        for (j = 0; j < len; j++) {
           gt_xfputc(gt_encseq_get_decoded_char(encseq,
                                                startpos + j,
                                                args->rm),
                     stdout);
        }
      } else {
        esr = gt_encseq_create_reader_with_readmode(encseq, args->rm, startpos);
        for (j = 0; j < len; j++) {
           gt_xfputc(gt_encseq_reader_next_decoded_char(esr), stdout);
        }
        gt_encseq_reader_delete(esr);
      }
      gt_xfputc('\n', stdout);
    }
  }

  if (strcmp(gt_str_get(args->mode), "concat") == 0) {
    GtUword from = 0,
                  to = gt_encseq_total_length(encseq) - 1;
    if (args->rng.start != GT_UNDEF_UWORD && args->rng.end != GT_UNDEF_UWORD) {
      if (args->rng.end > to) {
        had_err = -1;
        gt_error_set(err,
                     "end of range ("GT_WU") exceeds encoded sequence length "
                     "("GT_WU")", args->rng.end, to);
      }
      if (!had_err) {
        from = args->rng.start;
        to = args->rng.end;
      }
    }
    if (!had_err) {
      if (args->singlechars) {
        for (j = from; j <= to; j++) {
          char cc = gt_encseq_get_decoded_char(encseq, j, args->rm);
          if (cc == (char) SEPARATOR)
            cc = gt_str_get(args->sepchar)[0];
          gt_xfputc(cc, stdout);
        }
      } else {
        esr = gt_encseq_create_reader_with_readmode(encseq, args->rm, from);
        if (esr) {
          for (j = from; j <= to; j++) {
            char cc = gt_encseq_reader_next_decoded_char(esr);
            if (cc == (char) SEPARATOR)
              cc = gt_str_get(args->sepchar)[0];
            gt_xfputc(cc, stdout);
          }
          gt_encseq_reader_delete(esr);
        }
      }
      gt_xfputc('\n', stdout);
    }
  }
  return had_err;
}
static void condenseq_process_descriptions(GtCondenseq *condenseq,
                                           const GtEncseq *orig_es,
                                           GtLogger *logger)
{
  GtUword    *dist;
  const char *desc;
  char       *cur_id_startptr;
  GtUword     desclen,
              dist_idx,
              distsize = (GtUword) 128,
              idlen,
              idx,
              maxendidx = 0,
              maxlen = 0,
              minlen = GT_UWORD_MAX,
              wastedmem = 0,
              sdssize,
              cur_total_id_len = 0;
  bool        use_const_len;

  condenseq->ids_total_len = 0;
  dist = gt_calloc((size_t) distsize, sizeof (*dist));

  for (idx = 0; idx < condenseq->orig_num_seq; ++idx) {
    desc = gt_encseq_description(orig_es, &desclen, idx);
    idlen = condenseq_idlen(desc, desclen);
    if (distsize <= idlen) {
      dist = gt_realloc(dist, (size_t) (idlen + 1) * sizeof (*dist));
      for (dist_idx = distsize; dist_idx <= idlen; dist_idx++)
        dist[dist_idx] = 0;
      distsize = idlen + 1;
    }
    dist[idlen]++;
    if (idlen > maxlen)
      maxlen = idlen;
    if (idlen < minlen)
      minlen = idlen;
    maxendidx += idlen;
  }

  /* calculate memory we would waste if we assume equal length, and size if we
     store actual descriptions */
  for (dist_idx = minlen; dist_idx < maxlen; dist_idx++) {
    wastedmem += dist[dist_idx] * (maxlen - dist_idx);
    condenseq->ids_total_len += dist[dist_idx] * dist_idx;
  }
  condenseq->ids_total_len += dist_idx * dist[dist_idx];

  sdssize = (GtUword) gt_intset_best_memory_size(maxendidx,
                                                 condenseq->orig_num_seq);
  use_const_len = wastedmem < sdssize;

  if (use_const_len) {
    gt_logger_log(logger, "Condenseq descriptions will use const len, " GT_WU
                  ", \"wasting\" " GT_WU " bytes. SDS would use "
                  GT_WU " bytes",
                  maxlen, wastedmem, sdssize);
    condenseq->id_len = maxlen;
    condenseq->ids_total_len = maxlen * condenseq->orig_num_seq;
  }
  else {
    gt_logger_log(logger, "Condenseq descriptions will use sdstab with size "
                  GT_WU ". Const length would have wasted " GT_WU " bytes.",
                  sdssize, wastedmem);
    condenseq->sdstab = gt_intset_best_new(maxendidx, condenseq->orig_num_seq);
  }
  condenseq->orig_ids = gt_calloc((size_t) condenseq->ids_total_len,
                                  sizeof (*condenseq->orig_ids));

  cur_id_startptr = condenseq->orig_ids;
  for (idx = 0; idx < condenseq->orig_num_seq; ++idx) {
    desc = gt_encseq_description(orig_es, &desclen, idx);
    idlen = condenseq_idlen(desc, desclen);
    gt_assert(idlen <= maxlen);
    (void) memcpy(cur_id_startptr, desc, (size_t) idlen);
    if (use_const_len) {
      cur_id_startptr += maxlen;
      cur_total_id_len += maxlen;
    }
    else {
      cur_id_startptr += idlen;
      cur_total_id_len += idlen;
      gt_intset_add(condenseq->sdstab, cur_total_id_len);
    }
  }
  gt_assert(cur_total_id_len == condenseq->ids_total_len);
  gt_free(dist);
}
示例#12
0
static GtMatchIteratorStatus gt_match_iterator_sw_next(GtMatchIterator *mi,
                                                      GT_UNUSED GtMatch **match,
                                                      GT_UNUSED GtError *err)
{
  GtMatchIteratorSW *mis;
  GtSeq *seq_a, *seq_b;
  char *a, *b;
  const char *adesc, *bdesc;
  GtAlignment *ali = NULL;
  unsigned long seqlen_a, seqlen_b, seqpos;
  GtRange arng, brng;
  gt_assert(mi && match);

  mis = gt_match_iterator_sw_cast(mi);
  while (true) {
    if (!mis->pvt->firstali)
      mis->pvt->seqno_es2++;
    if (mis->pvt->seqno_es2 == gt_encseq_num_of_sequences(mis->pvt->es2)) {
      mis->pvt->seqno_es1++;
      if (mis->pvt->seqno_es1 == gt_encseq_num_of_sequences(mis->pvt->es1))
        return GT_MATCHER_STATUS_END;
      mis->pvt->seqno_es2 = 0;
    }
    seqlen_a = gt_encseq_seqlength(mis->pvt->es1, mis->pvt->seqno_es1);
    seqlen_b = gt_encseq_seqlength(mis->pvt->es2, mis->pvt->seqno_es2);
    /* XXX: reuse buffers for performance improvement */
    a = gt_malloc(seqlen_a * sizeof (char));
    seqpos = gt_encseq_seqstartpos(mis->pvt->es1, mis->pvt->seqno_es1);
    gt_encseq_extract_decoded(mis->pvt->es1, a, seqpos, seqpos + seqlen_a - 1);
    b = gt_malloc(seqlen_b * sizeof (char));
    seqpos = gt_encseq_seqstartpos(mis->pvt->es2, mis->pvt->seqno_es2);
    gt_encseq_extract_decoded(mis->pvt->es1, b, seqpos, seqpos + seqlen_b - 1);
    seq_a = gt_seq_new(a, seqlen_a, gt_encseq_alphabet(mis->pvt->es1));
    seq_b = gt_seq_new(b, seqlen_b, gt_encseq_alphabet(mis->pvt->es2));
    ali = gt_swalign(seq_a, seq_b, mis->pvt->sf);
    mis->pvt->firstali = false;
    if (ali && gt_alignment_get_length(ali) >= mis->pvt->min_len
          && gt_alignment_eval(ali) <= mis->pvt->max_edist) {
      break;
    }
    gt_alignment_delete(ali);
    gt_seq_delete(seq_a);
    gt_seq_delete(seq_b);
    gt_free(a);
    gt_free(b);
  }
  arng = gt_alignment_get_urange(ali);
  brng = gt_alignment_get_vrange(ali);
  adesc = gt_encseq_description(mis->pvt->es1, &seqlen_a, mis->pvt->seqno_es1);
  bdesc = gt_encseq_description(mis->pvt->es2, &seqlen_b, mis->pvt->seqno_es2);
  *match = gt_match_sw_new("", "",
                           mis->pvt->seqno_es1,
                           mis->pvt->seqno_es2,
                           gt_alignment_get_length(ali),
                           gt_alignment_eval(ali),
                           arng.start, brng.start,
                           arng.end, brng.end,
                           GT_MATCH_DIRECT);
  gt_match_set_seqid1_nt(*match, adesc, seqlen_a);
  gt_match_set_seqid2_nt(*match, bdesc, seqlen_b);
  gt_alignment_delete(ali);
  gt_seq_delete(seq_a);
  gt_seq_delete(seq_b);
  gt_free(a);
  gt_free(b);
  return GT_MATCHER_STATUS_OK;
}
示例#13
0
int gt_region_mapping_get_description(GtRegionMapping *rm, GtStr *desc,
                                      GtStr *seqid, GtError *err)
{
  int had_err = 0;
  gt_error_check(err);
  gt_assert(rm && desc && seqid);
  if (rm->userawseq) {
    gt_str_append_cstr(desc, "<rawseq>");
    return 0;
  }
  had_err = update_seq_col_if_necessary(rm, seqid, err);
  if (!had_err) {
    if (gt_md5_seqid_has_prefix(gt_str_get(seqid))) {
      had_err = gt_seq_col_md5_to_description(rm->seq_col, desc, seqid,
                                              err);
    }
    return had_err;
  }
  if (!had_err) {
    if (rm->usedesc) {
      unsigned long filenum, seqnum;
      gt_assert(rm->seqid2seqnum_mapping);
      had_err = gt_seqid2seqnum_mapping_map(rm->seqid2seqnum_mapping,
                                            gt_str_get(seqid), NULL, &seqnum,
                                            &filenum, NULL, err);
      if (!had_err) {
        char *cdesc;
        cdesc = gt_seq_col_get_description(rm->seq_col, filenum, seqnum);
        gt_assert(cdesc);
        gt_str_append_cstr(desc, cdesc);
        gt_free(cdesc);
      }
    }
    else if (rm->useseqno) {
      unsigned long seqno = GT_UNDEF_ULONG;
      gt_assert(rm->encseq);
      if (1 != sscanf(gt_str_get(seqid), "seq%lu", &seqno)) {
        gt_error_set(err, "seqid '%s' does not have the form 'seqX' "
                          "where X is a sequence number in the encoded "
                          "sequence", gt_str_get(seqid));
        had_err = -1;
      }
      gt_assert(had_err || seqno != GT_UNDEF_ULONG);
      if (!had_err && seqno >= gt_encseq_num_of_sequences(rm->encseq)) {
          gt_error_set(err, "trying to access sequence %lu, but encoded"
                            "sequence contains only %lu sequences",
                            seqno, gt_encseq_num_of_sequences(rm->encseq));
          had_err = -1;
      }
      if (!had_err) {
        unsigned long desclen;
        const char *edesc;
        edesc = gt_encseq_description(rm->encseq, &desclen, seqno);
        gt_str_append_cstr_nt(desc, edesc, desclen);
      }
    } else if (rm->matchdesc) {
      const char *md5;
      /* XXX: not beautiful, but works -- this may be LOTS faster */
      had_err = gt_seq_col_grep_desc_md5(rm->seq_col, &md5, seqid, err);
      if (!had_err) {
        GtStr *md5_seqid = gt_str_new_cstr(md5);
        had_err = gt_seq_col_md5_to_description(rm->seq_col, desc, md5_seqid,
                                                err);
        gt_str_delete(md5_seqid);
      }
    } else {
      if (!had_err) {
        char *cdesc;
        cdesc = gt_seq_col_get_description(rm->seq_col, 0, 0);
        gt_assert(cdesc);
        gt_str_append_cstr(desc, cdesc);
        gt_free(cdesc);
      }
    }
  }
  return had_err;
}
示例#14
0
static int process_feature(GtLTRClusterStream *lcs,
                           const char *feature,
                           GtError *err)
{
  GtArray *matches;
  GtMatchIterator *mi = NULL;
  GtMatch *match = NULL;
  GtMatchIteratorStatus status;
  GtEncseq *encseq;
  unsigned long i;
  int had_err = 0;

  if (lcs->current_state != NULL) {
    char tmp[BUFSIZ];
    gt_free(*lcs->current_state);
    (void) snprintf(tmp, BUFSIZ, "Clustering feature: %s", feature);
    *lcs->current_state = gt_cstr_dup(tmp);
  }
  matches = gt_array_new(sizeof(GtMatch*));
  encseq = (GtEncseq*) gt_hashmap_get(lcs->feat_to_encseq, feature);
  gt_log_log("found encseq %p for feature %s", encseq, feature);
  if (!had_err) {
    mi = gt_match_iterator_last_new(encseq, encseq, lcs->match_score,
                                        lcs->mismatch_cost,
                                        lcs->gap_open_cost,
                                        lcs->gap_ext_cost,
                                        lcs->xdrop,
                                        lcs->ydrop,
                                        lcs->zdrop,
                                        lcs->k,
                                        lcs->mscoregapped,
                                        lcs->mscoregapless, err);
    if (mi != NULL) {
      while ((status = gt_match_iterator_next(mi, &match, err))
             != GT_MATCHER_STATUS_END) {
        if (status == GT_MATCHER_STATUS_OK) {
          gt_array_add(matches, match);
        } else {
          gt_assert(status == GT_MATCHER_STATUS_ERROR);
          had_err = -1;
          break;
        }
      }
    } else
      had_err = -1;
  }
  if (!had_err) {
    GtClusteredSet *cs;
    GtHashmap *seqdesc2seqnum;
    GtMatch *tmp_match;
    const char *description;
    char *output;
    unsigned long desclen,
                  num_of_seq;

    seqdesc2seqnum = gt_hashmap_new(GT_HASH_STRING, free_hash, NULL);
    num_of_seq = gt_encseq_num_of_sequences(encseq);
    for (i = 0; i < num_of_seq; i++) {
      description = gt_encseq_description(encseq, &desclen, i);
      output = gt_calloc((size_t) (desclen + 1), sizeof (char));
      strncpy(output, description, (size_t) desclen);
      output[desclen] = '\0';
      gt_hashmap_add(seqdesc2seqnum, (void*) gt_cstr_dup(output),
                     (void*) (i + 1));
      gt_free(output);
    }
    cs = gt_clustered_set_union_find_new(num_of_seq, err);
    if (cs != NULL) {
      if (cluster_sequences(matches, cs, seqdesc2seqnum, (unsigned) lcs->psmall,
                            (unsigned) lcs->plarge, encseq, err) != 0) {
        had_err = -1;
      }
      if (!had_err) {
        (void) cluster_annotate_nodes(cs, encseq, feature, lcs->nodes, err);
      }
    } else
      had_err = -1;

    for (i = 0; i < gt_array_size(matches); i++) {
      tmp_match = *(GtMatch**) gt_array_get(matches, i);
      gt_match_delete(tmp_match);
    }
    gt_array_delete(matches);
    matches = NULL;
    gt_hashmap_delete(seqdesc2seqnum);
    gt_clustered_set_delete(cs, err);
  }
  gt_match_iterator_delete(mi);

  return had_err;
}
示例#15
0
static int cluster_annotate_nodes(GtClusteredSet *cs, GtEncseq *encseq,
                                  const char *feature, GtArray *nodes,
                                  GtError *err)
{
  GtFeatureNodeIterator *fni;
  GtFeatureNode *curnode = NULL, *tmp;
  GtClusteredSetIterator *csi = NULL;
  GtGenomeNode *gn;
  GtHashmap *desc2node;
  GtStr *seqid = NULL;
  int had_err = 0;
  unsigned long num_of_clusters, i, elm;
  const char *fnt = NULL;
  char buffer[BUFSIZ], *real_feature;
  gt_error_check(err);

  if ((strcmp(feature, "lLTR") == 0) || (strcmp(feature, "rLTR") == 0))
    real_feature = gt_cstr_dup(gt_ft_long_terminal_repeat);
  else
    real_feature = gt_cstr_dup(feature);

  desc2node = gt_hashmap_new(GT_HASH_STRING, free_hash, NULL);
  for (i = 0; i < gt_array_size(nodes); i++) {
    gn = *(GtGenomeNode**) gt_array_get(nodes, i);
    if (gt_feature_node_try_cast(gn) == NULL)
      continue;
    fni = gt_feature_node_iterator_new((GtFeatureNode*) gn);
    while ((curnode = gt_feature_node_iterator_next(fni)) != NULL) {
      char header[BUFSIZ];
      fnt = gt_feature_node_get_type(curnode);
      if (strcmp(fnt, gt_ft_repeat_region) == 0) {
        const char *rid;
        unsigned long id;
        seqid = gt_genome_node_get_seqid((GtGenomeNode*) curnode);
        rid = gt_feature_node_get_attribute(curnode, "ID");
        (void) sscanf(rid, "repeat_region%lu", &id);
        (void) snprintf(buffer, BUFSIZ, "%s_%lu", gt_str_get(seqid), id);
      } else if (strcmp(fnt, gt_ft_protein_match) == 0) {
        GtRange range;
        const char *attr;
        attr = gt_feature_node_get_attribute(curnode, "name");
        if (!attr)
          continue;
        if (strcmp(feature, attr) != 0)
          continue;
        range = gt_genome_node_get_range((GtGenomeNode*) curnode);
        if ((range.end - range.start + 1) < 10UL)
          continue;
        (void) snprintf(header, BUFSIZ, "%s_%lu_%lu", buffer, range.start,
                        range.end);
        gt_hashmap_add(desc2node, (void*) gt_cstr_dup(header), (void*) curnode);
      } else if (strcmp(fnt, real_feature) == 0) {
        GtRange range;
        range = gt_genome_node_get_range((GtGenomeNode*) curnode);
        if ((range.end - range.start + 1) < 10UL)
          continue;
        (void) snprintf(header, BUFSIZ, "%s_%lu_%lu", buffer, range.start,
                        range.end);
        gt_hashmap_add(desc2node, (void*) gt_cstr_dup(header), (void*) curnode);
      }
    }
    gt_feature_node_iterator_delete(fni);
  }
  gt_free(real_feature);

  num_of_clusters = gt_clustered_set_num_of_clusters(cs, err);
  for (i = 0; i < num_of_clusters; i++) {
    csi = gt_clustered_set_get_iterator(cs, i ,err);
    if (csi != NULL) {
      while (!had_err && (gt_clustered_set_iterator_next(csi, &elm, err)
             != GT_CLUSTERED_SET_ITERATOR_STATUS_END)) {
        char clid[BUFSIZ];
        const char *encseqdesc;
        char *encseqid;
        unsigned long desclen;
        encseqdesc = gt_encseq_description(encseq, &desclen, elm);
        encseqid = gt_calloc((size_t) (desclen + 1), sizeof (char));
        (void) strncpy(encseqid, encseqdesc, (size_t) desclen);
        encseqid[desclen] = '\0';
        tmp = (GtFeatureNode*) gt_hashmap_get(desc2node, (void*) encseqid);
        (void) snprintf(clid, BUFSIZ, "%lu", i);
        gt_feature_node_set_attribute(tmp, "clid", clid);
        gt_free(encseqid);
      }
    }
    gt_clustered_set_iterator_delete(csi, err);
    csi = NULL;
  }
  gt_hashmap_delete(desc2node);
  return had_err;
}