예제 #1
0
static int cds_check_visitor_feature_node(GtNodeVisitor *nv, GtFeatureNode *fn,
                                          GtError *err)
{
  GtCDSCheckVisitor *v = cds_check_visitor_cast(nv);
  GtFeatureNodeIterator *fni;
  GtFeatureNode *node;
  int had_err = 0;
  gt_error_check(err);
  gt_assert(v && fn);
  fni = gt_feature_node_iterator_new(fn);
  while (!had_err && (node = gt_feature_node_iterator_next(fni)))
    had_err = check_cds_phases_if_necessary(node, v, false, err);
  gt_feature_node_iterator_delete(fni);
  gt_hashmap_reset(v->cds_features);
  while (v->splitting_is_necessary) {
    split_cds_features(v->cds_features_to_split, fn);
    gt_hashmap_reset(v->cds_features_to_split);
    v->splitting_is_necessary = false;
    /* perform second pass to correct phases */
    fni = gt_feature_node_iterator_new(fn);
    while (!had_err && (node = gt_feature_node_iterator_next(fni)))
      had_err = check_cds_phases_if_necessary(node, v, false, err);
    gt_feature_node_iterator_delete(fni);
    gt_hashmap_reset(v->cds_features);
  }
  return had_err;
}
예제 #2
0
파일: AgnUtils.c 프로젝트: jfdenton/AEGeAn
GtRange agn_transcript_cds_range(GtFeatureNode *transcript)
{
  gt_assert(transcript);
  GtRange trange;
  trange.start = 0;
  trange.end = 0;

  GtFeatureNodeIterator *iter = gt_feature_node_iterator_new_direct(transcript);
  GtFeatureNode *current;
  for
  (
    current = gt_feature_node_iterator_next(iter);
    current != NULL;
    current = gt_feature_node_iterator_next(iter)
  )
  {
    if(agn_gt_feature_node_is_cds_feature(current))
    {
      GtRange crange = gt_genome_node_get_range((GtGenomeNode *)current);
      if(trange.start == 0 || crange.start < trange.start)
        trange.start = crange.start;
      if(trange.end == 0 || crange.end > trange.end)
        trange.end = crange.end;
    }
  }

  if(gt_feature_node_get_strand(transcript) == GT_STRAND_REVERSE)
  {
    GtUword temp = trange.start;
    trange.start = trange.end;
    trange.end = temp;
  }
  return trange;
}
예제 #3
0
static int
visit_feature_node(GtNodeVisitor *nv, GtFeatureNode *fn, GtError *error)
{
  AgnLocusMapVisitor *v = locus_map_visitor_cast(nv);
  gt_error_check(error);
  agn_assert(gt_feature_node_has_type(fn, "locus"));
  const char *locuslabel = agn_feature_node_get_label(fn);

  GtFeatureNodeIterator *iter = gt_feature_node_iterator_new(fn);
  GtFeatureNode *current;
  for(current  = gt_feature_node_iterator_next(iter);
      current != NULL;
      current  = gt_feature_node_iterator_next(iter))
  {
    if(agn_typecheck_gene(current) && v->genefh != NULL)
    {
      const char *genelabel = agn_feature_node_get_label(current);
      fprintf(v->genefh, "%s\t%s\n", genelabel, locuslabel);
    }

    if(agn_typecheck_mrna(current) && v->mrnafh != NULL)
    {
      const char *mrnalabel = agn_feature_node_get_label(current);
      fprintf(v->mrnafh, "%s\t%s\n", mrnalabel, locuslabel);
    }
  }
  gt_feature_node_iterator_delete(iter);
  return 0;
}
예제 #4
0
static int filter_stream_next(GtNodeStream *ns, GtGenomeNode **gn,
                              GtError *error)
{
  AgnFilterStream *stream;
  GtFeatureNode *fn;
  int had_err;
  gt_error_check(error);
  stream = filter_stream_cast(ns);

  if(gt_queue_size(stream->cache) > 0)
  {
    *gn = gt_queue_get(stream->cache);
    return 0;
  }

  while(1)
  {
    had_err = gt_node_stream_next(stream->in_stream, gn, error);
    if(had_err)
      return had_err;
    if(!*gn)
      return 0;

    fn = gt_feature_node_try_cast(*gn);
    if(!fn)
      return 0;

    GtFeatureNode *current;
    GtFeatureNodeIterator *iter = gt_feature_node_iterator_new(fn);
    for(current  = gt_feature_node_iterator_next(iter);
        current != NULL;
        current  = gt_feature_node_iterator_next(iter))
    {
      const char *type = gt_feature_node_get_type(current);
      bool keepfeature = false;
      if(gt_hashmap_get(stream->typestokeep, type) != NULL)
        keepfeature = true;

      if(keepfeature)
      {
        gt_genome_node_ref((GtGenomeNode *)current);
        gt_queue_add(stream->cache, current);
      }
    }
    gt_feature_node_iterator_delete(iter);
    gt_genome_node_delete((GtGenomeNode *)fn);
    if(gt_queue_size(stream->cache) > 0)
    {
      *gn = gt_queue_get(stream->cache);
      return 0;
    }
  }

  return 0;
}
예제 #5
0
static GtArray*
gaeval_visitor_intersect(GtGenomeNode *genemodel, GtGenomeNode *alignment)
{
  agn_assert(genemodel && alignment);

  GtFeatureNode *genefn = gt_feature_node_cast(genemodel);
  GtFeatureNode *algnfn = gt_feature_node_cast(alignment);
  agn_assert(gt_feature_node_has_type(genefn, "mRNA"));
  GtStrand genestrand = gt_feature_node_get_strand(genefn);
  GtStrand algnstrand = gt_feature_node_get_strand(algnfn);
  if(genestrand != algnstrand)
    return NULL;

  GtArray *covered_parts = gt_array_new( sizeof(GtRange) );
  GtArray *exons = agn_typecheck_select(genefn, agn_typecheck_exon);
  GtWord i;
  for(i = 0; i < gt_array_size(exons); i++)
  {
    GtGenomeNode *exon = *(GtGenomeNode **)gt_array_get(exons, i);
    GtRange exonrange = gt_genome_node_get_range(exon);

    GtFeatureNodeIterator *aniter = gt_feature_node_iterator_new(algnfn);
    GtFeatureNode *tempaln;
    GtRange nullrange = {0, 0};
    for(tempaln  = gt_feature_node_iterator_next(aniter);
        tempaln != NULL;
        tempaln  = gt_feature_node_iterator_next(aniter))
    {
      if(gt_feature_node_has_type(tempaln, "match_gap"))
        continue;

      GtRange alnrange = gt_genome_node_get_range((GtGenomeNode *) tempaln);
      GtRange intr = gaeval_visitor_range_intersect(&exonrange, &alnrange);
      if(gt_range_compare(&intr, &nullrange) != 0)
        gt_array_add(covered_parts, intr);
    }
    gt_feature_node_iterator_delete(aniter);
  }
  gt_array_delete(exons);

  for(i = 0; i < gt_array_size(covered_parts); i++)
  {
    GtRange *r1 = gt_array_get(covered_parts, i);
    GtUword j;
    for(j = i+1; j < gt_array_size(covered_parts); j++)
    {
      GtRange *r2 = gt_array_get(covered_parts, j);
      agn_assert(gt_range_overlap(r1, r2) == false);
    }
  }

  return covered_parts;
}
예제 #6
0
파일: AgnUtils.c 프로젝트: jfdenton/AEGeAn
void agn_transcript_structure_gbk(GtFeatureNode *transcript, FILE *outstream)
{
  gt_assert(transcript && outstream);

  GtArray *exons = gt_array_new( sizeof(GtFeatureNode *) );
  GtFeatureNodeIterator *iter = gt_feature_node_iterator_new_direct(transcript);
  GtFeatureNode *child;
  for
  (
    child = gt_feature_node_iterator_next(iter);
    child != NULL;
    child = gt_feature_node_iterator_next(iter)
  )
  {
    if(agn_gt_feature_node_is_exon_feature(child))
      gt_array_add(exons, child);
  }
  gt_feature_node_iterator_delete(iter);

  gt_assert(gt_array_size(exons) > 0);
  gt_array_sort(exons, (GtCompare)agn_gt_genome_node_compare);

  if(gt_feature_node_get_strand(transcript) == GT_STRAND_REVERSE)
    fputs("complement(", outstream);

  if(gt_array_size(exons) == 1)
  {
    GtGenomeNode *exon = *(GtGenomeNode **)gt_array_get(exons, 0);
    GtRange exonrange = gt_genome_node_get_range(exon);
    fprintf(outstream, "<%lu..>%lu", exonrange.start, exonrange.end);
  }
  else
  {
    fputs("join(", outstream);
    GtUword i;
    for(i = 0; i < gt_array_size(exons); i++)
    {
      GtGenomeNode *exon = *(GtGenomeNode **)gt_array_get(exons, i);
      GtRange exonrange = gt_genome_node_get_range(exon);

      if(i == 0)
        fprintf(outstream, "<%lu..%lu", exonrange.start, exonrange.end);
      else if(i+1 == gt_array_size(exons))
        fprintf(outstream, ",%lu..>%lu", exonrange.start, exonrange.end);
      else
        fprintf(outstream, ",%lu..%lu", exonrange.start, exonrange.end);
    }
    fputs(")", outstream);
  }

  if(gt_feature_node_get_strand(transcript) == GT_STRAND_REVERSE)
    fputs(")", outstream);
}
예제 #7
0
static int gt_orf_finder_visitor_feature_node(GtNodeVisitor *gv,
                                              GtFeatureNode *gf,
                                              GtError *err)
{
  GtORFFinderVisitor *lv;
  const char *gft = NULL;
  GtFeatureNodeIterator *gfi;
  GtFeatureNode *curnode = NULL;
  int had_err = 0;
  GtRange rng;

  lv = gt_orf_finder_visitor_cast(gv);
  gt_assert(lv);
  gt_error_check(err);

  gfi = gt_feature_node_iterator_new(gf);

  while (!had_err && (curnode = gt_feature_node_iterator_next(gfi))) {
    gft = gt_feature_node_get_type(curnode);

    if (gt_hashmap_get(lv->types, (void*) gft) != NULL ||
                       gt_hashmap_get(lv->types,
                                      (void*) "all") == (void*) 1) {
      if (!had_err) {
        rng = gt_genome_node_get_range((GtGenomeNode*) curnode);
        had_err = run_orffinder(lv->rmap, curnode, rng.start - 1, rng.end - 1,
                                lv->min, lv->max, lv->all, err);
        if (gt_hashmap_get(lv->types,
                           (void*) "all") == (void*) 1) {
          break;
        }
        else if (gt_feature_node_has_children(curnode)) {
          GtFeatureNode *tmpnode = NULL;
          GtFeatureNodeIterator *tmpgfi = gt_feature_node_iterator_new(curnode);
          (void) gt_feature_node_iterator_next(tmpgfi);
          while ((tmpnode = gt_feature_node_iterator_next(tmpgfi))) {
            gft = gt_feature_node_get_type(tmpnode);
            if (strcmp(gft, (const char*) GT_ORF_TYPE) == 0) {
              continue;
            }
            /* curnode = gt_feature_node_iterator_next(gfi); */
          }
          gt_feature_node_iterator_delete(tmpgfi);
        }
      }
    }
  }

  gt_feature_node_iterator_delete(gfi);

  return had_err;
}
static int check_boundaries_visitor_check_rec(GtFeatureNode *parent,
                                              GtFeatureNode *child,
                                              GtError *err)
{
  GtFeatureNodeIterator *fni;
  GtFeatureNode *node;
  GtRange range,
          p_range;
  int had_err = 0;

  range = gt_genome_node_get_range((GtGenomeNode*) child);
  p_range = gt_genome_node_get_range((GtGenomeNode*) parent);

  if (range.start < p_range.start || range.end > p_range.end) {
    gt_warning("%s child range " GT_WU "-" GT_WU " (file %s, line %u) not "
               "contained in %s parent range " GT_WU "-" GT_WU " (file %s, "
               "line %u)",
               gt_feature_node_get_type(child),
               range.start, range.end,
               gt_genome_node_get_filename((GtGenomeNode*) child),
               gt_genome_node_get_line_number((GtGenomeNode*) child),
               gt_feature_node_get_type(parent),
               p_range.start, p_range.end,
               gt_genome_node_get_filename((GtGenomeNode*) parent),
               gt_genome_node_get_line_number((GtGenomeNode*) parent));
  }

  fni = gt_feature_node_iterator_new_direct(child);
  while ((node = gt_feature_node_iterator_next(fni))) {
    had_err = check_boundaries_visitor_check_rec(child, node, err);
  }
  gt_feature_node_iterator_delete(fni);

  return had_err;
}
static int gt_ltr_input_check_visitor_feature_node(GtNodeVisitor *nv,
                                                   GtFeatureNode *fn,
                                                   GtError *err)
{
  GT_UNUSED GtLTRInputCheckVisitor *lv;
  GtFeatureNodeIterator *fni;
  bool seen_left = false;
  GtFeatureNode *curnode = NULL,
                *ltr_retrotrans = NULL,
                *lltr = NULL,
                *rltr = NULL;
  int had_err = 0;
  lv = gt_ltr_input_check_visitor_cast(nv);
  gt_assert(lv);
  gt_error_check(err);

  /* traverse annotation subgraph and find LTR components */
  fni = gt_feature_node_iterator_new(fn);
  while (!had_err && (curnode = gt_feature_node_iterator_next(fni))) {
    if (strcmp(gt_feature_node_get_type(curnode),
               gt_ft_LTR_retrotransposon) == 0) {
      ltr_retrotrans = curnode;
    }
    if (strcmp(gt_feature_node_get_type(curnode),
               gt_ft_long_terminal_repeat) == 0) {
      if (seen_left)
        rltr = curnode;
      else {
        lltr = curnode;
        seen_left = true;
      }
    }
  }
  gt_feature_node_iterator_delete(fni);

  if (lv->only_ltrs) {
    if (!had_err && !ltr_retrotrans) {
      gt_error_set(err, "connected component with %s entry node (%s, line %u) "
                        "does not contain a '%s' node, which is required",
                   gt_feature_node_get_type(fn),
                   gt_genome_node_get_filename((GtGenomeNode*) fn),
                   gt_genome_node_get_line_number((GtGenomeNode*) fn),
                   gt_ft_LTR_retrotransposon);
      had_err = -1;
    }
  }

  if (!had_err && ltr_retrotrans && (!lltr || !rltr)) {
    gt_error_set(err, "LTR_retrotransposon feature (%s, line %u) "
                      "does not contain two %s child features, both of which "
                      "are required",
                 gt_genome_node_get_filename((GtGenomeNode*) ltr_retrotrans),
                 gt_genome_node_get_line_number((GtGenomeNode*) ltr_retrotrans),
                 gt_ft_long_terminal_repeat);
    had_err = -1;
  }

  return had_err;
}
예제 #10
0
static int
gaeval_visitor_visit_feature_node(GtNodeVisitor *nv, GtFeatureNode *fn,
                                  GtError *error)
{
  AgnGaevalVisitor *v = gaeval_visitor_cast(nv);
  gt_error_check(error);

  GtFeatureNodeIterator *feats = gt_feature_node_iterator_new(fn);
  GtFeatureNode *tempfeat;
  for(tempfeat  = gt_feature_node_iterator_next(feats);
      tempfeat != NULL;
      tempfeat  = gt_feature_node_iterator_next(feats))
  {
    if(agn_typecheck_mrna(tempfeat) == false)
      continue;

    double coverage = gaeval_visitor_calculate_coverage(v, tempfeat, error);
    char covstr[16];
    sprintf(covstr, "%.3lf", coverage);
    gt_feature_node_add_attribute(tempfeat, "gaeval_coverage", covstr);

    double integrity_components[5];
    double integrity = gaeval_visitor_calculate_integrity(
        v, tempfeat, coverage, integrity_components, error
    );
    char intstr[16];
    sprintf(intstr, "%.3lf", integrity);
    gt_feature_node_add_attribute(tempfeat, "gaeval_integrity", intstr);

    if(v->tsvout)
    {
      const char *mrnaid = gt_feature_node_get_attribute(tempfeat, "ID");
      const char *mrnalabel = agn_feature_node_get_label(tempfeat);
      GtUword num_introns = agn_typecheck_count(tempfeat, agn_typecheck_intron);
      fprintf(v->tsvout, "%s\t%s\t%s\t%s\t%lu\t%.3lf\t%.3lf\t%.3lf\t%.3lf\n",
              mrnaid, mrnalabel, intstr, covstr, num_introns,
              integrity_components[0], integrity_components[1],
              integrity_components[2], integrity_components[3]);
    }
  }
  gt_feature_node_iterator_delete(feats);

  return 0;
}
예제 #11
0
static GtArray* find_cds_parents(GtFeatureNode *cds_feature, GtFeatureNode *fn)
{
  GtFeatureNodeIterator *fni, *di;
  GtFeatureNode *parent, *child;
  GtArray *parents;
  gt_assert(cds_feature && fn);
  parents = gt_array_new(sizeof (GtFeatureNode*));
  fni = gt_feature_node_iterator_new(fn);
  while ((parent = gt_feature_node_iterator_next(fni))) {
    di = gt_feature_node_iterator_new_direct(parent);
    while ((child = gt_feature_node_iterator_next(di))) {
      if (child == cds_feature)
        gt_array_add(parents, parent);
    }
    gt_feature_node_iterator_delete(di);
  }
  gt_feature_node_iterator_delete(fni);
  return parents;
}
예제 #12
0
static int infer_cds_visitor_visit_feature_node(GtNodeVisitor *nv,
                                                GtFeatureNode *fn,
                                                GtError *error)
{
  AgnInferCDSVisitor *v = infer_cds_visitor_cast(nv);
  gt_error_check(error);

  GtFeatureNodeIterator *iter = gt_feature_node_iterator_new(fn);
  GtFeatureNode *current;
  for(current  = gt_feature_node_iterator_next(iter);
      current != NULL;
      current  = gt_feature_node_iterator_next(iter))
  {
    if(!agn_typecheck_mrna(current))
      continue;

    v->cds    = agn_typecheck_select(current, agn_typecheck_cds);
    v->utrs   = agn_typecheck_select(current, agn_typecheck_utr);
    v->exons  = agn_typecheck_select(current, agn_typecheck_exon);
    v->starts = agn_typecheck_select(current, agn_typecheck_start_codon);
    v->stops  = agn_typecheck_select(current, agn_typecheck_stop_codon);
    v->mrna   = current;

    infer_cds_visitor_infer_cds(v);
    infer_cds_visitor_check_start(v);
    infer_cds_visitor_check_stop(v);
    infer_cds_visitor_infer_utrs(v);
    infer_cds_visitor_check_cds_multi(v);
    infer_cds_visitor_check_cds_phase(v);
    infer_cds_visitor_set_utrs(v);

    v->mrna = NULL;
    gt_array_delete(v->cds);
    gt_array_delete(v->utrs);
    gt_array_delete(v->exons);
    gt_array_delete(v->starts);
    gt_array_delete(v->stops);
  }
  gt_feature_node_iterator_delete(iter);

  return 0;
}
예제 #13
0
static int feature_node_iterator_lua_next(lua_State *L)
{
  GtFeatureNodeIterator **fni;
  GtFeatureNode *fn;
  fni = check_gt_feature_node_iterator(L, 1);
  fn = gt_feature_node_iterator_next(*fni);
  if (fn)
    gt_lua_genome_node_push(L, gt_genome_node_ref((GtGenomeNode*) fn));
  else
    lua_pushnil(L);
  return 1;
}
예제 #14
0
static int extract_feature_visitor_feature_node(GtNodeVisitor *nv,
                                                GtFeatureNode *fn, GtError *err)
{
  GtExtractFeatureVisitor *efv;
  GtFeatureNodeIterator *fni;
  GtFeatureNode *child;
  GtStrArray *target_ids = NULL;
  GtStr *seqid = NULL,
        *description,
        *sequence;
  int had_err = 0;
  gt_error_check(err);
  efv = gt_extract_feature_visitor_cast(nv);
  gt_assert(efv->region_mapping);
  fni = gt_feature_node_iterator_new(fn);
  if (efv->target)
    target_ids = gt_str_array_new();
  if (efv->seqid)
    seqid = gt_str_new();
  description = gt_str_new();
  sequence = gt_str_new();
  while (!had_err && (child = gt_feature_node_iterator_next(fni))) {
    if (seqid)
      gt_str_reset(seqid);
    if (target_ids)
      gt_str_array_reset(target_ids);
    if (gt_extract_feature_sequence(sequence, (GtGenomeNode*) child, efv->type,
                                    efv->join, seqid, target_ids,
                                    efv->region_mapping, err)) {
      had_err = -1;
    }

    if (!had_err && gt_str_length(sequence)) {
      efv->fastaseq_counter++;
      construct_description(description, efv->type, efv->fastaseq_counter,
                            efv->join, efv->translate, seqid, target_ids);
      had_err = show_entry(description, sequence, efv->translate, efv->width,
                           efv->outfp);
      gt_str_reset(description);
      gt_str_reset(sequence);
    }

  }
  gt_str_delete(sequence);
  gt_str_delete(description);
  gt_str_delete(seqid);
  gt_str_array_delete(target_ids);
  gt_feature_node_iterator_delete(fni);
  return had_err;
}
예제 #15
0
static int check_cds_phases_if_necessary(GtFeatureNode *fn,
                                         GtCDSCheckVisitor *v,
                                         bool second_pass, GtError *err)
{
  GtFeatureNodeIterator *fni;
  GtFeatureNode *node;
  GtArray *cds_features = NULL;
  GtHashmap *multi_features = NULL;
  int had_err = 0;
  gt_error_check(err);
  gt_assert(fn);
  fni = gt_feature_node_iterator_new_direct(fn);
  while ((node = gt_feature_node_iterator_next(fni))) {
    if (gt_feature_node_has_type(node, gt_ft_CDS)) {
      if (gt_feature_node_is_multi(node)) {
        GtArray *features;
        if (!multi_features)
          multi_features = gt_hashmap_new(GT_HASH_DIRECT, NULL,
                                          (GtFree) gt_array_delete);
        if ((features =
                gt_hashmap_get(multi_features,
                             gt_feature_node_get_multi_representative(node)))) {
          gt_array_add(features, node);
        }
        else {
          GtFeatureNode *representative;
          features = gt_array_new(sizeof (GtFeatureNode*));
          representative = gt_feature_node_get_multi_representative(node);
          gt_array_add(features, representative);
          gt_hashmap_add(multi_features, representative, features);
        }
      }
      else {
        if (!cds_features)
          cds_features = gt_array_new(sizeof (GtFeatureNode*));
        gt_array_add(cds_features, node);
      }
    }
  }
  if (cds_features)
    had_err = check_cds_phases(cds_features, v, false, second_pass, err);
  if (!had_err && multi_features)
    had_err = gt_hashmap_foreach(multi_features, check_cds_phases_hm, v, err);
  gt_array_delete(cds_features);
  gt_hashmap_delete(multi_features);
  gt_feature_node_iterator_delete(fni);
  return had_err;
}
예제 #16
0
int gt_feature_node_iterator_example(GT_UNUSED GtError *err)
{
  GtFeatureNodeIterator *fni;
  GtFeatureNode *fn, *node;
  fn = (GtFeatureNode*) gt_feature_node_new_standard_gene();

  /* an example genome node iterator use case */
  fni = gt_feature_node_iterator_new(fn);
  while ((node = gt_feature_node_iterator_next(fni))) {
    /* do something with <node> */
  }
  gt_feature_node_iterator_delete(fni);

  gt_genome_node_delete((GtGenomeNode*) fn);
  return 0;
}
static int check_boundaries_visitor_feature_node(GT_UNUSED GtNodeVisitor *nv,
                                                 GtFeatureNode *fn,
                                                 GT_UNUSED GtError *err)
{
  GtFeatureNodeIterator *fni;
  GtFeatureNode *node;
  int had_err = 0;

  fni = gt_feature_node_iterator_new_direct(fn);
  while (!had_err && (node = gt_feature_node_iterator_next(fni))) {
    had_err = check_boundaries_visitor_check_rec(fn, node, err);
  }
  gt_feature_node_iterator_delete(fni);

  return 0;
}
예제 #18
0
static int cds_check_visitor_feature_node(GtNodeVisitor *nv, GtFeatureNode *fn,
                                          GtError *err)
{
  GtCDSCheckVisitor *v = cds_check_visitor_cast(nv);
  GtFeatureNodeIterator *fni;
  GtFeatureNode *node;
  int had_err = 0;
  gt_error_check(err);
  gt_assert(v && fn);
  fni = gt_feature_node_iterator_new(fn);
  while (!had_err && (node = gt_feature_node_iterator_next(fni)))
    had_err = check_cds_phases_if_necessary(node, v, err);
  gt_feature_node_iterator_delete(fni);
  gt_hashmap_reset(v->cds_features);
  return had_err;
}
예제 #19
0
static int extracttarget_from_node(GtGenomeNode *gn, GtStrArray *seqfiles,
                                   GtError *err)
{
  GtFeatureNodeIterator *fni;
  int had_err = 0;
  gt_error_check(err);
  gt_assert(gn && seqfiles);
  if (gt_genome_node_cast(gt_feature_node_class(), gn)) {
    const char *target;
    GtFeatureNode *child;
    fni = gt_feature_node_iterator_new(gt_feature_node_cast(gn));
    while (!had_err && /* XXX remove cast */
           (child = (GtFeatureNode*) gt_feature_node_iterator_next(fni))) {
      if ((target = gt_feature_node_get_attribute(child, "Target")))
        had_err = extracttarget_from_seqfiles(target, seqfiles, err);
    }
    gt_feature_node_iterator_delete(fni);
  }
  return had_err;
}
static int gt_seqpos_classifier_next_specified_ft(
    GtSeqposClassifier *seqpos_classifier, GtRange *range,
    bool *end_of_annotation, GtError *err)
{
  int had_err = 0;
  GtFeatureNode *cfn;
  bool fni_exhausted = (seqpos_classifier->fni == NULL) ? true : false;
  gt_assert(seqpos_classifier != NULL);
  gt_assert(range != NULL);
  while (true)
  {
    if (fni_exhausted)
    {
      had_err = gt_seqpos_classifier_next_fn(seqpos_classifier, err);
      if (had_err != 0 || seqpos_classifier->fn == NULL)
      {
        *end_of_annotation = true;
        return had_err;
      }
      fni_exhausted = false;
    }
    gt_assert(seqpos_classifier->fni != NULL);
    cfn = gt_feature_node_iterator_next(seqpos_classifier->fni);
    if (cfn == NULL)
    {
      fni_exhausted = true;
    }
    else if (strcmp(gt_feature_node_get_type(cfn),
          seqpos_classifier->specified_ft) == 0)
    {
      seqpos_classifier->nof_specified_ft_found++;
      *range = gt_genome_node_get_range((GtGenomeNode*)cfn);
      gt_assert(range->start > 0);
      gt_assert(range->end > 0);
      range->start--;
      range->end--;
      *end_of_annotation = false;
      return had_err;
    }
  }
}
예제 #21
0
static void orf_attach_results_to_gff3(GtFeatureNode *gf,
                                       GtRange orf_rng, unsigned int orf_frame,
                                       GtStrand strand, GT_UNUSED GtError *err)
{
  GtGenomeNode *child;
  GtStr *tag;
  tag = gt_str_new_cstr(GT_ORF_FINDER_TAG);

  orf_rng.start++; orf_rng.end++;

  GtFeatureNodeIterator *gfi;
  GtFeatureNode *curnode = NULL, *parent_node = NULL;
  GtRange gfi_range;
  char frame_buf[3];
  sprintf(frame_buf, "%d", orf_frame);

  gfi = gt_feature_node_iterator_new(gf);

  while ((curnode = gt_feature_node_iterator_next(gfi))) {
    if (strcmp(gt_feature_node_get_type(curnode),
                                              (const char*) GT_ORF_TYPE) != 0) {
      gfi_range = gt_genome_node_get_range((GtGenomeNode*) curnode);
      if (gt_range_contains(&gfi_range, &orf_rng)) {
        parent_node = curnode;
      }
    }
  }
  if (parent_node) {
    child = gt_feature_node_new(gt_genome_node_get_seqid((GtGenomeNode*) gf),
                                GT_ORF_TYPE,
                                orf_rng.start,
                                orf_rng.end,
                                strand);
    gt_feature_node_set_source((GtFeatureNode*) child, tag);
    gt_feature_node_set_attribute((GtFeatureNode*) child, "frame", frame_buf);
    gt_feature_node_add_child(parent_node,(GtFeatureNode*) child);
  }
  gt_str_delete(tag);
  gt_feature_node_iterator_delete(gfi);
}
static int CpGIOverlap_stream_next(GtNodeStream * ns,
                                   GtGenomeNode ** gn,
                                   GtError * err)
{
    GtGenomeNode * cur_node, * next_node;
    GtFeatureNodeIterator * iter;
    int err_num = 0;
    *gn = NULL;
    CpGIOverlap_stream * context;
    const char * gene_name = NULL;
    const char * overlap_name = NULL;
    char  chr_str[255];
    int  chr_num;
    unsigned int TSS;

    float CpGIOverlap;


    context = CpGIOverlap_stream_cast(ns);

    // find the genes, determine expression level
     if(!gt_node_stream_next(context->in_stream,
                            &cur_node,
                            err
                           ) && cur_node != NULL
       )
     {
         *gn = cur_node;

         // try casting as a feature node so we can test type
         if(!gt_genome_node_try_cast(gt_feature_node_class(), cur_node))
         {
               return 0;
         }
         else // we found a feature node
         {
              // first check if it is a pseudo node, if so find the gene in it if available
              if (gt_feature_node_is_pseudo(cur_node))
              {
                  iter = gt_feature_node_iterator_new(cur_node);
                  if (iter == NULL)
                      return;
                  while ((next_node = gt_feature_node_iterator_next(iter)) && !gt_feature_node_has_type(next_node, feature_type_gene));
                  gt_feature_node_iterator_delete(iter);
                  if (NULL == (cur_node = next_node))
                     return 0;
              }


              if(!gt_feature_node_has_type(cur_node, feature_type_gene))
                  return 0;

              // find name of gene
              gene_name = gt_feature_node_get_attribute(cur_node, "Name");

              if (gene_name == NULL)
                  return;

              if ( 1 != sscanf(gt_str_get(gt_genome_node_get_seqid(cur_node)), "Chr%d", &chr_num))
                  return 0;

              TSS = (gt_feature_node_get_strand(cur_node) == GT_STRAND_FORWARD) ? gt_genome_node_get_start(cur_node) : gt_genome_node_get_end(cur_node);

              // now figure out the overlapping gene 
              if (! (overlap_name = CpGIOverlap_stream_find_gene_overlap( context, TSS, chr_num)))
                 return 0;

              // save the score into the node
              gt_feature_node_set_attribute(cur_node, "cpgi_at_tss", overlap_name);
              
              return 0;

         }
     }

    return err_num;
}
예제 #23
0
static int cluster_annotate_nodes(GtClusteredSet *cs, GtEncseq *encseq,
                                  const char *feature, GtArray *nodes,
                                  GtError *err)
{
  GtFeatureNodeIterator *fni;
  GtFeatureNode *curnode = NULL, *tmp;
  GtClusteredSetIterator *csi = NULL;
  GtGenomeNode *gn;
  GtHashmap *desc2node;
  GtStr *seqid = NULL;
  int had_err = 0;
  unsigned long num_of_clusters, i, elm;
  const char *fnt = NULL;
  char buffer[BUFSIZ], *real_feature;
  gt_error_check(err);

  if ((strcmp(feature, "lLTR") == 0) || (strcmp(feature, "rLTR") == 0))
    real_feature = gt_cstr_dup(gt_ft_long_terminal_repeat);
  else
    real_feature = gt_cstr_dup(feature);

  desc2node = gt_hashmap_new(GT_HASH_STRING, free_hash, NULL);
  for (i = 0; i < gt_array_size(nodes); i++) {
    gn = *(GtGenomeNode**) gt_array_get(nodes, i);
    if (gt_feature_node_try_cast(gn) == NULL)
      continue;
    fni = gt_feature_node_iterator_new((GtFeatureNode*) gn);
    while ((curnode = gt_feature_node_iterator_next(fni)) != NULL) {
      char header[BUFSIZ];
      fnt = gt_feature_node_get_type(curnode);
      if (strcmp(fnt, gt_ft_repeat_region) == 0) {
        const char *rid;
        unsigned long id;
        seqid = gt_genome_node_get_seqid((GtGenomeNode*) curnode);
        rid = gt_feature_node_get_attribute(curnode, "ID");
        (void) sscanf(rid, "repeat_region%lu", &id);
        (void) snprintf(buffer, BUFSIZ, "%s_%lu", gt_str_get(seqid), id);
      } else if (strcmp(fnt, gt_ft_protein_match) == 0) {
        GtRange range;
        const char *attr;
        attr = gt_feature_node_get_attribute(curnode, "name");
        if (!attr)
          continue;
        if (strcmp(feature, attr) != 0)
          continue;
        range = gt_genome_node_get_range((GtGenomeNode*) curnode);
        if ((range.end - range.start + 1) < 10UL)
          continue;
        (void) snprintf(header, BUFSIZ, "%s_%lu_%lu", buffer, range.start,
                        range.end);
        gt_hashmap_add(desc2node, (void*) gt_cstr_dup(header), (void*) curnode);
      } else if (strcmp(fnt, real_feature) == 0) {
        GtRange range;
        range = gt_genome_node_get_range((GtGenomeNode*) curnode);
        if ((range.end - range.start + 1) < 10UL)
          continue;
        (void) snprintf(header, BUFSIZ, "%s_%lu_%lu", buffer, range.start,
                        range.end);
        gt_hashmap_add(desc2node, (void*) gt_cstr_dup(header), (void*) curnode);
      }
    }
    gt_feature_node_iterator_delete(fni);
  }
  gt_free(real_feature);

  num_of_clusters = gt_clustered_set_num_of_clusters(cs, err);
  for (i = 0; i < num_of_clusters; i++) {
    csi = gt_clustered_set_get_iterator(cs, i ,err);
    if (csi != NULL) {
      while (!had_err && (gt_clustered_set_iterator_next(csi, &elm, err)
             != GT_CLUSTERED_SET_ITERATOR_STATUS_END)) {
        char clid[BUFSIZ];
        const char *encseqdesc;
        char *encseqid;
        unsigned long desclen;
        encseqdesc = gt_encseq_description(encseq, &desclen, elm);
        encseqid = gt_calloc((size_t) (desclen + 1), sizeof (char));
        (void) strncpy(encseqid, encseqdesc, (size_t) desclen);
        encseqid[desclen] = '\0';
        tmp = (GtFeatureNode*) gt_hashmap_get(desc2node, (void*) encseqid);
        (void) snprintf(clid, BUFSIZ, "%lu", i);
        gt_feature_node_set_attribute(tmp, "clid", clid);
        gt_free(encseqid);
      }
    }
    gt_clustered_set_iterator_delete(csi, err);
    csi = NULL;
  }
  gt_hashmap_delete(desc2node);
  return had_err;
}
예제 #24
0
static int gt_ltrdigest_pdom_visitor_feature_node(GtNodeVisitor *nv,
                                                  GtFeatureNode *fn,
                                                  GtError *err)
{
  GtLTRdigestPdomVisitor *lv;
  GtFeatureNodeIterator *fni;
  GtFeatureNode *curnode = NULL;
  int had_err = 0;
  GtRange rng;
  GtUword i;
  lv = gt_ltrdigest_pdom_visitor_cast(nv);
  gt_assert(lv);
  gt_error_check(err);

  /* traverse annotation subgraph and find LTR element */
  fni = gt_feature_node_iterator_new(fn);
  while (!had_err && (curnode = gt_feature_node_iterator_next(fni))) {
    if (strcmp(gt_feature_node_get_type(curnode), lv->root_type) == 0) {
      lv->ltr_retrotrans = curnode;
    }
  }
  gt_feature_node_iterator_delete(fni);

  if (!had_err && lv->ltr_retrotrans != NULL) {
    GtCodonIterator *ci;
    GtTranslator *tr;
    GtTranslatorStatus status;
    GtUword seqlen;
    char translated, *rev_seq;
#ifndef _WIN32
    FILE *instream;
    GtHMMERParseStatus *pstatus;
#endif
    unsigned int frame;
    GtStr *seq;

    seq = gt_str_new();
    rng = gt_genome_node_get_range((GtGenomeNode*) lv->ltr_retrotrans);
    lv->leftLTR_5 = rng.start - 1;
    lv->rightLTR_3 = rng.end - 1;
    seqlen = gt_range_length(&rng);

    had_err = gt_extract_feature_sequence(seq,
                                          (GtGenomeNode*) lv->ltr_retrotrans,
                                          lv->root_type,
                                          false, NULL, NULL, lv->rmap, err);

    if (!had_err) {
      for (i = 0UL; i < 3UL; i++) {
        gt_str_reset(lv->fwd[i]);
        gt_str_reset(lv->rev[i]);
      }

      /* create translations */
      ci = gt_codon_iterator_simple_new(gt_str_get(seq), seqlen, NULL);
      gt_assert(ci);
      tr = gt_translator_new(ci);
      status = gt_translator_next(tr, &translated, &frame, err);
      while (status == GT_TRANSLATOR_OK && translated) {
        gt_str_append_char(lv->fwd[frame], translated);
        status = gt_translator_next(tr, &translated, &frame, NULL);
      }
      if (status == GT_TRANSLATOR_ERROR) had_err = -1;
      if (!had_err) {
        rev_seq = gt_malloc((size_t) seqlen * sizeof (char));
        strncpy(rev_seq, gt_str_get(seq), (size_t) seqlen * sizeof (char));
        (void) gt_reverse_complement(rev_seq, seqlen, NULL);
        gt_codon_iterator_delete(ci);
        ci = gt_codon_iterator_simple_new(rev_seq, seqlen, NULL);
        gt_translator_set_codon_iterator(tr, ci);
        status = gt_translator_next(tr, &translated, &frame, err);
        while (status == GT_TRANSLATOR_OK && translated) {
          gt_str_append_char(lv->rev[frame], translated);
          status = gt_translator_next(tr, &translated, &frame, NULL);
        }
        if (status == GT_TRANSLATOR_ERROR) had_err = -1;
        gt_free(rev_seq);
      }
      gt_codon_iterator_delete(ci);
      gt_translator_delete(tr);
    }

    /* run HMMER and handle results */
    if (!had_err) {
#ifndef _WIN32
      int pid, pc[2], cp[2];
      GT_UNUSED int rval;

      (void) signal(SIGCHLD, SIG_IGN); /* XXX: for now, ignore child's
                                               exit status */
      rval = pipe(pc);
      gt_assert(rval == 0);
      rval = pipe(cp);
      gt_assert(rval == 0);

      switch ((pid = (int) fork())) {
        case -1:
          perror("Can't fork");
          exit(1);   /* XXX: error handling */
        case 0:    /* child */
          (void) close(1);    /* close current stdout. */
          rval = dup(cp[1]);  /* make stdout go to write end of pipe. */
          (void) close(0);    /* close current stdin. */
          rval = dup(pc[0]);  /* make stdin come from read end of pipe. */
          (void) close(pc[0]);
          (void) close(pc[1]);
          (void) close(cp[0]);
          (void) close(cp[1]);
          (void) execvp("hmmscan", lv->args); /* XXX: read path from env */
          perror("couldn't execute hmmscan!");
          exit(1);
        default:    /* parent */
          for (i = 0UL; i < 3UL; i++) {
            char buf[5];
            GT_UNUSED ssize_t written;
            (void) sprintf(buf, ">"GT_WU"%c\n", i, '+');
            written = write(pc[1], buf, 4 * sizeof (char));
            written = write(pc[1], gt_str_get(lv->fwd[i]),
                            (size_t) gt_str_length(lv->fwd[i]) * sizeof (char));
            written = write(pc[1], "\n", 1 * sizeof (char));
            (void) sprintf(buf, ">"GT_WU"%c\n", i, '-');
            written = write(pc[1], buf, 4 * sizeof (char));
            written = write(pc[1], gt_str_get(lv->rev[i]),
                            (size_t) gt_str_length(lv->rev[i]) * sizeof (char));
            written = write(pc[1], "\n", 1 * sizeof (char));
          }
          (void) close(pc[0]);
          (void) close(pc[1]);
          (void) close(cp[1]);
          instream = fdopen(cp[0], "r");
          pstatus = gt_hmmer_parse_status_new();
          had_err = gt_ltrdigest_pdom_visitor_parse_output(lv, pstatus,
                                                           instream, err);
          (void) fclose(instream);
          if (!had_err)
            had_err = gt_ltrdigest_pdom_visitor_process_hits(lv, pstatus, err);
          gt_hmmer_parse_status_delete(pstatus);
      }
#else
      /* XXX */
      gt_error_set(err, "HMMER call not implemented on Windows\n");
      had_err = -1;
#endif
    }
    gt_str_delete(seq);
  }
  if (!had_err)
    had_err = gt_ltrdigest_pdom_visitor_choose_strand(lv);
  return had_err;
}
예제 #25
0
static int gt_ltrdigest_pdom_visitor_choose_strand(GtLTRdigestPdomVisitor *lv)
{
  int had_err = 0;
  double log_eval_fwd = 0.0,
         log_eval_rev = 0.0;
  GtFeatureNodeIterator *fni;
  GtStrand strand;
  double score;
  bool seen_fwd = false,
       seen_rev = false;
  GtFeatureNode *curnode = NULL;
  GtUword i;
  GtArray *to_delete;

  fni = gt_feature_node_iterator_new(lv->ltr_retrotrans);
  while (!had_err && (curnode = gt_feature_node_iterator_next(fni))) {
    if (strcmp(gt_feature_node_get_type(curnode),
               gt_ft_protein_match) == 0) {
      strand = gt_feature_node_get_strand(curnode);
      score = (double) gt_feature_node_get_score(curnode);
      if (strand == GT_STRAND_FORWARD) {
        log_eval_fwd += log(score);
        seen_fwd = true;
      } else if (strand == GT_STRAND_REVERSE) {
        log_eval_rev += log(score);
        seen_rev = true;
      }
    }
  }
  gt_feature_node_iterator_delete(fni);

  if (seen_rev && !seen_fwd)
    gt_feature_node_set_strand(lv->ltr_retrotrans, GT_STRAND_REVERSE);
  else if (!seen_rev && seen_fwd)
    gt_feature_node_set_strand(lv->ltr_retrotrans, GT_STRAND_FORWARD);
  else if (!seen_rev && !seen_fwd)
    return had_err;
  else {
    gt_assert(seen_rev && seen_fwd);
    if (gt_double_compare(log_eval_fwd, log_eval_rev) < 0)
      strand = GT_STRAND_FORWARD;
    else
      strand = GT_STRAND_REVERSE;
    gt_feature_node_set_strand(lv->ltr_retrotrans, strand);

    to_delete = gt_array_new(sizeof (GtFeatureNode*));
    fni = gt_feature_node_iterator_new(lv->ltr_retrotrans);
    while (!had_err && (curnode = gt_feature_node_iterator_next(fni))) {
      if (strcmp(gt_feature_node_get_type(curnode),
                 gt_ft_protein_match) == 0) {
        if (strand != gt_feature_node_get_strand(curnode)) {
          gt_array_add(to_delete, curnode);
        }
      }
    }
    gt_feature_node_iterator_delete(fni);
    gt_assert(gt_array_size(to_delete) > 0);
    for (i = 0; i < gt_array_size(to_delete); i++) {
      gt_feature_node_remove_leaf(lv->ltr_retrotrans,
                                  *(GtFeatureNode**) gt_array_get(to_delete,
                                                                  i));
    }
    gt_array_delete(to_delete);
  }
  return had_err;
}
static int gt_extract_feature_sequence_generic(GtStr *sequence,
                                GtGenomeNode *gn,
                                const char *type, bool join, GtStr *seqid,
                                GtStrArray *target_ids,
                                unsigned int *out_phase_offset,
                                GtRegionMapping *region_mapping, GtError *err)
{
  GtFeatureNode *fn;
  GtRange range;
  unsigned int phase_offset = 0;
  char *outsequence;
  const char *target;
  int had_err = 0;

  gt_error_check(err);
  fn = gt_genome_node_cast(gt_feature_node_class(), gn);
  gt_assert(fn);

  if (seqid)
    gt_str_append_str(seqid, gt_genome_node_get_seqid(gn));
  if (target_ids &&
      (target = gt_feature_node_get_attribute(fn, GT_GFF_TARGET))) {
    had_err = gt_gff3_parser_parse_all_target_attributes(target, false,
                                                         target_ids, NULL,
                                                         NULL, "", 0, err);
  }
  if (!had_err) {
    if (join) {
      GtFeatureNodeIterator *fni;
      GtFeatureNode *child;
      bool reverse_strand = false,
           first_child = true,
           first_child_of_type_seen = false;
      GtPhase phase = GT_PHASE_UNDEFINED;
      /* in this case we have to traverse the children */
      fni = gt_feature_node_iterator_new_direct(gt_feature_node_cast(gn));
      while (!had_err && (child = gt_feature_node_iterator_next(fni))) {
        if (first_child) {
          if (target_ids &&
               (target = gt_feature_node_get_attribute(child, GT_GFF_TARGET))) {
            gt_str_array_reset(target_ids);
            had_err = gt_gff3_parser_parse_all_target_attributes(target, false,
                                                                 target_ids,
                                                                 NULL,
                                                                 NULL, "", 0,
                                                                 err);
          }
          first_child = false;
        }
        if (!had_err) {
          if (extract_join_feature((GtGenomeNode*) child, type, region_mapping,
                                   sequence, &reverse_strand,
                                   &first_child_of_type_seen,
                                   &phase, err)) {
            had_err = -1;
          }
          if (phase != GT_PHASE_UNDEFINED) {
            phase_offset = (int) phase;
          }
        }
      }
      gt_feature_node_iterator_delete(fni);
      gt_assert(phase_offset <= (unsigned int) GT_PHASE_UNDEFINED);
      if (!had_err && gt_str_length(sequence)) {
        if (reverse_strand) {
          had_err = gt_reverse_complement(gt_str_get(sequence),
                                          gt_str_length(sequence), err);
        }
      }
    }
    else if (gt_feature_node_get_type(fn) == type) {
      GtPhase phase = gt_feature_node_get_phase(fn);
      gt_assert(!had_err);
      if (phase != GT_PHASE_UNDEFINED)
        phase_offset = (unsigned int) phase;
      /* otherwise we only have to look at this feature */
      range = gt_genome_node_get_range(gn);
      gt_assert(range.start); /* 1-based coordinates */
      had_err = gt_region_mapping_get_sequence(region_mapping, &outsequence,
                                               gt_genome_node_get_seqid(gn),
                                               range.start, range.end, err);
      if (!had_err) {
        gt_str_append_cstr_nt(sequence, outsequence, gt_range_length(&range));
        gt_free(outsequence);
        if (gt_feature_node_get_strand(fn) == GT_STRAND_REVERSE) {
          had_err = gt_reverse_complement(gt_str_get(sequence),
                                          gt_str_length(sequence), err);
        }
      }
    }
  }
  if (out_phase_offset && phase_offset != GT_PHASE_UNDEFINED) {
    *out_phase_offset = phase_offset;
  }
  return had_err;
}
예제 #27
0
static int add_ids_visitor_feature_node(GtNodeVisitor *nv, GtFeatureNode *fn,
                                        GtError *err)
{
  AutomaticSequenceRegion *auto_sr;
  GtAddIDsVisitor *aiv;
  const char *seqid;
  bool is_circular;
  aiv = add_ids_visitor_cast(nv);
  seqid = gt_str_get(gt_genome_node_get_seqid((GtGenomeNode*) fn));
  if (aiv->ensure_sorting && !gt_cstr_table_get(aiv->defined_seqids, seqid)) {
    gt_error_set(err, "the file %s is not sorted (seqid \"%s\" on line %u has "
                 "not been previously introduced with a \"%s\" line)",
                 gt_genome_node_get_filename((GtGenomeNode*) fn), seqid,
                 gt_genome_node_get_line_number((GtGenomeNode*) fn),
                 GT_GFF_SEQUENCE_REGION);
    return -1;
  }
  if (!gt_cstr_table_get(aiv->defined_seqids, seqid)) {
    GtFeatureNodeIterator *fni;
    GtFeatureNode *node;
    GtRange range = gt_genome_node_get_range((GtGenomeNode*) fn);
    is_circular = gt_feature_node_get_attribute(fn, GT_GFF_IS_CIRCULAR)
                  ? true : false;
    if (!is_circular) {
      fni = gt_feature_node_iterator_new(fn);
      while ((node = gt_feature_node_iterator_next(fni))) {
        GtRange node_range = gt_genome_node_get_range((GtGenomeNode*) node);
        range = gt_range_join(&range, &node_range);
      }
      gt_feature_node_iterator_delete(fni);
    }
    /* sequence region has not been previously introduced -> check if one has
       already been created automatically */
    auto_sr = gt_hashmap_get(aiv->undefined_sequence_regions, seqid);
    if (!auto_sr) {
      GtStr *seqid_str;
      /* sequence region has not been createad automatically -> do it now */
      gt_warning("seqid \"%s\" on line %u in file \"%s\" has not been "
                 "previously introduced with a \"%s\" line, create such a line "
                 "automatically", seqid,
                 gt_genome_node_get_line_number((GtGenomeNode*) fn),
                 gt_genome_node_get_filename((GtGenomeNode*) fn),
                 GT_GFF_SEQUENCE_REGION);
      auto_sr = automatic_sequence_region_new(is_circular);
      seqid_str = gt_genome_node_get_seqid((GtGenomeNode*) fn);
      auto_sr->sequence_region = gt_region_node_new(seqid_str, range.start,
                                                               range.end);
      gt_hashmap_add(aiv->undefined_sequence_regions, gt_str_get(seqid_str),
                     auto_sr);
    }
    else {
      if (auto_sr->is_circular) {
        gt_assert(!is_circular); /* XXX */
      }
      else if (is_circular) {
        gt_assert(!auto_sr->is_circular); /* XXX */
        auto_sr->is_circular = true;
        gt_genome_node_set_range(auto_sr->sequence_region, &range);
      }
      else {
        GtRange joined_range,
                sr_range = gt_genome_node_get_range(auto_sr->sequence_region);
        /* update the range of the sequence region */
        joined_range = gt_range_join(&range, &sr_range);
        gt_genome_node_set_range(auto_sr->sequence_region, &joined_range);
      }
    }
    gt_array_add(auto_sr->feature_nodes, fn);
  }
  else
    gt_queue_add(aiv->node_buffer, fn);
  return 0;
}
int gt_ltrfileout_stream_next(GtNodeStream *ns, GtGenomeNode **gn, GtError *err)
{
  GtLTRdigestFileOutStream *ls;
  GtFeatureNode *fn;
  GtRange lltr_rng = {GT_UNDEF_UWORD, GT_UNDEF_UWORD},
          rltr_rng = {GT_UNDEF_UWORD, GT_UNDEF_UWORD},
          ppt_rng = {GT_UNDEF_UWORD, GT_UNDEF_UWORD},
          pbs_rng = {GT_UNDEF_UWORD, GT_UNDEF_UWORD};
  int had_err;
  GtUword i=0;

  gt_error_check(err);
  ls = gt_ltrdigest_file_out_stream_cast(ns);

  /* initialize this element */
  memset(&ls->element, 0, sizeof (GtLTRElement));

  /* get annotations from parser */
  had_err = gt_node_stream_next(ls->in_stream, gn, err);
  if (!had_err && *gn)
  {
    GtFeatureNodeIterator* gni;
    GtFeatureNode *mygn;

    /* only process feature nodes */
    if (!(fn = gt_feature_node_try_cast(*gn)))
      return 0;

    ls->element.pdomorder = gt_array_new(sizeof (const char*));

    /* fill LTRElement structure from GFF3 subgraph */
    gni = gt_feature_node_iterator_new(fn);
    for (mygn = fn; mygn != NULL; mygn = gt_feature_node_iterator_next(gni))
      (void) gt_genome_node_accept((GtGenomeNode*) mygn,
                                   (GtNodeVisitor*) ls->lv,
                                   err);
    gt_feature_node_iterator_delete(gni);
  }

  if (!had_err && ls->element.mainnode != NULL)
  {
    char desc[GT_MAXFASTAHEADER];
    GtFeatureNode *ltr3, *ltr5;
    GtStr *sdesc, *sreg, *seq;

    /* find sequence in GtEncseq */
    sreg = gt_genome_node_get_seqid((GtGenomeNode*) ls->element.mainnode);

    sdesc = gt_str_new();
    had_err = gt_region_mapping_get_description(ls->rmap, sdesc, sreg, err);

    if (!had_err) {
      GtRange rng;
      ls->element.seqid = gt_calloc((size_t) ls->seqnamelen+1, sizeof (char));
      (void) snprintf(ls->element.seqid,
                      MIN((size_t) gt_str_length(sdesc),
                          (size_t) ls->seqnamelen)+1,
                      "%s", gt_str_get(sdesc));
      gt_cstr_rep(ls->element.seqid, ' ', '_');
      if (gt_str_length(sdesc) > (GtUword) ls->seqnamelen)
        ls->element.seqid[ls->seqnamelen] = '\0';

      (void) gt_ltrelement_format_description(&ls->element,
                                              ls->seqnamelen,
                                              desc,
                                              (size_t) (GT_MAXFASTAHEADER-1));
      gt_str_delete(sdesc);

      /* output basic retrotransposon data */
      lltr_rng = gt_genome_node_get_range((GtGenomeNode*) ls->element.leftLTR);
      rltr_rng = gt_genome_node_get_range((GtGenomeNode*) ls->element.rightLTR);
      rng = gt_genome_node_get_range((GtGenomeNode*) ls->element.mainnode);
      gt_file_xprintf(ls->tabout_file,
                      GT_WU"\t"GT_WU"\t"GT_WU"\t%s\t"GT_WU"\t"GT_WU"\t"GT_WU"\t"
                      GT_WU"\t"GT_WU"\t"GT_WU"\t",
                      rng.start, rng.end, gt_ltrelement_length(&ls->element),
                      ls->element.seqid, lltr_rng.start, lltr_rng.end,
                      gt_ltrelement_leftltrlen(&ls->element), rltr_rng.start,
                      rltr_rng.end, gt_ltrelement_rightltrlen(&ls->element));
    }
    seq = gt_str_new();

    /* output TSDs */
    if (!had_err && ls->element.leftTSD != NULL)
    {
      GtRange tsd_rng;
      tsd_rng = gt_genome_node_get_range((GtGenomeNode*) ls->element.leftTSD);
      had_err = gt_extract_feature_sequence(seq,
                                       (GtGenomeNode*) ls->element.leftTSD,
                                       gt_symbol(gt_ft_target_site_duplication),
                                       false,
                                       NULL, NULL, ls->rmap, err);
      if (!had_err) {
        gt_file_xprintf(ls->tabout_file,
                         ""GT_WU"\t"GT_WU"\t%s\t",
                         tsd_rng.start,
                         tsd_rng.end,
                         gt_str_get(seq));
      }
    gt_str_reset(seq);
    } else gt_file_xprintf(ls->tabout_file, "\t\t\t");

    if (!had_err && ls->element.rightTSD != NULL)
    {
      GtRange tsd_rng;

      tsd_rng = gt_genome_node_get_range((GtGenomeNode*) ls->element.rightTSD);
      had_err = gt_extract_feature_sequence(seq,
                                       (GtGenomeNode*) ls->element.rightTSD,
                                       gt_symbol(gt_ft_target_site_duplication),
                                       false,
                                       NULL, NULL, ls->rmap, err);
      if (!had_err) {
        gt_file_xprintf(ls->tabout_file,
                           ""GT_WU"\t"GT_WU"\t%s\t",
                           tsd_rng.start,
                           tsd_rng.end,
                           gt_str_get(seq));
      }
      gt_str_reset(seq);
    } else gt_file_xprintf(ls->tabout_file, "\t\t\t");

    /* output PPT */
    if (!had_err && ls->element.ppt != NULL)
    {
      GtStrand ppt_strand = gt_feature_node_get_strand(ls->element.ppt);

      ppt_rng = gt_genome_node_get_range((GtGenomeNode*) ls->element.ppt);
      had_err = gt_extract_feature_sequence(seq,
                                            (GtGenomeNode*) ls->element.ppt,
                                            gt_symbol(gt_ft_RR_tract), false,
                                            NULL, NULL, ls->rmap, err);
      if (!had_err) {
        gt_fasta_show_entry(desc, gt_str_get(seq), gt_range_length(&ppt_rng),
                            GT_FSWIDTH, ls->pptout_file);
        gt_file_xprintf(ls->tabout_file,
                           ""GT_WU"\t"GT_WU"\t%s\t%c\t%d\t",
                           ppt_rng.start,
                           ppt_rng.end,
                           gt_str_get(seq),
                           GT_STRAND_CHARS[ppt_strand],
                           (ppt_strand == GT_STRAND_FORWARD ?
                               abs((int) (rltr_rng.start - ppt_rng.end)) :
                               abs((int) (lltr_rng.end - ppt_rng.start))));
      }
      gt_str_reset(seq);
    } else gt_file_xprintf(ls->tabout_file, "\t\t\t\t\t");

    /* output PBS */
    if (!had_err && ls->element.pbs != NULL)
    {
      GtStrand pbs_strand;

      pbs_strand = gt_feature_node_get_strand(ls->element.pbs);
      pbs_rng = gt_genome_node_get_range((GtGenomeNode*) ls->element.pbs);
      had_err = gt_extract_feature_sequence(seq,
                                           (GtGenomeNode*) ls->element.pbs,
                                           gt_symbol(gt_ft_primer_binding_site),
                                           false, NULL, NULL, ls->rmap, err);
      if (!had_err) {
        gt_fasta_show_entry(desc, gt_str_get(seq), gt_range_length(&pbs_rng),
                            GT_FSWIDTH, ls->pbsout_file);
        gt_file_xprintf(ls->tabout_file,
                         ""GT_WU"\t"GT_WU"\t%c\t%s\t%s\t%s\t%s\t%s\t",
                         pbs_rng.start,
                         pbs_rng.end,
                         GT_STRAND_CHARS[pbs_strand],
                         gt_feature_node_get_attribute(ls->element.pbs, "trna"),
                         gt_str_get(seq),
                         gt_feature_node_get_attribute(ls->element.pbs,
                                                       "pbsoffset"),
                         gt_feature_node_get_attribute(ls->element.pbs,
                                                       "trnaoffset"),
                         gt_feature_node_get_attribute(ls->element.pbs,
                                                       "edist"));
      }
      gt_str_reset(seq);
    } else gt_file_xprintf(ls->tabout_file, "\t\t\t\t\t\t\t\t");

    /* output protein domains */
    if (!had_err && ls->element.pdoms != NULL)
    {
      GtStr *pdomorderstr = gt_str_new();
      for (i=0; !had_err && i<gt_array_size(ls->element.pdomorder); i++)
      {
        const char* key = *(const char**) gt_array_get(ls->element.pdomorder,
                                                       i);
        GtArray *entry = (GtArray*) gt_hashmap_get(ls->element.pdoms, key);
        had_err = write_pdom(ls, entry, key, ls->rmap, desc, err);
      }

      if (GT_STRAND_REVERSE == gt_feature_node_get_strand(ls->element.mainnode))
        gt_array_reverse(ls->element.pdomorder);

      for (i=0 ;!had_err && i<gt_array_size(ls->element.pdomorder); i++)
      {
        const char* name = *(const char**) gt_array_get(ls->element.pdomorder,
                                                        i);
        gt_str_append_cstr(pdomorderstr, name);
        if (i != gt_array_size(ls->element.pdomorder)-1)
          gt_str_append_cstr(pdomorderstr, "/");
      }
      gt_file_xprintf(ls->tabout_file, "%s", gt_str_get(pdomorderstr));
      gt_str_delete(pdomorderstr);
    }

    /* output LTRs (we just expect them to exist) */
    switch (gt_feature_node_get_strand(ls->element.mainnode))
    {
      case GT_STRAND_REVERSE:
        ltr5 = ls->element.rightLTR;
        ltr3 = ls->element.leftLTR;
        break;
      case GT_STRAND_FORWARD:
      default:
        ltr5 = ls->element.leftLTR;
        ltr3 = ls->element.rightLTR;
        break;
    }

    if (!had_err) {
      had_err = gt_extract_feature_sequence(seq, (GtGenomeNode*) ltr5,
                                          gt_symbol(gt_ft_long_terminal_repeat),
                                          false,
                                          NULL, NULL, ls->rmap, err);
    }
    if (!had_err) {
      gt_fasta_show_entry(desc, gt_str_get(seq), gt_str_length(seq),
                          GT_FSWIDTH, ls->ltr5out_file);
      gt_str_reset(seq);
    }
    if (!had_err) {
      had_err = gt_extract_feature_sequence(seq, (GtGenomeNode*) ltr3,
                                          gt_symbol(gt_ft_long_terminal_repeat),
                                          false,
                                          NULL, NULL, ls->rmap, err);
    }
    if (!had_err) {
      gt_fasta_show_entry(desc, gt_str_get(seq), gt_str_length(seq),
                          GT_FSWIDTH, ls->ltr3out_file);
      gt_str_reset(seq);
    }

    /* output complete oriented element */
    if (!had_err) {
      had_err = gt_extract_feature_sequence(seq,
                                           (GtGenomeNode*) ls->element.mainnode,
                                           gt_symbol(gt_ft_LTR_retrotransposon),
                                           false,
                                           NULL, NULL, ls->rmap, err);
    }
    if (!had_err) {
      gt_fasta_show_entry(desc,gt_str_get(seq), gt_str_length(seq),
                          GT_FSWIDTH, ls->elemout_file);
      gt_str_reset(seq);
    }
    gt_file_xprintf(ls->tabout_file, "\n");
    gt_str_delete(seq);
  }
  gt_hashmap_delete(ls->element.pdoms);
  gt_array_delete(ls->element.pdomorder);
  gt_free(ls->element.seqid);
  return had_err;
}
static int gt_snp_annotator_visitor_prepare_gene(GtSNPAnnotatorVisitor *sav,
                                                 GtError *err)
{
  GtFeatureNodeIterator *fni,
                        *mrnafni;
  GtFeatureNode *curnode,
                *last_mRNA = NULL;
  GtStr *mrnaseq,
        *seqid;
  int had_err = 0;

  mrnaseq = gt_str_new();
  seqid = gt_genome_node_get_seqid((GtGenomeNode*) sav->gene);
  fni = gt_feature_node_iterator_new(sav->gene);
  while (!had_err && (curnode = gt_feature_node_iterator_next(fni))) {
    if (gt_feature_node_get_type(curnode) == sav->mRNA_type) {
      GtFeatureNode *curnode2;
      if (last_mRNA) {
        char *mrna_charseq = gt_calloc(gt_str_length(mrnaseq)+1, sizeof (char));
        (void) strncpy(mrna_charseq, gt_str_get(mrnaseq),
                       gt_str_length(mrnaseq));
        if (gt_feature_node_get_strand(sav->gene) == GT_STRAND_REVERSE) {
          had_err = gt_reverse_complement(mrna_charseq, gt_str_length(mrnaseq),
                                          err);
        }
        if (!had_err) {
          gt_hashmap_add(sav->rnaseqs, last_mRNA, mrna_charseq);
          last_mRNA = curnode;
          gt_str_reset(mrnaseq);
        }
      } else last_mRNA = curnode;
      if (!had_err) {
        mrnafni = gt_feature_node_iterator_new(curnode);
        while (!had_err && (curnode2 =
                                      gt_feature_node_iterator_next(mrnafni))) {
          if (gt_feature_node_get_type(curnode2) == sav->CDS_type) {
            char *tmp;
            GtRange rng = gt_genome_node_get_range((GtGenomeNode*) curnode2);
            had_err = gt_region_mapping_get_sequence(sav->rmap, &tmp, seqid,
                                                     rng.start, rng.end, err);
            if (!had_err) {
              gt_str_append_cstr_nt(mrnaseq, tmp, gt_range_length(&rng));
              gt_free(tmp);
            }
          }
        }
        gt_feature_node_iterator_delete(mrnafni);
      }
    }
  }
  if (!had_err && last_mRNA) {
    char *mrna_charseq = gt_calloc(gt_str_length(mrnaseq)+1, sizeof (char));
    (void) strncpy(mrna_charseq, gt_str_get(mrnaseq), gt_str_length(mrnaseq));
    if (gt_feature_node_get_strand(sav->gene) == GT_STRAND_REVERSE) {
      had_err = gt_reverse_complement(mrna_charseq, gt_str_length(mrnaseq),
                                      err);
    }
    if (!had_err) {
      gt_hashmap_add(sav->rnaseqs, last_mRNA, mrna_charseq);
    }
  }
  gt_feature_node_iterator_delete(fni);
  gt_str_delete(mrnaseq);
  return had_err;
}
static int snp_annotator_visitor_feature_node(GtNodeVisitor *nv,
                                              GtFeatureNode *fn,
                                              GtError *err)
{
  int had_err = 0;
  GtSNPAnnotatorVisitor *sav;
  GtFeatureNodeIterator *fni,
                        *mrnafni;
  GtFeatureNode *curnode,
                *curnode2;
  GtRange snp_rng;
  gt_error_check(err);
  sav = snp_annotator_visitor_cast(nv);

  /* ignore non-nodes */
  if (!fn) return 0;

  /* only process SNPs */
  if (!(gt_feature_node_get_type(fn) == sav->SNV_type ||
        gt_feature_node_get_type(fn) == sav->SNP_type)) {
    return 0;
  }

  fni = gt_feature_node_iterator_new_direct(sav->gene);
  snp_rng = gt_genome_node_get_range((GtGenomeNode*) fn);
  while (!had_err && (curnode = gt_feature_node_iterator_next(fni))) {
    if (gt_feature_node_get_type(curnode) == sav->mRNA_type) {
      GtStrand mrna_strand = gt_feature_node_get_strand(curnode);
#ifndef NDEBUG
      const char *refstr;
#endif
      GtUword mrnasnppos = 0;
      mrnafni = gt_feature_node_iterator_new(curnode);
      while (!had_err && (curnode2 = gt_feature_node_iterator_next(mrnafni))) {
        if (gt_feature_node_get_type(curnode2) == sav->CDS_type) {
          GtRange cds_rng = gt_genome_node_get_range((GtGenomeNode*) curnode2);
          if (gt_range_overlap(&snp_rng, &cds_rng)) {
            char *mRNA,
                 origchar;
            char *variantchars, *variantptr = NULL;
            GT_UNUSED char *refchars, *refptr = NULL;
            mRNA = (char*) gt_hashmap_get(sav->rnaseqs, curnode);
            gt_assert(mRNA);
            gt_assert(snp_rng.start >= cds_rng.start);
            mrnasnppos += (snp_rng.start - cds_rng.start);
            if (mrna_strand == GT_STRAND_REVERSE)
              mrnasnppos = strlen(mRNA) - mrnasnppos - 1;
            gt_assert(mrnasnppos < strlen(mRNA));
            origchar = mRNA[mrnasnppos];
#ifndef NDEBUG
            refstr = refptr = gt_cstr_dup(gt_feature_node_get_attribute(fn,
                                                         GT_GVF_REFERENCE_SEQ));
            if (!had_err && refstr) {
              if (gt_feature_node_get_strand(curnode) == GT_STRAND_REVERSE) {
                int rval = gt_complement(&origchar, origchar, err);
                gt_assert(rval == 0);
              }
              gt_assert(toupper(origchar) == toupper(refstr[0]));
            }
#endif
            variantchars = variantptr = gt_cstr_dup(
                         gt_feature_node_get_attribute(fn, GT_GVF_VARIANT_SEQ));
            if (!had_err && variantchars) {
              GtUword i = 0;

              while (!had_err &&
                              (*variantchars != ';' && *variantchars != '\0')) {
                if (*variantchars != ',' && *variantchars != origchar) {
                  char variantchar = *variantchars;
#ifndef NDEBUG
                  char refchar = refstr ? refstr[0] : '-';  /* XXX */
                  if (!had_err && mrna_strand == GT_STRAND_REVERSE)
                    had_err = gt_complement(&refchar, refchar, err);
#endif
                  if (!had_err && mrna_strand == GT_STRAND_REVERSE)
                    had_err = gt_complement(&variantchar, variantchar, err);
                  if (!had_err) {
                    had_err = snp_annotator_classify_snp(sav, curnode, fn,
                                                         mrnasnppos,
                                                         i++,
                                                         variantchar,
#ifndef NDEBUG
                                                         refchar,
#endif
                                                         err);
                  }
                } else if (*variantchars == origchar) {
                  i++;
                }
                variantchars++;
              }
              gt_free(variantptr);
              gt_free(refptr);
            }
          } else {
            mrnasnppos += gt_range_length(&cds_rng);
          }
        }
      }
      gt_feature_node_iterator_delete(mrnafni);
    }
  }
  gt_feature_node_iterator_delete(fni);

  return had_err;
}