Example #1
0
static const char* get_node_name_or_id(GtFeatureNode *gn)
{
  const char *ret;
  if (!gn) return NULL;
  if (!(ret = gt_feature_node_get_attribute(gn, GT_GFF_NAME))) {
    if (!(ret = gt_feature_node_get_attribute(gn, GT_GFF_ID)))
      ret = NULL;
  }
  return ret;
}
Example #2
0
static void infer_cds_visitor_check_cds_multi(AgnInferCDSVisitor *v)
{
  if(gt_array_size(v->cds) <= 1)
  {
    return;
  }

  GtFeatureNode **firstsegment = gt_array_get(v->cds, 0);
  const char *id = gt_feature_node_get_attribute(*firstsegment, "ID");
  if(id == NULL)
  {
    char newid[64];
    sprintf(newid, "CDS%lu", v->cdscounter++);
    gt_feature_node_add_attribute(*firstsegment, "ID", newid);
  }
  gt_feature_node_make_multi_representative(*firstsegment);
  GtUword i;
  for(i = 0; i < gt_array_size(v->cds); i++)
  {
    GtFeatureNode **segment = gt_array_get(v->cds, i);
    if(!gt_feature_node_is_multi(*segment))
    {
      gt_feature_node_set_multi_representative(*segment, *firstsegment);
    }
  }
}
static GtFeatureNode* find_root(const GtFeatureInfo *fi, const char *id)
{
  const char *delim, *parents;
  GtFeatureNode *this_feature, *parent_pseudo_feature;
  gt_assert(fi && id);
  /* get feature */
  delim = strchr(id, ';');
  if (delim) {
    char *first_parent = gt_cstr_dup_nt(id, delim - id);
    this_feature = gt_hashmap_get(fi->id_to_genome_node, first_parent);
    parent_pseudo_feature = gt_hashmap_get(fi->id_to_pseudo_parent,
                                           first_parent);
    gt_free(first_parent);
  }
  else {
    this_feature = gt_hashmap_get(fi->id_to_genome_node, id);
    parent_pseudo_feature = gt_hashmap_get(fi->id_to_pseudo_parent, id);
  }
  gt_assert(this_feature);
  /* recursion */
  parents = gt_feature_node_get_attribute(this_feature, GT_GFF_PARENT);
  if (parents)
    return find_root(fi, parents);
  else if (parent_pseudo_feature)
    return parent_pseudo_feature;
  return this_feature;
}
static void split_cds_feature(GtFeatureNode *cds_feature, GtFeatureNode *fn)
{
  GtArray *parents;
  unsigned long i;
  gt_assert(cds_feature && fn);

  /* find parents */
  parents = find_cds_parents(cds_feature, fn);

  /* remove CDS feature */
  gt_feature_node_remove_leaf(fn, cds_feature);

  /* add CDS feature to all parents */
  for (i = 0; i < gt_array_size(parents); i++) {
    GtFeatureNode *parent = *(GtFeatureNode**) gt_array_get(parents, i);
    const char *id = gt_feature_node_get_attribute(parent, GT_GFF_ID);
    if (!i) {
      gt_feature_node_set_attribute(cds_feature, GT_GFF_PARENT, id);
      gt_feature_node_add_child(parent, cds_feature);
    }
    else {
      GtFeatureNode *new_cds = gt_feature_node_clone(cds_feature);
      gt_feature_node_set_attribute(new_cds, GT_GFF_PARENT, id);
      gt_feature_node_add_child(parent, new_cds);
      gt_genome_node_delete((GtGenomeNode*) cds_feature);
    }
  }

  gt_array_delete(parents);
}
static int m2i_change_seqid(GtFeatureNode *fn, void *data, GtError *err)
{
  const char *target;
  M2IChangeSeqidInfo *info = (M2IChangeSeqidInfo*) data;
  gt_error_check(err);
  gt_assert(fn && info);
  gt_genome_node_change_seqid((GtGenomeNode*) fn, info->new_seqid);
  if ((target = gt_feature_node_get_attribute(fn, GT_GFF_TARGET)))
    return m2i_change_target_seqids(fn, target, info->region_mapping, err);
  return 0;
}
void gt_feature_info_replace_pseudo_parent(GtFeatureInfo *fi,
                                           GtFeatureNode *child,
                                           GtFeatureNode *new_pseudo_parent)
{
  const char *id;
  gt_assert(fi && child && new_pseudo_parent);
  gt_assert(gt_feature_node_is_pseudo((GtFeatureNode*) new_pseudo_parent));
  id = gt_feature_node_get_attribute(child, GT_GFF_ID);
  gt_assert(id);
  gt_hashmap_remove(fi->id_to_pseudo_parent, id);
  gt_feature_info_add_pseudo_parent(fi, id, new_pseudo_parent);
}
Example #7
0
static void infer_cds_visitor_check_stop(AgnInferCDSVisitor *v)
{
  if(gt_array_size(v->cds) == 0)
    return;

  const char *mrnaid = gt_feature_node_get_attribute(v->mrna, "ID");
  unsigned int ln = gt_genome_node_get_line_number((GtGenomeNode *)v->mrna);
  GtStrand strand = gt_feature_node_get_strand(v->mrna);

  GtRange stoprange;
  GtUword threeprimeindex = gt_array_size(v->cds) - 1;
  GtGenomeNode **threeprimesegment = gt_array_get(v->cds, threeprimeindex);
  stoprange = gt_genome_node_get_range(*threeprimesegment);
  stoprange.start = stoprange.end - 2;
  if(strand == GT_STRAND_REVERSE)
  {
    threeprimesegment = gt_array_get(v->cds, 0);
    stoprange = gt_genome_node_get_range(*threeprimesegment);
    stoprange.end = stoprange.start + 2;
  }

  if(gt_array_size(v->stops) > 1)
  {
    gt_logger_log(v->logger, "mRNA '%s' (line %u) has %lu stop codons", mrnaid,
                  ln, gt_array_size(v->starts));
  }
  else if(gt_array_size(v->stops) == 1)
  {
    GtGenomeNode **codon = gt_array_get(v->stops, 0);
    GtRange testrange = gt_genome_node_get_range(*codon);
    if(gt_range_compare(&stoprange, &testrange) != 0)
    {
      gt_logger_log(v->logger, "stop codon inferred from CDS [%lu, %lu] does "
                    "not match explicitly provided stop codon [%lu, %lu] for "
                    "mRNA '%s'", stoprange.start, stoprange.end,
                    testrange.start, testrange.end, mrnaid);
    }
  }
  else // agn_assert(gt_array_size(v->stops) == 0)
  {
    GtStr *seqid = gt_genome_node_get_seqid((GtGenomeNode *)v->mrna);
    GtGenomeNode *codonfeature = gt_feature_node_new(seqid, "stop_codon",
                                                     stoprange.start,
                                                     stoprange.end,
                                                     strand);
    if(v->source)
      gt_feature_node_set_source((GtFeatureNode *)codonfeature, v->source);
    GtFeatureNode *cf = (GtFeatureNode *)codonfeature;
    gt_feature_node_add_child(v->mrna, cf);
    gt_array_add(v->stops, cf);
  }
}
Example #8
0
static int create_block_features(GtBEDParser *bed_parser, GtFeatureNode *fn,
                                 GtUword block_count,
                                 GtSplitter *size_splitter,
                                 GtSplitter *start_splitter, GtIO *bed_file,
                                 GtError *err)
{
  GtUword i;
  int had_err = 0;
  gt_assert(fn && block_count && size_splitter && start_splitter);
  gt_assert(gt_splitter_size(size_splitter) == block_count);
  gt_assert(gt_splitter_size(start_splitter) == block_count);
  for (i = 0; !had_err && i < block_count; i++) {
    GtUword block_size, block_start, start, end;
    GtGenomeNode *block;
    const char *name;
    if (gt_parse_uword(&block_size, gt_splitter_get_token(size_splitter, i))) {
      gt_error_set(err,
                   "file \"%s\": line "GT_WU": could not parse blockSize '%s'",
                   gt_io_get_filename(bed_file),
                   gt_io_get_line_number(bed_file),
                   gt_splitter_get_token(size_splitter, i));
      had_err = -1;
    }
    if (!had_err && gt_parse_uword(&block_start,
                                   gt_splitter_get_token(start_splitter, i))) {
      gt_error_set(err, "file \"%s\": line "GT_WU": could not parse blockStart "
                   "'%s'", gt_io_get_filename(bed_file),
                   gt_io_get_line_number(bed_file),
                   gt_splitter_get_token(start_splitter, i));
      had_err = -1;
    }
    if (!had_err) {
      start = gt_genome_node_get_start((GtGenomeNode*) fn) + block_start;
      end = start + block_size - 1;
      block = gt_feature_node_new(gt_genome_node_get_seqid((GtGenomeNode*) fn),
                                  bed_parser->block_type
                                  ? bed_parser->block_type
                                  : BED_BLOCK_TYPE,
                                  start, end, gt_feature_node_get_strand(fn));
      if ((name = gt_feature_node_get_attribute(fn, GT_GFF_NAME))) {
        gt_feature_node_add_attribute((GtFeatureNode*) block, GT_GFF_NAME,
                                      name);
      }
      gt_feature_node_set_score((GtFeatureNode*) block,
                                gt_feature_node_get_score(fn));
      gt_feature_node_set_strand((GtFeatureNode*) block,
                                 gt_feature_node_get_strand(fn));
      gt_feature_node_add_child(fn, (GtFeatureNode*) block);
    }
  }
  return had_err;
}
Example #9
0
static void pbs_attach_results_to_gff3(GtPBSResults *results,
                                       GtLTRElement *element,
                                       GtStrand *canonical_strand,
                                       GtStr *tag)
{
  GtRange pbs_range;
  GtGenomeNode *gf;
  unsigned long i = 0;
  char buffer[BUFSIZ];
  GtPBSHit* hit = gt_pbs_results_get_ranked_hit(results, i++);
  if (*canonical_strand == GT_STRAND_UNKNOWN)
    *canonical_strand = gt_pbs_hit_get_strand(hit);
  else
  {
    /* do we have to satisfy a strand constraint?
     * then find best-scoring PBS on the given canonical strand */
    while (gt_pbs_hit_get_strand(hit) != *canonical_strand
             && i < gt_pbs_results_get_number_of_hits(results))
    {
      gt_log_log("dropping PBS because of nonconsistent strand: %s\n",
                 gt_feature_node_get_attribute(element->mainnode, "ID"));
      hit = gt_pbs_results_get_ranked_hit(results, i++);
    }
    /* if there is none, do not report a PBS */
    if (gt_pbs_hit_get_strand(hit) != *canonical_strand)
      return;
  }
  pbs_range = gt_pbs_hit_get_coords(hit);
  pbs_range.start++; pbs_range.end++;  /* GFF3 is 1-based */
  gf = gt_feature_node_new(gt_genome_node_get_seqid((GtGenomeNode*)
                                                    element->mainnode),
                           GT_PBS_TYPE,
                           pbs_range.start,
                           pbs_range.end,
                           gt_pbs_hit_get_strand(hit));
  gt_feature_node_set_source((GtFeatureNode*) gf, tag);
  gt_feature_node_set_score((GtFeatureNode*) gf,
                            (float) gt_pbs_hit_get_score(hit));
  if (gt_pbs_hit_get_trna(hit) != NULL) {
    gt_feature_node_add_attribute((GtFeatureNode*) gf, "trna",
                                   gt_pbs_hit_get_trna(hit));
  }
  gt_feature_node_set_strand(element->mainnode, gt_pbs_hit_get_strand(hit));
  (void) snprintf(buffer, BUFSIZ-1, "%lu", gt_pbs_hit_get_tstart(hit));
  gt_feature_node_add_attribute((GtFeatureNode*) gf, "trnaoffset", buffer);
  (void) snprintf(buffer, BUFSIZ-1, "%lu", gt_pbs_hit_get_offset(hit));
  gt_feature_node_add_attribute((GtFeatureNode*) gf, "pbsoffset", buffer);
  (void) snprintf(buffer, BUFSIZ-1, "%lu", gt_pbs_hit_get_edist(hit));
  gt_feature_node_add_attribute((GtFeatureNode*) gf, "edist", buffer);
  gt_feature_node_add_child(element->mainnode, (GtFeatureNode*) gf);
}
Example #10
0
static int feature_node_lua_get_attribute(lua_State *L)
{
  GtGenomeNode **gn = check_genome_node(L, 1);
  const char *attr = NULL, *attrval = NULL;
  attr = luaL_checkstring(L, 2);
  GtFeatureNode *fn;
  /* make sure we get a feature node */
  fn = gt_feature_node_try_cast(*gn);
  luaL_argcheck(L, fn, 1, "not a feature node");
  attrval = gt_feature_node_get_attribute(fn, attr);
  if (attrval)
    lua_pushstring(L, attrval);
  else
    lua_pushnil(L);
  return 1;
}
Example #11
0
static int store_ids(GtFeatureNode *fn, void *data, GtError *err)
{
  GtGFF3Visitor *gff3_visitor = (GtGFF3Visitor*) data;
  AddIDInfo add_id_info;
  int had_err = 0;
  GtStr *id;

  gt_error_check(err);
  gt_assert(fn && gff3_visitor);

  if (gt_feature_node_has_children(fn) || gt_feature_node_is_multi(fn) ||
      (gff3_visitor->retain_ids && gt_feature_node_get_attribute(fn, "ID"))) {
    if (gt_feature_node_is_multi(fn)) {
      id = gt_hashmap_get(gff3_visitor->feature_node_to_unique_id_str,
                          gt_feature_node_get_multi_representative(fn));
      if (!id) {
        /* the representative does not have its own id yet -> create it */
        if (gff3_visitor->retain_ids) {
          id = make_id_unique(gff3_visitor,
                              gt_feature_node_get_multi_representative(fn));
        }
        else {
          id = create_unique_id(gff3_visitor,
                                gt_feature_node_get_multi_representative(fn));
        }
      }
      /* store id for feature, if the feature was not the representative */
      if (gt_feature_node_get_multi_representative(fn) != fn) {
        gt_hashmap_add(gff3_visitor->feature_node_to_unique_id_str, fn,
                       gt_str_ref(id));
      }
    }
    else {
      if (gff3_visitor->retain_ids)
        id = make_id_unique(gff3_visitor, fn);
      else
        id = create_unique_id(gff3_visitor, fn);
    }
    /* for each child -> store the parent feature in the hash map */
    add_id_info.gt_feature_node_to_id_array =
      gff3_visitor->feature_node_to_id_array,
    add_id_info.id = gt_str_get(id);
    had_err = gt_feature_node_traverse_direct_children(fn, &add_id_info, add_id,
                                                       err);
  }
  return had_err;
}
Example #12
0
static int gff3_visitor_feature_node(GtNodeVisitor *nv, GtFeatureNode *fn,
                                     GtError *err)
{
  GtGFF3Visitor *gff3_visitor;
  int had_err;
  gt_error_check(err);
  gff3_visitor = gff3_visitor_cast(nv);

  gff3_version_string(nv);

  had_err = gt_feature_node_traverse_children(fn, gff3_visitor, store_ids, true,
                                              err);
  if (!had_err) {
    if (gt_feature_node_is_tree(fn)) {
      had_err = gt_feature_node_traverse_children(fn, gff3_visitor,
                                                  gff3_show_feature_node, true,
                                                  err);
    }
    else {
      /* got a DAG -> traverse in topologically sorted depth first fashion to
         make sure that the 'Parent' attributes are shown in correct order */
      had_err =
        gt_feature_node_traverse_children_top(fn, gff3_visitor,
                                              gff3_show_feature_node, err);
    }
  }

  /* reset hashmaps */
  gt_hashmap_reset(gff3_visitor->feature_node_to_id_array);
  gt_hashmap_reset(gff3_visitor->feature_node_to_unique_id_str);

  /* show terminator, if the feature has children (otherwise it is clear that
     the feature is complete, because no ID attribute has been shown) */
  if (gt_feature_node_has_children(fn) ||
      (gff3_visitor->retain_ids && gt_feature_node_get_attribute(fn, "ID"))) {
    if (!gff3_visitor->outstr)
      gt_file_xprintf(gff3_visitor->outfp, "%s\n", GT_GFF_TERMINATOR);
    else {
      gt_str_append_cstr(gff3_visitor->outstr, GT_GFF_TERMINATOR);
      gt_str_append_char(gff3_visitor->outstr, '\n');
    }
  }

  return had_err;
}
Example #13
0
static int
gaeval_visitor_visit_feature_node(GtNodeVisitor *nv, GtFeatureNode *fn,
                                  GtError *error)
{
  AgnGaevalVisitor *v = gaeval_visitor_cast(nv);
  gt_error_check(error);

  GtFeatureNodeIterator *feats = gt_feature_node_iterator_new(fn);
  GtFeatureNode *tempfeat;
  for(tempfeat  = gt_feature_node_iterator_next(feats);
      tempfeat != NULL;
      tempfeat  = gt_feature_node_iterator_next(feats))
  {
    if(agn_typecheck_mrna(tempfeat) == false)
      continue;

    double coverage = gaeval_visitor_calculate_coverage(v, tempfeat, error);
    char covstr[16];
    sprintf(covstr, "%.3lf", coverage);
    gt_feature_node_add_attribute(tempfeat, "gaeval_coverage", covstr);

    double integrity_components[5];
    double integrity = gaeval_visitor_calculate_integrity(
        v, tempfeat, coverage, integrity_components, error
    );
    char intstr[16];
    sprintf(intstr, "%.3lf", integrity);
    gt_feature_node_add_attribute(tempfeat, "gaeval_integrity", intstr);

    if(v->tsvout)
    {
      const char *mrnaid = gt_feature_node_get_attribute(tempfeat, "ID");
      const char *mrnalabel = agn_feature_node_get_label(tempfeat);
      GtUword num_introns = agn_typecheck_count(tempfeat, agn_typecheck_intron);
      fprintf(v->tsvout, "%s\t%s\t%s\t%s\t%lu\t%.3lf\t%.3lf\t%.3lf\t%.3lf\n",
              mrnaid, mrnalabel, intstr, covstr, num_introns,
              integrity_components[0], integrity_components[1],
              integrity_components[2], integrity_components[3]);
    }
  }
  gt_feature_node_iterator_delete(feats);

  return 0;
}
Example #14
0
void gt_orphanage_add(GtOrphanage *o, GtGenomeNode *orphan,
                      const char *orphan_id, GtStrArray *missing_parents)
{
  const char *missing_parent;
  GtUword i;
  gt_assert(o && orphan);
  gt_assert(gt_feature_node_get_attribute((GtFeatureNode*) orphan,
                                          GT_GFF_PARENT));
  gt_queue_add(o->orphans, orphan);
  if (orphan_id && !gt_cstr_table_get(o->orphan_ids, orphan_id))
    gt_cstr_table_add(o->orphan_ids, orphan_id);
  if (missing_parents) {
    for (i = 0; i < gt_str_array_size(missing_parents); i++) {
      missing_parent = gt_str_array_get(missing_parents, i);
      if (!gt_cstr_table_get(o->missing_parents, missing_parent))
        gt_cstr_table_add(o->missing_parents, missing_parent);
    }
  }
}
Example #15
0
static bool filter_targetstrand(GtFeatureNode *fn, GtStrand targetstrand)
{
  const char *target;
  gt_assert(fn);
  if (targetstrand != GT_NUM_OF_STRAND_TYPES &&
      (target = gt_feature_node_get_attribute(fn, GT_GFF_TARGET))) {
    unsigned long num_of_targets;
    GtStrand parsed_strand;
    GT_UNUSED int had_err;
    had_err = gt_gff3_parser_parse_target_attributes(target, &num_of_targets,
                                                     NULL, NULL, &parsed_strand,
                                                     "", 0, NULL);
    gt_assert(!had_err);
    if (num_of_targets == 1 && parsed_strand != GT_NUM_OF_STRAND_TYPES &&
        parsed_strand != targetstrand) {
      return true;
    }
  }
  return false;
}
Example #16
0
static int extracttarget_from_node(GtGenomeNode *gn, GtStrArray *seqfiles,
                                   GtError *err)
{
  GtFeatureNodeIterator *fni;
  int had_err = 0;
  gt_error_check(err);
  gt_assert(gn && seqfiles);
  if (gt_genome_node_cast(gt_feature_node_class(), gn)) {
    const char *target;
    GtFeatureNode *child;
    fni = gt_feature_node_iterator_new(gt_feature_node_cast(gn));
    while (!had_err && /* XXX remove cast */
           (child = (GtFeatureNode*) gt_feature_node_iterator_next(fni))) {
      if ((target = gt_feature_node_get_attribute(child, "Target")))
        had_err = extracttarget_from_seqfiles(target, seqfiles, err);
    }
    gt_feature_node_iterator_delete(fni);
  }
  return had_err;
}
static GtStr* make_id_unique(GtGFF3Visitor *gff3_visitor, GtFeatureNode *fn)
{
  GtUword i = 1;
  GtStr *id = gt_str_new_cstr(gt_feature_node_get_attribute(fn, "ID"));

  if (gt_cstr_table_get(gff3_visitor->used_ids, gt_str_get(id))) {
    GtStr *buf = gt_str_new();
    while (!id_string_is_unique(id, buf, gff3_visitor->used_ids, i++));
    gt_warning("feature ID \"%s\" not unique: changing to %s", gt_str_get(id),
                                                               gt_str_get(buf));
    gt_str_set(id, gt_str_get(buf));
    gt_str_delete(buf);
  }
  /* update table with the new id */
  gt_cstr_table_add(gff3_visitor->used_ids, gt_str_get(id));
  /* store (unique) id */
  gt_hashmap_add(gff3_visitor->feature_node_to_unique_id_str, fn, id);

  return id;
}
static void filter_targetbest(GtFeatureNode *current_feature,
                              GtDlist *trees, GtHashmap *target_to_elem)
{
  unsigned long num_of_targets;
  GtDlistelem *previous_elem;
  GtStr *first_target_id;
  const char *target;
  int had_err;
  gt_assert(current_feature && trees);
  target = gt_feature_node_get_attribute(current_feature, TARGET_STRING);
  gt_assert(target);
  first_target_id = gt_str_new();
  had_err = gt_gff3_parser_parse_target_attributes(target, &num_of_targets,
                                                   first_target_id, NULL, NULL,
                                                   "", 0, NULL);
  gt_assert(!had_err);
  if (num_of_targets == 1) {
    GtStr *key = gt_str_new();
    build_key(key, current_feature, first_target_id);
    if (!(previous_elem = gt_hashmap_get(target_to_elem, gt_str_get(key)))) {
      /* element with this target_id not included yet -> include it */
      include_feature(trees, target_to_elem, current_feature, key);
    }
    else {
      GtFeatureNode *previous_feature = gt_dlistelem_get_data(previous_elem);
      /* element with this target_id included already -> compare them */
      if (gt_feature_node_get_score(current_feature) >
          gt_feature_node_get_score(previous_feature)) {
        /* current feature is better -> replace previous feature */
        replace_previous_elem(previous_elem, current_feature, trees,
                              target_to_elem, key);
      }
      else /* current feature is not better -> remove it */
        gt_genome_node_delete((GtGenomeNode*) current_feature);
    }
    gt_str_delete(key);
  }
  else
    gt_dlist_add(trees, current_feature);
  gt_str_delete(first_target_id);
}
Example #19
0
static void construct_thick_feature(GtBEDParser *bed_parser, GtFeatureNode *fn,
                                    GtRange range)
{
  GtGenomeNode *thick_feature;
  const char *name;
  gt_assert(fn);
  thick_feature = gt_feature_node_new(gt_genome_node_get_seqid((GtGenomeNode*)
                                                               fn),
                                      bed_parser->thick_feature_type
                                      ? bed_parser->thick_feature_type
                                      : BED_THICK_FEATURE_TYPE,
                                      range.start, range.end,
                                      gt_feature_node_get_strand(fn));
  if ((name = gt_feature_node_get_attribute(fn, "Name")))
    gt_feature_node_add_attribute((GtFeatureNode*) thick_feature, "Name", name);
  gt_feature_node_set_score((GtFeatureNode*) thick_feature,
                            gt_feature_node_get_score(fn));
  gt_feature_node_set_strand((GtFeatureNode*) thick_feature,
                             gt_feature_node_get_strand(fn));
  gt_feature_node_add_child(fn, (GtFeatureNode*) thick_feature);
}
static int targetbest_filter_stream_next(GtNodeStream *gs, GtGenomeNode **gn,
                                         GtError *err)
{
  GtTargetbestFilterStream *tfs;
  GtGenomeNode *node;
  int had_err = 0;
  gt_error_check(err);
  tfs = targetbest_filter_stream_cast(gs);

  if (!tfs->in_stream_processed) {
    while (!(had_err = gt_node_stream_next(tfs->in_stream, &node, err)) &&
           node) {
      if (gt_feature_node_try_cast(node) &&
          gt_feature_node_get_attribute((GtFeatureNode*) node, "Target")) {
        filter_targetbest((GtFeatureNode*) node, tfs->trees,
                          tfs->target_to_elem);
      }
      else
        gt_dlist_add(tfs->trees, node);
    }
    tfs->next = gt_dlist_first(tfs->trees);
    tfs->in_stream_processed = true;
  }

  if (!had_err) {
    gt_assert(tfs->in_stream_processed);
    if (tfs->next) {
      *gn = gt_dlistelem_get_data(tfs->next);
      tfs->next = gt_dlistelem_next(tfs->next);
    }
    else
      *gn = NULL;
    return 0;
  }

  return had_err;
}
Example #21
0
static void ppt_attach_results_to_gff3(GtPPTResults *results,
                                       GtLTRElement *element,
                                       GtStrand *canonical_strand,
                                       GtStr *tag)
{
  GtRange ppt_range;
  unsigned long i = 0;
  GtGenomeNode *gf;
  GtPPTHit* hit = gt_ppt_results_get_ranked_hit(results, i++);
  if (*canonical_strand == GT_STRAND_UNKNOWN)
    *canonical_strand = gt_ppt_hit_get_strand(hit);
  else
  {
    /* find best-scoring PPT on the given canonical strand */
    while (gt_ppt_hit_get_strand(hit) != *canonical_strand
             && i < gt_ppt_results_get_number_of_hits(results))
    {
      gt_log_log("dropping PPT because of nonconsistent strand: %s\n",
                 gt_feature_node_get_attribute(element->mainnode, "ID"));
      hit = gt_ppt_results_get_ranked_hit(results, i++);
    }
    /* if there is none, do not report a PPT */
    if (gt_ppt_hit_get_strand(hit) != *canonical_strand)
      return;
  }
  ppt_range = gt_ppt_hit_get_coords(hit);
  ppt_range.start++; ppt_range.end++;  /* GFF3 is 1-based */
  gf = gt_feature_node_new(gt_genome_node_get_seqid((GtGenomeNode*)
                                                    element->mainnode),
                           GT_PPT_TYPE,
                           ppt_range.start,
                           ppt_range.end,
                           gt_ppt_hit_get_strand(hit));
  gt_feature_node_set_source((GtFeatureNode*) gf, tag);
  gt_feature_node_set_strand(element->mainnode, gt_ppt_hit_get_strand(hit));
  gt_feature_node_add_child(element->mainnode, (GtFeatureNode*) gf);
}
static int gt_extract_feature_sequence_generic(GtStr *sequence,
                                GtGenomeNode *gn,
                                const char *type, bool join, GtStr *seqid,
                                GtStrArray *target_ids,
                                unsigned int *out_phase_offset,
                                GtRegionMapping *region_mapping, GtError *err)
{
  GtFeatureNode *fn;
  GtRange range;
  unsigned int phase_offset = 0;
  char *outsequence;
  const char *target;
  int had_err = 0;

  gt_error_check(err);
  fn = gt_genome_node_cast(gt_feature_node_class(), gn);
  gt_assert(fn);

  if (seqid)
    gt_str_append_str(seqid, gt_genome_node_get_seqid(gn));
  if (target_ids &&
      (target = gt_feature_node_get_attribute(fn, GT_GFF_TARGET))) {
    had_err = gt_gff3_parser_parse_all_target_attributes(target, false,
                                                         target_ids, NULL,
                                                         NULL, "", 0, err);
  }
  if (!had_err) {
    if (join) {
      GtFeatureNodeIterator *fni;
      GtFeatureNode *child;
      bool reverse_strand = false,
           first_child = true,
           first_child_of_type_seen = false;
      GtPhase phase = GT_PHASE_UNDEFINED;
      /* in this case we have to traverse the children */
      fni = gt_feature_node_iterator_new_direct(gt_feature_node_cast(gn));
      while (!had_err && (child = gt_feature_node_iterator_next(fni))) {
        if (first_child) {
          if (target_ids &&
               (target = gt_feature_node_get_attribute(child, GT_GFF_TARGET))) {
            gt_str_array_reset(target_ids);
            had_err = gt_gff3_parser_parse_all_target_attributes(target, false,
                                                                 target_ids,
                                                                 NULL,
                                                                 NULL, "", 0,
                                                                 err);
          }
          first_child = false;
        }
        if (!had_err) {
          if (extract_join_feature((GtGenomeNode*) child, type, region_mapping,
                                   sequence, &reverse_strand,
                                   &first_child_of_type_seen,
                                   &phase, err)) {
            had_err = -1;
          }
          if (phase != GT_PHASE_UNDEFINED) {
            phase_offset = (int) phase;
          }
        }
      }
      gt_feature_node_iterator_delete(fni);
      gt_assert(phase_offset <= (unsigned int) GT_PHASE_UNDEFINED);
      if (!had_err && gt_str_length(sequence)) {
        if (reverse_strand) {
          had_err = gt_reverse_complement(gt_str_get(sequence),
                                          gt_str_length(sequence), err);
        }
      }
    }
    else if (gt_feature_node_get_type(fn) == type) {
      GtPhase phase = gt_feature_node_get_phase(fn);
      gt_assert(!had_err);
      if (phase != GT_PHASE_UNDEFINED)
        phase_offset = (unsigned int) phase;
      /* otherwise we only have to look at this feature */
      range = gt_genome_node_get_range(gn);
      gt_assert(range.start); /* 1-based coordinates */
      had_err = gt_region_mapping_get_sequence(region_mapping, &outsequence,
                                               gt_genome_node_get_seqid(gn),
                                               range.start, range.end, err);
      if (!had_err) {
        gt_str_append_cstr_nt(sequence, outsequence, gt_range_length(&range));
        gt_free(outsequence);
        if (gt_feature_node_get_strand(fn) == GT_STRAND_REVERSE) {
          had_err = gt_reverse_complement(gt_str_get(sequence),
                                          gt_str_length(sequence), err);
        }
      }
    }
  }
  if (out_phase_offset && phase_offset != GT_PHASE_UNDEFINED) {
    *out_phase_offset = phase_offset;
  }
  return had_err;
}
static int add_ids_visitor_feature_node(GtNodeVisitor *nv, GtFeatureNode *fn,
                                        GtError *err)
{
  AutomaticSequenceRegion *auto_sr;
  GtAddIDsVisitor *aiv;
  const char *seqid;
  bool is_circular;
  aiv = add_ids_visitor_cast(nv);
  seqid = gt_str_get(gt_genome_node_get_seqid((GtGenomeNode*) fn));
  if (aiv->ensure_sorting && !gt_cstr_table_get(aiv->defined_seqids, seqid)) {
    gt_error_set(err, "the file %s is not sorted (seqid \"%s\" on line %u has "
                 "not been previously introduced with a \"%s\" line)",
                 gt_genome_node_get_filename((GtGenomeNode*) fn), seqid,
                 gt_genome_node_get_line_number((GtGenomeNode*) fn),
                 GT_GFF_SEQUENCE_REGION);
    return -1;
  }
  if (!gt_cstr_table_get(aiv->defined_seqids, seqid)) {
    GtFeatureNodeIterator *fni;
    GtFeatureNode *node;
    GtRange range = gt_genome_node_get_range((GtGenomeNode*) fn);
    is_circular = gt_feature_node_get_attribute(fn, GT_GFF_IS_CIRCULAR)
                  ? true : false;
    if (!is_circular) {
      fni = gt_feature_node_iterator_new(fn);
      while ((node = gt_feature_node_iterator_next(fni))) {
        GtRange node_range = gt_genome_node_get_range((GtGenomeNode*) node);
        range = gt_range_join(&range, &node_range);
      }
      gt_feature_node_iterator_delete(fni);
    }
    /* sequence region has not been previously introduced -> check if one has
       already been created automatically */
    auto_sr = gt_hashmap_get(aiv->undefined_sequence_regions, seqid);
    if (!auto_sr) {
      GtStr *seqid_str;
      /* sequence region has not been createad automatically -> do it now */
      gt_warning("seqid \"%s\" on line %u in file \"%s\" has not been "
                 "previously introduced with a \"%s\" line, create such a line "
                 "automatically", seqid,
                 gt_genome_node_get_line_number((GtGenomeNode*) fn),
                 gt_genome_node_get_filename((GtGenomeNode*) fn),
                 GT_GFF_SEQUENCE_REGION);
      auto_sr = automatic_sequence_region_new(is_circular);
      seqid_str = gt_genome_node_get_seqid((GtGenomeNode*) fn);
      auto_sr->sequence_region = gt_region_node_new(seqid_str, range.start,
                                                               range.end);
      gt_hashmap_add(aiv->undefined_sequence_regions, gt_str_get(seqid_str),
                     auto_sr);
    }
    else {
      if (auto_sr->is_circular) {
        gt_assert(!is_circular); /* XXX */
      }
      else if (is_circular) {
        gt_assert(!auto_sr->is_circular); /* XXX */
        auto_sr->is_circular = true;
        gt_genome_node_set_range(auto_sr->sequence_region, &range);
      }
      else {
        GtRange joined_range,
                sr_range = gt_genome_node_get_range(auto_sr->sequence_region);
        /* update the range of the sequence region */
        joined_range = gt_range_join(&range, &sr_range);
        gt_genome_node_set_range(auto_sr->sequence_region, &joined_range);
      }
    }
    gt_array_add(auto_sr->feature_nodes, fn);
  }
  else
    gt_queue_add(aiv->node_buffer, fn);
  return 0;
}
static int snp_annotator_classify_snp(GtSNPAnnotatorVisitor *sav,
                                      GtFeatureNode *mRNA,
                                      GtFeatureNode *snp,
                                      GtUword variant_pos,
                                      GtUword variant_idx,
                                      char variant_char,
#ifndef NDEBUG
                                      GT_UNUSED char reference_char,
#endif
                                      GT_UNUSED GtError *err)
{
  int had_err = 0;
  char *mrnaseq;
  const char *variant_effect = NULL;
  gt_assert(mRNA && snp && sav);
  gt_log_log("processing variant char %c for SNP %s\n",
               variant_char, gt_feature_node_get_attribute(snp, "Dbxref"));
  mrnaseq = gt_hashmap_get(sav->rnaseqs, mRNA);
  gt_assert(mrnaseq);
  if (mrnaseq) {
    char codon[3],
         variant_codon[3];
    GtStr *effect_string;
    char oldamino,
         newamino;
    GT_UNUSED GtUword mrnalen;
    GtUword startpos = variant_pos / GT_CODON_LENGTH,
                  variantoffset = variant_pos % GT_CODON_LENGTH;
    mrnalen = strlen(mrnaseq);
    gt_assert(variant_pos < mrnalen);
    variant_codon[0] = codon[0] = mrnaseq[3*startpos];
    variant_codon[1] = codon[1] = mrnaseq[3*startpos+1];
    variant_codon[2] = codon[2] = mrnaseq[3*startpos+2];
    variant_codon[variantoffset] = variant_char;
#ifndef NDEBUG
    gt_assert(toupper(codon[variantoffset]) == toupper(reference_char));
#endif
    if (gt_trans_table_is_stop_codon(sav->tt, codon[0], codon[1], codon[2])) {
      if (gt_trans_table_is_stop_codon(sav->tt, variant_codon[0],
                                       variant_codon[1], variant_codon[2])) {
        variant_effect = gt_symbol(GT_SNP_SYNONYMOUS_STOP_EFFECT);
      } else {
        variant_effect = gt_symbol(GT_SNP_STOP_LOST_EFFECT);
      }
    } else {
      if (gt_trans_table_is_stop_codon(sav->tt, variant_codon[0],
                                       variant_codon[1], variant_codon[2])) {
        variant_effect = gt_symbol(GT_SNP_NONSENSE_EFFECT);
      } else {
        had_err = gt_trans_table_translate_codon(sav->tt, codon[0], codon[1],
                                                 codon[2], &oldamino, err);
        if (!had_err) {
          had_err = gt_trans_table_translate_codon(sav->tt, variant_codon[0],
                                                   variant_codon[1],
                                                   variant_codon[2],
                                                   &newamino, err);
        }
        if (!had_err) {
          if (newamino == oldamino) {
            variant_effect = gt_symbol(GT_SNP_SYNONYMOUS_AMINO_EFFECT);
          } else {
            variant_effect = gt_symbol(GT_SNP_MISSENSE_EFFECT);
          }
        }
      }
    }
    if (!had_err) {
      const char *var_attrib;
      gt_assert(variant_effect != NULL);
      if ((var_attrib = gt_feature_node_get_attribute(snp,
                                                      GT_GVF_VARIANT_EFFECT))) {
        effect_string = gt_str_new_cstr(var_attrib);
        gt_str_append_cstr(effect_string, ",");
        gt_str_append_cstr(effect_string, variant_effect);
      } else {
        effect_string = gt_str_new_cstr(variant_effect);
      }
      gt_str_append_cstr(effect_string, " ");
      gt_str_append_ulong(effect_string, variant_idx);
      gt_str_append_cstr(effect_string, " ");
      gt_str_append_cstr(effect_string, gt_feature_node_get_type(mRNA));
      gt_str_append_cstr(effect_string, " ");
      gt_str_append_cstr(effect_string,
                         gt_feature_node_get_attribute(mRNA, GT_GFF_ID));
      gt_feature_node_set_attribute(snp, GT_GVF_VARIANT_EFFECT,
                                    gt_str_get(effect_string));
      gt_str_reset(effect_string);
      gt_str_delete(effect_string);
    }
  }

  return had_err;
}
static int snp_annotator_visitor_feature_node(GtNodeVisitor *nv,
                                              GtFeatureNode *fn,
                                              GtError *err)
{
  int had_err = 0;
  GtSNPAnnotatorVisitor *sav;
  GtFeatureNodeIterator *fni,
                        *mrnafni;
  GtFeatureNode *curnode,
                *curnode2;
  GtRange snp_rng;
  gt_error_check(err);
  sav = snp_annotator_visitor_cast(nv);

  /* ignore non-nodes */
  if (!fn) return 0;

  /* only process SNPs */
  if (!(gt_feature_node_get_type(fn) == sav->SNV_type ||
        gt_feature_node_get_type(fn) == sav->SNP_type)) {
    return 0;
  }

  fni = gt_feature_node_iterator_new_direct(sav->gene);
  snp_rng = gt_genome_node_get_range((GtGenomeNode*) fn);
  while (!had_err && (curnode = gt_feature_node_iterator_next(fni))) {
    if (gt_feature_node_get_type(curnode) == sav->mRNA_type) {
      GtStrand mrna_strand = gt_feature_node_get_strand(curnode);
#ifndef NDEBUG
      const char *refstr;
#endif
      GtUword mrnasnppos = 0;
      mrnafni = gt_feature_node_iterator_new(curnode);
      while (!had_err && (curnode2 = gt_feature_node_iterator_next(mrnafni))) {
        if (gt_feature_node_get_type(curnode2) == sav->CDS_type) {
          GtRange cds_rng = gt_genome_node_get_range((GtGenomeNode*) curnode2);
          if (gt_range_overlap(&snp_rng, &cds_rng)) {
            char *mRNA,
                 origchar;
            char *variantchars, *variantptr = NULL;
            GT_UNUSED char *refchars, *refptr = NULL;
            mRNA = (char*) gt_hashmap_get(sav->rnaseqs, curnode);
            gt_assert(mRNA);
            gt_assert(snp_rng.start >= cds_rng.start);
            mrnasnppos += (snp_rng.start - cds_rng.start);
            if (mrna_strand == GT_STRAND_REVERSE)
              mrnasnppos = strlen(mRNA) - mrnasnppos - 1;
            gt_assert(mrnasnppos < strlen(mRNA));
            origchar = mRNA[mrnasnppos];
#ifndef NDEBUG
            refstr = refptr = gt_cstr_dup(gt_feature_node_get_attribute(fn,
                                                         GT_GVF_REFERENCE_SEQ));
            if (!had_err && refstr) {
              if (gt_feature_node_get_strand(curnode) == GT_STRAND_REVERSE) {
                int rval = gt_complement(&origchar, origchar, err);
                gt_assert(rval == 0);
              }
              gt_assert(toupper(origchar) == toupper(refstr[0]));
            }
#endif
            variantchars = variantptr = gt_cstr_dup(
                         gt_feature_node_get_attribute(fn, GT_GVF_VARIANT_SEQ));
            if (!had_err && variantchars) {
              GtUword i = 0;

              while (!had_err &&
                              (*variantchars != ';' && *variantchars != '\0')) {
                if (*variantchars != ',' && *variantchars != origchar) {
                  char variantchar = *variantchars;
#ifndef NDEBUG
                  char refchar = refstr ? refstr[0] : '-';  /* XXX */
                  if (!had_err && mrna_strand == GT_STRAND_REVERSE)
                    had_err = gt_complement(&refchar, refchar, err);
#endif
                  if (!had_err && mrna_strand == GT_STRAND_REVERSE)
                    had_err = gt_complement(&variantchar, variantchar, err);
                  if (!had_err) {
                    had_err = snp_annotator_classify_snp(sav, curnode, fn,
                                                         mrnasnppos,
                                                         i++,
                                                         variantchar,
#ifndef NDEBUG
                                                         refchar,
#endif
                                                         err);
                  }
                } else if (*variantchars == origchar) {
                  i++;
                }
                variantchars++;
              }
              gt_free(variantptr);
              gt_free(refptr);
            }
          } else {
            mrnasnppos += gt_range_length(&cds_rng);
          }
        }
      }
      gt_feature_node_iterator_delete(mrnafni);
    }
  }
  gt_feature_node_iterator_delete(fni);

  return had_err;
}
int gt_ltrfileout_stream_next(GtNodeStream *ns, GtGenomeNode **gn, GtError *err)
{
  GtLTRdigestFileOutStream *ls;
  GtFeatureNode *fn;
  GtRange lltr_rng = {GT_UNDEF_UWORD, GT_UNDEF_UWORD},
          rltr_rng = {GT_UNDEF_UWORD, GT_UNDEF_UWORD},
          ppt_rng = {GT_UNDEF_UWORD, GT_UNDEF_UWORD},
          pbs_rng = {GT_UNDEF_UWORD, GT_UNDEF_UWORD};
  int had_err;
  GtUword i=0;

  gt_error_check(err);
  ls = gt_ltrdigest_file_out_stream_cast(ns);

  /* initialize this element */
  memset(&ls->element, 0, sizeof (GtLTRElement));

  /* get annotations from parser */
  had_err = gt_node_stream_next(ls->in_stream, gn, err);
  if (!had_err && *gn)
  {
    GtFeatureNodeIterator* gni;
    GtFeatureNode *mygn;

    /* only process feature nodes */
    if (!(fn = gt_feature_node_try_cast(*gn)))
      return 0;

    ls->element.pdomorder = gt_array_new(sizeof (const char*));

    /* fill LTRElement structure from GFF3 subgraph */
    gni = gt_feature_node_iterator_new(fn);
    for (mygn = fn; mygn != NULL; mygn = gt_feature_node_iterator_next(gni))
      (void) gt_genome_node_accept((GtGenomeNode*) mygn,
                                   (GtNodeVisitor*) ls->lv,
                                   err);
    gt_feature_node_iterator_delete(gni);
  }

  if (!had_err && ls->element.mainnode != NULL)
  {
    char desc[GT_MAXFASTAHEADER];
    GtFeatureNode *ltr3, *ltr5;
    GtStr *sdesc, *sreg, *seq;

    /* find sequence in GtEncseq */
    sreg = gt_genome_node_get_seqid((GtGenomeNode*) ls->element.mainnode);

    sdesc = gt_str_new();
    had_err = gt_region_mapping_get_description(ls->rmap, sdesc, sreg, err);

    if (!had_err) {
      GtRange rng;
      ls->element.seqid = gt_calloc((size_t) ls->seqnamelen+1, sizeof (char));
      (void) snprintf(ls->element.seqid,
                      MIN((size_t) gt_str_length(sdesc),
                          (size_t) ls->seqnamelen)+1,
                      "%s", gt_str_get(sdesc));
      gt_cstr_rep(ls->element.seqid, ' ', '_');
      if (gt_str_length(sdesc) > (GtUword) ls->seqnamelen)
        ls->element.seqid[ls->seqnamelen] = '\0';

      (void) gt_ltrelement_format_description(&ls->element,
                                              ls->seqnamelen,
                                              desc,
                                              (size_t) (GT_MAXFASTAHEADER-1));
      gt_str_delete(sdesc);

      /* output basic retrotransposon data */
      lltr_rng = gt_genome_node_get_range((GtGenomeNode*) ls->element.leftLTR);
      rltr_rng = gt_genome_node_get_range((GtGenomeNode*) ls->element.rightLTR);
      rng = gt_genome_node_get_range((GtGenomeNode*) ls->element.mainnode);
      gt_file_xprintf(ls->tabout_file,
                      GT_WU"\t"GT_WU"\t"GT_WU"\t%s\t"GT_WU"\t"GT_WU"\t"GT_WU"\t"
                      GT_WU"\t"GT_WU"\t"GT_WU"\t",
                      rng.start, rng.end, gt_ltrelement_length(&ls->element),
                      ls->element.seqid, lltr_rng.start, lltr_rng.end,
                      gt_ltrelement_leftltrlen(&ls->element), rltr_rng.start,
                      rltr_rng.end, gt_ltrelement_rightltrlen(&ls->element));
    }
    seq = gt_str_new();

    /* output TSDs */
    if (!had_err && ls->element.leftTSD != NULL)
    {
      GtRange tsd_rng;
      tsd_rng = gt_genome_node_get_range((GtGenomeNode*) ls->element.leftTSD);
      had_err = gt_extract_feature_sequence(seq,
                                       (GtGenomeNode*) ls->element.leftTSD,
                                       gt_symbol(gt_ft_target_site_duplication),
                                       false,
                                       NULL, NULL, ls->rmap, err);
      if (!had_err) {
        gt_file_xprintf(ls->tabout_file,
                         ""GT_WU"\t"GT_WU"\t%s\t",
                         tsd_rng.start,
                         tsd_rng.end,
                         gt_str_get(seq));
      }
    gt_str_reset(seq);
    } else gt_file_xprintf(ls->tabout_file, "\t\t\t");

    if (!had_err && ls->element.rightTSD != NULL)
    {
      GtRange tsd_rng;

      tsd_rng = gt_genome_node_get_range((GtGenomeNode*) ls->element.rightTSD);
      had_err = gt_extract_feature_sequence(seq,
                                       (GtGenomeNode*) ls->element.rightTSD,
                                       gt_symbol(gt_ft_target_site_duplication),
                                       false,
                                       NULL, NULL, ls->rmap, err);
      if (!had_err) {
        gt_file_xprintf(ls->tabout_file,
                           ""GT_WU"\t"GT_WU"\t%s\t",
                           tsd_rng.start,
                           tsd_rng.end,
                           gt_str_get(seq));
      }
      gt_str_reset(seq);
    } else gt_file_xprintf(ls->tabout_file, "\t\t\t");

    /* output PPT */
    if (!had_err && ls->element.ppt != NULL)
    {
      GtStrand ppt_strand = gt_feature_node_get_strand(ls->element.ppt);

      ppt_rng = gt_genome_node_get_range((GtGenomeNode*) ls->element.ppt);
      had_err = gt_extract_feature_sequence(seq,
                                            (GtGenomeNode*) ls->element.ppt,
                                            gt_symbol(gt_ft_RR_tract), false,
                                            NULL, NULL, ls->rmap, err);
      if (!had_err) {
        gt_fasta_show_entry(desc, gt_str_get(seq), gt_range_length(&ppt_rng),
                            GT_FSWIDTH, ls->pptout_file);
        gt_file_xprintf(ls->tabout_file,
                           ""GT_WU"\t"GT_WU"\t%s\t%c\t%d\t",
                           ppt_rng.start,
                           ppt_rng.end,
                           gt_str_get(seq),
                           GT_STRAND_CHARS[ppt_strand],
                           (ppt_strand == GT_STRAND_FORWARD ?
                               abs((int) (rltr_rng.start - ppt_rng.end)) :
                               abs((int) (lltr_rng.end - ppt_rng.start))));
      }
      gt_str_reset(seq);
    } else gt_file_xprintf(ls->tabout_file, "\t\t\t\t\t");

    /* output PBS */
    if (!had_err && ls->element.pbs != NULL)
    {
      GtStrand pbs_strand;

      pbs_strand = gt_feature_node_get_strand(ls->element.pbs);
      pbs_rng = gt_genome_node_get_range((GtGenomeNode*) ls->element.pbs);
      had_err = gt_extract_feature_sequence(seq,
                                           (GtGenomeNode*) ls->element.pbs,
                                           gt_symbol(gt_ft_primer_binding_site),
                                           false, NULL, NULL, ls->rmap, err);
      if (!had_err) {
        gt_fasta_show_entry(desc, gt_str_get(seq), gt_range_length(&pbs_rng),
                            GT_FSWIDTH, ls->pbsout_file);
        gt_file_xprintf(ls->tabout_file,
                         ""GT_WU"\t"GT_WU"\t%c\t%s\t%s\t%s\t%s\t%s\t",
                         pbs_rng.start,
                         pbs_rng.end,
                         GT_STRAND_CHARS[pbs_strand],
                         gt_feature_node_get_attribute(ls->element.pbs, "trna"),
                         gt_str_get(seq),
                         gt_feature_node_get_attribute(ls->element.pbs,
                                                       "pbsoffset"),
                         gt_feature_node_get_attribute(ls->element.pbs,
                                                       "trnaoffset"),
                         gt_feature_node_get_attribute(ls->element.pbs,
                                                       "edist"));
      }
      gt_str_reset(seq);
    } else gt_file_xprintf(ls->tabout_file, "\t\t\t\t\t\t\t\t");

    /* output protein domains */
    if (!had_err && ls->element.pdoms != NULL)
    {
      GtStr *pdomorderstr = gt_str_new();
      for (i=0; !had_err && i<gt_array_size(ls->element.pdomorder); i++)
      {
        const char* key = *(const char**) gt_array_get(ls->element.pdomorder,
                                                       i);
        GtArray *entry = (GtArray*) gt_hashmap_get(ls->element.pdoms, key);
        had_err = write_pdom(ls, entry, key, ls->rmap, desc, err);
      }

      if (GT_STRAND_REVERSE == gt_feature_node_get_strand(ls->element.mainnode))
        gt_array_reverse(ls->element.pdomorder);

      for (i=0 ;!had_err && i<gt_array_size(ls->element.pdomorder); i++)
      {
        const char* name = *(const char**) gt_array_get(ls->element.pdomorder,
                                                        i);
        gt_str_append_cstr(pdomorderstr, name);
        if (i != gt_array_size(ls->element.pdomorder)-1)
          gt_str_append_cstr(pdomorderstr, "/");
      }
      gt_file_xprintf(ls->tabout_file, "%s", gt_str_get(pdomorderstr));
      gt_str_delete(pdomorderstr);
    }

    /* output LTRs (we just expect them to exist) */
    switch (gt_feature_node_get_strand(ls->element.mainnode))
    {
      case GT_STRAND_REVERSE:
        ltr5 = ls->element.rightLTR;
        ltr3 = ls->element.leftLTR;
        break;
      case GT_STRAND_FORWARD:
      default:
        ltr5 = ls->element.leftLTR;
        ltr3 = ls->element.rightLTR;
        break;
    }

    if (!had_err) {
      had_err = gt_extract_feature_sequence(seq, (GtGenomeNode*) ltr5,
                                          gt_symbol(gt_ft_long_terminal_repeat),
                                          false,
                                          NULL, NULL, ls->rmap, err);
    }
    if (!had_err) {
      gt_fasta_show_entry(desc, gt_str_get(seq), gt_str_length(seq),
                          GT_FSWIDTH, ls->ltr5out_file);
      gt_str_reset(seq);
    }
    if (!had_err) {
      had_err = gt_extract_feature_sequence(seq, (GtGenomeNode*) ltr3,
                                          gt_symbol(gt_ft_long_terminal_repeat),
                                          false,
                                          NULL, NULL, ls->rmap, err);
    }
    if (!had_err) {
      gt_fasta_show_entry(desc, gt_str_get(seq), gt_str_length(seq),
                          GT_FSWIDTH, ls->ltr3out_file);
      gt_str_reset(seq);
    }

    /* output complete oriented element */
    if (!had_err) {
      had_err = gt_extract_feature_sequence(seq,
                                           (GtGenomeNode*) ls->element.mainnode,
                                           gt_symbol(gt_ft_LTR_retrotransposon),
                                           false,
                                           NULL, NULL, ls->rmap, err);
    }
    if (!had_err) {
      gt_fasta_show_entry(desc,gt_str_get(seq), gt_str_length(seq),
                          GT_FSWIDTH, ls->elemout_file);
      gt_str_reset(seq);
    }
    gt_file_xprintf(ls->tabout_file, "\n");
    gt_str_delete(seq);
  }
  gt_hashmap_delete(ls->element.pdoms);
  gt_array_delete(ls->element.pdomorder);
  gt_free(ls->element.seqid);
  return had_err;
}
static int gt_ltr_visitor_feature_node(GtNodeVisitor *nv, GtFeatureNode *fn,
                                       GT_UNUSED GtError *err)
{
  GtLTRVisitor *lv;
  GtRange node_range;
  GtArray *pdomarr = NULL;
  const char *pfamname;
  const char *fnt;
  lv = gt_ltr_visitor_cast(nv);
  gt_assert(lv);
  gt_error_check(err);

  fnt = gt_feature_node_get_type(fn);

  if (strcmp(fnt, gt_ft_LTR_retrotransposon) == 0)
  {
    lv->element->mainnode = fn;
  } else if (strcmp(fnt, gt_ft_long_terminal_repeat) == 0)
  {
    if (lv->element->leftLTR == NULL)
    {
      node_range = gt_genome_node_get_range((GtGenomeNode*) fn);
      lv->element->leftLTR = fn;
      /* compensate for 1-based node coords */
      lv->element->leftLTR_5 = node_range.start - 1;
      lv->element->leftLTR_3 = node_range.end - 1;
    }
    else
    {
      node_range = gt_genome_node_get_range((GtGenomeNode*) fn);
      lv->element->rightLTR = fn;
      /* compensate for 1-based node coords */
      lv->element->rightLTR_5 = node_range.start - 1;
      lv->element->rightLTR_3 = node_range.end - 1;
    }
  } else if (strcmp(fnt, gt_ft_target_site_duplication) == 0)
  {
    if (lv->element->leftTSD == NULL)
    {
      lv->element->leftTSD = fn;
    }
    else
    {
      lv->element->rightTSD = fn;
    }
  } else if (strcmp(fnt, gt_ft_RR_tract) == 0)
  {
    if (lv->element->ppt == NULL)
    {
      lv->element->ppt = fn;
    }
  } else if (strcmp(fnt, gt_ft_primer_binding_site) == 0)
  {
    if (lv->element->pbs == NULL)
    {
      lv->element->pbs = fn;
    }
  } else if (strcmp(fnt, gt_ft_protein_match) == 0)
  {
    char buf[BUFSIZ];
    if (!lv->element->pdoms)
    {
      lv->element->pdoms = gt_hashmap_new(GT_HASH_STRING, gt_free_func,
                                          (GtFree) gt_array_delete);
    }
    pfamname = gt_feature_node_get_attribute(fn, "name");
    (void) snprintf(buf, BUFSIZ-1, "%s", pfamname);
    gt_cstr_rep(buf, '/', '_');
    if (!(pdomarr = (GtArray*) gt_hashmap_get(lv->element->pdoms, buf)))
    {
      char *pfamcpy = gt_cstr_dup(buf);
      pdomarr = gt_array_new(sizeof (GtFeatureNode*));
      gt_hashmap_add(lv->element->pdoms, pfamcpy, pdomarr);
      if (lv->element->pdomorder != NULL)
        gt_array_add(lv->element->pdomorder, pfamcpy);
    }
    gt_array_add(pdomarr, fn);
  }
  return 0;
}
Example #28
0
int gt_gtf_parser_parse(GtGTFParser *parser, GtQueue *genome_nodes,
                        GtStr *filenamestr, GtFile *fpin, bool be_tolerant,
                        GtError *err)
{
  GtStr *seqid_str, *source_str, *line_buffer;
  char *line;
  size_t line_length;
  GtUword i, line_number = 0;
  GtGenomeNode *gn;
  GtRange range;
  GtPhase phase_value;
  GtStrand gt_strand_value;
  GtSplitter *splitter, *attribute_splitter;
  float score_value;
  char *seqname,
       *source,
       *feature,
       *start,
       *end,
       *score,
       *strand,
       *frame,
       *attributes,
       *token,
       *gene_id,
       *gene_name = NULL,
       *transcript_id,
       *transcript_name = NULL,
       **tokens;
  GtHashmap *transcript_id_hash; /* map from transcript id to array of genome
                                    nodes */
  GtArray *gt_genome_node_array;
  ConstructionInfo cinfo;
  GTF_feature_type gtf_feature_type;
  GT_UNUSED bool gff_type_is_valid = false;
  const char *type = NULL;
  const char *filename;
  bool score_is_defined;
  int had_err = 0;

  gt_assert(parser && genome_nodes);
  gt_error_check(err);

  filename = gt_str_get(filenamestr);

  /* alloc */
  line_buffer = gt_str_new();
  splitter = gt_splitter_new(),
  attribute_splitter = gt_splitter_new();

#define HANDLE_ERROR                                                   \
        if (had_err) {                                                 \
          if (be_tolerant) {                                           \
            fprintf(stderr, "skipping line: %s\n", gt_error_get(err)); \
            gt_error_unset(err);                                       \
            gt_str_reset(line_buffer);                                 \
            had_err = 0;                                               \
            continue;                                                  \
          }                                                            \
          else {                                                       \
            had_err = -1;                                              \
            break;                                                     \
          }                                                            \
        }

  while (gt_str_read_next_line_generic(line_buffer, fpin) != EOF) {
    line = gt_str_get(line_buffer);
    line_length = gt_str_length(line_buffer);
    line_number++;
    gene_name = gene_id = transcript_id = transcript_name = NULL;
    had_err = 0;

    if (line_length == 0) {
      gt_warning("skipping blank line " GT_WU " in file \"%s\"", line_number,
                 filename);
    }
    else if (line[0] == '#') {
      /* storing comment */
      if (line_length >= 2 && line[1] == '#')
        gn = gt_comment_node_new(line+2); /* store '##' line as '#' line */
      else
        gn = gt_comment_node_new(line+1);
      gt_genome_node_set_origin(gn, filenamestr, line_number);
      gt_queue_add(genome_nodes, gn);
    }
    else {
      bool stop_codon = false;
      char *tokendup, *attrkey;
      GtStrArray *attrkeys, *attrvals;

      /* process tab delimited GTF line */
      gt_splitter_reset(splitter);
      gt_splitter_split(splitter, line, line_length, '\t');
      if (gt_splitter_size(splitter) != 9UL) {
        gt_error_set(err, "line " GT_WU " in file \"%s\" contains " GT_WU
                     " tab (\\t) " "separated fields instead of 9", line_number,
                     filename,
                  gt_splitter_size(splitter));
        had_err = -1;
        break;
      }
      tokens = gt_splitter_get_tokens(splitter);
      seqname    = tokens[0];
      source     = tokens[1];
      feature    = tokens[2];
      start      = tokens[3];
      end        = tokens[4];
      score      = tokens[5];
      strand     = tokens[6];
      frame      = tokens[7];
      attributes = tokens[8];

      /* parse feature */
      if (GTF_feature_type_get(&gtf_feature_type, feature) == -1) {
        /* we skip unknown features */
        fprintf(stderr, "skipping line " GT_WU " in file \"%s\": unknown "
                "feature: \"%s\"\n", line_number, filename, feature);
        gt_str_reset(line_buffer);
        continue;
      }

      /* translate into GFF3 feature type */
      switch (gtf_feature_type) {
        case GTF_stop_codon:
          stop_codon = true;
        case GTF_CDS:
          gff_type_is_valid = gt_type_checker_is_valid(parser->type_checker,
                                                       gt_ft_CDS);
          type = gt_ft_CDS;
          break;
        case GTF_exon:
          gff_type_is_valid = gt_type_checker_is_valid(parser->type_checker,
                                                       gt_ft_exon);
          type = gt_ft_exon;
          break;
        case GTF_start_codon:
          /* we can skip the start codons, they are part of the CDS anyway */
          gt_str_reset(line_buffer);
          continue;
      }
      gt_assert(gff_type_is_valid);

      /* parse the range */
      had_err = gt_parse_range(&range, start, end, line_number, filename, err);
      HANDLE_ERROR;

      /* process seqname (we have to do it here because we need the range) */
      gt_region_node_builder_add_region(parser->region_node_builder, seqname,
                                        range);

      /* parse the score */
      had_err = gt_parse_score(&score_is_defined, &score_value, score,
                               line_number, filename, err);
      HANDLE_ERROR;

      /* parse the strand */
      had_err = gt_parse_strand(&gt_strand_value, strand, line_number, filename,
                               err);
      HANDLE_ERROR;

      /* parse the frame */
      had_err = gt_parse_phase(&phase_value, frame, line_number, filename, err);
      HANDLE_ERROR;

      /* parse the attributes */
      attrkeys = gt_str_array_new();
      attrvals = gt_str_array_new();
      gt_splitter_reset(attribute_splitter);
      gene_id = NULL;
      transcript_id = NULL;
      gt_splitter_split(attribute_splitter, attributes, strlen(attributes),
                        ';');
      for (i = 0; i < gt_splitter_size(attribute_splitter); i++) {
        token = gt_splitter_get_token(attribute_splitter, i);
        /* skip leading blanks */
        while (*token == ' ')
          token++;

        tokendup = gt_cstr_dup(token);
        attrkey = strtok(tokendup, " ");
        if (attrkey) {
          char *attrval = strtok(NULL, " ");
          if (attrval == NULL || strcmp(attrval, "") == 0 ||
              strcmp(attrval, "\"\"") == 0)
          {
            gt_error_set(err, "missing value to attribute \"%s\" on line "
                         GT_WU " in file \"%s\"", attrkey,line_number,filename);
            had_err = -1;
          }
          HANDLE_ERROR;

          if (*attrval == '"')
            attrval++;
          if (attrval[strlen(attrval)-1] == '"')
            attrval[strlen(attrval)-1] = '\0';
          gt_assert(attrkey && strlen(attrkey) > 0);
          gt_assert(attrval && strlen(attrval) > 0);
          gt_str_array_add_cstr(attrkeys, attrkey);
          gt_str_array_add_cstr(attrvals, attrval);
        }
        gt_free(tokendup);

        /* look for the two mandatory attributes */
        if (strncmp(token, GENE_ID_ATTRIBUTE, strlen(GENE_ID_ATTRIBUTE)) == 0) {
          if (strlen(token) + 2 < strlen(GENE_ID_ATTRIBUTE)) {
            gt_error_set(err, "missing value to attribute \"%s\" on line "
                         GT_WU "in file \"%s\"", GENE_ID_ATTRIBUTE, line_number,
                         filename);
            had_err = -1;
          }
          HANDLE_ERROR;
          gene_id = token + strlen(GENE_ID_ATTRIBUTE) + 1;
          if (*gene_id == '"')
            gene_id++;
          if (gene_id[strlen(gene_id)-1] == '"')
            gene_id[strlen(gene_id)-1] = '\0';
        }
        else if (strncmp(token, TRANSCRIPT_ID_ATTRIBUTE,
                         strlen(TRANSCRIPT_ID_ATTRIBUTE)) == 0) {
          if (strlen(token) + 2 < strlen(TRANSCRIPT_ID_ATTRIBUTE)) {
            gt_error_set(err, "missing value to attribute \"%s\" on line "
                         GT_WU "in file \"%s\"", TRANSCRIPT_ID_ATTRIBUTE,
                         line_number, filename);
            had_err = -1;
          }
          HANDLE_ERROR;
          transcript_id = token + strlen(TRANSCRIPT_ID_ATTRIBUTE) + 1;
          if (*transcript_id == '"')
            transcript_id++;
          if (transcript_id[strlen(transcript_id)-1] == '"')
            transcript_id[strlen(transcript_id)-1] = '\0';
        }
        else if (strncmp(token, GENE_NAME_ATTRIBUTE,
                         strlen(GENE_NAME_ATTRIBUTE)) == 0) {
          if (strlen(token) + 2 < strlen(GENE_NAME_ATTRIBUTE)) {
            gt_error_set(err, "missing value to attribute \"%s\" on line "
                         GT_WU "in file \"%s\"", GENE_NAME_ATTRIBUTE,
                         line_number, filename);
            had_err = -1;
          }
          HANDLE_ERROR;
          gene_name = token + strlen(GENE_NAME_ATTRIBUTE) + 1;
          /* for output we want to strip quotes */
          if (*gene_name == '"')
            gene_name++;
          if (gene_name[strlen(gene_name)-1] == '"')
            gene_name[strlen(gene_name)-1] = '\0';
        }
        else if (strncmp(token, TRANSCRIPT_NAME_ATTRIBUTE,
                         strlen(TRANSCRIPT_NAME_ATTRIBUTE)) == 0) {
          if (strlen(token) + 2 < strlen(TRANSCRIPT_NAME_ATTRIBUTE)) {
            gt_error_set(err, "missing value to attribute \"%s\" on line "
                         GT_WU "in file \"%s\"", TRANSCRIPT_NAME_ATTRIBUTE,
                         line_number, filename);
            had_err = -1;
          }
          HANDLE_ERROR;
          transcript_name = token + strlen(TRANSCRIPT_NAME_ATTRIBUTE) + 1;
          /* for output we want to strip quotes */
          if (*transcript_name == '"')
            transcript_name++;
          if (transcript_name[strlen(transcript_name)-1] == '"')
            transcript_name[strlen(transcript_name)-1] = '\0';
        }
      }

      /* check for the mandatory attributes */
      if (!gene_id) {
        gt_error_set(err, "missing attribute \"%s\" on line " GT_WU
                     " in file \"%s\"", GENE_ID_ATTRIBUTE, line_number,
                     filename);
        had_err = -1;
      }
      HANDLE_ERROR;
      if (!transcript_id) {
        gt_error_set(err, "missing attribute \"%s\" on line " GT_WU
                     " in file \"%s\"", TRANSCRIPT_ID_ATTRIBUTE, line_number,
                     filename);
        had_err = -1;
      }
      HANDLE_ERROR;

      /* process the mandatory attributes */
      if (!(transcript_id_hash = gt_hashmap_get(parser->gene_id_hash,
                                             gene_id))) {
        transcript_id_hash = gt_hashmap_new(GT_HASH_STRING, gt_free_func,
                                            (GtFree) gt_array_delete);
        gt_hashmap_add(parser->gene_id_hash, gt_cstr_dup(gene_id),
                    transcript_id_hash);
      }
      gt_assert(transcript_id_hash);

      if (!(gt_genome_node_array = gt_hashmap_get(transcript_id_hash,
                                            transcript_id))) {
        gt_genome_node_array = gt_array_new(sizeof (GtGenomeNode*));
        gt_hashmap_add(transcript_id_hash, gt_cstr_dup(transcript_id),
                    gt_genome_node_array);
      }
      gt_assert(gt_genome_node_array);

      /* save optional gene_name and transcript_name attributes */
      if (transcript_name && strlen(transcript_name) > 0
            && !gt_hashmap_get(parser->transcript_id_to_name_mapping,
                             transcript_id)) {
        gt_hashmap_add(parser->transcript_id_to_name_mapping,
                    gt_cstr_dup(transcript_id),
                    gt_cstr_dup(transcript_name));
      }
      if (gene_name && strlen(gene_name) > 0
            && !gt_hashmap_get(parser->gene_id_to_name_mapping,
                                    gene_id)) {
        gt_hashmap_add(parser->gene_id_to_name_mapping,
                    gt_cstr_dup(gene_id),
                    gt_cstr_dup(gene_name));
      }

      /* get seqid */
      seqid_str = gt_hashmap_get(parser->seqid_to_str_mapping, seqname);
      if (!seqid_str) {
        seqid_str = gt_str_new_cstr(seqname);
        gt_hashmap_add(parser->seqid_to_str_mapping, gt_str_get(seqid_str),
                       seqid_str);
      }
      gt_assert(seqid_str);

      /* construct the new feature */
      gn = gt_feature_node_new(seqid_str, type, range.start, range.end,
                                 gt_strand_value);
      gt_genome_node_set_origin(gn, filenamestr, line_number);
      if (stop_codon) {
        gt_feature_node_add_attribute((GtFeatureNode*) gn,
                                      GTF_PARSER_STOP_CODON_FLAG, "true");
      }
      for (i = 0; i < gt_str_array_size(attrkeys); i++) {
        GtFeatureNode *fn = (GtFeatureNode *)gn;
        const char *key = gt_str_array_get(attrkeys, i);
        const char *val = gt_str_array_get(attrvals, i);

        /* Not a comprehensive solution to ensure correct encoding, just bare
           minimum required to get Cufflinks output parsed */
        if (strcmp(val, "=") == 0)
          val = "%26";

        if (gt_feature_node_get_attribute(fn, key) != NULL) {
          const char *oldval = gt_feature_node_get_attribute(fn, key);
          GtStr *newval = gt_str_new_cstr(oldval);
          gt_str_append_char(newval, ',');
          gt_str_append_cstr(newval, val);
          gt_feature_node_set_attribute(fn, key, gt_str_get(newval));
          gt_str_delete(newval);
        }
        else
          gt_feature_node_add_attribute(fn, key, val);
      }
      gt_str_array_delete(attrkeys);
      gt_str_array_delete(attrvals);

      /* set source */
      source_str = gt_hashmap_get(parser->source_to_str_mapping, source);
      if (!source_str) {
        source_str = gt_str_new_cstr(source);
        gt_hashmap_add(parser->source_to_str_mapping, gt_str_get(source_str),
                    source_str);
      }
      gt_assert(source_str);
      gt_feature_node_set_source((GtFeatureNode*) gn, source_str);

      if (score_is_defined)
        gt_feature_node_set_score((GtFeatureNode*) gn, score_value);
      if (phase_value != GT_PHASE_UNDEFINED)
        gt_feature_node_set_phase((GtFeatureNode*) gn, phase_value);
      gt_array_add(gt_genome_node_array, gn);
    }

    gt_str_reset(line_buffer);
  }

  /* process all region nodes */
  if (!had_err)
    gt_region_node_builder_build(parser->region_node_builder, genome_nodes);

  /* process all feature nodes */
  cinfo.genome_nodes = genome_nodes;
  cinfo.tidy = be_tolerant;
  cinfo.gene_id_to_name_mapping = parser->gene_id_to_name_mapping;
  cinfo.transcript_id_to_name_mapping = parser->transcript_id_to_name_mapping;
  if (!had_err) {
    had_err = gt_hashmap_foreach(parser->gene_id_hash, construct_genes,
                                 &cinfo, err);
  }
  gt_hashmap_foreach(parser->gene_id_hash, delete_genes, NULL, err);

  /* free */
  gt_splitter_delete(splitter);
  gt_splitter_delete(attribute_splitter);
  gt_str_delete(line_buffer);

  return had_err;
}
static int CpGI_score_stream_next(GtNodeStream * ns,
                                   GtGenomeNode ** gn,
                                   GtError * err)
{
    GtGenomeNode * cur_node;
    int err_num = 0;
    *gn = NULL;
    CpGI_score_stream * score_stream;
    unsigned long island_start;
    unsigned long island_end;
    float island_score;
    int chromosome_num;
    GtStr * seqID_gtstr;
    char *  seqID_str;
    char *  num_cg_str;
    unsigned long num_cg = 0;

    score_stream = CpGI_score_stream_cast(ns);

    // find the CpGI's, process methylome score
     if(!gt_node_stream_next(score_stream->in_stream,
                            &cur_node,
                            err
                           ) && cur_node != NULL
       )
     {
         *gn = cur_node;

         // try casting as a feature node so we can test type
         if(!gt_genome_node_try_cast(gt_feature_node_class(), cur_node))
         {
               return 0;
         }
         else // we found a feature node
         {
              if(!gt_feature_node_has_type(cur_node, feature_type_CpGI))
                  return 0;

              #if DEBUG_SCORE
              printf("found CpGI\n");
              #endif 
 
              island_start = gt_genome_node_get_start(cur_node);
              island_end   = gt_genome_node_get_end(cur_node);

              seqID_gtstr = gt_genome_node_get_seqid(cur_node);
              seqID_str   = gt_str_get(seqID_gtstr);
              sscanf(seqID_str, "Chr%d", &chromosome_num);

              num_cg_str = gt_feature_node_get_attribute(cur_node, "sumcg");
              if (!num_cg_str)
                 return 0;
              
              sscanf(num_cg_str, "%d", &num_cg);             

              // now figure out the score
              island_score = CpGI_score_stream_score_island(score_stream ,
                                                            chromosome_num,
                                                            num_cg,
                                                            island_start,
                                                            island_end);
//              gt_str_delete(seqID_gtstr);

              // save the score into the node
              gt_feature_node_set_score(cur_node, island_score);
              
              return 0;

         }
     }

    return err_num;
}
static int CpGIOverlap_stream_next(GtNodeStream * ns,
                                   GtGenomeNode ** gn,
                                   GtError * err)
{
    GtGenomeNode * cur_node, * next_node;
    GtFeatureNodeIterator * iter;
    int err_num = 0;
    *gn = NULL;
    CpGIOverlap_stream * context;
    const char * gene_name = NULL;
    const char * overlap_name = NULL;
    char  chr_str[255];
    int  chr_num;
    unsigned int TSS;

    float CpGIOverlap;


    context = CpGIOverlap_stream_cast(ns);

    // find the genes, determine expression level
     if(!gt_node_stream_next(context->in_stream,
                            &cur_node,
                            err
                           ) && cur_node != NULL
       )
     {
         *gn = cur_node;

         // try casting as a feature node so we can test type
         if(!gt_genome_node_try_cast(gt_feature_node_class(), cur_node))
         {
               return 0;
         }
         else // we found a feature node
         {
              // first check if it is a pseudo node, if so find the gene in it if available
              if (gt_feature_node_is_pseudo(cur_node))
              {
                  iter = gt_feature_node_iterator_new(cur_node);
                  if (iter == NULL)
                      return;
                  while ((next_node = gt_feature_node_iterator_next(iter)) && !gt_feature_node_has_type(next_node, feature_type_gene));
                  gt_feature_node_iterator_delete(iter);
                  if (NULL == (cur_node = next_node))
                     return 0;
              }


              if(!gt_feature_node_has_type(cur_node, feature_type_gene))
                  return 0;

              // find name of gene
              gene_name = gt_feature_node_get_attribute(cur_node, "Name");

              if (gene_name == NULL)
                  return;

              if ( 1 != sscanf(gt_str_get(gt_genome_node_get_seqid(cur_node)), "Chr%d", &chr_num))
                  return 0;

              TSS = (gt_feature_node_get_strand(cur_node) == GT_STRAND_FORWARD) ? gt_genome_node_get_start(cur_node) : gt_genome_node_get_end(cur_node);

              // now figure out the overlapping gene 
              if (! (overlap_name = CpGIOverlap_stream_find_gene_overlap( context, TSS, chr_num)))
                 return 0;

              // save the score into the node
              gt_feature_node_set_attribute(cur_node, "cpgi_at_tss", overlap_name);
              
              return 0;

         }
     }

    return err_num;
}