static int check_boundaries_visitor_check_rec(GtFeatureNode *parent,
                                              GtFeatureNode *child,
                                              GtError *err)
{
  GtFeatureNodeIterator *fni;
  GtFeatureNode *node;
  GtRange range,
          p_range;
  int had_err = 0;

  range = gt_genome_node_get_range((GtGenomeNode*) child);
  p_range = gt_genome_node_get_range((GtGenomeNode*) parent);

  if (range.start < p_range.start || range.end > p_range.end) {
    gt_warning("%s child range " GT_WU "-" GT_WU " (file %s, line %u) not "
               "contained in %s parent range " GT_WU "-" GT_WU " (file %s, "
               "line %u)",
               gt_feature_node_get_type(child),
               range.start, range.end,
               gt_genome_node_get_filename((GtGenomeNode*) child),
               gt_genome_node_get_line_number((GtGenomeNode*) child),
               gt_feature_node_get_type(parent),
               p_range.start, p_range.end,
               gt_genome_node_get_filename((GtGenomeNode*) parent),
               gt_genome_node_get_line_number((GtGenomeNode*) parent));
  }

  fni = gt_feature_node_iterator_new_direct(child);
  while ((node = gt_feature_node_iterator_next(fni))) {
    had_err = check_boundaries_visitor_check_rec(child, node, err);
  }
  gt_feature_node_iterator_delete(fni);

  return had_err;
}
static int gt_ltr_input_check_visitor_feature_node(GtNodeVisitor *nv,
                                                   GtFeatureNode *fn,
                                                   GtError *err)
{
  GT_UNUSED GtLTRInputCheckVisitor *lv;
  GtFeatureNodeIterator *fni;
  bool seen_left = false;
  GtFeatureNode *curnode = NULL,
                *ltr_retrotrans = NULL,
                *lltr = NULL,
                *rltr = NULL;
  int had_err = 0;
  lv = gt_ltr_input_check_visitor_cast(nv);
  gt_assert(lv);
  gt_error_check(err);

  /* traverse annotation subgraph and find LTR components */
  fni = gt_feature_node_iterator_new(fn);
  while (!had_err && (curnode = gt_feature_node_iterator_next(fni))) {
    if (strcmp(gt_feature_node_get_type(curnode),
               gt_ft_LTR_retrotransposon) == 0) {
      ltr_retrotrans = curnode;
    }
    if (strcmp(gt_feature_node_get_type(curnode),
               gt_ft_long_terminal_repeat) == 0) {
      if (seen_left)
        rltr = curnode;
      else {
        lltr = curnode;
        seen_left = true;
      }
    }
  }
  gt_feature_node_iterator_delete(fni);

  if (lv->only_ltrs) {
    if (!had_err && !ltr_retrotrans) {
      gt_error_set(err, "connected component with %s entry node (%s, line %u) "
                        "does not contain a '%s' node, which is required",
                   gt_feature_node_get_type(fn),
                   gt_genome_node_get_filename((GtGenomeNode*) fn),
                   gt_genome_node_get_line_number((GtGenomeNode*) fn),
                   gt_ft_LTR_retrotransposon);
      had_err = -1;
    }
  }

  if (!had_err && ltr_retrotrans && (!lltr || !rltr)) {
    gt_error_set(err, "LTR_retrotransposon feature (%s, line %u) "
                      "does not contain two %s child features, both of which "
                      "are required",
                 gt_genome_node_get_filename((GtGenomeNode*) ltr_retrotrans),
                 gt_genome_node_get_line_number((GtGenomeNode*) ltr_retrotrans),
                 gt_ft_long_terminal_repeat);
    had_err = -1;
  }

  return had_err;
}
Esempio n. 3
0
static void default_track_selector(GtBlock *block, GtStr *result,
                                   GT_UNUSED void *data)
{
  GtGenomeNode *top;
  char *basename;
  gt_assert(block && result);
  gt_str_reset(result);
  top = (GtGenomeNode*) gt_block_get_top_level_feature(block);
  /* we take the basename of the filename to have nicer output in the
     generated graphic. this might lead to ``collapsed'' tracks, if two files
     with different paths have the same basename. */
  basename = gt_basename(gt_genome_node_get_filename(top));
  gt_str_append_cstr(result, basename);
  gt_free(basename);
  gt_str_append_char(result, GT_FILENAME_TYPE_SEPARATOR);
  gt_str_append_cstr(result, gt_block_get_type(block));
}
Esempio n. 4
0
static int gtf_show_feature_node(GtFeatureNode *fn, void *data, GtError *err)
{
  GtGTFVisitor *gtf_visitor = (GtGTFVisitor*) data;
  int had_err = 0;
  if (gt_feature_node_has_type(fn, gt_ft_gene)) {
      gtf_visitor->gene_id++;
      gtf_visitor->transcript_id = 0;
      had_err = gtf_show_transcript(fn, gtf_visitor, err);
  }
  else if (gt_feature_node_has_type(fn, gt_ft_mRNA)) {
    had_err = gtf_show_transcript(fn, gtf_visitor, err);
  }
  else if (!(gt_feature_node_has_type(fn, gt_ft_CDS) ||
             gt_feature_node_has_type(fn, gt_ft_exon))) {
      gt_warning("skipping GFF3 feature of type \"%s\" (from line %u in file "
                 "\"%s\")",
                 gt_feature_node_get_type(fn),
                 gt_genome_node_get_line_number((GtGenomeNode*) fn),
                 gt_genome_node_get_filename((GtGenomeNode*) fn));
  }
  return had_err;
}
Esempio n. 5
0
static int buffer_is_sorted(void **elem, void *info, GtError *err)
{
  GtGenomeNode *current_node, **last_node;

  gt_error_check(err);
  gt_assert(elem && info);

  current_node = *(GtGenomeNode**) elem,
  last_node = info;

  if (*last_node && gt_genome_node_compare(last_node, &current_node) > 0) {
    gt_assert(*last_node);
    gt_error_set(err, "the file %s is not sorted (example: line %u and %u)",
              gt_genome_node_get_filename(*last_node),
              gt_genome_node_get_line_number(*last_node),
              gt_genome_node_get_line_number(current_node));
    return -1;
  }
  else
    *last_node = current_node;
  return 0;
}
static int add_ids_visitor_region_node(GtNodeVisitor *nv, GtRegionNode *rn,
                                       GT_UNUSED GtError *err)
{
  GtAddIDsVisitor *aiv;
  const char *seqid;
  int had_err = 0;
  gt_error_check(err);
  aiv = add_ids_visitor_cast(nv);
  seqid = gt_str_get(gt_genome_node_get_seqid((GtGenomeNode*) rn));
  if (gt_hashmap_get(aiv->undefined_sequence_regions, seqid)) {
    gt_error_set(err, "genome feature with id \"%s\" has been defined before "
                 "the corresponding \"%s\" definition on line %u in file "
                 "\"%s\"", seqid, GT_GFF_SEQUENCE_REGION,
                 gt_genome_node_get_line_number((GtGenomeNode*) rn),
                 gt_genome_node_get_filename((GtGenomeNode*) rn));
    had_err = -1;
  }
  if (!had_err) {
    if (!gt_cstr_table_get(aiv->defined_seqids, seqid))
      gt_cstr_table_add(aiv->defined_seqids, seqid);
    gt_queue_add(aiv->node_buffer, rn);
  }
  return had_err;
}
Esempio n. 7
0
static int check_cds_phases(GtArray *cds_features, GtCDSCheckVisitor *v,
                            bool is_multi, bool second_pass, GtError *err)
{
  GtPhase current_phase, correct_phase = GT_PHASE_ZERO;
  GtFeatureNode *fn;
  GtStrand strand;
  unsigned long i, current_length;
  int had_err = 0;
  gt_error_check(err);
  gt_assert(cds_features);
  gt_assert(gt_array_size(cds_features));
  fn = *(GtFeatureNode**) gt_array_get_first(cds_features);
  strand = gt_feature_node_get_strand(fn);
  if (strand == GT_STRAND_REVERSE)
    gt_array_reverse(cds_features);
  for (i = 0; !had_err && i < gt_array_size(cds_features); i++) {
    fn = *(GtFeatureNode**) gt_array_get(cds_features, i);
    /* the first phase can be anything (except being undefined), because the
       GFF3 spec says:

       NOTE 4 - CDS features MUST have have a defined phase field. Otherwise it
       is not possible to infer the correct polypeptides corresponding to
       partially annotated genes. */
    if ((!i && gt_feature_node_get_phase(fn) == GT_PHASE_UNDEFINED) ||
        (i && gt_feature_node_get_phase(fn) != correct_phase)) {
      if (gt_hashmap_get(v->cds_features, fn)) {
        if (v->tidy && !is_multi && !gt_feature_node_has_children(fn)) {
          /* we can split the feature */
          gt_warning("%s feature on line %u in file \"%s\" has multiple "
                     "parents which require different phases; split feature",
                     gt_ft_CDS,
                     gt_genome_node_get_line_number((GtGenomeNode*) fn),
                     gt_genome_node_get_filename((GtGenomeNode*) fn));
          gt_hashmap_add(v->cds_features_to_split, fn, fn);
          v->splitting_is_necessary = true; /* split later */
        }
        else {
          gt_error_set(err, "%s feature on line %u in file \"%s\" has multiple "
                       "parents which require different phases",
                       gt_ft_CDS,
                       gt_genome_node_get_line_number((GtGenomeNode*) fn),
                       gt_genome_node_get_filename((GtGenomeNode*) fn));
          had_err = -1;
        }
      }
      else {
        if (v->tidy) {
          if (!second_pass) {
            gt_warning("%s feature on line %u in file \"%s\" has the wrong "
                       "phase %c -> correcting it to %c", gt_ft_CDS,
                       gt_genome_node_get_line_number((GtGenomeNode*) fn),
                       gt_genome_node_get_filename((GtGenomeNode*) fn),
                       GT_PHASE_CHARS[gt_feature_node_get_phase(fn)],
                       GT_PHASE_CHARS[correct_phase]);
          }
          gt_feature_node_set_phase(fn, correct_phase);
        }
        else {
          gt_error_set(err, "%s feature on line %u in file \"%s\" has the "
                       "wrong phase %c (should be %c)", gt_ft_CDS,
                       gt_genome_node_get_line_number((GtGenomeNode*) fn),
                       gt_genome_node_get_filename((GtGenomeNode*) fn),
                       GT_PHASE_CHARS[gt_feature_node_get_phase(fn)],
                       GT_PHASE_CHARS[correct_phase]);
          had_err = -1;
        }
      }
    }
    if (!had_err) {
      current_phase = gt_feature_node_get_phase(fn);
      current_length = gt_genome_node_get_length((GtGenomeNode*) fn);
      correct_phase = (3 - (current_length - current_phase) % 3) % 3;
      gt_hashmap_add(v->cds_features, fn, fn); /* record CDS feature */
    }
  }
  return had_err;
}
Esempio n. 8
0
static int genome_node_lua_get_filename(lua_State *L)
{
  GtGenomeNode **gn = check_genome_node(L, 1);
  lua_pushstring(L, gt_genome_node_get_filename(*gn));
  return 1;
}
static int add_ids_visitor_feature_node(GtNodeVisitor *nv, GtFeatureNode *fn,
                                        GtError *err)
{
  AutomaticSequenceRegion *auto_sr;
  GtAddIDsVisitor *aiv;
  const char *seqid;
  bool is_circular;
  aiv = add_ids_visitor_cast(nv);
  seqid = gt_str_get(gt_genome_node_get_seqid((GtGenomeNode*) fn));
  if (aiv->ensure_sorting && !gt_cstr_table_get(aiv->defined_seqids, seqid)) {
    gt_error_set(err, "the file %s is not sorted (seqid \"%s\" on line %u has "
                 "not been previously introduced with a \"%s\" line)",
                 gt_genome_node_get_filename((GtGenomeNode*) fn), seqid,
                 gt_genome_node_get_line_number((GtGenomeNode*) fn),
                 GT_GFF_SEQUENCE_REGION);
    return -1;
  }
  if (!gt_cstr_table_get(aiv->defined_seqids, seqid)) {
    GtFeatureNodeIterator *fni;
    GtFeatureNode *node;
    GtRange range = gt_genome_node_get_range((GtGenomeNode*) fn);
    is_circular = gt_feature_node_get_attribute(fn, GT_GFF_IS_CIRCULAR)
                  ? true : false;
    if (!is_circular) {
      fni = gt_feature_node_iterator_new(fn);
      while ((node = gt_feature_node_iterator_next(fni))) {
        GtRange node_range = gt_genome_node_get_range((GtGenomeNode*) node);
        range = gt_range_join(&range, &node_range);
      }
      gt_feature_node_iterator_delete(fni);
    }
    /* sequence region has not been previously introduced -> check if one has
       already been created automatically */
    auto_sr = gt_hashmap_get(aiv->undefined_sequence_regions, seqid);
    if (!auto_sr) {
      GtStr *seqid_str;
      /* sequence region has not been createad automatically -> do it now */
      gt_warning("seqid \"%s\" on line %u in file \"%s\" has not been "
                 "previously introduced with a \"%s\" line, create such a line "
                 "automatically", seqid,
                 gt_genome_node_get_line_number((GtGenomeNode*) fn),
                 gt_genome_node_get_filename((GtGenomeNode*) fn),
                 GT_GFF_SEQUENCE_REGION);
      auto_sr = automatic_sequence_region_new(is_circular);
      seqid_str = gt_genome_node_get_seqid((GtGenomeNode*) fn);
      auto_sr->sequence_region = gt_region_node_new(seqid_str, range.start,
                                                               range.end);
      gt_hashmap_add(aiv->undefined_sequence_regions, gt_str_get(seqid_str),
                     auto_sr);
    }
    else {
      if (auto_sr->is_circular) {
        gt_assert(!is_circular); /* XXX */
      }
      else if (is_circular) {
        gt_assert(!auto_sr->is_circular); /* XXX */
        auto_sr->is_circular = true;
        gt_genome_node_set_range(auto_sr->sequence_region, &range);
      }
      else {
        GtRange joined_range,
                sr_range = gt_genome_node_get_range(auto_sr->sequence_region);
        /* update the range of the sequence region */
        joined_range = gt_range_join(&range, &sr_range);
        gt_genome_node_set_range(auto_sr->sequence_region, &joined_range);
      }
    }
    gt_array_add(auto_sr->feature_nodes, fn);
  }
  else
    gt_queue_add(aiv->node_buffer, fn);
  return 0;
}
Esempio n. 10
0
static int construct_mRNAs(GT_UNUSED void *key, void *value, void *data,
                           GtError *err)
{
  ConstructionInfo *cinfo = (ConstructionInfo*) data;
  GtArray *gt_genome_node_array = (GtArray*) value,
          *mRNAs = (GtArray*) cinfo->mRNAs;
  GtGenomeNode *mRNA_node, *first_node, *gn;
  const char *tname;
  GtStrand mRNA_strand;
  GtRange mRNA_range;
  GtStr *mRNA_seqid;
  GtUword i;
  int had_err = 0;

  gt_error_check(err);
  gt_assert(key && value && data);
   /* at least one node in array */
  gt_assert(gt_array_size(gt_genome_node_array));

  /* determine the range and the strand of the mRNA */
  first_node = *(GtGenomeNode**) gt_array_get(gt_genome_node_array, 0);
  mRNA_range = gt_genome_node_get_range(first_node);
  mRNA_strand = gt_feature_node_get_strand((GtFeatureNode*) first_node);
  mRNA_seqid = gt_genome_node_get_seqid(first_node);

  /* TODO: support discontinuous start/stop codons */
  for (i = 0; !had_err && i < gt_array_size(gt_genome_node_array); i++) {
    gn = *(GtGenomeNode**) gt_array_get(gt_genome_node_array, i);
    if (gt_feature_node_get_attribute((GtFeatureNode*) gn,
        GTF_PARSER_STOP_CODON_FLAG)) {
      GtUword j;
      GtRange stop_codon_rng = gt_genome_node_get_range(gn);
      bool found_cds = false;
      for (j = 0; !had_err && j < gt_array_size(gt_genome_node_array); j++) {
        GtGenomeNode* gn2;
        GtRange this_rng;
        const char *this_type;
        gn2 = *(GtGenomeNode**) gt_array_get(gt_genome_node_array, j);
        if (gn == gn2) continue;
        this_rng = gt_genome_node_get_range(gn2);
        this_type = gt_feature_node_get_type((GtFeatureNode*) gn2);
        if (this_type == gt_symbol(gt_ft_CDS)) {
          if (gt_range_contains(&this_rng, &stop_codon_rng)) {
            if (cinfo->tidy) {
              gt_warning("stop codon on line %u in file %s is contained in "
                         "CDS in line %u",
                         gt_genome_node_get_line_number(gn),
                         gt_genome_node_get_filename(gn),
                         gt_genome_node_get_line_number(gn2));
              found_cds = true;
            } else {
              gt_error_set(err, "stop codon on line %u in file %s is "
                                "contained in CDS in line %u",
                           gt_genome_node_get_line_number(gn),
                           gt_genome_node_get_filename(gn),
                           gt_genome_node_get_line_number(gn2));
              had_err = -1;
            }
            break;
          }
          if (this_rng.end + 1 == stop_codon_rng.start) {
            this_rng.end = stop_codon_rng.end;
            gt_genome_node_set_range(gn2, &this_rng);
            found_cds = true;
            break;
          }
          if (this_rng.start == stop_codon_rng.end + 1) {
            this_rng.start = stop_codon_rng.start;
            gt_genome_node_set_range(gn2, &this_rng);
            found_cds = true;
            break;
          }
        }
      }
      if (!found_cds) {
        if (!had_err) {
          if (cinfo->tidy) {
            gt_warning("found stop codon on line %u in file %s with no "
                       "flanking CDS, ignoring it",
                       gt_genome_node_get_line_number(gn),
                       gt_genome_node_get_filename(gn));
          } else {
            gt_error_set(err, "found stop codon on line %u in file %s with no "
                              "flanking CDS",
                         gt_genome_node_get_line_number(gn),
                         gt_genome_node_get_filename(gn));
            had_err = -1;
            break;
          }
        }
      } else {
        gt_array_rem(gt_genome_node_array, i);
        gt_genome_node_delete(gn);
      }
    }
  }

  for (i = 1; !had_err && i < gt_array_size(gt_genome_node_array); i++) {
    GtRange range;
    GtStrand strand;
    gn = *(GtGenomeNode**) gt_array_get(gt_genome_node_array, i);
    range = gt_genome_node_get_range(gn);
    mRNA_range = gt_range_join(&mRNA_range, &range);
    strand = gt_feature_node_get_strand((GtFeatureNode*) gn);
    if (strand != mRNA_strand) {
      gt_error_set(err, "feature %s on line %u has strand %c, but the "
                        "parent transcript has strand %c",
                   (const char*) key,
                   gt_genome_node_get_line_number(gn),
                   GT_STRAND_CHARS[strand],
                   GT_STRAND_CHARS[mRNA_strand]);
      had_err = -1;
      break;
    } else {
      mRNA_strand = gt_strand_join(mRNA_strand, strand);
    }
    if (!had_err && gt_str_cmp(mRNA_seqid, gt_genome_node_get_seqid(gn))) {
      gt_error_set(err, "The features on lines %u and %u refer to different "
                "genomic sequences (``seqname''), although they have the same "
                "gene IDs (``gene_id'') which must be globally unique",
                gt_genome_node_get_line_number(first_node),
                gt_genome_node_get_line_number(gn));
      had_err = -1;
      break;
    }
  }

  if (!had_err) {
    mRNA_node = gt_feature_node_new(mRNA_seqid, gt_ft_mRNA, mRNA_range.start,
                                    mRNA_range.end, mRNA_strand);
    gt_feature_node_add_attribute(((GtFeatureNode*) mRNA_node), "ID", key);
    gt_feature_node_add_attribute(((GtFeatureNode*) mRNA_node), "transcript_id",
                                  key);

    if ((tname = gt_hashmap_get(cinfo->transcript_id_to_name_mapping,
                              (const char*) key)) && strlen(tname) > 0) {
      gt_feature_node_add_attribute((GtFeatureNode*) mRNA_node, GT_GFF_NAME,
                                      tname);
    }

    /* register children */
    for (i = 0; i < gt_array_size(gt_genome_node_array); i++) {
      gn = *(GtGenomeNode**) gt_array_get(gt_genome_node_array, i);
      gt_feature_node_add_child((GtFeatureNode*) mRNA_node,
                                (GtFeatureNode*) gt_genome_node_ref(gn));
    }

    /* store the mRNA */
    gt_array_add(mRNAs, mRNA_node);
  }

  return had_err;
}