示例#1
0
static int gff3_visitor_region_node(GtNodeVisitor *nv, GtRegionNode *rn,
                                    GT_UNUSED GtError *err)
{
  GtGFF3Visitor *gff3_visitor;
  gt_error_check(err);
  gff3_visitor = gff3_visitor_cast(nv);
  gt_assert(nv && rn);
  gff3_version_string(nv);
  if (!gff3_visitor->outstr) {
    gt_file_xprintf(gff3_visitor->outfp, "%s   %s "GT_WU" "GT_WU"\n",
                    GT_GFF_SEQUENCE_REGION,
                    gt_str_get(gt_genome_node_get_seqid((GtGenomeNode*) rn)),
                    gt_genome_node_get_start((GtGenomeNode*) rn),
                    gt_genome_node_get_end((GtGenomeNode*) rn));
  } else {
    gt_str_append_cstr(gff3_visitor->outstr, GT_GFF_SEQUENCE_REGION);
    gt_str_append_cstr(gff3_visitor->outstr, "   ");
    gt_str_append_cstr(gff3_visitor->outstr,
                      gt_str_get(gt_genome_node_get_seqid((GtGenomeNode*) rn)));
    gt_str_append_char(gff3_visitor->outstr, ' ');
    gt_str_append_ulong(gff3_visitor->outstr,
                                  gt_genome_node_get_start((GtGenomeNode*) rn));
    gt_str_append_char(gff3_visitor->outstr, ' ');
    gt_str_append_ulong(gff3_visitor->outstr,
                                  gt_genome_node_get_end((GtGenomeNode*) rn));
    gt_str_append_char(gff3_visitor->outstr, '\n');
  }
  return 0;
}
示例#2
0
static int gt_sort_stream_next(GtNodeStream *ns, GtGenomeNode **gn,
                               GtError *err)
{
  GtSortStream *sort_stream;
  GtGenomeNode *node, *eofn;
  int had_err = 0;
  gt_error_check(err);
  sort_stream = gt_sort_stream_cast(ns);

  if (!sort_stream->sorted) {
    while (!(had_err = gt_node_stream_next(sort_stream->in_stream, &node,
                                           err)) && node) {
      if ((eofn = gt_eof_node_try_cast(node)))
        gt_genome_node_delete(eofn); /* get rid of EOF nodes */
      else
        gt_array_add(sort_stream->nodes, node);
    }
    if (!had_err) {
      gt_genome_nodes_sort_stable(sort_stream->nodes);
      sort_stream->sorted = true;
    }
  }

  if (!had_err) {
    gt_assert(sort_stream->sorted);
    if (sort_stream->idx < gt_array_size(sort_stream->nodes)) {
      *gn = *(GtGenomeNode**) gt_array_get(sort_stream->nodes,
                                           sort_stream->idx);
      sort_stream->idx++;
      /* join region nodes with the same sequence ID */
      if (gt_region_node_try_cast(*gn)) {
        GtRange range_a, range_b;
        while (sort_stream->idx < gt_array_size(sort_stream->nodes)) {
          node = *(GtGenomeNode**) gt_array_get(sort_stream->nodes,
                                                sort_stream->idx);
          if (!gt_region_node_try_cast(node) ||
              gt_str_cmp(gt_genome_node_get_seqid(*gn),
                         gt_genome_node_get_seqid(node))) {
            /* the next node is not a region node with the same ID */
            break;
          }
          range_a = gt_genome_node_get_range(*gn);
          range_b = gt_genome_node_get_range(node);
          range_a = gt_range_join(&range_a, &range_b);
          gt_genome_node_set_range(*gn, &range_a);
          gt_genome_node_delete(node);
          sort_stream->idx++;
        }
      }
      return 0;
    }
  }

  if (!had_err) {
    gt_array_reset(sort_stream->nodes);
    *gn = NULL;
  }

  return had_err;
}
示例#3
0
void gt_region_node_consolidate(GtRegionNode *rn_a, GtRegionNode *rn_b)
{
  GtRange range_a, range_b;
  gt_assert(rn_a);
  gt_assert(rn_b);
  gt_assert(!gt_str_cmp(gt_genome_node_get_seqid((GtGenomeNode*) rn_a),
                        gt_genome_node_get_seqid((GtGenomeNode*) rn_b)));
  range_a = gt_genome_node_get_range((GtGenomeNode*) rn_a);
  range_b = gt_genome_node_get_range((GtGenomeNode*) rn_b);
  range_a = gt_range_join(&range_a, &range_b);
  gt_genome_node_set_range((GtGenomeNode*) rn_a, &range_a);
}
示例#4
0
static int pdom_hit_attach_gff3(GtPdomModel *model, GtPdomModelHit *hit,
                                void *data, GT_UNUSED GtError *err)
{
  unsigned long i;
  GtRange rng;
  GtLTRdigestStream *ls = (GtLTRdigestStream *) data;
  GtStrand strand;
  gt_assert(model && hit);

  strand = gt_pdom_model_hit_get_best_strand(hit);
  /* do not use the hits on the non-predicted strand
      -- maybe identify nested elements ? */
  if (strand != gt_feature_node_get_strand(ls->element.mainnode))
    return 0;

  for (i=0;i<gt_pdom_model_hit_best_chain_length(hit);i++)
  {
    GtGenomeNode *gf;
    GtStr *alignmentstring,
          *aastring;
    GtPdomSingleHit *singlehit;
    GtPhase frame;

    singlehit = gt_pdom_model_hit_best_single_hit(hit, i);
    alignmentstring = gt_str_new();
    aastring = gt_str_new();
    frame = gt_pdom_single_hit_get_phase(singlehit);
    rng = gt_pdom_single_hit_get_range(singlehit);

    gt_pdom_single_hit_format_alignment(singlehit, GT_ALIWIDTH,
                                        alignmentstring);
    gt_pdom_single_hit_get_aaseq(singlehit, aastring);

    rng.start++; rng.end++;  /* GFF3 is 1-based */
    gf = gt_feature_node_new(gt_genome_node_get_seqid((GtGenomeNode*)
                                                      ls->element.mainnode),
                             GT_PDOM_TYPE,
                             rng.start,
                             rng.end,
                             strand);
    gt_genome_node_add_user_data((GtGenomeNode*) gf, "pdom_alignment",
                                 alignmentstring, (GtFree) gt_str_delete);
    gt_genome_node_add_user_data((GtGenomeNode*) gf, "pdom_aaseq",
                                 aastring, (GtFree) gt_str_delete);
    gt_feature_node_set_source((GtFeatureNode*) gf, ls->ltrdigest_tag);
    gt_feature_node_set_score((GtFeatureNode*) gf,
                              gt_pdom_single_hit_get_evalue(singlehit));
    gt_feature_node_set_phase((GtFeatureNode*) gf, frame);
    if (gt_pdom_model_get_name(model)) {
      gt_feature_node_add_attribute((GtFeatureNode*) gf, "name",
                                    gt_pdom_model_get_name(model));
    }
    if (gt_pdom_model_get_acc(model)) {
      gt_feature_node_add_attribute((GtFeatureNode*) gf, "id",
                                    gt_pdom_model_get_acc(model));
    }
    gt_feature_node_add_child(ls->element.mainnode, (GtFeatureNode*) gf);
  }
  return 0;
}
static int md5_to_seqid(GtGenomeNode *gn, GtRegionMapping *region_mapping,
                        GtError *err)
{
  GtStr *seqid;
  int had_err = 0;
  gt_error_check(err);
  gt_assert(gn && region_mapping);
  seqid = gt_genome_node_get_seqid(gn);
  if (gt_md5_seqid_has_prefix(gt_str_get(seqid))) {
    /* seqid is a MD5 seqid -> change id */
    GtStr *desc = gt_str_new();
    had_err = gt_region_mapping_get_description(region_mapping, desc, seqid,
                                                err);
    if (!had_err) {
      GtStr *new_seqid = gt_str_new();
      gt_regular_seqid_save(new_seqid, desc);
      if (gt_feature_node_try_cast(gn)) {
        M2IChangeSeqidInfo info;
        info.new_seqid = new_seqid;
        info.region_mapping = region_mapping;
        had_err = gt_feature_node_traverse_children((GtFeatureNode*) gn, &info,
                                                    m2i_change_seqid, true,
                                                    err);
      }
      else
        gt_genome_node_change_seqid(gn, new_seqid);
      gt_str_delete(new_seqid);
    }
    gt_str_delete(desc);
  }
  return had_err;
}
static int gt_regioncov_visitor_feature_node(GtNodeVisitor *nv,
                                             GtFeatureNode *fn,
                                             GT_UNUSED GtError *err)
{
  GtRange *old_range_ptr, old_range, new_range;
  GtArray *ranges;
  GtRegionCovVisitor *regioncov_visitor;
  gt_error_check(err);
  regioncov_visitor = gt_regioncov_visitor_cast(nv);
  ranges = gt_hashmap_get(regioncov_visitor->region2rangelist,
                       gt_str_get(gt_genome_node_get_seqid((GtGenomeNode*)
                                                           fn)));
  gt_assert(ranges);
  new_range = gt_genome_node_get_range((GtGenomeNode*) fn);
  if (!gt_array_size(ranges))
    gt_array_add(ranges, new_range);
  else {
    old_range_ptr = gt_array_get_last(ranges);
    old_range = *old_range_ptr;
    old_range.end += regioncov_visitor->max_feature_dist;
    if (gt_range_overlap(&old_range, &new_range)) {
      old_range_ptr->end = MAX(old_range_ptr->end, new_range.end);
    }
    else
      gt_array_add(ranges, new_range);
  }
  return 0;
}
示例#7
0
static int select_visitor_region_node(GtNodeVisitor *nv, GtRegionNode *rn,
                                      GT_UNUSED GtError *err)
{
  GtSelectVisitor *select_visitor;
  gt_error_check(err);
  select_visitor = select_visitor_cast(nv);
  if (!gt_str_length(select_visitor->seqid) || /* no seqid was specified */
      !gt_str_cmp(select_visitor->seqid,       /* or seqids are equal */
               gt_genome_node_get_seqid((GtGenomeNode*) rn))) {
    if (select_visitor->contain_range.start != GT_UNDEF_ULONG) {
      GtRange range = gt_genome_node_get_range((GtGenomeNode*) rn);
      if (gt_range_overlap(&range, &select_visitor->contain_range)) {
        /* an overlapping contain range was defined -> update range  */
        range.start = MAX(range.start, select_visitor->contain_range.start);
        range.end = MIN(range.end, select_visitor->contain_range.end);
        gt_genome_node_set_range((GtGenomeNode*) rn, &range);
        gt_queue_add(select_visitor->node_buffer, rn);
      }
      else /* contain range does not overlap with <rn> range -> delete <rn> */
        gt_genome_node_delete((GtGenomeNode*) rn);
    }
    else
      gt_queue_add(select_visitor->node_buffer, rn);
  }
  else
    gt_genome_node_delete((GtGenomeNode*) rn);
  return 0;
}
static int feature_index_lua_add_feature_node(lua_State *L)
{
  GtFeatureIndex **fi;
  GtGenomeNode **gn;
  GtFeatureNode *fn;
  GtStr *seqid;
  GtError *err;
  bool has_seqid;
  gt_assert(L);
  fi = check_feature_index(L, 1);
  gn = check_genome_node(L, 2);
  fn = gt_feature_node_cast(*gn);
  luaL_argcheck(L, fn, 2, "not a feature node");
  seqid = gt_genome_node_get_seqid(*gn);
  luaL_argcheck(L, seqid, 2, "feature does not have a sequence id");
  err = gt_error_new();
  if (gt_feature_index_has_seqid(*fi, &has_seqid, gt_str_get(seqid), err))
    return gt_lua_error(L, err);
  gt_error_delete(err);
  luaL_argcheck(L, has_seqid, 2,
                "feature index does not contain corresponding sequence region");
  err = gt_error_new();
  if (gt_feature_index_add_feature_node(*fi, fn, err))
    return gt_lua_error(L, err);
  gt_error_delete(err);
  return 0;
}
示例#9
0
double agn_calc_edit_distance(GtFeatureNode *t1, GtFeatureNode *t2)
{
  AgnTranscriptClique *clique1 = agn_transcript_clique_new();
  agn_transcript_clique_add(clique1, t1);
  AgnTranscriptClique *clique2 = agn_transcript_clique_new();
  agn_transcript_clique_add(clique2, t2);

  GtGenomeNode *gn1 = (GtGenomeNode *)t1;
  GtRange r1 = gt_genome_node_get_range(gn1);
  GtStr *seqid = gt_genome_node_get_seqid(gn1);
  GtGenomeNode *gn2 = (GtGenomeNode *)t2;
  GtRange r2 = gt_genome_node_get_range(gn2);
  GtRange local_range = r1;
  if(r2.start < r1.start)
    local_range.start = r2.start;
  if(r2.end > r1.end)
    local_range.end = r2.end;

  AgnCliquePair *pair = agn_clique_pair_new(gt_str_get(seqid), clique1, clique2,
                                            &local_range);
  agn_clique_pair_build_model_vectors(pair);
  agn_clique_pair_comparative_analysis(pair);

  double ed = agn_clique_pair_get_edit_distance(pair);

  agn_transcript_clique_delete(clique1);
  agn_transcript_clique_delete(clique2);
  agn_clique_pair_delete(pair);

  return ed;
}
示例#10
0
void gt_gff3_output_leading_str(GtFeatureNode *fn, GtStr *outstr)
{
  GtGenomeNode *gn;
  gt_assert(fn && outstr);
  gn = (GtGenomeNode*) fn;
  gt_str_append_str(outstr, gt_genome_node_get_seqid(gn));
  gt_str_append_char(outstr, '\t');
  gt_str_append_cstr(outstr, gt_feature_node_get_source(fn));
  gt_str_append_char(outstr, '\t');
  gt_str_append_cstr(outstr, gt_feature_node_get_type(fn));
  gt_str_append_char(outstr, '\t');
  gt_str_append_uword(outstr, gt_genome_node_get_start(gn));
  gt_str_append_char(outstr, '\t');
  gt_str_append_uword(outstr, gt_genome_node_get_end(gn));
  gt_str_append_char(outstr, '\t');
  if (gt_feature_node_score_is_defined(fn)) {
    char buf[BUFSIZ];
    (void) snprintf(buf, BUFSIZ, "%.3g", gt_feature_node_get_score(fn));
    gt_str_append_cstr(outstr, buf);
  } else
    gt_str_append_char(outstr, '.');
  gt_str_append_char(outstr, '\t');
  gt_str_append_char(outstr, GT_STRAND_CHARS[gt_feature_node_get_strand(fn)]);
  gt_str_append_char(outstr, '\t');
  gt_str_append_char(outstr, GT_PHASE_CHARS[gt_feature_node_get_phase(fn)]);
  gt_str_append_char(outstr, '\t');
}
示例#11
0
static int gt_ltrdigest_pdom_visitor_attach_hit(GtLTRdigestPdomVisitor *lv,
                                                GtHMMERModelHit *modelhit,
                                                GtHMMERSingleHit *singlehit)
{
  GT_UNUSED GtUword i;
  GtGenomeNode *gf;
  int had_err = 0;
  GtRange rrng;
  gt_assert(lv && singlehit);

  rrng = gt_ltrdigest_pdom_visitor_coords(lv, singlehit);

  if (gt_array_size(singlehit->chains) > 0 || lv->output_all_chains) {
    char buf[32];
    gf = gt_feature_node_new(gt_genome_node_get_seqid((GtGenomeNode*)
                                                      lv->ltr_retrotrans),
                             gt_ft_protein_match,
                             rrng.start,
                             rrng.end,
                             singlehit->strand);
    gt_genome_node_add_user_data((GtGenomeNode*) gf, "pdom_alignment",
                                 gt_str_ref(singlehit->alignment),
                                 (GtFree) gt_str_delete);
    gt_genome_node_add_user_data((GtGenomeNode*) gf, "pdom_aaseq",
                                 gt_str_ref(singlehit->aastring),
                                 (GtFree) gt_str_delete);
    gt_feature_node_set_source((GtFeatureNode*) gf, lv->tag);
    gt_feature_node_set_score((GtFeatureNode*) gf, (float) singlehit->evalue);
    (void) snprintf(buf, (size_t) 32, "%d", (int) singlehit->frame);
    gt_feature_node_add_attribute((GtFeatureNode*) gf,
                                    "reading_frame", buf);
    if (modelhit->modelname != NULL) {
      gt_feature_node_add_attribute((GtFeatureNode*) gf, "name",
                                    modelhit->modelname);
    }
    if (gt_array_size(singlehit->chains) > 1UL && lv->output_all_chains) {
      GtStr *buffer;
      GtUword j;
      gt_assert(singlehit->chains != NULL);
      buffer = gt_str_new();
      for (j = 0UL; j < gt_array_size(singlehit->chains); j++) {
        gt_str_append_cstr(buffer, modelhit->modelname);
        gt_str_append_char(buffer, ':');
        gt_str_append_ulong(buffer,
                          *(GtUword*) gt_array_get(singlehit->chains, j));
        if (j != gt_array_size(singlehit->chains) - 1) {
          gt_str_append_char(buffer, ',');
        }
      }
      gt_feature_node_set_attribute((GtFeatureNode*) gf, "chains",
                                    gt_str_get(buffer));
      gt_str_delete(buffer);
    }
    gt_feature_node_add_child(lv->ltr_retrotrans, (GtFeatureNode*) gf);
  }
  gt_array_delete(singlehit->chains);
  singlehit->chains = NULL;
  return had_err;
}
static void build_key(GtStr *key, GtFeatureNode *feature, GtStr *target_id)
{
  gt_assert(key && feature && target_id);
  gt_str_reset(key);
  gt_str_append_str(key, gt_genome_node_get_seqid((GtGenomeNode*) feature));
  gt_str_append_char(key, '\t'); /* cannot occur in seqid or target_id */
  gt_str_append_str(key, target_id);
}
示例#13
0
static int genome_node_lua_get_seqid(lua_State *L)
{
  GtStr *seqid;
  GtGenomeNode **gn = check_genome_node(L, 1);
  if ((seqid = gt_genome_node_get_seqid(*gn)))
    lua_pushstring(L, gt_str_get(seqid));
  else
    lua_pushnil(L);
  return 1;
}
示例#14
0
static void infer_cds_visitor_check_stop(AgnInferCDSVisitor *v)
{
  if(gt_array_size(v->cds) == 0)
    return;

  const char *mrnaid = gt_feature_node_get_attribute(v->mrna, "ID");
  unsigned int ln = gt_genome_node_get_line_number((GtGenomeNode *)v->mrna);
  GtStrand strand = gt_feature_node_get_strand(v->mrna);

  GtRange stoprange;
  GtUword threeprimeindex = gt_array_size(v->cds) - 1;
  GtGenomeNode **threeprimesegment = gt_array_get(v->cds, threeprimeindex);
  stoprange = gt_genome_node_get_range(*threeprimesegment);
  stoprange.start = stoprange.end - 2;
  if(strand == GT_STRAND_REVERSE)
  {
    threeprimesegment = gt_array_get(v->cds, 0);
    stoprange = gt_genome_node_get_range(*threeprimesegment);
    stoprange.end = stoprange.start + 2;
  }

  if(gt_array_size(v->stops) > 1)
  {
    gt_logger_log(v->logger, "mRNA '%s' (line %u) has %lu stop codons", mrnaid,
                  ln, gt_array_size(v->starts));
  }
  else if(gt_array_size(v->stops) == 1)
  {
    GtGenomeNode **codon = gt_array_get(v->stops, 0);
    GtRange testrange = gt_genome_node_get_range(*codon);
    if(gt_range_compare(&stoprange, &testrange) != 0)
    {
      gt_logger_log(v->logger, "stop codon inferred from CDS [%lu, %lu] does "
                    "not match explicitly provided stop codon [%lu, %lu] for "
                    "mRNA '%s'", stoprange.start, stoprange.end,
                    testrange.start, testrange.end, mrnaid);
    }
  }
  else // agn_assert(gt_array_size(v->stops) == 0)
  {
    GtStr *seqid = gt_genome_node_get_seqid((GtGenomeNode *)v->mrna);
    GtGenomeNode *codonfeature = gt_feature_node_new(seqid, "stop_codon",
                                                     stoprange.start,
                                                     stoprange.end,
                                                     strand);
    if(v->source)
      gt_feature_node_set_source((GtFeatureNode *)codonfeature, v->source);
    GtFeatureNode *cf = (GtFeatureNode *)codonfeature;
    gt_feature_node_add_child(v->mrna, cf);
    gt_array_add(v->stops, cf);
  }
}
示例#15
0
static void infer_cds_visitor_infer_cds(AgnInferCDSVisitor *v)
{
  GtFeatureNode **start_codon = NULL, **stop_codon = NULL;

  bool exonsexplicit    = gt_array_size(v->exons) > 0;
  bool startcodon_check = gt_array_size(v->starts) == 1 &&
                          (start_codon = gt_array_get(v->starts, 0)) != NULL;
  bool stopcodon_check  = gt_array_size(v->stops)  == 1 &&
                          (stop_codon  = gt_array_get(v->stops,  0)) != NULL;

  if(gt_array_size(v->cds) > 0)
  {
    return;
  }
  else if(!exonsexplicit || !startcodon_check || !stopcodon_check)
  {
    return;
  }

  GtRange left_codon_range, right_codon_range;
  left_codon_range  = gt_genome_node_get_range(*(GtGenomeNode **)start_codon);
  right_codon_range = gt_genome_node_get_range(*(GtGenomeNode **)stop_codon);
  if(gt_feature_node_get_strand(v->mrna) == GT_STRAND_REVERSE)
  {
    left_codon_range  = gt_genome_node_get_range(*(GtGenomeNode **)stop_codon);
    right_codon_range = gt_genome_node_get_range(*(GtGenomeNode **)start_codon);
  }
  GtUword i;
  for(i = 0; i < gt_array_size(v->exons); i++)
  {
    GtFeatureNode *exon = *(GtFeatureNode **)gt_array_get(v->exons, i);
    GtGenomeNode *exon_gn = (GtGenomeNode *)exon;
    GtRange exon_range = gt_genome_node_get_range(exon_gn);
    GtStrand exon_strand = gt_feature_node_get_strand(exon);

    GtRange cdsrange;
    bool exon_includes_cds = infer_cds_visitor_infer_range(&exon_range,
                                                           &left_codon_range,
                                                           &right_codon_range,
                                                           &cdsrange);
    if(exon_includes_cds)
    {
      GtGenomeNode *cdsfeat;
      cdsfeat = gt_feature_node_new(gt_genome_node_get_seqid(exon_gn), "CDS",
                                    cdsrange.start, cdsrange.end, exon_strand);
      if(v->source)
        gt_feature_node_set_source((GtFeatureNode *)cdsfeat, v->source);
      gt_feature_node_add_child(v->mrna, (GtFeatureNode *)cdsfeat);
      gt_array_add(v->cds, cdsfeat);
    }
  }
}
示例#16
0
static int create_block_features(GtBEDParser *bed_parser, GtFeatureNode *fn,
                                 GtUword block_count,
                                 GtSplitter *size_splitter,
                                 GtSplitter *start_splitter, GtIO *bed_file,
                                 GtError *err)
{
  GtUword i;
  int had_err = 0;
  gt_assert(fn && block_count && size_splitter && start_splitter);
  gt_assert(gt_splitter_size(size_splitter) == block_count);
  gt_assert(gt_splitter_size(start_splitter) == block_count);
  for (i = 0; !had_err && i < block_count; i++) {
    GtUword block_size, block_start, start, end;
    GtGenomeNode *block;
    const char *name;
    if (gt_parse_uword(&block_size, gt_splitter_get_token(size_splitter, i))) {
      gt_error_set(err,
                   "file \"%s\": line "GT_WU": could not parse blockSize '%s'",
                   gt_io_get_filename(bed_file),
                   gt_io_get_line_number(bed_file),
                   gt_splitter_get_token(size_splitter, i));
      had_err = -1;
    }
    if (!had_err && gt_parse_uword(&block_start,
                                   gt_splitter_get_token(start_splitter, i))) {
      gt_error_set(err, "file \"%s\": line "GT_WU": could not parse blockStart "
                   "'%s'", gt_io_get_filename(bed_file),
                   gt_io_get_line_number(bed_file),
                   gt_splitter_get_token(start_splitter, i));
      had_err = -1;
    }
    if (!had_err) {
      start = gt_genome_node_get_start((GtGenomeNode*) fn) + block_start;
      end = start + block_size - 1;
      block = gt_feature_node_new(gt_genome_node_get_seqid((GtGenomeNode*) fn),
                                  bed_parser->block_type
                                  ? bed_parser->block_type
                                  : BED_BLOCK_TYPE,
                                  start, end, gt_feature_node_get_strand(fn));
      if ((name = gt_feature_node_get_attribute(fn, GT_GFF_NAME))) {
        gt_feature_node_add_attribute((GtFeatureNode*) block, GT_GFF_NAME,
                                      name);
      }
      gt_feature_node_set_score((GtFeatureNode*) block,
                                gt_feature_node_get_score(fn));
      gt_feature_node_set_strand((GtFeatureNode*) block,
                                 gt_feature_node_get_strand(fn));
      gt_feature_node_add_child(fn, (GtFeatureNode*) block);
    }
  }
  return had_err;
}
示例#17
0
static void pbs_attach_results_to_gff3(GtPBSResults *results,
                                       GtLTRElement *element,
                                       GtStrand *canonical_strand,
                                       GtStr *tag)
{
  GtRange pbs_range;
  GtGenomeNode *gf;
  unsigned long i = 0;
  char buffer[BUFSIZ];
  GtPBSHit* hit = gt_pbs_results_get_ranked_hit(results, i++);
  if (*canonical_strand == GT_STRAND_UNKNOWN)
    *canonical_strand = gt_pbs_hit_get_strand(hit);
  else
  {
    /* do we have to satisfy a strand constraint?
     * then find best-scoring PBS on the given canonical strand */
    while (gt_pbs_hit_get_strand(hit) != *canonical_strand
             && i < gt_pbs_results_get_number_of_hits(results))
    {
      gt_log_log("dropping PBS because of nonconsistent strand: %s\n",
                 gt_feature_node_get_attribute(element->mainnode, "ID"));
      hit = gt_pbs_results_get_ranked_hit(results, i++);
    }
    /* if there is none, do not report a PBS */
    if (gt_pbs_hit_get_strand(hit) != *canonical_strand)
      return;
  }
  pbs_range = gt_pbs_hit_get_coords(hit);
  pbs_range.start++; pbs_range.end++;  /* GFF3 is 1-based */
  gf = gt_feature_node_new(gt_genome_node_get_seqid((GtGenomeNode*)
                                                    element->mainnode),
                           GT_PBS_TYPE,
                           pbs_range.start,
                           pbs_range.end,
                           gt_pbs_hit_get_strand(hit));
  gt_feature_node_set_source((GtFeatureNode*) gf, tag);
  gt_feature_node_set_score((GtFeatureNode*) gf,
                            (float) gt_pbs_hit_get_score(hit));
  if (gt_pbs_hit_get_trna(hit) != NULL) {
    gt_feature_node_add_attribute((GtFeatureNode*) gf, "trna",
                                   gt_pbs_hit_get_trna(hit));
  }
  gt_feature_node_set_strand(element->mainnode, gt_pbs_hit_get_strand(hit));
  (void) snprintf(buffer, BUFSIZ-1, "%lu", gt_pbs_hit_get_tstart(hit));
  gt_feature_node_add_attribute((GtFeatureNode*) gf, "trnaoffset", buffer);
  (void) snprintf(buffer, BUFSIZ-1, "%lu", gt_pbs_hit_get_offset(hit));
  gt_feature_node_add_attribute((GtFeatureNode*) gf, "pbsoffset", buffer);
  (void) snprintf(buffer, BUFSIZ-1, "%lu", gt_pbs_hit_get_edist(hit));
  gt_feature_node_add_attribute((GtFeatureNode*) gf, "edist", buffer);
  gt_feature_node_add_child(element->mainnode, (GtFeatureNode*) gf);
}
static int gt_regioncov_visitor_region_node(GtNodeVisitor *nv, GtRegionNode *rn,
                                         GT_UNUSED GtError *err)
{
  GtRegionCovVisitor *regioncov_visitor;
  GtArray *rangelist;
  gt_error_check(err);
  regioncov_visitor = gt_regioncov_visitor_cast(nv);
  rangelist = gt_array_new(sizeof (GtRange));
  gt_hashmap_add(regioncov_visitor->region2rangelist,
              gt_cstr_dup(gt_str_get(gt_genome_node_get_seqid((GtGenomeNode*)
                                                              rn))),
              rangelist);
  return 0;
}
示例#19
0
static int select_visitor_sequence_node(GtNodeVisitor *nv, GtSequenceNode *sn,
                                        GT_UNUSED GtError *err)
{
  GtSelectVisitor *select_visitor;
  gt_error_check(err);
  select_visitor = select_visitor_cast(nv);
  if (!gt_str_length(select_visitor->seqid) || /* no seqid was specified */
      !gt_str_cmp(select_visitor->seqid,       /* or seqids are equal */
                  gt_genome_node_get_seqid((GtGenomeNode*) sn))) {
    gt_queue_add(select_visitor->node_buffer, sn);
  }
  else
    gt_genome_node_delete((GtGenomeNode*) sn);
  return 0;
}
示例#20
0
static int gff3_visitor_region_node(GtNodeVisitor *nv, GtRegionNode *rn,
                                    GT_UNUSED GtError *err)
{
  GtGFF3Visitor *gff3_visitor;
  gt_error_check(err);
  gff3_visitor = gff3_visitor_cast(nv);
  gt_assert(nv && rn);
  gff3_version_string(nv);
  gt_file_xprintf(gff3_visitor->outfp, "%s   %s %lu %lu\n",
                  GT_GFF_SEQUENCE_REGION,
                  gt_str_get(gt_genome_node_get_seqid((GtGenomeNode*) rn)),
                  gt_genome_node_get_start((GtGenomeNode*) rn),
                  gt_genome_node_get_end((GtGenomeNode*) rn));
  return 0;
}
示例#21
0
static int feature_index_lua_add_feature_node(lua_State *L)
{
  GtFeatureIndex **fi;
  GtGenomeNode **gn;
  GtFeatureNode *gf;
  GtStr *seqid;
  gt_assert(L);
  fi = check_feature_index(L, 1);
  gn = check_genome_node(L, 2);
  gf = gt_genome_node_cast(gt_feature_node_class(), *gn);
  luaL_argcheck(L, gf, 2, "not a feature node");
  seqid = gt_genome_node_get_seqid(*gn);
  luaL_argcheck(L, seqid, 2, "feature does not have a sequence id");
  luaL_argcheck(L, gt_feature_index_has_seqid(*fi, gt_str_get(seqid)), 2,
                "feature index does not contain corresponding sequence region");
  gt_feature_index_add_feature_node(*fi, gf);
  return 0;
}
示例#22
0
void gt_gff3_output_leading(GtFeatureNode *fn, GtFile *outfp)
{
  GtGenomeNode *gn;
  gt_assert(fn);
  gn = (GtGenomeNode*) fn;
  gt_file_xprintf(outfp, "%s\t%s\t%s\t"GT_WU"\t"GT_WU"\t",
                     gt_str_get(gt_genome_node_get_seqid(gn)),
                     gt_feature_node_get_source(fn),
                     gt_feature_node_get_type(fn),
                     gt_genome_node_get_start(gn),
                     gt_genome_node_get_end(gn));
  if (gt_feature_node_score_is_defined(fn))
    gt_file_xprintf(outfp, "%.3g", gt_feature_node_get_score(fn));
  else
    gt_file_xfputc('.', outfp);
  gt_file_xprintf(outfp, "\t%c\t%c\t",
                     GT_STRAND_CHARS[gt_feature_node_get_strand(fn)],
                     GT_PHASE_CHARS[gt_feature_node_get_phase(fn)]);
}
示例#23
0
static void construct_thick_feature(GtBEDParser *bed_parser, GtFeatureNode *fn,
                                    GtRange range)
{
  GtGenomeNode *thick_feature;
  const char *name;
  gt_assert(fn);
  thick_feature = gt_feature_node_new(gt_genome_node_get_seqid((GtGenomeNode*)
                                                               fn),
                                      bed_parser->thick_feature_type
                                      ? bed_parser->thick_feature_type
                                      : BED_THICK_FEATURE_TYPE,
                                      range.start, range.end,
                                      gt_feature_node_get_strand(fn));
  if ((name = gt_feature_node_get_attribute(fn, "Name")))
    gt_feature_node_add_attribute((GtFeatureNode*) thick_feature, "Name", name);
  gt_feature_node_set_score((GtFeatureNode*) thick_feature,
                            gt_feature_node_get_score(fn));
  gt_feature_node_set_strand((GtFeatureNode*) thick_feature,
                             gt_feature_node_get_strand(fn));
  gt_feature_node_add_child(fn, (GtFeatureNode*) thick_feature);
}
示例#24
0
static void orf_attach_results_to_gff3(GtFeatureNode *gf,
                                       GtRange orf_rng, unsigned int orf_frame,
                                       GtStrand strand, GT_UNUSED GtError *err)
{
  GtGenomeNode *child;
  GtStr *tag;
  tag = gt_str_new_cstr(GT_ORF_FINDER_TAG);

  orf_rng.start++; orf_rng.end++;

  GtFeatureNodeIterator *gfi;
  GtFeatureNode *curnode = NULL, *parent_node = NULL;
  GtRange gfi_range;
  char frame_buf[3];
  sprintf(frame_buf, "%d", orf_frame);

  gfi = gt_feature_node_iterator_new(gf);

  while ((curnode = gt_feature_node_iterator_next(gfi))) {
    if (strcmp(gt_feature_node_get_type(curnode),
                                              (const char*) GT_ORF_TYPE) != 0) {
      gfi_range = gt_genome_node_get_range((GtGenomeNode*) curnode);
      if (gt_range_contains(&gfi_range, &orf_rng)) {
        parent_node = curnode;
      }
    }
  }
  if (parent_node) {
    child = gt_feature_node_new(gt_genome_node_get_seqid((GtGenomeNode*) gf),
                                GT_ORF_TYPE,
                                orf_rng.start,
                                orf_rng.end,
                                strand);
    gt_feature_node_set_source((GtFeatureNode*) child, tag);
    gt_feature_node_set_attribute((GtFeatureNode*) child, "frame", frame_buf);
    gt_feature_node_add_child(parent_node,(GtFeatureNode*) child);
  }
  gt_str_delete(tag);
  gt_feature_node_iterator_delete(gfi);
}
示例#25
0
static double gaeval_visitor_calculate_coverage(AgnGaevalVisitor *v,
                                                GtFeatureNode *genemodel,
                                                GtError *error)
{
  agn_assert(v && genemodel);

  GtStr *seqid = gt_genome_node_get_seqid((GtGenomeNode *)genemodel);
  GtRange mrna_range = gt_genome_node_get_range((GtGenomeNode *)genemodel);
  GtArray *overlapping = gt_array_new( sizeof(GtFeatureNode *) );
  bool hasseqid;
  gt_feature_index_has_seqid(v->alignments, &hasseqid, gt_str_get(seqid),error);
  if(hasseqid)
  {
    gt_feature_index_get_features_for_range(v->alignments, overlapping,
                                            gt_str_get(seqid), &mrna_range,
                                            error);
  }

  GtArray *exon_coverage = gt_array_new( sizeof(GtRange) );
  GtUword i;
  for(i = 0; i < gt_array_size(overlapping); i++)
  {
    GtFeatureNode *alignment = *(GtFeatureNode **)gt_array_get(overlapping, i);
    GtArray *covered_parts = gaeval_visitor_intersect((GtGenomeNode*)genemodel,
                                                      (GtGenomeNode*)alignment);
    if(covered_parts != NULL)
    {
      GtArray *temp = gaeval_visitor_union(exon_coverage, covered_parts);
      gt_array_delete(covered_parts);
      gt_array_delete(exon_coverage);
      exon_coverage = temp;
    }
  }
  double coverage = gaeval_visitor_coverage_resolve(genemodel, exon_coverage);
  gt_array_delete(exon_coverage);
  gt_array_delete(overlapping);

  return coverage;
}
示例#26
0
static void ppt_attach_results_to_gff3(GtPPTResults *results,
                                       GtLTRElement *element,
                                       GtStrand *canonical_strand,
                                       GtStr *tag)
{
  GtRange ppt_range;
  unsigned long i = 0;
  GtGenomeNode *gf;
  GtPPTHit* hit = gt_ppt_results_get_ranked_hit(results, i++);
  if (*canonical_strand == GT_STRAND_UNKNOWN)
    *canonical_strand = gt_ppt_hit_get_strand(hit);
  else
  {
    /* find best-scoring PPT on the given canonical strand */
    while (gt_ppt_hit_get_strand(hit) != *canonical_strand
             && i < gt_ppt_results_get_number_of_hits(results))
    {
      gt_log_log("dropping PPT because of nonconsistent strand: %s\n",
                 gt_feature_node_get_attribute(element->mainnode, "ID"));
      hit = gt_ppt_results_get_ranked_hit(results, i++);
    }
    /* if there is none, do not report a PPT */
    if (gt_ppt_hit_get_strand(hit) != *canonical_strand)
      return;
  }
  ppt_range = gt_ppt_hit_get_coords(hit);
  ppt_range.start++; ppt_range.end++;  /* GFF3 is 1-based */
  gf = gt_feature_node_new(gt_genome_node_get_seqid((GtGenomeNode*)
                                                    element->mainnode),
                           GT_PPT_TYPE,
                           ppt_range.start,
                           ppt_range.end,
                           gt_ppt_hit_get_strand(hit));
  gt_feature_node_set_source((GtFeatureNode*) gf, tag);
  gt_feature_node_set_strand(element->mainnode, gt_ppt_hit_get_strand(hit));
  gt_feature_node_add_child(element->mainnode, (GtFeatureNode*) gf);
}
static int extract_join_feature(GtGenomeNode *gn, const char *type,
                                GtRegionMapping *region_mapping,
                                GtStr *sequence, bool *reverse_strand,
                                bool *first_child_of_type_seen, GtPhase *phase,
                                GtError *err)
{
  char *outsequence;
  GtFeatureNode *fn;
  GtRange range;
  int had_err = 0;

  gt_error_check(err);
  fn = gt_feature_node_cast(gn);
  gt_assert(fn);

  if (gt_feature_node_has_type(fn, type)) {
    if (gt_feature_node_get_strand(fn) == GT_STRAND_REVERSE) {
      *reverse_strand = true;
      *phase = gt_feature_node_get_phase(fn);
    } else {
      if (!(*first_child_of_type_seen)) {
        *first_child_of_type_seen = true;
        *phase = gt_feature_node_get_phase(fn);
      } else *phase = GT_PHASE_UNDEFINED;
    }
    range = gt_genome_node_get_range(gn);
    had_err = gt_region_mapping_get_sequence(region_mapping, &outsequence,
                                             gt_genome_node_get_seqid(gn),
                                             range.start, range.end, err);
    if (!had_err) {
      gt_str_append_cstr_nt(sequence, outsequence, gt_range_length(&range));
      gt_free(outsequence);
    }
  }
  return had_err;
}
static int add_ids_visitor_region_node(GtNodeVisitor *nv, GtRegionNode *rn,
                                       GT_UNUSED GtError *err)
{
  GtAddIDsVisitor *aiv;
  const char *seqid;
  int had_err = 0;
  gt_error_check(err);
  aiv = add_ids_visitor_cast(nv);
  seqid = gt_str_get(gt_genome_node_get_seqid((GtGenomeNode*) rn));
  if (gt_hashmap_get(aiv->undefined_sequence_regions, seqid)) {
    gt_error_set(err, "genome feature with id \"%s\" has been defined before "
                 "the corresponding \"%s\" definition on line %u in file "
                 "\"%s\"", seqid, GT_GFF_SEQUENCE_REGION,
                 gt_genome_node_get_line_number((GtGenomeNode*) rn),
                 gt_genome_node_get_filename((GtGenomeNode*) rn));
    had_err = -1;
  }
  if (!had_err) {
    if (!gt_cstr_table_get(aiv->defined_seqids, seqid))
      gt_cstr_table_add(aiv->defined_seqids, seqid);
    gt_queue_add(aiv->node_buffer, rn);
  }
  return had_err;
}
static int inter_feature_in_children(GtFeatureNode *current_feature, void *data,
                                     GT_UNUSED GtError *err)
{
  GtInterFeatureVisitor *aiv = (GtInterFeatureVisitor*) data;
  GtFeatureNode *inter_node;
  GtRange previous_range, current_range, inter_range;
  GtStrand previous_strand, /*current_strand, */inter_strand;
  GtStr *parent_seqid;
  gt_error_check(err);
  gt_assert(current_feature);
  if (gt_feature_node_has_type(current_feature, aiv->outside_type)) {
    if (aiv->previous_feature) {
      /* determine inter range */
      previous_range = gt_genome_node_get_range((GtGenomeNode*)
                                                aiv->previous_feature);
      current_range = gt_genome_node_get_range((GtGenomeNode*) current_feature);
      if (previous_range.end >= current_range.start) {
        gt_warning("overlapping boundary features " GT_WU "-" GT_WU " and "
                   GT_WU "-" GT_WU ", " "not placing '%s' inter-feature",
                   previous_range.start,
                   previous_range.end,
                   current_range.start,
                   current_range.end,
                   aiv->inter_type);
        return 0;
      }
      if (current_range.start - previous_range.end < 2) {
        gt_warning("no space for inter-feature '%s' between " GT_WU " and "
                   GT_WU,
                   aiv->inter_type,
                   previous_range.end,
                   current_range.start);
        return 0;
      }
      inter_range.start = previous_range.end + 1;
      inter_range.end = current_range.start - 1;

      /* determine inter strand */
      previous_strand = gt_feature_node_get_strand(aiv->previous_feature);
      /*current_strand = gt_feature_node_get_strand(current_feature);*/
      gt_assert(previous_strand == gt_feature_node_get_strand(current_feature));
      inter_strand = previous_strand;

      /* determine sequence id */
      parent_seqid =
        gt_genome_node_get_seqid((GtGenomeNode*) aiv->parent_feature);
      gt_assert(!gt_str_cmp(parent_seqid,
                            gt_genome_node_get_seqid((GtGenomeNode*)
                                                     aiv->previous_feature)));
      gt_assert(!gt_str_cmp(parent_seqid,
                            gt_genome_node_get_seqid((GtGenomeNode*)
                                                     current_feature)));

      /* create inter feature */
      inter_node = (GtFeatureNode*)
                   gt_feature_node_new(parent_seqid, aiv->inter_type,
                                       inter_range.start, inter_range.end,
                                       inter_strand);
      gt_feature_node_add_child(aiv->parent_feature, inter_node);
    }
    aiv->previous_feature = current_feature;
  }
  return 0;
}
示例#30
0
static int select_visitor_feature_node(GtNodeVisitor *nv,
                                       GtFeatureNode *fn,
                                       GT_UNUSED GtError *err)
{
  GtSelectVisitor *fv;
  bool filter_node = false;
  gt_error_check(err);
  fv = select_visitor_cast(nv);
  fv->current_feature++;
  if ((!gt_str_length(fv->seqid) || /* no seqid was specified or seqids are
                                       equal */
       !gt_str_cmp(fv->seqid, gt_genome_node_get_seqid((GtGenomeNode*) fn))) &&
      (!gt_str_length(fv->source) || /* no source was specified or sources are
                                        equal */
       !strcmp(gt_str_get(fv->source), gt_feature_node_get_source(fn)))) {
    GtRange range = gt_genome_node_get_range((GtGenomeNode*) fn);
    /* enforce maximum gene length */
    /* XXX: we (spuriously) assume that genes are always root nodes */
    if (fn && gt_feature_node_has_type(fn, gt_ft_gene)) {
      if (fv->max_gene_length != GT_UNDEF_ULONG &&
          gt_range_length(&range) > fv->max_gene_length) {
        filter_node = true;
      }
      else if (fv->max_gene_num != GT_UNDEF_ULONG &&
               fv->gene_num >= fv->max_gene_num) {
        filter_node = true;
      }
      else if (fv->min_gene_score != GT_UNDEF_DOUBLE &&
               gt_feature_node_get_score(fn) < fv->min_gene_score) {
        filter_node = true;
      }
      else if (fv->max_gene_score != GT_UNDEF_DOUBLE &&
               gt_feature_node_get_score(fn) > fv->max_gene_score) {
        filter_node = true;
      }
      else if (fv->feature_num != GT_UNDEF_ULONG &&
               fv->feature_num != fv->current_feature) {
        filter_node = true;
      }
      if (!filter_node)
        fv->gene_num++; /* gene passed filter */
    }
  }
  else
    filter_node = true;

  if (!filter_node)
    filter_node = filter_contain_range(fn, fv->contain_range);

  if (!filter_node)
    filter_node = filter_overlap_range(fn, fv->overlap_range);

  if (!filter_node)
    filter_node = filter_strand(fn, fv->strand);

  if (!filter_node)
    filter_node = filter_targetstrand(fn, fv->targetstrand);

  if (!filter_node)
    filter_node = filter_has_CDS(fn, fv->has_CDS);

  if (!filter_node)
    filter_node = filter_min_average_ssp(fn, fv->min_average_splice_site_prob);

  if (filter_node)
    gt_genome_node_delete((GtGenomeNode*) fn);
  else
    gt_queue_add(fv->node_buffer, fn);

  return 0;
}