Exemplo n.º 1
0
void gt_gff3_output_leading_str(GtFeatureNode *fn, GtStr *outstr)
{
  GtGenomeNode *gn;
  gt_assert(fn && outstr);
  gn = (GtGenomeNode*) fn;
  gt_str_append_str(outstr, gt_genome_node_get_seqid(gn));
  gt_str_append_char(outstr, '\t');
  gt_str_append_cstr(outstr, gt_feature_node_get_source(fn));
  gt_str_append_char(outstr, '\t');
  gt_str_append_cstr(outstr, gt_feature_node_get_type(fn));
  gt_str_append_char(outstr, '\t');
  gt_str_append_uword(outstr, gt_genome_node_get_start(gn));
  gt_str_append_char(outstr, '\t');
  gt_str_append_uword(outstr, gt_genome_node_get_end(gn));
  gt_str_append_char(outstr, '\t');
  if (gt_feature_node_score_is_defined(fn)) {
    char buf[BUFSIZ];
    (void) snprintf(buf, BUFSIZ, "%.3g", gt_feature_node_get_score(fn));
    gt_str_append_cstr(outstr, buf);
  } else
    gt_str_append_char(outstr, '.');
  gt_str_append_char(outstr, '\t');
  gt_str_append_char(outstr, GT_STRAND_CHARS[gt_feature_node_get_strand(fn)]);
  gt_str_append_char(outstr, '\t');
  gt_str_append_char(outstr, GT_PHASE_CHARS[gt_feature_node_get_phase(fn)]);
  gt_str_append_char(outstr, '\t');
}
static void filter_targetbest(GtFeatureNode *current_feature,
                              GtDlist *trees, GtHashmap *target_to_elem)
{
  unsigned long num_of_targets;
  GtDlistelem *previous_elem;
  GtStr *first_target_id;
  const char *target;
  int had_err;
  gt_assert(current_feature && trees);
  target = gt_feature_node_get_attribute(current_feature, TARGET_STRING);
  gt_assert(target);
  first_target_id = gt_str_new();
  had_err = gt_gff3_parser_parse_target_attributes(target, &num_of_targets,
                                                   first_target_id, NULL, NULL,
                                                   "", 0, NULL);
  gt_assert(!had_err);
  if (num_of_targets == 1) {
    GtStr *key = gt_str_new();
    build_key(key, current_feature, first_target_id);
    if (!(previous_elem = gt_hashmap_get(target_to_elem, gt_str_get(key)))) {
      /* element with this target_id not included yet -> include it */
      include_feature(trees, target_to_elem, current_feature, key);
    }
    else {
      GtFeatureNode *previous_feature = gt_dlistelem_get_data(previous_elem);
      /* element with this target_id included already -> compare them */
      if (gt_feature_node_get_score(current_feature) >
          gt_feature_node_get_score(previous_feature)) {
        /* current feature is better -> replace previous feature */
        replace_previous_elem(previous_elem, current_feature, trees,
                              target_to_elem, key);
      }
      else /* current feature is not better -> remove it */
        gt_genome_node_delete((GtGenomeNode*) current_feature);
    }
    gt_str_delete(key);
  }
  else
    gt_dlist_add(trees, current_feature);
  gt_str_delete(first_target_id);
}
Exemplo n.º 3
0
static int create_block_features(GtBEDParser *bed_parser, GtFeatureNode *fn,
                                 GtUword block_count,
                                 GtSplitter *size_splitter,
                                 GtSplitter *start_splitter, GtIO *bed_file,
                                 GtError *err)
{
  GtUword i;
  int had_err = 0;
  gt_assert(fn && block_count && size_splitter && start_splitter);
  gt_assert(gt_splitter_size(size_splitter) == block_count);
  gt_assert(gt_splitter_size(start_splitter) == block_count);
  for (i = 0; !had_err && i < block_count; i++) {
    GtUword block_size, block_start, start, end;
    GtGenomeNode *block;
    const char *name;
    if (gt_parse_uword(&block_size, gt_splitter_get_token(size_splitter, i))) {
      gt_error_set(err,
                   "file \"%s\": line "GT_WU": could not parse blockSize '%s'",
                   gt_io_get_filename(bed_file),
                   gt_io_get_line_number(bed_file),
                   gt_splitter_get_token(size_splitter, i));
      had_err = -1;
    }
    if (!had_err && gt_parse_uword(&block_start,
                                   gt_splitter_get_token(start_splitter, i))) {
      gt_error_set(err, "file \"%s\": line "GT_WU": could not parse blockStart "
                   "'%s'", gt_io_get_filename(bed_file),
                   gt_io_get_line_number(bed_file),
                   gt_splitter_get_token(start_splitter, i));
      had_err = -1;
    }
    if (!had_err) {
      start = gt_genome_node_get_start((GtGenomeNode*) fn) + block_start;
      end = start + block_size - 1;
      block = gt_feature_node_new(gt_genome_node_get_seqid((GtGenomeNode*) fn),
                                  bed_parser->block_type
                                  ? bed_parser->block_type
                                  : BED_BLOCK_TYPE,
                                  start, end, gt_feature_node_get_strand(fn));
      if ((name = gt_feature_node_get_attribute(fn, GT_GFF_NAME))) {
        gt_feature_node_add_attribute((GtFeatureNode*) block, GT_GFF_NAME,
                                      name);
      }
      gt_feature_node_set_score((GtFeatureNode*) block,
                                gt_feature_node_get_score(fn));
      gt_feature_node_set_strand((GtFeatureNode*) block,
                                 gt_feature_node_get_strand(fn));
      gt_feature_node_add_child(fn, (GtFeatureNode*) block);
    }
  }
  return had_err;
}
Exemplo n.º 4
0
static int feature_node_lua_get_score(lua_State *L)
{
  GtGenomeNode **gn = check_genome_node(L, 1);
  GtFeatureNode *fn;
  /* make sure we get a feature node */
  fn = gt_feature_node_try_cast(*gn);
  luaL_argcheck(L, fn, 1, "not a feature node");
  if (gt_feature_node_score_is_defined(fn))
    lua_pushnumber(L, gt_feature_node_get_score(fn));
  else
    lua_pushnil(L);
  return 1;
}
Exemplo n.º 5
0
static void compute_type_statistics(GtFeatureNode *fn, GtStatVisitor *sv)
{
    GtRange range;
    gt_assert(fn && sv);
    if (gt_feature_node_has_type(fn, gt_ft_gene)) {
        sv->number_of_genes++;
        if (gt_feature_node_has_CDS(fn))
            sv->number_of_protein_coding_genes++;
        if (sv->gene_length_distribution) {
            range = gt_genome_node_get_range((GtGenomeNode*) fn);
            gt_disc_distri_add(sv->gene_length_distribution, gt_range_length(&range));
        }
        if (sv->gene_score_distribution) {
            gt_disc_distri_add(sv->gene_score_distribution,
                               gt_feature_node_get_score(fn) * 100.0);
        }
    }
    else if (gt_feature_node_has_type(fn, gt_ft_mRNA)) {
        sv->number_of_mRNAs++;
        if (gt_feature_node_has_CDS(fn))
            sv->number_of_protein_coding_mRNAs++;
    }
    else if (gt_feature_node_has_type(fn, gt_ft_exon)) {
        sv->number_of_exons++;
        if (sv->exon_length_distribution) {
            range = gt_genome_node_get_range((GtGenomeNode*) fn);
            gt_disc_distri_add(sv->exon_length_distribution,
                               gt_range_length(&range));
        }
    }
    else if (gt_feature_node_has_type(fn, gt_ft_CDS)) {
        sv->number_of_CDSs++;
    }
    else if (gt_feature_node_has_type(fn, gt_ft_intron)) {
        if (sv->intron_length_distribution) {
            range = gt_genome_node_get_range((GtGenomeNode*) fn);
            gt_disc_distri_add(sv->intron_length_distribution,
                               gt_range_length(&range));
        }
    }
    else if (gt_feature_node_has_type(fn, gt_ft_LTR_retrotransposon)) {
        sv->number_of_LTR_retrotransposons++;
    }
}
Exemplo n.º 6
0
void gt_gff3_output_leading(GtFeatureNode *fn, GtFile *outfp)
{
  GtGenomeNode *gn;
  gt_assert(fn);
  gn = (GtGenomeNode*) fn;
  gt_file_xprintf(outfp, "%s\t%s\t%s\t"GT_WU"\t"GT_WU"\t",
                     gt_str_get(gt_genome_node_get_seqid(gn)),
                     gt_feature_node_get_source(fn),
                     gt_feature_node_get_type(fn),
                     gt_genome_node_get_start(gn),
                     gt_genome_node_get_end(gn));
  if (gt_feature_node_score_is_defined(fn))
    gt_file_xprintf(outfp, "%.3g", gt_feature_node_get_score(fn));
  else
    gt_file_xfputc('.', outfp);
  gt_file_xprintf(outfp, "\t%c\t%c\t",
                     GT_STRAND_CHARS[gt_feature_node_get_strand(fn)],
                     GT_PHASE_CHARS[gt_feature_node_get_phase(fn)]);
}
Exemplo n.º 7
0
static void construct_thick_feature(GtBEDParser *bed_parser, GtFeatureNode *fn,
                                    GtRange range)
{
  GtGenomeNode *thick_feature;
  const char *name;
  gt_assert(fn);
  thick_feature = gt_feature_node_new(gt_genome_node_get_seqid((GtGenomeNode*)
                                                               fn),
                                      bed_parser->thick_feature_type
                                      ? bed_parser->thick_feature_type
                                      : BED_THICK_FEATURE_TYPE,
                                      range.start, range.end,
                                      gt_feature_node_get_strand(fn));
  if ((name = gt_feature_node_get_attribute(fn, "Name")))
    gt_feature_node_add_attribute((GtFeatureNode*) thick_feature, "Name", name);
  gt_feature_node_set_score((GtFeatureNode*) thick_feature,
                            gt_feature_node_get_score(fn));
  gt_feature_node_set_strand((GtFeatureNode*) thick_feature,
                             gt_feature_node_get_strand(fn));
  gt_feature_node_add_child(fn, (GtFeatureNode*) thick_feature);
}
Exemplo n.º 8
0
static int select_visitor_feature_node(GtNodeVisitor *nv,
                                       GtFeatureNode *fn,
                                       GT_UNUSED GtError *err)
{
  GtSelectVisitor *fv;
  bool filter_node = false;
  gt_error_check(err);
  fv = select_visitor_cast(nv);
  fv->current_feature++;
  if ((!gt_str_length(fv->seqid) || /* no seqid was specified or seqids are
                                       equal */
       !gt_str_cmp(fv->seqid, gt_genome_node_get_seqid((GtGenomeNode*) fn))) &&
      (!gt_str_length(fv->source) || /* no source was specified or sources are
                                        equal */
       !strcmp(gt_str_get(fv->source), gt_feature_node_get_source(fn)))) {
    GtRange range = gt_genome_node_get_range((GtGenomeNode*) fn);
    /* enforce maximum gene length */
    /* XXX: we (spuriously) assume that genes are always root nodes */
    if (fn && gt_feature_node_has_type(fn, gt_ft_gene)) {
      if (fv->max_gene_length != GT_UNDEF_ULONG &&
          gt_range_length(&range) > fv->max_gene_length) {
        filter_node = true;
      }
      else if (fv->max_gene_num != GT_UNDEF_ULONG &&
               fv->gene_num >= fv->max_gene_num) {
        filter_node = true;
      }
      else if (fv->min_gene_score != GT_UNDEF_DOUBLE &&
               gt_feature_node_get_score(fn) < fv->min_gene_score) {
        filter_node = true;
      }
      else if (fv->max_gene_score != GT_UNDEF_DOUBLE &&
               gt_feature_node_get_score(fn) > fv->max_gene_score) {
        filter_node = true;
      }
      else if (fv->feature_num != GT_UNDEF_ULONG &&
               fv->feature_num != fv->current_feature) {
        filter_node = true;
      }
      if (!filter_node)
        fv->gene_num++; /* gene passed filter */
    }
  }
  else
    filter_node = true;

  if (!filter_node)
    filter_node = filter_contain_range(fn, fv->contain_range);

  if (!filter_node)
    filter_node = filter_overlap_range(fn, fv->overlap_range);

  if (!filter_node)
    filter_node = filter_strand(fn, fv->strand);

  if (!filter_node)
    filter_node = filter_targetstrand(fn, fv->targetstrand);

  if (!filter_node)
    filter_node = filter_has_CDS(fn, fv->has_CDS);

  if (!filter_node)
    filter_node = filter_min_average_ssp(fn, fv->min_average_splice_site_prob);

  if (filter_node)
    gt_genome_node_delete((GtGenomeNode*) fn);
  else
    gt_queue_add(fv->node_buffer, fn);

  return 0;
}
Exemplo n.º 9
0
static int gt_ltrdigest_pdom_visitor_choose_strand(GtLTRdigestPdomVisitor *lv)
{
  int had_err = 0;
  double log_eval_fwd = 0.0,
         log_eval_rev = 0.0;
  GtFeatureNodeIterator *fni;
  GtStrand strand;
  double score;
  bool seen_fwd = false,
       seen_rev = false;
  GtFeatureNode *curnode = NULL;
  GtUword i;
  GtArray *to_delete;

  fni = gt_feature_node_iterator_new(lv->ltr_retrotrans);
  while (!had_err && (curnode = gt_feature_node_iterator_next(fni))) {
    if (strcmp(gt_feature_node_get_type(curnode),
               gt_ft_protein_match) == 0) {
      strand = gt_feature_node_get_strand(curnode);
      score = (double) gt_feature_node_get_score(curnode);
      if (strand == GT_STRAND_FORWARD) {
        log_eval_fwd += log(score);
        seen_fwd = true;
      } else if (strand == GT_STRAND_REVERSE) {
        log_eval_rev += log(score);
        seen_rev = true;
      }
    }
  }
  gt_feature_node_iterator_delete(fni);

  if (seen_rev && !seen_fwd)
    gt_feature_node_set_strand(lv->ltr_retrotrans, GT_STRAND_REVERSE);
  else if (!seen_rev && seen_fwd)
    gt_feature_node_set_strand(lv->ltr_retrotrans, GT_STRAND_FORWARD);
  else if (!seen_rev && !seen_fwd)
    return had_err;
  else {
    gt_assert(seen_rev && seen_fwd);
    if (gt_double_compare(log_eval_fwd, log_eval_rev) < 0)
      strand = GT_STRAND_FORWARD;
    else
      strand = GT_STRAND_REVERSE;
    gt_feature_node_set_strand(lv->ltr_retrotrans, strand);

    to_delete = gt_array_new(sizeof (GtFeatureNode*));
    fni = gt_feature_node_iterator_new(lv->ltr_retrotrans);
    while (!had_err && (curnode = gt_feature_node_iterator_next(fni))) {
      if (strcmp(gt_feature_node_get_type(curnode),
                 gt_ft_protein_match) == 0) {
        if (strand != gt_feature_node_get_strand(curnode)) {
          gt_array_add(to_delete, curnode);
        }
      }
    }
    gt_feature_node_iterator_delete(fni);
    gt_assert(gt_array_size(to_delete) > 0);
    for (i = 0; i < gt_array_size(to_delete); i++) {
      gt_feature_node_remove_leaf(lv->ltr_retrotrans,
                                  *(GtFeatureNode**) gt_array_get(to_delete,
                                                                  i));
    }
    gt_array_delete(to_delete);
  }
  return had_err;
}