Esempio n. 1
0
static GtArray*
gaeval_visitor_intersect(GtGenomeNode *genemodel, GtGenomeNode *alignment)
{
  agn_assert(genemodel && alignment);

  GtFeatureNode *genefn = gt_feature_node_cast(genemodel);
  GtFeatureNode *algnfn = gt_feature_node_cast(alignment);
  agn_assert(gt_feature_node_has_type(genefn, "mRNA"));
  GtStrand genestrand = gt_feature_node_get_strand(genefn);
  GtStrand algnstrand = gt_feature_node_get_strand(algnfn);
  if(genestrand != algnstrand)
    return NULL;

  GtArray *covered_parts = gt_array_new( sizeof(GtRange) );
  GtArray *exons = agn_typecheck_select(genefn, agn_typecheck_exon);
  GtWord i;
  for(i = 0; i < gt_array_size(exons); i++)
  {
    GtGenomeNode *exon = *(GtGenomeNode **)gt_array_get(exons, i);
    GtRange exonrange = gt_genome_node_get_range(exon);

    GtFeatureNodeIterator *aniter = gt_feature_node_iterator_new(algnfn);
    GtFeatureNode *tempaln;
    GtRange nullrange = {0, 0};
    for(tempaln  = gt_feature_node_iterator_next(aniter);
        tempaln != NULL;
        tempaln  = gt_feature_node_iterator_next(aniter))
    {
      if(gt_feature_node_has_type(tempaln, "match_gap"))
        continue;

      GtRange alnrange = gt_genome_node_get_range((GtGenomeNode *) tempaln);
      GtRange intr = gaeval_visitor_range_intersect(&exonrange, &alnrange);
      if(gt_range_compare(&intr, &nullrange) != 0)
        gt_array_add(covered_parts, intr);
    }
    gt_feature_node_iterator_delete(aniter);
  }
  gt_array_delete(exons);

  for(i = 0; i < gt_array_size(covered_parts); i++)
  {
    GtRange *r1 = gt_array_get(covered_parts, i);
    GtUword j;
    for(j = i+1; j < gt_array_size(covered_parts); j++)
    {
      GtRange *r2 = gt_array_get(covered_parts, j);
      agn_assert(gt_range_overlap(r1, r2) == false);
    }
  }

  return covered_parts;
}
Esempio n. 2
0
static int save_exon_node(GtFeatureNode *fn, void *data, GT_UNUSED GtError *err)
{
  GtGTFVisitor *gtf_visitor;
  gt_error_check(err);
  gt_assert(fn && data);
  gtf_visitor = (GtGTFVisitor*) data;
  if (gt_feature_node_has_type(fn, gt_ft_exon))
    gt_array_add(gtf_visitor->exon_features, fn);
  else if (gt_feature_node_has_type(fn, gt_ft_CDS))
    gt_array_add(gtf_visitor->CDS_features, fn);
  return 0;
}
Esempio n. 3
0
static int add_exon_or_cds_number(GtFeatureNode *fn, void *data,
                                  GT_UNUSED GtError *err)
{
    GtStatVisitor *sv = (GtStatVisitor*) data;
    gt_error_check(err);
    gt_assert(sv && fn);
    if (gt_feature_node_has_type(fn, gt_ft_exon))
        sv->exon_number_for_distri++;
    else if (gt_feature_node_has_type(fn, gt_ft_CDS)) {
        GtRange range = gt_genome_node_get_range((GtGenomeNode*) fn);
        sv->cds_length_for_distri += gt_range_length(&range);
    }
    return 0;
}
Esempio n. 4
0
static int
visit_feature_node(GtNodeVisitor *nv, GtFeatureNode *fn, GtError *error)
{
  AgnLocusMapVisitor *v = locus_map_visitor_cast(nv);
  gt_error_check(error);
  agn_assert(gt_feature_node_has_type(fn, "locus"));
  const char *locuslabel = agn_feature_node_get_label(fn);

  GtFeatureNodeIterator *iter = gt_feature_node_iterator_new(fn);
  GtFeatureNode *current;
  for(current  = gt_feature_node_iterator_next(iter);
      current != NULL;
      current  = gt_feature_node_iterator_next(iter))
  {
    if(agn_typecheck_gene(current) && v->genefh != NULL)
    {
      const char *genelabel = agn_feature_node_get_label(current);
      fprintf(v->genefh, "%s\t%s\n", genelabel, locuslabel);
    }

    if(agn_typecheck_mrna(current) && v->mrnafh != NULL)
    {
      const char *mrnalabel = agn_feature_node_get_label(current);
      fprintf(v->mrnafh, "%s\t%s\n", mrnalabel, locuslabel);
    }
  }
  gt_feature_node_iterator_delete(iter);
  return 0;
}
Esempio n. 5
0
static int gtf_show_feature_node(GtFeatureNode *fn, void *data, GtError *err)
{
  GtGTFVisitor *gtf_visitor = (GtGTFVisitor*) data;
  int had_err = 0;
  if (gt_feature_node_has_type(fn, gt_ft_gene)) {
      gtf_visitor->gene_id++;
      gtf_visitor->transcript_id = 0;
      had_err = gtf_show_transcript(fn, gtf_visitor, err);
  }
  else if (gt_feature_node_has_type(fn, gt_ft_mRNA)) {
    had_err = gtf_show_transcript(fn, gtf_visitor, err);
  }
  else if (!(gt_feature_node_has_type(fn, gt_ft_CDS) ||
             gt_feature_node_has_type(fn, gt_ft_exon))) {
      gt_warning("skipping GFF3 feature of type \"%s\" (from line %u in file "
                 "\"%s\")",
                 gt_feature_node_get_type(fn),
                 gt_genome_node_get_line_number((GtGenomeNode*) fn),
                 gt_genome_node_get_filename((GtGenomeNode*) fn));
  }
  return had_err;
}
Esempio n. 6
0
static void infer_cds_visitor_set_utrs(AgnInferCDSVisitor *v)
{
  GtGenomeNode **start;
  GtUword i, cds_start;

  if(!v->starts || gt_array_size(v->starts) != 1)
    return;
  start = gt_array_get(v->starts, 0);
  cds_start = gt_genome_node_get_start(*start);

  for(i = 0; i < gt_array_size(v->utrs); i++)
  {
    GtFeatureNode *utr = *(GtFeatureNode **)gt_array_get(v->utrs, i);
    GtStrand strand = gt_feature_node_get_strand(utr);
    GtUword utr_start = gt_genome_node_get_start((GtGenomeNode *)utr);

    if(!gt_feature_node_has_type(utr, "five_prime_UTR") &&
       !gt_feature_node_has_type(utr, "three_prime_UTR"))
    {
      if(strand == GT_STRAND_FORWARD)
      {
        if(utr_start < cds_start)
          gt_feature_node_set_type(utr, "five_prime_UTR");
        else
          gt_feature_node_set_type(utr, "three_prime_UTR");
      }
      else
      {
        if(utr_start < cds_start)
          gt_feature_node_set_type(utr, "three_prime_UTR");
        else
          gt_feature_node_set_type(utr, "five_prime_UTR");
      }
    }
  }
}
Esempio n. 7
0
static int check_cds_phases_if_necessary(GtFeatureNode *fn,
                                         GtCDSCheckVisitor *v,
                                         bool second_pass, GtError *err)
{
  GtFeatureNodeIterator *fni;
  GtFeatureNode *node;
  GtArray *cds_features = NULL;
  GtHashmap *multi_features = NULL;
  int had_err = 0;
  gt_error_check(err);
  gt_assert(fn);
  fni = gt_feature_node_iterator_new_direct(fn);
  while ((node = gt_feature_node_iterator_next(fni))) {
    if (gt_feature_node_has_type(node, gt_ft_CDS)) {
      if (gt_feature_node_is_multi(node)) {
        GtArray *features;
        if (!multi_features)
          multi_features = gt_hashmap_new(GT_HASH_DIRECT, NULL,
                                          (GtFree) gt_array_delete);
        if ((features =
                gt_hashmap_get(multi_features,
                             gt_feature_node_get_multi_representative(node)))) {
          gt_array_add(features, node);
        }
        else {
          GtFeatureNode *representative;
          features = gt_array_new(sizeof (GtFeatureNode*));
          representative = gt_feature_node_get_multi_representative(node);
          gt_array_add(features, representative);
          gt_hashmap_add(multi_features, representative, features);
        }
      }
      else {
        if (!cds_features)
          cds_features = gt_array_new(sizeof (GtFeatureNode*));
        gt_array_add(cds_features, node);
      }
    }
  }
  if (cds_features)
    had_err = check_cds_phases(cds_features, v, false, second_pass, err);
  if (!had_err && multi_features)
    had_err = gt_hashmap_foreach(multi_features, check_cds_phases_hm, v, err);
  gt_array_delete(cds_features);
  gt_hashmap_delete(multi_features);
  gt_feature_node_iterator_delete(fni);
  return had_err;
}
Esempio n. 8
0
static double gaeval_visitor_coverage_resolve(GtFeatureNode *genemodel,
                                              GtArray *exon_coverage)
{
  agn_assert(genemodel && exon_coverage);
  agn_assert(gt_feature_node_has_type(genemodel, "mRNA"));

  GtUword cum_exon_length =
      agn_typecheck_feature_combined_length(genemodel, agn_typecheck_exon);

  GtUword i, covered = 0;
  for(i = 0; i < gt_array_size(exon_coverage); i++)
  {
    GtRange *range = gt_array_get(exon_coverage, i);
    covered += gt_range_length(range);
  }
  agn_assert(covered <= cum_exon_length);
  return (double)covered / (double)cum_exon_length;
}
Esempio n. 9
0
static void compute_type_statistics(GtFeatureNode *fn, GtStatVisitor *sv)
{
    GtRange range;
    gt_assert(fn && sv);
    if (gt_feature_node_has_type(fn, gt_ft_gene)) {
        sv->number_of_genes++;
        if (gt_feature_node_has_CDS(fn))
            sv->number_of_protein_coding_genes++;
        if (sv->gene_length_distribution) {
            range = gt_genome_node_get_range((GtGenomeNode*) fn);
            gt_disc_distri_add(sv->gene_length_distribution, gt_range_length(&range));
        }
        if (sv->gene_score_distribution) {
            gt_disc_distri_add(sv->gene_score_distribution,
                               gt_feature_node_get_score(fn) * 100.0);
        }
    }
    else if (gt_feature_node_has_type(fn, gt_ft_mRNA)) {
        sv->number_of_mRNAs++;
        if (gt_feature_node_has_CDS(fn))
            sv->number_of_protein_coding_mRNAs++;
    }
    else if (gt_feature_node_has_type(fn, gt_ft_exon)) {
        sv->number_of_exons++;
        if (sv->exon_length_distribution) {
            range = gt_genome_node_get_range((GtGenomeNode*) fn);
            gt_disc_distri_add(sv->exon_length_distribution,
                               gt_range_length(&range));
        }
    }
    else if (gt_feature_node_has_type(fn, gt_ft_CDS)) {
        sv->number_of_CDSs++;
    }
    else if (gt_feature_node_has_type(fn, gt_ft_intron)) {
        if (sv->intron_length_distribution) {
            range = gt_genome_node_get_range((GtGenomeNode*) fn);
            gt_disc_distri_add(sv->intron_length_distribution,
                               gt_range_length(&range));
        }
    }
    else if (gt_feature_node_has_type(fn, gt_ft_LTR_retrotransposon)) {
        sv->number_of_LTR_retrotransposons++;
    }
}
static int extract_join_feature(GtGenomeNode *gn, const char *type,
                                GtRegionMapping *region_mapping,
                                GtStr *sequence, bool *reverse_strand,
                                bool *first_child_of_type_seen, GtPhase *phase,
                                GtError *err)
{
  char *outsequence;
  GtFeatureNode *fn;
  GtRange range;
  int had_err = 0;

  gt_error_check(err);
  fn = gt_feature_node_cast(gn);
  gt_assert(fn);

  if (gt_feature_node_has_type(fn, type)) {
    if (gt_feature_node_get_strand(fn) == GT_STRAND_REVERSE) {
      *reverse_strand = true;
      *phase = gt_feature_node_get_phase(fn);
    } else {
      if (!(*first_child_of_type_seen)) {
        *first_child_of_type_seen = true;
        *phase = gt_feature_node_get_phase(fn);
      } else *phase = GT_PHASE_UNDEFINED;
    }
    range = gt_genome_node_get_range(gn);
    had_err = gt_region_mapping_get_sequence(region_mapping, &outsequence,
                                             gt_genome_node_get_seqid(gn),
                                             range.start, range.end, err);
    if (!had_err) {
      gt_str_append_cstr_nt(sequence, outsequence, gt_range_length(&range));
      gt_free(outsequence);
    }
  }
  return had_err;
}
static int inter_feature_in_children(GtFeatureNode *current_feature, void *data,
                                     GT_UNUSED GtError *err)
{
  GtInterFeatureVisitor *aiv = (GtInterFeatureVisitor*) data;
  GtFeatureNode *inter_node;
  GtRange previous_range, current_range, inter_range;
  GtStrand previous_strand, /*current_strand, */inter_strand;
  GtStr *parent_seqid;
  gt_error_check(err);
  gt_assert(current_feature);
  if (gt_feature_node_has_type(current_feature, aiv->outside_type)) {
    if (aiv->previous_feature) {
      /* determine inter range */
      previous_range = gt_genome_node_get_range((GtGenomeNode*)
                                                aiv->previous_feature);
      current_range = gt_genome_node_get_range((GtGenomeNode*) current_feature);
      if (previous_range.end >= current_range.start) {
        gt_warning("overlapping boundary features " GT_WU "-" GT_WU " and "
                   GT_WU "-" GT_WU ", " "not placing '%s' inter-feature",
                   previous_range.start,
                   previous_range.end,
                   current_range.start,
                   current_range.end,
                   aiv->inter_type);
        return 0;
      }
      if (current_range.start - previous_range.end < 2) {
        gt_warning("no space for inter-feature '%s' between " GT_WU " and "
                   GT_WU,
                   aiv->inter_type,
                   previous_range.end,
                   current_range.start);
        return 0;
      }
      inter_range.start = previous_range.end + 1;
      inter_range.end = current_range.start - 1;

      /* determine inter strand */
      previous_strand = gt_feature_node_get_strand(aiv->previous_feature);
      /*current_strand = gt_feature_node_get_strand(current_feature);*/
      gt_assert(previous_strand == gt_feature_node_get_strand(current_feature));
      inter_strand = previous_strand;

      /* determine sequence id */
      parent_seqid =
        gt_genome_node_get_seqid((GtGenomeNode*) aiv->parent_feature);
      gt_assert(!gt_str_cmp(parent_seqid,
                            gt_genome_node_get_seqid((GtGenomeNode*)
                                                     aiv->previous_feature)));
      gt_assert(!gt_str_cmp(parent_seqid,
                            gt_genome_node_get_seqid((GtGenomeNode*)
                                                     current_feature)));

      /* create inter feature */
      inter_node = (GtFeatureNode*)
                   gt_feature_node_new(parent_seqid, aiv->inter_type,
                                       inter_range.start, inter_range.end,
                                       inter_strand);
      gt_feature_node_add_child(aiv->parent_feature, inter_node);
    }
    aiv->previous_feature = current_feature;
  }
  return 0;
}
Esempio n. 12
0
static int select_visitor_feature_node(GtNodeVisitor *nv,
                                       GtFeatureNode *fn,
                                       GT_UNUSED GtError *err)
{
  GtSelectVisitor *fv;
  bool filter_node = false;
  gt_error_check(err);
  fv = select_visitor_cast(nv);
  fv->current_feature++;
  if ((!gt_str_length(fv->seqid) || /* no seqid was specified or seqids are
                                       equal */
       !gt_str_cmp(fv->seqid, gt_genome_node_get_seqid((GtGenomeNode*) fn))) &&
      (!gt_str_length(fv->source) || /* no source was specified or sources are
                                        equal */
       !strcmp(gt_str_get(fv->source), gt_feature_node_get_source(fn)))) {
    GtRange range = gt_genome_node_get_range((GtGenomeNode*) fn);
    /* enforce maximum gene length */
    /* XXX: we (spuriously) assume that genes are always root nodes */
    if (fn && gt_feature_node_has_type(fn, gt_ft_gene)) {
      if (fv->max_gene_length != GT_UNDEF_ULONG &&
          gt_range_length(&range) > fv->max_gene_length) {
        filter_node = true;
      }
      else if (fv->max_gene_num != GT_UNDEF_ULONG &&
               fv->gene_num >= fv->max_gene_num) {
        filter_node = true;
      }
      else if (fv->min_gene_score != GT_UNDEF_DOUBLE &&
               gt_feature_node_get_score(fn) < fv->min_gene_score) {
        filter_node = true;
      }
      else if (fv->max_gene_score != GT_UNDEF_DOUBLE &&
               gt_feature_node_get_score(fn) > fv->max_gene_score) {
        filter_node = true;
      }
      else if (fv->feature_num != GT_UNDEF_ULONG &&
               fv->feature_num != fv->current_feature) {
        filter_node = true;
      }
      if (!filter_node)
        fv->gene_num++; /* gene passed filter */
    }
  }
  else
    filter_node = true;

  if (!filter_node)
    filter_node = filter_contain_range(fn, fv->contain_range);

  if (!filter_node)
    filter_node = filter_overlap_range(fn, fv->overlap_range);

  if (!filter_node)
    filter_node = filter_strand(fn, fv->strand);

  if (!filter_node)
    filter_node = filter_targetstrand(fn, fv->targetstrand);

  if (!filter_node)
    filter_node = filter_has_CDS(fn, fv->has_CDS);

  if (!filter_node)
    filter_node = filter_min_average_ssp(fn, fv->min_average_splice_site_prob);

  if (filter_node)
    gt_genome_node_delete((GtGenomeNode*) fn);
  else
    gt_queue_add(fv->node_buffer, fn);

  return 0;
}
Esempio n. 13
0
static bool gaeval_visitor_typecheck_gap(GtFeatureNode *fn)
{
  return gt_feature_node_has_type(fn, "match_gap");
}
static int CpGIOverlap_stream_next(GtNodeStream * ns,
                                   GtGenomeNode ** gn,
                                   GtError * err)
{
    GtGenomeNode * cur_node, * next_node;
    GtFeatureNodeIterator * iter;
    int err_num = 0;
    *gn = NULL;
    CpGIOverlap_stream * context;
    const char * gene_name = NULL;
    const char * overlap_name = NULL;
    char  chr_str[255];
    int  chr_num;
    unsigned int TSS;

    float CpGIOverlap;


    context = CpGIOverlap_stream_cast(ns);

    // find the genes, determine expression level
     if(!gt_node_stream_next(context->in_stream,
                            &cur_node,
                            err
                           ) && cur_node != NULL
       )
     {
         *gn = cur_node;

         // try casting as a feature node so we can test type
         if(!gt_genome_node_try_cast(gt_feature_node_class(), cur_node))
         {
               return 0;
         }
         else // we found a feature node
         {
              // first check if it is a pseudo node, if so find the gene in it if available
              if (gt_feature_node_is_pseudo(cur_node))
              {
                  iter = gt_feature_node_iterator_new(cur_node);
                  if (iter == NULL)
                      return;
                  while ((next_node = gt_feature_node_iterator_next(iter)) && !gt_feature_node_has_type(next_node, feature_type_gene));
                  gt_feature_node_iterator_delete(iter);
                  if (NULL == (cur_node = next_node))
                     return 0;
              }


              if(!gt_feature_node_has_type(cur_node, feature_type_gene))
                  return 0;

              // find name of gene
              gene_name = gt_feature_node_get_attribute(cur_node, "Name");

              if (gene_name == NULL)
                  return;

              if ( 1 != sscanf(gt_str_get(gt_genome_node_get_seqid(cur_node)), "Chr%d", &chr_num))
                  return 0;

              TSS = (gt_feature_node_get_strand(cur_node) == GT_STRAND_FORWARD) ? gt_genome_node_get_start(cur_node) : gt_genome_node_get_end(cur_node);

              // now figure out the overlapping gene 
              if (! (overlap_name = CpGIOverlap_stream_find_gene_overlap( context, TSS, chr_num)))
                 return 0;

              // save the score into the node
              gt_feature_node_set_attribute(cur_node, "cpgi_at_tss", overlap_name);
              
              return 0;

         }
     }

    return err_num;
}
static int CpGI_score_stream_next(GtNodeStream * ns,
                                   GtGenomeNode ** gn,
                                   GtError * err)
{
    GtGenomeNode * cur_node;
    int err_num = 0;
    *gn = NULL;
    CpGI_score_stream * score_stream;
    unsigned long island_start;
    unsigned long island_end;
    float island_score;
    int chromosome_num;
    GtStr * seqID_gtstr;
    char *  seqID_str;
    char *  num_cg_str;
    unsigned long num_cg = 0;

    score_stream = CpGI_score_stream_cast(ns);

    // find the CpGI's, process methylome score
     if(!gt_node_stream_next(score_stream->in_stream,
                            &cur_node,
                            err
                           ) && cur_node != NULL
       )
     {
         *gn = cur_node;

         // try casting as a feature node so we can test type
         if(!gt_genome_node_try_cast(gt_feature_node_class(), cur_node))
         {
               return 0;
         }
         else // we found a feature node
         {
              if(!gt_feature_node_has_type(cur_node, feature_type_CpGI))
                  return 0;

              #if DEBUG_SCORE
              printf("found CpGI\n");
              #endif 
 
              island_start = gt_genome_node_get_start(cur_node);
              island_end   = gt_genome_node_get_end(cur_node);

              seqID_gtstr = gt_genome_node_get_seqid(cur_node);
              seqID_str   = gt_str_get(seqID_gtstr);
              sscanf(seqID_str, "Chr%d", &chromosome_num);

              num_cg_str = gt_feature_node_get_attribute(cur_node, "sumcg");
              if (!num_cg_str)
                 return 0;
              
              sscanf(num_cg_str, "%d", &num_cg);             

              // now figure out the score
              island_score = CpGI_score_stream_score_island(score_stream ,
                                                            chromosome_num,
                                                            num_cg,
                                                            island_start,
                                                            island_end);
//              gt_str_delete(seqID_gtstr);

              // save the score into the node
              gt_feature_node_set_score(cur_node, island_score);
              
              return 0;

         }
     }

    return err_num;
}