Exemplo n.º 1
0
static void gv_test_introns_confirmed(AgnUnitTest *test)
{
  GtGenomeNode *intron, *gap;
  GtStr *seqid = gt_str_new_cstr("chr");
  GtArray *introns = gt_array_new( sizeof(GtGenomeNode *) );
  intron = gt_feature_node_new(seqid, "intron", 1000, 1170, GT_STRAND_REVERSE);
  gt_array_add(introns, intron);
  intron = gt_feature_node_new(seqid, "intron", 1225, 1305, GT_STRAND_REVERSE);
  gt_array_add(introns, intron);
  intron = gt_feature_node_new(seqid, "intron", 1950, 2110, GT_STRAND_REVERSE);
  gt_array_add(introns, intron);
  intron = gt_feature_node_new(seqid, "intron", 2545, 2655, GT_STRAND_REVERSE);
  gt_array_add(introns, intron);
  intron = gt_feature_node_new(seqid, "intron", 2800, 2950, GT_STRAND_REVERSE);
  gt_array_add(introns, intron);

  GtArray *gaps = gt_array_new( sizeof(GtGenomeNode *) );

  double intcon = gaeval_visitor_introns_confirmed(introns, gaps);
  bool test1 = fabs(intcon - 0.0) < 0.0001;
  agn_unit_test_result(test, "introns confirmed (no gaps)", test1);

  gap = gt_feature_node_new(seqid, "match_gap", 1000, 1170, GT_STRAND_REVERSE);
  gt_array_add(gaps, gap);
  gap = gt_feature_node_new(seqid, "match_gap", 1225, 1302, GT_STRAND_REVERSE);
  gt_array_add(gaps, gap);
  gap = gt_feature_node_new(seqid, "match_gap", 1950, 2110, GT_STRAND_REVERSE);
  gt_array_add(gaps, gap);
  gap = gt_feature_node_new(seqid, "match_gap", 2575, 2655, GT_STRAND_REVERSE);
  gt_array_add(gaps, gap);
  gap = gt_feature_node_new(seqid, "match_gap", 2800, 2950, GT_STRAND_REVERSE);
  gt_array_add(gaps, gap);

  intcon = gaeval_visitor_introns_confirmed(introns, gaps);
  bool test2 = fabs(intcon - 0.6) < 0.0001;
  agn_unit_test_result(test, "introns confirmed (gaps)", test2);

  while(gt_array_size(introns) > 0)
  {
    intron = *(GtGenomeNode **)gt_array_pop(introns);
    gt_genome_node_delete(intron);
  }
  gt_array_delete(introns);

  while(gt_array_size(gaps) > 0)
  {
    gap = *(GtGenomeNode **)gt_array_pop(gaps);
    gt_genome_node_delete(gap);
  }
  gt_array_delete(gaps);
  gt_str_delete(seqid);
}
Exemplo n.º 2
0
static int feature_node_lua_new(lua_State *L)
{
  GtGenomeNode **gf;
  GtUword startpos, endpos;
  GtStrand strand;
  const char *seqid, *type, *strand_str;
  size_t length;
  GtStr *seqid_str;
  gt_assert(L);
  /* get/check parameters */
  seqid = luaL_checkstring(L, 1);
  type = luaL_checkstring(L, 2);
  startpos = luaL_checklong(L, 3);
  endpos   = luaL_checklong(L, 4);
  luaL_argcheck(L, startpos > 0, 3, "must be > 0");
  luaL_argcheck(L, endpos > 0, 4, "must be > 0");
  luaL_argcheck(L, startpos <= endpos, 3, "must be <= endpos");
  strand_str = luaL_checklstring(L, 5, &length);
  luaL_argcheck(L, length == 1, 5, "strand string must have length 1");
  luaL_argcheck(L, (strand = gt_strand_get(strand_str[0])) !=
                    GT_NUM_OF_STRAND_TYPES, 5, "invalid strand");
  /* construct object */
  gf = lua_newuserdata(L, sizeof (GtGenomeNode*));
  seqid_str = gt_str_new_cstr(seqid);
  *gf = gt_feature_node_new(seqid_str, type, startpos, endpos, strand);
  gt_str_delete(seqid_str);
  gt_assert(*gf);
  luaL_getmetatable(L, GENOME_NODE_METATABLE);
  lua_setmetatable(L, -2);
  return 1;
}
Exemplo n.º 3
0
static int pdom_hit_attach_gff3(GtPdomModel *model, GtPdomModelHit *hit,
                                void *data, GT_UNUSED GtError *err)
{
  unsigned long i;
  GtRange rng;
  GtLTRdigestStream *ls = (GtLTRdigestStream *) data;
  GtStrand strand;
  gt_assert(model && hit);

  strand = gt_pdom_model_hit_get_best_strand(hit);
  /* do not use the hits on the non-predicted strand
      -- maybe identify nested elements ? */
  if (strand != gt_feature_node_get_strand(ls->element.mainnode))
    return 0;

  for (i=0;i<gt_pdom_model_hit_best_chain_length(hit);i++)
  {
    GtGenomeNode *gf;
    GtStr *alignmentstring,
          *aastring;
    GtPdomSingleHit *singlehit;
    GtPhase frame;

    singlehit = gt_pdom_model_hit_best_single_hit(hit, i);
    alignmentstring = gt_str_new();
    aastring = gt_str_new();
    frame = gt_pdom_single_hit_get_phase(singlehit);
    rng = gt_pdom_single_hit_get_range(singlehit);

    gt_pdom_single_hit_format_alignment(singlehit, GT_ALIWIDTH,
                                        alignmentstring);
    gt_pdom_single_hit_get_aaseq(singlehit, aastring);

    rng.start++; rng.end++;  /* GFF3 is 1-based */
    gf = gt_feature_node_new(gt_genome_node_get_seqid((GtGenomeNode*)
                                                      ls->element.mainnode),
                             GT_PDOM_TYPE,
                             rng.start,
                             rng.end,
                             strand);
    gt_genome_node_add_user_data((GtGenomeNode*) gf, "pdom_alignment",
                                 alignmentstring, (GtFree) gt_str_delete);
    gt_genome_node_add_user_data((GtGenomeNode*) gf, "pdom_aaseq",
                                 aastring, (GtFree) gt_str_delete);
    gt_feature_node_set_source((GtFeatureNode*) gf, ls->ltrdigest_tag);
    gt_feature_node_set_score((GtFeatureNode*) gf,
                              gt_pdom_single_hit_get_evalue(singlehit));
    gt_feature_node_set_phase((GtFeatureNode*) gf, frame);
    if (gt_pdom_model_get_name(model)) {
      gt_feature_node_add_attribute((GtFeatureNode*) gf, "name",
                                    gt_pdom_model_get_name(model));
    }
    if (gt_pdom_model_get_acc(model)) {
      gt_feature_node_add_attribute((GtFeatureNode*) gf, "id",
                                    gt_pdom_model_get_acc(model));
    }
    gt_feature_node_add_child(ls->element.mainnode, (GtFeatureNode*) gf);
  }
  return 0;
}
Exemplo n.º 4
0
static GtFeatureIndex *in_stream_test_data(GtError *error)
{
  GtFeatureIndex *fi;
  GtFeatureNode *fn;
  GtGenomeNode *gn;
  GtRegionNode *rn;
  GtStr *seqid;

  fi = gt_feature_index_memory_new();

  seqid = gt_str_new_cstr("chr1");
  gn = gt_region_node_new(seqid, 1, 100000);
  rn = gt_region_node_cast(gn);
  gt_feature_index_add_region_node(fi, rn, error);
  gt_genome_node_delete(gn);

  gn = gt_feature_node_new(seqid, "region", 500, 5000, GT_STRAND_BOTH);
  fn = gt_feature_node_cast(gn);
  gt_feature_index_add_feature_node(fi, fn, error);
  gt_genome_node_delete(gn);

  gn = gt_feature_node_new(seqid, "region", 50000, 75000, GT_STRAND_BOTH);
  fn = gt_feature_node_cast(gn);
  gt_feature_index_add_feature_node(fi, fn, error);
  gt_genome_node_delete(gn);

  gt_str_delete(seqid);
  seqid = gt_str_new_cstr("scf0001");
  gn = gt_region_node_new(seqid, 1, 10000);
  rn = gt_region_node_cast(gn);
  gt_feature_index_add_region_node(fi, rn, error);
  gt_genome_node_delete(gn);

  gn = gt_feature_node_new(seqid, "mRNA", 4000, 6000, GT_STRAND_REVERSE);
  fn = gt_feature_node_cast(gn);
  gt_feature_index_add_feature_node(fi, fn, error);
  gt_genome_node_delete(gn);

  gn = gt_feature_node_new(seqid, "mRNA", 7000, 9500, GT_STRAND_FORWARD);
  fn = gt_feature_node_cast(gn);
  gt_feature_index_add_feature_node(fi, fn, error);
  gt_genome_node_delete(gn);

  gt_str_delete(seqid);

  return fi;
}
Exemplo n.º 5
0
static int gt_ltrdigest_pdom_visitor_attach_hit(GtLTRdigestPdomVisitor *lv,
                                                GtHMMERModelHit *modelhit,
                                                GtHMMERSingleHit *singlehit)
{
  GT_UNUSED GtUword i;
  GtGenomeNode *gf;
  int had_err = 0;
  GtRange rrng;
  gt_assert(lv && singlehit);

  rrng = gt_ltrdigest_pdom_visitor_coords(lv, singlehit);

  if (gt_array_size(singlehit->chains) > 0 || lv->output_all_chains) {
    char buf[32];
    gf = gt_feature_node_new(gt_genome_node_get_seqid((GtGenomeNode*)
                                                      lv->ltr_retrotrans),
                             gt_ft_protein_match,
                             rrng.start,
                             rrng.end,
                             singlehit->strand);
    gt_genome_node_add_user_data((GtGenomeNode*) gf, "pdom_alignment",
                                 gt_str_ref(singlehit->alignment),
                                 (GtFree) gt_str_delete);
    gt_genome_node_add_user_data((GtGenomeNode*) gf, "pdom_aaseq",
                                 gt_str_ref(singlehit->aastring),
                                 (GtFree) gt_str_delete);
    gt_feature_node_set_source((GtFeatureNode*) gf, lv->tag);
    gt_feature_node_set_score((GtFeatureNode*) gf, (float) singlehit->evalue);
    (void) snprintf(buf, (size_t) 32, "%d", (int) singlehit->frame);
    gt_feature_node_add_attribute((GtFeatureNode*) gf,
                                    "reading_frame", buf);
    if (modelhit->modelname != NULL) {
      gt_feature_node_add_attribute((GtFeatureNode*) gf, "name",
                                    modelhit->modelname);
    }
    if (gt_array_size(singlehit->chains) > 1UL && lv->output_all_chains) {
      GtStr *buffer;
      GtUword j;
      gt_assert(singlehit->chains != NULL);
      buffer = gt_str_new();
      for (j = 0UL; j < gt_array_size(singlehit->chains); j++) {
        gt_str_append_cstr(buffer, modelhit->modelname);
        gt_str_append_char(buffer, ':');
        gt_str_append_ulong(buffer,
                          *(GtUword*) gt_array_get(singlehit->chains, j));
        if (j != gt_array_size(singlehit->chains) - 1) {
          gt_str_append_char(buffer, ',');
        }
      }
      gt_feature_node_set_attribute((GtFeatureNode*) gf, "chains",
                                    gt_str_get(buffer));
      gt_str_delete(buffer);
    }
    gt_feature_node_add_child(lv->ltr_retrotrans, (GtFeatureNode*) gf);
  }
  gt_array_delete(singlehit->chains);
  singlehit->chains = NULL;
  return had_err;
}
Exemplo n.º 6
0
static void infer_cds_visitor_infer_cds(AgnInferCDSVisitor *v)
{
  GtFeatureNode **start_codon = NULL, **stop_codon = NULL;

  bool exonsexplicit    = gt_array_size(v->exons) > 0;
  bool startcodon_check = gt_array_size(v->starts) == 1 &&
                          (start_codon = gt_array_get(v->starts, 0)) != NULL;
  bool stopcodon_check  = gt_array_size(v->stops)  == 1 &&
                          (stop_codon  = gt_array_get(v->stops,  0)) != NULL;

  if(gt_array_size(v->cds) > 0)
  {
    return;
  }
  else if(!exonsexplicit || !startcodon_check || !stopcodon_check)
  {
    return;
  }

  GtRange left_codon_range, right_codon_range;
  left_codon_range  = gt_genome_node_get_range(*(GtGenomeNode **)start_codon);
  right_codon_range = gt_genome_node_get_range(*(GtGenomeNode **)stop_codon);
  if(gt_feature_node_get_strand(v->mrna) == GT_STRAND_REVERSE)
  {
    left_codon_range  = gt_genome_node_get_range(*(GtGenomeNode **)stop_codon);
    right_codon_range = gt_genome_node_get_range(*(GtGenomeNode **)start_codon);
  }
  GtUword i;
  for(i = 0; i < gt_array_size(v->exons); i++)
  {
    GtFeatureNode *exon = *(GtFeatureNode **)gt_array_get(v->exons, i);
    GtGenomeNode *exon_gn = (GtGenomeNode *)exon;
    GtRange exon_range = gt_genome_node_get_range(exon_gn);
    GtStrand exon_strand = gt_feature_node_get_strand(exon);

    GtRange cdsrange;
    bool exon_includes_cds = infer_cds_visitor_infer_range(&exon_range,
                                                           &left_codon_range,
                                                           &right_codon_range,
                                                           &cdsrange);
    if(exon_includes_cds)
    {
      GtGenomeNode *cdsfeat;
      cdsfeat = gt_feature_node_new(gt_genome_node_get_seqid(exon_gn), "CDS",
                                    cdsrange.start, cdsrange.end, exon_strand);
      if(v->source)
        gt_feature_node_set_source((GtFeatureNode *)cdsfeat, v->source);
      gt_feature_node_add_child(v->mrna, (GtFeatureNode *)cdsfeat);
      gt_array_add(v->cds, cdsfeat);
    }
  }
}
Exemplo n.º 7
0
static void infer_cds_visitor_check_stop(AgnInferCDSVisitor *v)
{
  if(gt_array_size(v->cds) == 0)
    return;

  const char *mrnaid = gt_feature_node_get_attribute(v->mrna, "ID");
  unsigned int ln = gt_genome_node_get_line_number((GtGenomeNode *)v->mrna);
  GtStrand strand = gt_feature_node_get_strand(v->mrna);

  GtRange stoprange;
  GtUword threeprimeindex = gt_array_size(v->cds) - 1;
  GtGenomeNode **threeprimesegment = gt_array_get(v->cds, threeprimeindex);
  stoprange = gt_genome_node_get_range(*threeprimesegment);
  stoprange.start = stoprange.end - 2;
  if(strand == GT_STRAND_REVERSE)
  {
    threeprimesegment = gt_array_get(v->cds, 0);
    stoprange = gt_genome_node_get_range(*threeprimesegment);
    stoprange.end = stoprange.start + 2;
  }

  if(gt_array_size(v->stops) > 1)
  {
    gt_logger_log(v->logger, "mRNA '%s' (line %u) has %lu stop codons", mrnaid,
                  ln, gt_array_size(v->starts));
  }
  else if(gt_array_size(v->stops) == 1)
  {
    GtGenomeNode **codon = gt_array_get(v->stops, 0);
    GtRange testrange = gt_genome_node_get_range(*codon);
    if(gt_range_compare(&stoprange, &testrange) != 0)
    {
      gt_logger_log(v->logger, "stop codon inferred from CDS [%lu, %lu] does "
                    "not match explicitly provided stop codon [%lu, %lu] for "
                    "mRNA '%s'", stoprange.start, stoprange.end,
                    testrange.start, testrange.end, mrnaid);
    }
  }
  else // agn_assert(gt_array_size(v->stops) == 0)
  {
    GtStr *seqid = gt_genome_node_get_seqid((GtGenomeNode *)v->mrna);
    GtGenomeNode *codonfeature = gt_feature_node_new(seqid, "stop_codon",
                                                     stoprange.start,
                                                     stoprange.end,
                                                     strand);
    if(v->source)
      gt_feature_node_set_source((GtFeatureNode *)codonfeature, v->source);
    GtFeatureNode *cf = (GtFeatureNode *)codonfeature;
    gt_feature_node_add_child(v->mrna, cf);
    gt_array_add(v->stops, cf);
  }
}
Exemplo n.º 8
0
static int create_block_features(GtBEDParser *bed_parser, GtFeatureNode *fn,
                                 GtUword block_count,
                                 GtSplitter *size_splitter,
                                 GtSplitter *start_splitter, GtIO *bed_file,
                                 GtError *err)
{
  GtUword i;
  int had_err = 0;
  gt_assert(fn && block_count && size_splitter && start_splitter);
  gt_assert(gt_splitter_size(size_splitter) == block_count);
  gt_assert(gt_splitter_size(start_splitter) == block_count);
  for (i = 0; !had_err && i < block_count; i++) {
    GtUword block_size, block_start, start, end;
    GtGenomeNode *block;
    const char *name;
    if (gt_parse_uword(&block_size, gt_splitter_get_token(size_splitter, i))) {
      gt_error_set(err,
                   "file \"%s\": line "GT_WU": could not parse blockSize '%s'",
                   gt_io_get_filename(bed_file),
                   gt_io_get_line_number(bed_file),
                   gt_splitter_get_token(size_splitter, i));
      had_err = -1;
    }
    if (!had_err && gt_parse_uword(&block_start,
                                   gt_splitter_get_token(start_splitter, i))) {
      gt_error_set(err, "file \"%s\": line "GT_WU": could not parse blockStart "
                   "'%s'", gt_io_get_filename(bed_file),
                   gt_io_get_line_number(bed_file),
                   gt_splitter_get_token(start_splitter, i));
      had_err = -1;
    }
    if (!had_err) {
      start = gt_genome_node_get_start((GtGenomeNode*) fn) + block_start;
      end = start + block_size - 1;
      block = gt_feature_node_new(gt_genome_node_get_seqid((GtGenomeNode*) fn),
                                  bed_parser->block_type
                                  ? bed_parser->block_type
                                  : BED_BLOCK_TYPE,
                                  start, end, gt_feature_node_get_strand(fn));
      if ((name = gt_feature_node_get_attribute(fn, GT_GFF_NAME))) {
        gt_feature_node_add_attribute((GtFeatureNode*) block, GT_GFF_NAME,
                                      name);
      }
      gt_feature_node_set_score((GtFeatureNode*) block,
                                gt_feature_node_get_score(fn));
      gt_feature_node_set_strand((GtFeatureNode*) block,
                                 gt_feature_node_get_strand(fn));
      gt_feature_node_add_child(fn, (GtFeatureNode*) block);
    }
  }
  return had_err;
}
Exemplo n.º 9
0
GtGenomeNode* gt_feature_node_new_pseudo(GtStr *seqid, GtUword start,
                                         GtUword end, GtStrand strand)
{
  GtFeatureNode *pf;
  GtGenomeNode *pn;
  gt_assert(seqid);
  gt_assert(start <= end);
  pn = gt_feature_node_new(seqid, "pseudo", start, end, strand);
  pf = gt_feature_node_cast(pn);
  pf->type = NULL; /* pseudo features do not have a type */
  pf->bit_field |= 1 << PSEUDO_FEATURE_OFFSET;
  return pn;
}
Exemplo n.º 10
0
static void pbs_attach_results_to_gff3(GtPBSResults *results,
                                       GtLTRElement *element,
                                       GtStrand *canonical_strand,
                                       GtStr *tag)
{
  GtRange pbs_range;
  GtGenomeNode *gf;
  unsigned long i = 0;
  char buffer[BUFSIZ];
  GtPBSHit* hit = gt_pbs_results_get_ranked_hit(results, i++);
  if (*canonical_strand == GT_STRAND_UNKNOWN)
    *canonical_strand = gt_pbs_hit_get_strand(hit);
  else
  {
    /* do we have to satisfy a strand constraint?
     * then find best-scoring PBS on the given canonical strand */
    while (gt_pbs_hit_get_strand(hit) != *canonical_strand
             && i < gt_pbs_results_get_number_of_hits(results))
    {
      gt_log_log("dropping PBS because of nonconsistent strand: %s\n",
                 gt_feature_node_get_attribute(element->mainnode, "ID"));
      hit = gt_pbs_results_get_ranked_hit(results, i++);
    }
    /* if there is none, do not report a PBS */
    if (gt_pbs_hit_get_strand(hit) != *canonical_strand)
      return;
  }
  pbs_range = gt_pbs_hit_get_coords(hit);
  pbs_range.start++; pbs_range.end++;  /* GFF3 is 1-based */
  gf = gt_feature_node_new(gt_genome_node_get_seqid((GtGenomeNode*)
                                                    element->mainnode),
                           GT_PBS_TYPE,
                           pbs_range.start,
                           pbs_range.end,
                           gt_pbs_hit_get_strand(hit));
  gt_feature_node_set_source((GtFeatureNode*) gf, tag);
  gt_feature_node_set_score((GtFeatureNode*) gf,
                            (float) gt_pbs_hit_get_score(hit));
  if (gt_pbs_hit_get_trna(hit) != NULL) {
    gt_feature_node_add_attribute((GtFeatureNode*) gf, "trna",
                                   gt_pbs_hit_get_trna(hit));
  }
  gt_feature_node_set_strand(element->mainnode, gt_pbs_hit_get_strand(hit));
  (void) snprintf(buffer, BUFSIZ-1, "%lu", gt_pbs_hit_get_tstart(hit));
  gt_feature_node_add_attribute((GtFeatureNode*) gf, "trnaoffset", buffer);
  (void) snprintf(buffer, BUFSIZ-1, "%lu", gt_pbs_hit_get_offset(hit));
  gt_feature_node_add_attribute((GtFeatureNode*) gf, "pbsoffset", buffer);
  (void) snprintf(buffer, BUFSIZ-1, "%lu", gt_pbs_hit_get_edist(hit));
  gt_feature_node_add_attribute((GtFeatureNode*) gf, "edist", buffer);
  gt_feature_node_add_child(element->mainnode, (GtFeatureNode*) gf);
}
Exemplo n.º 11
0
static void construct_thick_feature(GtBEDParser *bed_parser, GtFeatureNode *fn,
                                    GtRange range)
{
  GtGenomeNode *thick_feature;
  const char *name;
  gt_assert(fn);
  thick_feature = gt_feature_node_new(gt_genome_node_get_seqid((GtGenomeNode*)
                                                               fn),
                                      bed_parser->thick_feature_type
                                      ? bed_parser->thick_feature_type
                                      : BED_THICK_FEATURE_TYPE,
                                      range.start, range.end,
                                      gt_feature_node_get_strand(fn));
  if ((name = gt_feature_node_get_attribute(fn, "Name")))
    gt_feature_node_add_attribute((GtFeatureNode*) thick_feature, "Name", name);
  gt_feature_node_set_score((GtFeatureNode*) thick_feature,
                            gt_feature_node_get_score(fn));
  gt_feature_node_set_strand((GtFeatureNode*) thick_feature,
                             gt_feature_node_get_strand(fn));
  gt_feature_node_add_child(fn, (GtFeatureNode*) thick_feature);
}
Exemplo n.º 12
0
static void orf_attach_results_to_gff3(GtFeatureNode *gf,
                                       GtRange orf_rng, unsigned int orf_frame,
                                       GtStrand strand, GT_UNUSED GtError *err)
{
  GtGenomeNode *child;
  GtStr *tag;
  tag = gt_str_new_cstr(GT_ORF_FINDER_TAG);

  orf_rng.start++; orf_rng.end++;

  GtFeatureNodeIterator *gfi;
  GtFeatureNode *curnode = NULL, *parent_node = NULL;
  GtRange gfi_range;
  char frame_buf[3];
  sprintf(frame_buf, "%d", orf_frame);

  gfi = gt_feature_node_iterator_new(gf);

  while ((curnode = gt_feature_node_iterator_next(gfi))) {
    if (strcmp(gt_feature_node_get_type(curnode),
                                              (const char*) GT_ORF_TYPE) != 0) {
      gfi_range = gt_genome_node_get_range((GtGenomeNode*) curnode);
      if (gt_range_contains(&gfi_range, &orf_rng)) {
        parent_node = curnode;
      }
    }
  }
  if (parent_node) {
    child = gt_feature_node_new(gt_genome_node_get_seqid((GtGenomeNode*) gf),
                                GT_ORF_TYPE,
                                orf_rng.start,
                                orf_rng.end,
                                strand);
    gt_feature_node_set_source((GtFeatureNode*) child, tag);
    gt_feature_node_set_attribute((GtFeatureNode*) child, "frame", frame_buf);
    gt_feature_node_add_child(parent_node,(GtFeatureNode*) child);
  }
  gt_str_delete(tag);
  gt_feature_node_iterator_delete(gfi);
}
Exemplo n.º 13
0
static void ppt_attach_results_to_gff3(GtPPTResults *results,
                                       GtLTRElement *element,
                                       GtStrand *canonical_strand,
                                       GtStr *tag)
{
  GtRange ppt_range;
  unsigned long i = 0;
  GtGenomeNode *gf;
  GtPPTHit* hit = gt_ppt_results_get_ranked_hit(results, i++);
  if (*canonical_strand == GT_STRAND_UNKNOWN)
    *canonical_strand = gt_ppt_hit_get_strand(hit);
  else
  {
    /* find best-scoring PPT on the given canonical strand */
    while (gt_ppt_hit_get_strand(hit) != *canonical_strand
             && i < gt_ppt_results_get_number_of_hits(results))
    {
      gt_log_log("dropping PPT because of nonconsistent strand: %s\n",
                 gt_feature_node_get_attribute(element->mainnode, "ID"));
      hit = gt_ppt_results_get_ranked_hit(results, i++);
    }
    /* if there is none, do not report a PPT */
    if (gt_ppt_hit_get_strand(hit) != *canonical_strand)
      return;
  }
  ppt_range = gt_ppt_hit_get_coords(hit);
  ppt_range.start++; ppt_range.end++;  /* GFF3 is 1-based */
  gf = gt_feature_node_new(gt_genome_node_get_seqid((GtGenomeNode*)
                                                    element->mainnode),
                           GT_PPT_TYPE,
                           ppt_range.start,
                           ppt_range.end,
                           gt_ppt_hit_get_strand(hit));
  gt_feature_node_set_source((GtFeatureNode*) gf, tag);
  gt_feature_node_set_strand(element->mainnode, gt_ppt_hit_get_strand(hit));
  gt_feature_node_add_child(element->mainnode, (GtFeatureNode*) gf);
}
Exemplo n.º 14
0
static int gt_script_filter_runner(int argc, const char **argv, int parsed_args,
                                   void *tool_arguments, GT_UNUSED GtError *err)
{
  GtScriptFilterArguments *arguments = tool_arguments;
  int had_err = 0;
  GtUword i;

  gt_error_check(err);
  gt_assert(arguments);

  for (i = parsed_args; !had_err && i < argc; i++) {
    GtScriptFilter *sf = NULL;
    sf = gt_script_filter_new(argv[i], err);
    if (!sf) {
      had_err = -1;
      break;
    } else {
      if (arguments->showinfo) {
        if (arguments->oneline) {
          const char *name,
                     *version,
                     *author;

          if (!(name = gt_script_filter_get_name(sf, err)))
            had_err = -1;

          if (!had_err) {
            if (!(version = gt_script_filter_get_version(sf, err)))
              had_err = -1;
          }

          if (!had_err) {
            if (!(author = gt_script_filter_get_author(sf, err)))
              had_err = -1;
          }

          if (!had_err) {
            printf("%s v%s (by %s)\n", name, version, author);
          }
        } else {
          const char *out;

          if (arguments->scriptname)
            printf("script name:\t%s\n", argv[i]);

          out = gt_script_filter_get_name(sf, err);
          if (out)
            printf("filter name:\t%s\n", out);
          else
            had_err = -1;

          if (!had_err) {
            out = gt_script_filter_get_version(sf, err);
            if (out)
              printf("version:\t%s\n", out);
            else
              had_err = -1;
          }

          if (!had_err) {
            out = gt_script_filter_get_author(sf, err);
            if (out)
              printf("author:\t\t%s\n", out);
            else
              had_err = -1;
          }

          if (!had_err) {
            out = gt_script_filter_get_email(sf, err);
            if (out)
              printf("email:\t\t%s\n", out);
            else
              had_err = -1;
          }

          if (!had_err) {
            out = gt_script_filter_get_description(sf, err);
            if (out)
              printf("description:\t%s\n", out);
            else
              had_err = -1;
          }

          if (i != argc-1)
            printf("\n");
        }
      }
      if (arguments->validate) {
        GT_UNUSED bool select;
        GtStr *seqid = gt_str_new_cstr("foo");
        GtFeatureNode *fn = (GtFeatureNode*) gt_feature_node_new(seqid, "gene",
                                                             23, 42,
                                                             GT_STRAND_FORWARD);
        had_err = gt_script_filter_run(sf, fn, &select, err);
        gt_genome_node_delete((GtGenomeNode*) fn);
        gt_str_delete(seqid);
      }
      gt_script_filter_delete(sf);
    }
  }

  return had_err;
}
static int inter_feature_in_children(GtFeatureNode *current_feature, void *data,
                                     GT_UNUSED GtError *err)
{
  GtInterFeatureVisitor *aiv = (GtInterFeatureVisitor*) data;
  GtFeatureNode *inter_node;
  GtRange previous_range, current_range, inter_range;
  GtStrand previous_strand, /*current_strand, */inter_strand;
  GtStr *parent_seqid;
  gt_error_check(err);
  gt_assert(current_feature);
  if (gt_feature_node_has_type(current_feature, aiv->outside_type)) {
    if (aiv->previous_feature) {
      /* determine inter range */
      previous_range = gt_genome_node_get_range((GtGenomeNode*)
                                                aiv->previous_feature);
      current_range = gt_genome_node_get_range((GtGenomeNode*) current_feature);
      if (previous_range.end >= current_range.start) {
        gt_warning("overlapping boundary features " GT_WU "-" GT_WU " and "
                   GT_WU "-" GT_WU ", " "not placing '%s' inter-feature",
                   previous_range.start,
                   previous_range.end,
                   current_range.start,
                   current_range.end,
                   aiv->inter_type);
        return 0;
      }
      if (current_range.start - previous_range.end < 2) {
        gt_warning("no space for inter-feature '%s' between " GT_WU " and "
                   GT_WU,
                   aiv->inter_type,
                   previous_range.end,
                   current_range.start);
        return 0;
      }
      inter_range.start = previous_range.end + 1;
      inter_range.end = current_range.start - 1;

      /* determine inter strand */
      previous_strand = gt_feature_node_get_strand(aiv->previous_feature);
      /*current_strand = gt_feature_node_get_strand(current_feature);*/
      gt_assert(previous_strand == gt_feature_node_get_strand(current_feature));
      inter_strand = previous_strand;

      /* determine sequence id */
      parent_seqid =
        gt_genome_node_get_seqid((GtGenomeNode*) aiv->parent_feature);
      gt_assert(!gt_str_cmp(parent_seqid,
                            gt_genome_node_get_seqid((GtGenomeNode*)
                                                     aiv->previous_feature)));
      gt_assert(!gt_str_cmp(parent_seqid,
                            gt_genome_node_get_seqid((GtGenomeNode*)
                                                     current_feature)));

      /* create inter feature */
      inter_node = (GtFeatureNode*)
                   gt_feature_node_new(parent_seqid, aiv->inter_type,
                                       inter_range.start, inter_range.end,
                                       inter_strand);
      gt_feature_node_add_child(aiv->parent_feature, inter_node);
    }
    aiv->previous_feature = current_feature;
  }
  return 0;
}
Exemplo n.º 16
0
int gt_condenseq_output_to_gff3(const GtCondenseq *condenseq,
                                GtError *err)
{
  int had_err = 0;
  GtUword idx,
          name_len,
          seqnum = 0, seqstart = 0, seqend = 0,
          desclen;
  GtStr *filename = NULL,
        *id = gt_str_new_cstr("U"),
        *name = gt_str_new_cstr("unique"),
        *parent_unique = gt_str_new_cstr("U"),
        *seqid = gt_str_new(),
        *source = gt_str_new_cstr("Condenseq");
  GtFile *outfile = NULL;
  GtGFF3Visitor *gffv = NULL;
  GtNodeVisitor *nodev = NULL;
  GtFeatureNode *fnode = NULL;
  GtGenomeNode *node = NULL;
  GtRange range;

  gt_assert(condenseq != NULL);

  filename = gt_str_new_cstr(gt_condenseq_basefilename(condenseq));

  name_len = gt_str_length(name);
  gt_str_append_cstr(filename, ".gff3");
  outfile = gt_file_new(gt_str_get(filename), "w", err);
  nodev = gt_gff3_visitor_new(outfile);
  gffv = (GtGFF3Visitor *) nodev;
  gt_gff3_visitor_retain_id_attributes(gffv);

  node = gt_feature_node_new(seqid, "experimental_feature", (GtUword) 1,
                             (GtUword) 1, GT_STRAND_BOTH);
  fnode = (GtFeatureNode*) node;
  gt_feature_node_set_source(fnode, source);
  for (idx = 0; !had_err && idx < condenseq->udb_nelems; ++idx) {
    GtCondenseqUnique uq = condenseq->uniques[idx];
    if (seqend <= uq.orig_startpos) {
      const char *desc;
      gt_genome_node_delete(node);
      seqnum = gt_condenseq_pos2seqnum(condenseq, uq.orig_startpos);
      seqstart = gt_condenseq_seqstartpos(condenseq, seqnum);
      seqend = seqstart + condenseq_seqlength_help(condenseq, seqnum, seqstart);
      desc = gt_condenseq_description(condenseq, &desclen, seqnum);
      gt_str_reset(seqid);
      gt_str_append_cstr_nt(seqid, desc, desclen);
      node = gt_feature_node_new(seqid, "experimental_feature", (GtUword) 1,
                                 (GtUword) 1, GT_STRAND_BOTH);
      fnode = (GtFeatureNode*) node;
      gt_feature_node_set_source(fnode, source);
    }
    gt_str_set_length(name, name_len);
    gt_str_append_uword(name, idx);
    gt_str_set_length(id, (GtUword) 1);
    gt_str_append_uword(id, idx);
    gt_feature_node_set_attribute(fnode, "Name", gt_str_get(name));
    gt_feature_node_set_attribute(fnode, "ID", gt_str_get(id));
    /* 1 Based coordinates! */
    range.start = uq.orig_startpos + 1 - seqstart;
    range.end = uq.orig_startpos + uq.len - seqstart;
    gt_genome_node_set_range(node, &range);
    had_err = gt_genome_node_accept(node, nodev, err);
  }
  gt_str_reset(name);
  gt_str_append_cstr(name, "link");
  gt_str_reset(id);
  gt_str_append_cstr(id, "L");
  name_len = gt_str_length(name);
  seqend = 0;
  for (idx = 0; !had_err && idx < condenseq->ldb_nelems; ++idx) {
    GtCondenseqLink link = condenseq->links[idx];
    if (seqend <= link.orig_startpos) {
      const char *desc;
      gt_genome_node_delete(node);
      seqnum = gt_condenseq_pos2seqnum(condenseq, link.orig_startpos);
      seqstart = gt_condenseq_seqstartpos(condenseq, seqnum);
      seqend = seqstart + condenseq_seqlength_help(condenseq, seqnum, seqstart);
      desc = gt_condenseq_description(condenseq, &desclen, seqnum);
      gt_str_reset(seqid);
      gt_str_append_cstr_nt(seqid, desc, desclen);
      node = gt_feature_node_new(seqid, "experimental_feature", (GtUword) 1,
                                 (GtUword) 1, GT_STRAND_BOTH);
      fnode = (GtFeatureNode*) node;
      gt_feature_node_set_source(fnode, source);
    }
    gt_str_set_length(name, name_len);
    gt_str_append_uword(name, idx);
    gt_str_set_length(id, (GtUword) 1);
    gt_str_append_uword(id, idx);
    gt_feature_node_set_attribute(fnode, "Name", gt_str_get(name));
    gt_feature_node_set_attribute(fnode, "ID", gt_str_get(id));
    gt_str_set_length(parent_unique, (GtUword) 1);
    gt_str_append_uword(parent_unique, link.unique_id);
    gt_feature_node_set_attribute(fnode, "Derives_from",
                                  gt_str_get(parent_unique));
    /* 1 Based coordinates! */
    range.start = link.orig_startpos + 1 - seqstart;
    range.end = link.orig_startpos + link.len - seqstart;
    gt_genome_node_set_range(node, &range);
    had_err = gt_genome_node_accept(node, nodev, err);
  }
  gt_file_delete(outfile);
  gt_genome_node_delete(node);
  gt_node_visitor_delete(nodev);
  gt_str_delete(filename);
  gt_str_delete(id);
  gt_str_delete(name);
  gt_str_delete(parent_unique);
  gt_str_delete(seqid);
  gt_str_delete(source);
  return had_err;
}
Exemplo n.º 17
0
GtGenomeNode* gt_feature_node_new_standard_gene(void)
{
  GtGenomeNode *fn, *child, *grand;
  GtStr *seqid;
  seqid = gt_str_new_cstr("ctg123");

  /* gene */
  fn = gt_feature_node_new(seqid, gt_ft_gene, 1000, 9000, GT_STRAND_FORWARD);

  /* TF binding site */
  child = gt_feature_node_new(seqid, gt_ft_TF_binding_site, 1000, 1012,
                                GT_STRAND_FORWARD);
  gt_feature_node_add_child((GtFeatureNode*) fn, (GtFeatureNode*) child);

  /* first mRNA */
  child = gt_feature_node_new(seqid, gt_ft_mRNA, 1050, 9000, GT_STRAND_FORWARD);
  gt_feature_node_add_child((GtFeatureNode*) fn, (GtFeatureNode*) child);
  grand = gt_feature_node_new(seqid, gt_ft_exon, 1050, 1500, GT_STRAND_FORWARD);
  gt_feature_node_add_child((GtFeatureNode*) child, (GtFeatureNode*) grand);
  grand = gt_feature_node_new(seqid, gt_ft_exon, 3000, 3902, GT_STRAND_FORWARD);
  gt_feature_node_add_child((GtFeatureNode*) child, (GtFeatureNode*) grand);
  grand = gt_feature_node_new(seqid, gt_ft_exon, 5000, 5500, GT_STRAND_FORWARD);
  gt_feature_node_add_child((GtFeatureNode*) child, (GtFeatureNode*) grand);
  grand = gt_feature_node_new(seqid, gt_ft_exon, 7000, 9000, GT_STRAND_FORWARD);
  gt_feature_node_add_child((GtFeatureNode*) child, (GtFeatureNode*) grand);

  /* second mRNA */
  child = gt_feature_node_new(seqid, gt_ft_mRNA, 1050, 9000, GT_STRAND_FORWARD);
  gt_feature_node_add_child((GtFeatureNode*) fn, (GtFeatureNode*) child);
  grand = gt_feature_node_new(seqid, gt_ft_exon, 1050, 1500, GT_STRAND_FORWARD);
  gt_feature_node_add_child((GtFeatureNode*) child, (GtFeatureNode*) grand);
  grand = gt_feature_node_new(seqid, gt_ft_exon, 5000, 5500, GT_STRAND_FORWARD);
  gt_feature_node_add_child((GtFeatureNode*) child, (GtFeatureNode*) grand);
  grand = gt_feature_node_new(seqid, gt_ft_exon, 7000, 9000, GT_STRAND_FORWARD);
  gt_feature_node_add_child((GtFeatureNode*) child, (GtFeatureNode*) grand);

  /* third mRNA */
  child = gt_feature_node_new(seqid, gt_ft_mRNA, 1300, 9000, GT_STRAND_FORWARD);
  gt_feature_node_add_child((GtFeatureNode*) fn, (GtFeatureNode*) child);
  grand = gt_feature_node_new(seqid, gt_ft_exon, 1300, 1500, GT_STRAND_FORWARD);
  gt_feature_node_add_child((GtFeatureNode*) child, (GtFeatureNode*) grand);
  grand = gt_feature_node_new(seqid, gt_ft_exon, 3000, 3902, GT_STRAND_FORWARD);
  gt_feature_node_add_child((GtFeatureNode*) child, (GtFeatureNode*) grand);
  grand = gt_feature_node_new(seqid, gt_ft_exon, 5000, 5500, GT_STRAND_FORWARD);
  gt_feature_node_add_child((GtFeatureNode*) child, (GtFeatureNode*) grand);
  grand = gt_feature_node_new(seqid, gt_ft_exon, 7000, 9000,GT_STRAND_FORWARD);
  gt_feature_node_add_child((GtFeatureNode*) child, (GtFeatureNode*) grand);

  gt_str_delete(seqid);
  return fn;
}
Exemplo n.º 18
0
static int construct_mRNAs(GT_UNUSED void *key, void *value, void *data,
                           GtError *err)
{
  ConstructionInfo *cinfo = (ConstructionInfo*) data;
  GtArray *gt_genome_node_array = (GtArray*) value,
          *mRNAs = (GtArray*) cinfo->mRNAs;
  GtGenomeNode *mRNA_node, *first_node, *gn;
  const char *tname;
  GtStrand mRNA_strand;
  GtRange mRNA_range;
  GtStr *mRNA_seqid;
  GtUword i;
  int had_err = 0;

  gt_error_check(err);
  gt_assert(key && value && data);
   /* at least one node in array */
  gt_assert(gt_array_size(gt_genome_node_array));

  /* determine the range and the strand of the mRNA */
  first_node = *(GtGenomeNode**) gt_array_get(gt_genome_node_array, 0);
  mRNA_range = gt_genome_node_get_range(first_node);
  mRNA_strand = gt_feature_node_get_strand((GtFeatureNode*) first_node);
  mRNA_seqid = gt_genome_node_get_seqid(first_node);

  /* TODO: support discontinuous start/stop codons */
  for (i = 0; !had_err && i < gt_array_size(gt_genome_node_array); i++) {
    gn = *(GtGenomeNode**) gt_array_get(gt_genome_node_array, i);
    if (gt_feature_node_get_attribute((GtFeatureNode*) gn,
        GTF_PARSER_STOP_CODON_FLAG)) {
      GtUword j;
      GtRange stop_codon_rng = gt_genome_node_get_range(gn);
      bool found_cds = false;
      for (j = 0; !had_err && j < gt_array_size(gt_genome_node_array); j++) {
        GtGenomeNode* gn2;
        GtRange this_rng;
        const char *this_type;
        gn2 = *(GtGenomeNode**) gt_array_get(gt_genome_node_array, j);
        if (gn == gn2) continue;
        this_rng = gt_genome_node_get_range(gn2);
        this_type = gt_feature_node_get_type((GtFeatureNode*) gn2);
        if (this_type == gt_symbol(gt_ft_CDS)) {
          if (gt_range_contains(&this_rng, &stop_codon_rng)) {
            if (cinfo->tidy) {
              gt_warning("stop codon on line %u in file %s is contained in "
                         "CDS in line %u",
                         gt_genome_node_get_line_number(gn),
                         gt_genome_node_get_filename(gn),
                         gt_genome_node_get_line_number(gn2));
              found_cds = true;
            } else {
              gt_error_set(err, "stop codon on line %u in file %s is "
                                "contained in CDS in line %u",
                           gt_genome_node_get_line_number(gn),
                           gt_genome_node_get_filename(gn),
                           gt_genome_node_get_line_number(gn2));
              had_err = -1;
            }
            break;
          }
          if (this_rng.end + 1 == stop_codon_rng.start) {
            this_rng.end = stop_codon_rng.end;
            gt_genome_node_set_range(gn2, &this_rng);
            found_cds = true;
            break;
          }
          if (this_rng.start == stop_codon_rng.end + 1) {
            this_rng.start = stop_codon_rng.start;
            gt_genome_node_set_range(gn2, &this_rng);
            found_cds = true;
            break;
          }
        }
      }
      if (!found_cds) {
        if (!had_err) {
          if (cinfo->tidy) {
            gt_warning("found stop codon on line %u in file %s with no "
                       "flanking CDS, ignoring it",
                       gt_genome_node_get_line_number(gn),
                       gt_genome_node_get_filename(gn));
          } else {
            gt_error_set(err, "found stop codon on line %u in file %s with no "
                              "flanking CDS",
                         gt_genome_node_get_line_number(gn),
                         gt_genome_node_get_filename(gn));
            had_err = -1;
            break;
          }
        }
      } else {
        gt_array_rem(gt_genome_node_array, i);
        gt_genome_node_delete(gn);
      }
    }
  }

  for (i = 1; !had_err && i < gt_array_size(gt_genome_node_array); i++) {
    GtRange range;
    GtStrand strand;
    gn = *(GtGenomeNode**) gt_array_get(gt_genome_node_array, i);
    range = gt_genome_node_get_range(gn);
    mRNA_range = gt_range_join(&mRNA_range, &range);
    strand = gt_feature_node_get_strand((GtFeatureNode*) gn);
    if (strand != mRNA_strand) {
      gt_error_set(err, "feature %s on line %u has strand %c, but the "
                        "parent transcript has strand %c",
                   (const char*) key,
                   gt_genome_node_get_line_number(gn),
                   GT_STRAND_CHARS[strand],
                   GT_STRAND_CHARS[mRNA_strand]);
      had_err = -1;
      break;
    } else {
      mRNA_strand = gt_strand_join(mRNA_strand, strand);
    }
    if (!had_err && gt_str_cmp(mRNA_seqid, gt_genome_node_get_seqid(gn))) {
      gt_error_set(err, "The features on lines %u and %u refer to different "
                "genomic sequences (``seqname''), although they have the same "
                "gene IDs (``gene_id'') which must be globally unique",
                gt_genome_node_get_line_number(first_node),
                gt_genome_node_get_line_number(gn));
      had_err = -1;
      break;
    }
  }

  if (!had_err) {
    mRNA_node = gt_feature_node_new(mRNA_seqid, gt_ft_mRNA, mRNA_range.start,
                                    mRNA_range.end, mRNA_strand);
    gt_feature_node_add_attribute(((GtFeatureNode*) mRNA_node), "ID", key);
    gt_feature_node_add_attribute(((GtFeatureNode*) mRNA_node), "transcript_id",
                                  key);

    if ((tname = gt_hashmap_get(cinfo->transcript_id_to_name_mapping,
                              (const char*) key)) && strlen(tname) > 0) {
      gt_feature_node_add_attribute((GtFeatureNode*) mRNA_node, GT_GFF_NAME,
                                      tname);
    }

    /* register children */
    for (i = 0; i < gt_array_size(gt_genome_node_array); i++) {
      gn = *(GtGenomeNode**) gt_array_get(gt_genome_node_array, i);
      gt_feature_node_add_child((GtFeatureNode*) mRNA_node,
                                (GtFeatureNode*) gt_genome_node_ref(gn));
    }

    /* store the mRNA */
    gt_array_add(mRNAs, mRNA_node);
  }

  return had_err;
}
Exemplo n.º 19
0
GtFeatureNode* gt_feature_node_clone(const GtFeatureNode *template)
{
  GtFeatureNode *fn;
  GtStr *seqid;
  const char *type;
  GtUword i, start, end;
  GtStrand strand;
  GtStrArray *attributes;
  gt_assert(template && !gt_feature_node_has_children(template));
  seqid = gt_genome_node_get_seqid((GtGenomeNode*) template);
  type = gt_feature_node_get_type(template);
  start = gt_genome_node_get_start((GtGenomeNode*) template);
  end = gt_genome_node_get_end((GtGenomeNode*) template);
  strand = gt_feature_node_get_strand(template);
  fn = (GtFeatureNode*) gt_feature_node_new(seqid, type, start, end, strand);
  gt_genome_node_set_origin((GtGenomeNode*) fn,
                            template->parent_instance.filename,
                            template->parent_instance.line_number);
  if (gt_feature_node_has_source(template))
    gt_feature_node_set_source(fn, template->source);
  if (gt_feature_node_score_is_defined(template))
    gt_feature_node_set_score(fn, template->score);
  attributes = gt_feature_node_get_attribute_list(template);
  for (i = 0; i < gt_str_array_size(attributes); i++) {
    const char *tag, *value;
    tag = gt_str_array_get(attributes, i);
    value = gt_feature_node_get_attribute(template, tag);
    gt_feature_node_set_attribute(fn, tag, value);
  }
  gt_str_array_delete(attributes);
Exemplo n.º 20
0
static int construct_mRNAs(GT_UNUSED void *key, void *value, void *data,
                           GtError *err)
{
  ConstructionInfo *cinfo = (ConstructionInfo*) data;
  GtArray *gt_genome_node_array = (GtArray*) value,
          *mRNAs = (GtArray*) cinfo->mRNAs;
  GtGenomeNode *mRNA_node, *first_node, *gn;
  const char *tname;
  GtStrand mRNA_strand;
  GtRange mRNA_range;
  GtStr *mRNA_seqid;
  GtUword i;
  int had_err = 0;

  gt_error_check(err);
  gt_assert(key && value && data);
   /* at least one node in array */
  gt_assert(gt_array_size(gt_genome_node_array));

  /* determine the range and the strand of the mRNA */
  first_node = *(GtGenomeNode**) gt_array_get(gt_genome_node_array, 0);
  mRNA_range = gt_genome_node_get_range(first_node);
  mRNA_strand = gt_feature_node_get_strand((GtFeatureNode*) first_node);
  mRNA_seqid = gt_genome_node_get_seqid(first_node);
  for (i = 1; i < gt_array_size(gt_genome_node_array); i++) {
    GtRange range;
    gn = *(GtGenomeNode**) gt_array_get(gt_genome_node_array, i);
    range = gt_genome_node_get_range(gn);
    mRNA_range = gt_range_join(&mRNA_range, &range);
    /* XXX: an error check is necessary here, otherwise gt_strand_join() can
       cause a failed assertion */
    mRNA_strand = gt_strand_join(mRNA_strand,
                          gt_feature_node_get_strand((GtFeatureNode*) gn));
    if (gt_str_cmp(mRNA_seqid, gt_genome_node_get_seqid(gn))) {
      gt_error_set(err, "The features on lines %u and %u refer to different "
                "genomic sequences (``seqname''), although they have the same "
                "gene IDs (``gene_id'') which must be globally unique",
                gt_genome_node_get_line_number(first_node),
                gt_genome_node_get_line_number(gn));
      had_err = -1;
      break;
    }
  }

  if (!had_err) {
    mRNA_node = gt_feature_node_new(mRNA_seqid, gt_ft_mRNA, mRNA_range.start,
                                    mRNA_range.end, mRNA_strand);

    if ((tname = gt_hashmap_get(cinfo->transcript_id_to_name_mapping,
                              (const char*) key))) {
      gt_feature_node_add_attribute((GtFeatureNode*) mRNA_node, GT_GFF_NAME,
                                      tname);
    }

    /* register children */
    for (i = 0; i < gt_array_size(gt_genome_node_array); i++) {
      gn = *(GtGenomeNode**) gt_array_get(gt_genome_node_array, i);
      gt_feature_node_add_child((GtFeatureNode*) mRNA_node,
                                (GtFeatureNode*) gn);
    }

    /* store the mRNA */
    gt_array_add(mRNAs, mRNA_node);
  }

  return had_err;
}
Exemplo n.º 21
0
int gt_block_unit_test(GtError *err)
{
  GtRange r1, r2, r_temp, b_range;
  GtStrand s;
  GtGenomeNode *gn1, *gn2;
  GtElement *e1, *e2;
  double height;
  GtBlock *b;
  GtStr *seqid, *caption1, *caption2;
  int had_err = 0;
  GtStyle *sty;
  GtError *testerr;
  gt_error_check(err);

  seqid = gt_str_new_cstr("seqid");
  caption1 = gt_str_new_cstr("foo");
  caption2 = gt_str_new_cstr("bar");
  testerr = gt_error_new();

  r1.start = 10UL;
  r1.end = 50UL;

  r2.start = 40UL;
  r2.end = 50UL;

  gn1 = gt_feature_node_new(seqid, gt_ft_gene, r1.start, r1.end,
                            GT_STRAND_FORWARD);
  gn2 = gt_feature_node_new(seqid, gt_ft_exon, r2.start, r2.end,
                            GT_STRAND_FORWARD);

  e1 = gt_element_new((GtFeatureNode*) gn1);
  e2 = gt_element_new((GtFeatureNode*) gn2);

  b = gt_block_new();

  /* test gt_block_insert_elements */
  gt_ensure((0UL == gt_block_get_size(b)));
  gt_block_insert_element(b, (GtFeatureNode*) gn1);
  gt_ensure((1UL == gt_block_get_size(b)));
  gt_block_insert_element(b, (GtFeatureNode*) gn2);
  gt_ensure((2UL == gt_block_get_size(b)));

  /* test gt_block_set_range & gt_block_get_range */
  r_temp = gt_range_join(&r1, &r2);
  gt_block_set_range(b, r_temp);
  b_range = gt_block_get_range(b);
  gt_ensure((0 == gt_range_compare(&b_range, &r_temp)));
  gt_ensure((1 == gt_range_compare(&r2, &r_temp)));

  /* tests gt_block_set_caption & gt_block_get_caption */
  gt_block_set_caption(b, caption1);
  gt_ensure((0 == gt_str_cmp(gt_block_get_caption(b), caption1)));
  gt_ensure((0 != gt_str_cmp(gt_block_get_caption(b), caption2)));

  /* tests gt_block_set_strand & gt_block_get_range */
  s = gt_block_get_strand(b);
  gt_ensure((GT_STRAND_UNKNOWN == s));
  gt_block_set_strand(b, GT_STRAND_FORWARD);
  s = gt_block_get_strand(b);
  gt_ensure((GT_STRAND_FORWARD == s));

  /* test gt_block_get_max_height() */
  sty = gt_style_new(err);
  gt_ensure(gt_block_get_max_height(b, &height, sty, err) == 0);
  gt_ensure(!gt_error_is_set(testerr));
  gt_ensure(height == BAR_HEIGHT_DEFAULT);
  gt_style_set_num(sty, "exon", "bar_height", 42);
  gt_ensure(gt_block_get_max_height(b, &height, sty, err) == 0);
  gt_ensure(!gt_error_is_set(testerr));
  gt_ensure(height == 42);
  gt_style_set_num(sty, "gene", "bar_height", 23);
  gt_ensure(gt_block_get_max_height(b, &height, sty, err) == 0);
  gt_ensure(!gt_error_is_set(testerr));
  gt_ensure(height == 42);
  gt_style_unset(sty, "exon", "bar_height");
  gt_ensure(gt_block_get_max_height(b, &height, sty, err) == 0);
  gt_ensure(!gt_error_is_set(testerr));
  gt_ensure(height == 23);

  gt_str_delete(caption2);
  gt_str_delete(seqid);
  gt_element_delete(e1);
  gt_element_delete(e2);
  gt_block_delete(b);
  gt_style_delete(sty);
  gt_error_delete(testerr);
  gt_genome_node_delete(gn1);
  gt_genome_node_delete(gn2);

  return had_err;
}
Exemplo n.º 22
0
static int bed_rest(GtBEDParser *bed_parser, GtIO *bed_file, GtError *err)
{
  GtUword block_count = 0;
  GtGenomeNode *gn = NULL;
  GtRange range;
  GtStr *seqid;
  int had_err;
  gt_error_check(err);
  /* column 1.: chrom */
  seqid = get_seqid(bed_parser);
  had_err = skip_blanks(bed_file, err);
  /* column 2.: chromStart */
  if (!had_err) {
    word(bed_parser->word, bed_file);
    had_err = skip_blanks(bed_file, err);
  }
  /* column 3.: chromEnd */
  if (!had_err) {
    word(bed_parser->another_word, bed_file);
    had_err = parse_bed_range(&range, bed_parser->word,
                              bed_parser->another_word, bed_parser->offset,
                              bed_file, false, err);
  }
  if (!had_err) {
    /* add region */
    gt_region_node_builder_add_region(bed_parser->region_node_builder,
                                      gt_str_get(seqid), range);
    /* create feature */
    gn = gt_feature_node_new(seqid,
                             bed_parser->feature_type
                             ? bed_parser->feature_type
                             : BED_FEATURE_TYPE,
                             range.start, range.end, GT_STRAND_BOTH);
    gt_queue_add(bed_parser->feature_nodes, gn);
    if (bed_separator(bed_file))
      had_err = skip_blanks(bed_file, err);
  }
  /* optional column 4.: name */
  if (!had_err) {
    word(bed_parser->word, bed_file);
    if (gt_str_length(bed_parser->word)) {
      gt_feature_node_add_attribute((GtFeatureNode*) gn, GT_GFF_NAME,
                                    gt_str_get(bed_parser->word));
    }
    if (bed_separator(bed_file))
      had_err = skip_blanks(bed_file, err);
  }
  /* optional column 5.: score */
  if (!had_err) {
    word(bed_parser->word, bed_file);
    if (gt_str_length(bed_parser->word)) {
      bool score_is_defined;
      float score_value;
      had_err = gt_parse_score(&score_is_defined, &score_value,
                               gt_str_get(bed_parser->word),
                               gt_io_get_line_number(bed_file),
                               gt_io_get_filename(bed_file), err);
      if (!had_err && score_is_defined)
        gt_feature_node_set_score((GtFeatureNode*) gn, score_value);
    }
  }
  if (!had_err && bed_separator(bed_file))
    had_err = skip_blanks(bed_file, err);
  /* optional column 6.: strand */
  if (!had_err) {
    word(bed_parser->word, bed_file);
    if (gt_str_length(bed_parser->word)) {
      GtStrand strand;
      had_err = gt_parse_strand(&strand, gt_str_get(bed_parser->word),
                                gt_io_get_line_number(bed_file),
                                gt_io_get_filename(bed_file), err);
      if (!had_err)
        gt_feature_node_set_strand((GtFeatureNode*) gn, strand);
    }
  }
  if (!had_err && bed_separator(bed_file))
    had_err = skip_blanks(bed_file, err);
  /* optional column 7.: thickStart */
  if (!had_err) {
    word(bed_parser->word, bed_file);
    if (bed_separator(bed_file))
      had_err = skip_blanks(bed_file, err);
  }
  /* optional column 8.: thickEnd */
  if (!had_err) {
    word(bed_parser->another_word, bed_file);
    if (gt_str_length(bed_parser->another_word)) {
      gt_assert(gt_str_length(bed_parser->word));
      /* got a thickStart and a thickEnd -> construct corresponding feature */
      had_err = parse_bed_range(&range, bed_parser->word,
                                bed_parser->another_word, bed_parser->offset,
                                bed_file, true, err);
      if (!had_err && range.start <= range.end)
        construct_thick_feature(bed_parser, (GtFeatureNode*) gn, range);
    }
  }
  if (!had_err && bed_separator(bed_file))
    had_err = skip_blanks(bed_file, err);
  /* optional column 9.: itemRgb */
  if (!had_err) {
    word(bed_parser->word, bed_file);
    /* we do not use the RGB values */
    if (bed_separator(bed_file))
      had_err = skip_blanks(bed_file, err);
  }
  /* optional column 10.: blockCount */
  if (!had_err) {
    word(bed_parser->word, bed_file);
    if (gt_str_length(bed_parser->word)) {
      if (gt_parse_uword(&block_count, gt_str_get(bed_parser->word))) {
        gt_error_set(err,
                     "file \"%s\": line "GT_WU": could not parse blockCount",
                     gt_io_get_filename(bed_file),
                     gt_io_get_line_number(bed_file));
        had_err = -1;
      }
      else {
        /* reset to parse/process blockSizes and blockStarts properly */
        gt_str_reset(bed_parser->word);
        gt_str_reset(bed_parser->another_word);
      }
    }
  }
  if (!had_err && bed_separator(bed_file))
    had_err = skip_blanks(bed_file, err);
  /* optional column 11.: blockSizes */
  if (!had_err) {
    word(bed_parser->word, bed_file);
    if (bed_separator(bed_file))
      had_err = skip_blanks(bed_file, err);
  }
  /* optional column 12.: blockStarts */
  if (!had_err) {
    word(bed_parser->another_word, bed_file);
    if (bed_separator(bed_file))
      had_err = skip_blanks(bed_file, err);
  }
  /* process blocks if necessary */
  if (!had_err && block_count) {
    had_err = process_blocks(bed_parser, (GtFeatureNode*) gn, block_count,
                             bed_parser->word, bed_parser->another_word,
                             bed_file, err);
  }
  /* the end of the line should now be reached */
  if (!had_err)
    had_err = gt_io_expect(bed_file, GT_END_OF_LINE, err);
  return had_err;
}
Exemplo n.º 23
0
static int construct_genes(GT_UNUSED void *key, void *value, void *data,
                           GtError *err)
{
  GtHashmap *transcript_id_hash = (GtHashmap*) value;
  ConstructionInfo *cinfo = (ConstructionInfo*) data;
  GtQueue *genome_nodes = cinfo->genome_nodes;
  const char *gname;
  GtArray *mRNAs = gt_array_new(sizeof (GtGenomeNode*));
  GtGenomeNode *gene_node, *gn;
  GtStrand gene_strand;
  GtRange gene_range;
  GtStr *gene_seqid;
  GtUword i;
  int had_err = 0;

  gt_error_check(err);
  gt_assert(key && value && data);
  cinfo->mRNAs = mRNAs;
  had_err = gt_hashmap_foreach(transcript_id_hash, construct_mRNAs, cinfo, err);
  if (!had_err) {
    gt_assert(gt_array_size(mRNAs)); /* at least one mRNA constructed */

    /* determine the range and the strand of the gene */
    gn = *(GtGenomeNode**) gt_array_get(mRNAs, 0);
    gene_range = gt_genome_node_get_range(gn);
    gene_strand = gt_feature_node_get_strand((GtFeatureNode*) gn);
    gene_seqid = gt_genome_node_get_seqid(gn);
    for (i = 1; i < gt_array_size(mRNAs); i++) {
      GtRange range;
      gn = *(GtGenomeNode**) gt_array_get(mRNAs, i);
      range = gt_genome_node_get_range(gn);
      gene_range = gt_range_join(&gene_range, &range);
      gene_strand = gt_strand_join(gene_strand,
                          gt_feature_node_get_strand((GtFeatureNode*) gn));
      gt_assert(gt_str_cmp(gene_seqid, gt_genome_node_get_seqid(gn)) == 0);
    }

    gene_node = gt_feature_node_new(gene_seqid, gt_ft_gene, gene_range.start,
                                    gene_range.end, gene_strand);

    if ((gname = gt_hashmap_get(cinfo->gene_id_to_name_mapping,
                              (const char*) key))) {
      gt_feature_node_add_attribute((GtFeatureNode*) gene_node, GT_GFF_NAME,
                                      gname);
    }

    /* register children */
    for (i = 0; i < gt_array_size(mRNAs); i++) {
      gn = *(GtGenomeNode**) gt_array_get(mRNAs, i);
      gt_feature_node_add_child((GtFeatureNode*) gene_node,
                                (GtFeatureNode*) gn);
    }

    /* store the gene */
    gt_queue_add(genome_nodes, gene_node);

    /* free */
    gt_array_delete(mRNAs);
  }

  return had_err;
}
Exemplo n.º 24
0
int gt_track_unit_test(GtError *err)
{
    int had_err = 0;
    GtBlock *b[4];
    GtRange r[4];
    GtTrack *track;
    GtGenomeNode *parent[4], *gn[4];
    GtStr *title;
    double height, tmp;
    GtStyle *sty;
    unsigned long i;
    GtLineBreaker *lb;
    double t_rest = 0,
           l_rest = 0;
    gt_error_check(err);

    title = gt_str_new_cstr("test");

    r[0].start=100UL;
    r[0].end=1000UL;
    r[1].start=1001UL;
    r[1].end=1500UL;
    r[2].start=700UL;
    r[2].end=1200UL;
    r[3].start=10UL;
    r[3].end=200UL;

    for (i=0; i<4; i++)
    {
        parent[i] = gt_feature_node_new(title, gt_ft_gene, r[i].start, r[i].end,
                                        GT_STRAND_FORWARD);
        gn[i] = gt_feature_node_new(title, gt_ft_exon, r[i].start, r[i].end,
                                    GT_STRAND_FORWARD);

        gt_feature_node_add_child((GtFeatureNode*) parent[i],
                                  (GtFeatureNode*) gn[i]);

        gt_feature_node_add_attribute((GtFeatureNode*) parent[i], GT_GFF_NAME,
                                      "parent");
        gt_feature_node_add_attribute((GtFeatureNode*) gn[i], GT_GFF_NAME, "child");
    }

    for (i=0; i<4; i++)
    {
        b[i] = gt_block_new();
        gt_block_set_range(b[i], r[i]);
        gt_block_insert_element(b[i], (GtFeatureNode*) parent[i]);
        gt_block_insert_element(b[i], (GtFeatureNode*) gn[i]);
    }

    lb = gt_line_breaker_bases_new();

    sty = gt_style_new(err);

    if (gt_style_get_num(sty, "format", "track_caption_font_size", &tmp,
                         NULL, err) == GT_STYLE_QUERY_NOT_SET) {
        tmp = TEXT_SIZE_DEFAULT;
    }
    t_rest += tmp;
    if (gt_style_get_num(sty, "format", "track_caption_space", &tmp,
                         NULL, err) == GT_STYLE_QUERY_NOT_SET) {
        tmp = CAPTION_BAR_SPACE_DEFAULT;
    }
    t_rest += tmp;
    if (gt_style_get_num(sty, "format", "track_vspace", &tmp,
                         NULL, err) == GT_STYLE_QUERY_NOT_SET) {
        tmp = TRACK_VSPACE_DEFAULT;
    }
    t_rest += tmp;
    if (gt_style_get_num(sty, "format", "bar_vspace", &l_rest,
                         NULL, err) == GT_STYLE_QUERY_NOT_SET) {
        l_rest = BAR_VSPACE_DEFAULT;
    }

    track = gt_track_new(title, GT_UNDEF_ULONG, true, lb);
    gt_ensure(had_err, track);
    gt_ensure(had_err, gt_track_get_title(track) == title);

    gt_ensure(had_err, gt_track_get_number_of_lines(track) == 0);
    gt_ensure(had_err, gt_track_get_height(track, &height, sty, err) == 0);
    gt_ensure(had_err, height == t_rest);
    gt_ensure(had_err, !gt_error_is_set(err));

    gt_ensure(had_err, gt_track_insert_block(track, b[0], err) == 0);
    gt_ensure(had_err, !gt_error_is_set(err));
    gt_ensure(had_err, gt_track_get_number_of_lines(track) == 1);
    gt_ensure(had_err, gt_track_get_height(track, &height, sty, err) == 0);
    gt_ensure(had_err, height == t_rest + l_rest + BAR_HEIGHT_DEFAULT);
    gt_ensure(had_err, !gt_error_is_set(err));

    gt_ensure(had_err, gt_track_insert_block(track, b[1], err) == 0);
    gt_ensure(had_err, !gt_error_is_set(err));
    gt_ensure(had_err, gt_track_get_number_of_lines(track) == 1);
    gt_ensure(had_err, gt_track_get_height(track, &height, sty, err) == 0);
    gt_ensure(had_err, height == t_rest + l_rest + BAR_HEIGHT_DEFAULT);
    gt_ensure(had_err, !gt_error_is_set(err));

    gt_ensure(had_err, gt_track_insert_block(track, b[2], err) == 0);
    gt_ensure(had_err, !gt_error_is_set(err));
    gt_ensure(had_err, gt_track_get_number_of_lines(track) == 2);
    gt_ensure(had_err, gt_track_insert_block(track, b[3], err) == 0);
    gt_ensure(had_err, !gt_error_is_set(err));
    gt_ensure(had_err, gt_track_get_number_of_lines(track) == 2);
    gt_ensure(had_err, gt_track_get_height(track, &height, sty, err) == 0);
    gt_ensure(had_err, height == t_rest + 2*(l_rest + BAR_HEIGHT_DEFAULT));
    gt_ensure(had_err, !gt_error_is_set(err));

    gt_style_set_num(sty, "exon", "bar_height", 42);
    gt_ensure(had_err, gt_track_get_height(track, &height, sty, err) == 0);
    gt_ensure(had_err, height == t_rest + 2*(l_rest+42));
    gt_ensure(had_err, !gt_error_is_set(err));
    gt_style_set_num(sty, "gene", "bar_height", 23);
    gt_ensure(had_err, gt_track_get_height(track, &height, sty, err) == 0);
    gt_ensure(had_err, height == t_rest + 2*(l_rest+42));
    gt_ensure(had_err, !gt_error_is_set(err));
    gt_style_unset(sty, "exon", "bar_height");
    gt_ensure(had_err, gt_track_get_height(track, &height, sty, err) == 0);
    gt_ensure(had_err, height == t_rest + 2*(l_rest+23));
    gt_ensure(had_err, !gt_error_is_set(err));
    gt_style_unset(sty, "gene", "bar_height");
    gt_style_set_num(sty, "format", "bar_height", 99);
    gt_ensure(had_err, gt_track_get_height(track, &height, sty, err) == 0);
    gt_ensure(had_err, height == t_rest + 2*(l_rest+99));
    gt_ensure(had_err, !gt_error_is_set(err));

    gt_ensure(had_err, gt_track_get_number_of_discarded_blocks(track) == 0);

    gt_track_delete(track);
    gt_str_delete(title);
    gt_style_delete(sty);
    for (i=0; i<4; i++)
    {
        gt_block_delete(b[i]);
        gt_genome_node_delete(parent[i]);
    }
    return had_err;
}
Exemplo n.º 25
0
int gt_gtf_parser_parse(GtGTFParser *parser, GtQueue *genome_nodes,
                        GtStr *filenamestr, GtFile *fpin, bool be_tolerant,
                        GtError *err)
{
  GtStr *seqid_str, *source_str, *line_buffer;
  char *line;
  size_t line_length;
  GtUword i, line_number = 0;
  GtGenomeNode *gn;
  GtRange range;
  GtPhase phase_value;
  GtStrand gt_strand_value;
  GtSplitter *splitter, *attribute_splitter;
  float score_value;
  char *seqname,
       *source,
       *feature,
       *start,
       *end,
       *score,
       *strand,
       *frame,
       *attributes,
       *token,
       *gene_id,
       *gene_name = NULL,
       *transcript_id,
       *transcript_name = NULL,
       **tokens;
  GtHashmap *transcript_id_hash; /* map from transcript id to array of genome
                                    nodes */
  GtArray *gt_genome_node_array;
  ConstructionInfo cinfo;
  GTF_feature_type gtf_feature_type;
  GT_UNUSED bool gff_type_is_valid = false;
  const char *type = NULL;
  const char *filename;
  bool score_is_defined;
  int had_err = 0;

  gt_assert(parser && genome_nodes);
  gt_error_check(err);

  filename = gt_str_get(filenamestr);

  /* alloc */
  line_buffer = gt_str_new();
  splitter = gt_splitter_new(),
  attribute_splitter = gt_splitter_new();

#define HANDLE_ERROR                                                \
        if (had_err) {                                              \
          if (be_tolerant) {                                        \
            fprintf(stderr, "skipping line: %s\n", gt_error_get(err)); \
            gt_error_unset(err);                                       \
            gt_str_reset(line_buffer);                                 \
            had_err = 0;                                            \
            continue;                                               \
          }                                                         \
          else {                                                    \
            had_err = -1;                                           \
            break;                                                  \
          }                                                         \
        }

  while (gt_str_read_next_line_generic(line_buffer, fpin) != EOF) {
    line = gt_str_get(line_buffer);
    line_length = gt_str_length(line_buffer);
    line_number++;
    had_err = 0;

    if (line_length == 0) {
      gt_warning("skipping blank line " GT_WU " in file \"%s\"", line_number,
                 filename);
    }
    else if (line[0] == '#') {
      /* storing comment */
      if (line_length >= 2 && line[1] == '#')
        gn = gt_comment_node_new(line+2); /* store '##' line as '#' line */
      else
        gn = gt_comment_node_new(line+1);
      gt_genome_node_set_origin(gn, filenamestr, line_number);
      gt_queue_add(genome_nodes, gn);
    }
    else {
      /* process tab delimited GTF line */
      gt_splitter_reset(splitter);
      gt_splitter_split(splitter, line, line_length, '\t');
      if (gt_splitter_size(splitter) != 9UL) {
        gt_error_set(err, "line " GT_WU " in file \"%s\" contains " GT_WU
                     " tab (\\t) " "separated fields instead of 9", line_number,
                     filename,
                  gt_splitter_size(splitter));
        had_err = -1;
        break;
      }
      tokens = gt_splitter_get_tokens(splitter);
      seqname    = tokens[0];
      source     = tokens[1];
      feature    = tokens[2];
      start      = tokens[3];
      end        = tokens[4];
      score      = tokens[5];
      strand     = tokens[6];
      frame      = tokens[7];
      attributes = tokens[8];

      /* parse feature */
      if (GTF_feature_type_get(&gtf_feature_type, feature) == -1) {
        /* we skip unknown features */
        fprintf(stderr, "skipping line " GT_WU " in file \"%s\": unknown "
                "feature: \"%s\"\n", line_number, filename, feature);
        gt_str_reset(line_buffer);
        continue;
      }

      /* translate into GFF3 feature type */
      switch (gtf_feature_type) {
        case GTF_CDS:
        case GTF_stop_codon:
          gff_type_is_valid = gt_type_checker_is_valid(parser->type_checker,
                                                       gt_ft_CDS);
          type = gt_ft_CDS;
          break;
        case GTF_exon:
          gff_type_is_valid = gt_type_checker_is_valid(parser->type_checker,
                                                       gt_ft_exon);
          type = gt_ft_exon;
      }
      gt_assert(gff_type_is_valid);

      /* parse the range */
      had_err = gt_parse_range(&range, start, end, line_number, filename, err);
      HANDLE_ERROR;

      /* process seqname (we have to do it here because we need the range) */
      gt_region_node_builder_add_region(parser->region_node_builder, seqname,
                                        range);

      /* parse the score */
      had_err = gt_parse_score(&score_is_defined, &score_value, score,
                               line_number, filename, err);
      HANDLE_ERROR;

      /* parse the strand */
      had_err = gt_parse_strand(&gt_strand_value, strand, line_number, filename,
                               err);
      HANDLE_ERROR;

      /* parse the frame */
      had_err = gt_parse_phase(&phase_value, frame, line_number, filename, err);
      HANDLE_ERROR;

      /* parse the attributes */
      gt_splitter_reset(attribute_splitter);
      gene_id = NULL;
      transcript_id = NULL;
      gt_splitter_split(attribute_splitter, attributes, strlen(attributes),
                        ';');
      for (i = 0; i < gt_splitter_size(attribute_splitter); i++) {
        token = gt_splitter_get_token(attribute_splitter, i);
        /* skip leading blanks */
        while (*token == ' ')
          token++;
        /* look for the two mandatory attributes */
        if (strncmp(token, GENE_ID_ATTRIBUTE, strlen(GENE_ID_ATTRIBUTE)) == 0) {
          if (strlen(token) + 2 < strlen(GENE_ID_ATTRIBUTE)) {
            gt_error_set(err, "missing value to attribute \"%s\" on line "
                         GT_WU "in file \"%s\"", GENE_ID_ATTRIBUTE, line_number,
                         filename);
            had_err = -1;
          }
          HANDLE_ERROR;
          gene_id = token + strlen(GENE_ID_ATTRIBUTE) + 1;
        }
        else if (strncmp(token, TRANSCRIPT_ID_ATTRIBUTE,
                         strlen(TRANSCRIPT_ID_ATTRIBUTE)) == 0) {
          if (strlen(token) + 2 < strlen(TRANSCRIPT_ID_ATTRIBUTE)) {
            gt_error_set(err, "missing value to attribute \"%s\" on line "
                         GT_WU "in file \"%s\"", TRANSCRIPT_ID_ATTRIBUTE,
                         line_number, filename);
            had_err = -1;
          }
          HANDLE_ERROR;
          transcript_id = token + strlen(TRANSCRIPT_ID_ATTRIBUTE) + 1;
        }
        else if (strncmp(token, GENE_NAME_ATTRIBUTE,
                         strlen(GENE_NAME_ATTRIBUTE)) == 0) {
          if (strlen(token) + 2 < strlen(GENE_NAME_ATTRIBUTE)) {
            gt_error_set(err, "missing value to attribute \"%s\" on line "
                         GT_WU "in file \"%s\"", GENE_NAME_ATTRIBUTE,
                         line_number, filename);
            had_err = -1;
          }
          HANDLE_ERROR;
          gene_name = token + strlen(GENE_NAME_ATTRIBUTE) + 1;
          /* for output we want to strip quotes */
          if (*gene_name == '"')
            gene_name++;
          if (gene_name[strlen(gene_name)-1] == '"')
            gene_name[strlen(gene_name)-1] = '\0';
        }
        else if (strncmp(token, TRANSCRIPT_NAME_ATTRIBUTE,
                         strlen(TRANSCRIPT_NAME_ATTRIBUTE)) == 0) {
          if (strlen(token) + 2 < strlen(TRANSCRIPT_NAME_ATTRIBUTE)) {
            gt_error_set(err, "missing value to attribute \"%s\" on line "
                         GT_WU "in file \"%s\"", TRANSCRIPT_NAME_ATTRIBUTE,
                         line_number, filename);
            had_err = -1;
          }
          HANDLE_ERROR;
          transcript_name = token + strlen(TRANSCRIPT_NAME_ATTRIBUTE) + 1;
          /* for output we want to strip quotes */
          if (*transcript_name == '"')
            transcript_name++;
          if (transcript_name[strlen(transcript_name)-1] == '"')
            transcript_name[strlen(transcript_name)-1] = '\0';
        }
      }

      /* check for the mandatory attributes */
      if (!gene_id) {
        gt_error_set(err, "missing attribute \"%s\" on line " GT_WU
                     " in file \"%s\"", GENE_ID_ATTRIBUTE, line_number,
                     filename);
        had_err = -1;
      }
      HANDLE_ERROR;
      if (!transcript_id) {
        gt_error_set(err, "missing attribute \"%s\" on line " GT_WU
                     " in file \"%s\"", TRANSCRIPT_ID_ATTRIBUTE, line_number,
                     filename);
        had_err = -1;
      }
      HANDLE_ERROR;

      /* process the mandatory attributes */
      if (!(transcript_id_hash = gt_hashmap_get(parser->gene_id_hash,
                                             gene_id))) {
        transcript_id_hash = gt_hashmap_new(GT_HASH_STRING, gt_free_func,
                                            (GtFree) gt_array_delete);
        gt_hashmap_add(parser->gene_id_hash, gt_cstr_dup(gene_id),
                    transcript_id_hash);
      }
      gt_assert(transcript_id_hash);

      if (!(gt_genome_node_array = gt_hashmap_get(transcript_id_hash,
                                            transcript_id))) {
        gt_genome_node_array = gt_array_new(sizeof (GtGenomeNode*));
        gt_hashmap_add(transcript_id_hash, gt_cstr_dup(transcript_id),
                    gt_genome_node_array);
      }
      gt_assert(gt_genome_node_array);

      /* save optional gene_name and transcript_name attributes */
      if (transcript_name
            && !gt_hashmap_get(parser->transcript_id_to_name_mapping,
                             transcript_id)) {
        gt_hashmap_add(parser->transcript_id_to_name_mapping,
                    gt_cstr_dup(transcript_id),
                    gt_cstr_dup(transcript_name));
      }
      if (gene_name && !gt_hashmap_get(parser->gene_id_to_name_mapping,
                                    gene_id)) {
        gt_hashmap_add(parser->gene_id_to_name_mapping,
                    gt_cstr_dup(gene_id),
                    gt_cstr_dup(gene_name));
      }

      /* get seqid */
      seqid_str = gt_hashmap_get(parser->seqid_to_str_mapping, seqname);
      if (!seqid_str) {
        seqid_str = gt_str_new_cstr(seqname);
        gt_hashmap_add(parser->seqid_to_str_mapping, gt_str_get(seqid_str),
                       seqid_str);
      }
      gt_assert(seqid_str);

      /* construct the new feature */
      gn = gt_feature_node_new(seqid_str, type, range.start, range.end,
                                 gt_strand_value);
      gt_genome_node_set_origin(gn, filenamestr, line_number);

      /* set source */
      source_str = gt_hashmap_get(parser->source_to_str_mapping, source);
      if (!source_str) {
        source_str = gt_str_new_cstr(source);
        gt_hashmap_add(parser->source_to_str_mapping, gt_str_get(source_str),
                    source_str);
      }
      gt_assert(source_str);
      gt_feature_node_set_source((GtFeatureNode*) gn, source_str);

      if (score_is_defined)
        gt_feature_node_set_score((GtFeatureNode*) gn, score_value);
      if (phase_value != GT_PHASE_UNDEFINED)
        gt_feature_node_set_phase((GtFeatureNode*) gn, phase_value);
      gt_array_add(gt_genome_node_array, gn);
    }

    gt_str_reset(line_buffer);
  }

  /* process all region nodes */
  if (!had_err)
    gt_region_node_builder_build(parser->region_node_builder, genome_nodes);

  /* process all feature nodes */
  cinfo.genome_nodes = genome_nodes;
  cinfo.gene_id_to_name_mapping = parser->gene_id_to_name_mapping;
  cinfo.transcript_id_to_name_mapping = parser->transcript_id_to_name_mapping;
  if (!had_err) {
    had_err = gt_hashmap_foreach(parser->gene_id_hash, construct_genes,
                              &cinfo, err);
  }

  /* free */
  gt_splitter_delete(splitter);
  gt_splitter_delete(attribute_splitter);
  gt_str_delete(line_buffer);

  return had_err;
}
Exemplo n.º 26
0
static void infer_cds_visitor_infer_utrs(AgnInferCDSVisitor *v)
{
  GtFeatureNode *start_codon, *stop_codon;

  bool exonsexplicit    = gt_array_size(v->exons) > 0;
  bool cdsexplicit      = gt_array_size(v->cds) > 0;
  bool startcodon_check = gt_array_size(v->starts) == 1 &&
                          (start_codon = gt_array_get(v->starts, 0)) != NULL;
  bool stopcodon_check  = gt_array_size(v->stops)  == 1 &&
                          (stop_codon  = gt_array_get(v->stops,  0)) != NULL;
  bool caninferutrs     = exonsexplicit && startcodon_check && stopcodon_check;

  if(gt_array_size(v->utrs) > 0)
  {
    return;
  }
  else if(!cdsexplicit && !caninferutrs)
  {
    return;
  }

  GtGenomeNode **leftcodon = gt_array_get(v->starts, 0);
  GtGenomeNode **rightcodon = gt_array_get(v->stops, 0);
  GtStrand strand = gt_feature_node_get_strand(v->mrna);
  const char *lefttype  = "five_prime_UTR";
  const char *righttype = "three_prime_UTR";
  if(strand == GT_STRAND_REVERSE)
  {
    lefttype   = "three_prime_UTR";
    righttype  = "five_prime_UTR";
    void *temp = leftcodon;
    leftcodon  = rightcodon;
    rightcodon = temp;
  }
  GtRange leftrange  = gt_genome_node_get_range(*leftcodon);
  GtRange rightrange = gt_genome_node_get_range(*rightcodon);

  GtUword i;
  for(i = 0; i < gt_array_size(v->exons); i++)
  {
    GtGenomeNode **exon = gt_array_get(v->exons, i);
    GtRange exonrange = gt_genome_node_get_range(*exon);
    if(exonrange.start < leftrange.start)
    {
      GtRange utrrange;
      if(gt_range_overlap(&exonrange, &leftrange))
      {
        utrrange.start = exonrange.start;
        utrrange.end   = leftrange.start - 1;
      }
      else
      {
        utrrange = exonrange;
      }
      GtGenomeNode *utr = gt_feature_node_new(gt_genome_node_get_seqid(*exon),
                                              lefttype, utrrange.start,
                                              utrrange.end, strand);
      if(v->source)
        gt_feature_node_set_source((GtFeatureNode *)utr, v->source);
      gt_feature_node_add_child(v->mrna, (GtFeatureNode *)utr);
      gt_array_add(v->utrs, utr);
    }

    if(exonrange.end > rightrange.end)
    {
      GtRange utrrange;
      if(gt_range_overlap(&exonrange, &rightrange))
      {
        utrrange.start = rightrange.end + 1;
        utrrange.end   = exonrange.end;
      }
      else
      {
        utrrange = exonrange;
      }
      GtGenomeNode *utr = gt_feature_node_new(gt_genome_node_get_seqid(*exon),
                                              righttype, utrrange.start,
                                              utrrange.end, strand);
      if(v->source)
        gt_feature_node_set_source((GtFeatureNode *)utr, v->source);
      gt_feature_node_add_child(v->mrna, (GtFeatureNode *)utr);
      gt_array_add(v->utrs, utr);
    }
  }
}
Exemplo n.º 27
0
/* to be called from implementing class! */
int gt_feature_index_unit_test(GtFeatureIndex *fi, GtError *err)
{
  int had_err = 0, i, rval;
  GtFeatureIndexTestShared sh;
  GtStrArray *seqids;
  GtStr *seqid;
  GtRange check_range;
  GtRegionNode *rn;
  bool has_seqid;
  gt_error_check(err);

  sh.mutex = gt_mutex_new();
  sh.nodes = gt_array_new(sizeof (GtFeatureNode*));
  sh.error_count = 0;
  sh.next_node_idx = 0;
  sh.fi = fi;
  sh.err = gt_error_new();

  /* create region */
  seqid = gt_str_new_cstr(GT_FI_TEST_SEQID);
  rn = (GtRegionNode*) gt_region_node_new(seqid, GT_FI_TEST_START,
                                          GT_FI_TEST_END);

  /* test seqid is not supposed to exist */
  gt_ensure(gt_feature_index_has_seqid(sh.fi, &has_seqid,
                                                 GT_FI_TEST_SEQID, err) == 0);
  gt_ensure(!has_seqid);

  /* add a sequence region directly and check if it has been added */
  rval = gt_feature_index_add_region_node(sh.fi, rn, err);
  gt_ensure(rval == 0);
  gt_genome_node_delete((GtGenomeNode*) rn);
  gt_ensure(gt_feature_index_has_seqid(sh.fi, &has_seqid,
                                                GT_FI_TEST_SEQID, err) == 0);
  gt_ensure(has_seqid);

  gt_feature_index_get_range_for_seqid(sh.fi, &check_range, GT_FI_TEST_SEQID,
                                       err);
  gt_ensure(check_range.start == GT_FI_TEST_START
                    && check_range.end == GT_FI_TEST_END);

  /* set up nodes to store */
  for (i=0;i<GT_FI_TEST_FEATURES_PER_THREAD*gt_jobs;i++) {
    GtUword start, end;
    GtFeatureNode *fn;
    start = random() % (GT_FI_TEST_END - GT_FI_TEST_FEATURE_WIDTH);
    end = start + random() % (GT_FI_TEST_FEATURE_WIDTH);
    fn = gt_feature_node_cast(gt_feature_node_new(seqid, "gene", start, end,
                                                  GT_STRAND_FORWARD));
    gt_array_add(sh.nodes, fn);
  }
  /* test parallel addition */
  gt_multithread(gt_feature_index_unit_test_add, &sh, err);
  seqids = gt_feature_index_get_seqids(fi, err);
  gt_ensure(seqids);
  gt_ensure(gt_feature_index_has_seqid(fi, &has_seqid,GT_FI_TEST_SEQID,
                                                err) == 0);
  gt_ensure(has_seqid);
  gt_ensure(gt_str_array_size(seqids) == 1);

  /* test parallel query */
  if (!had_err)
    gt_multithread(gt_feature_index_unit_test_query, &sh, err);
  gt_ensure(sh.error_count == 0);

  gt_mutex_delete(sh.mutex);
  gt_error_delete(sh.err);
  gt_str_array_delete(seqids);
  gt_array_delete(sh.nodes);
  gt_str_delete(seqid);
  return had_err;
}