Exemplo n.º 1
0
static int update_bioseq_if_necessary(GtRegionMapping *rm, GtStr *seqid,
                                      GtError *err)
{
  int had_err = 0;
  gt_error_check(err);
  gt_assert(rm && seqid);
  if (!rm->sequence_file || gt_str_cmp(rm->sequence_name, seqid)) {
    gt_str_delete(rm->sequence_file);
    rm->sequence_file = region_mapping_map(rm, gt_str_get(seqid), err);
    if (!rm->sequence_file)
      had_err = -1;
    else {
      if (!rm->sequence_name)
        rm->sequence_name = gt_str_new();
      else
        gt_str_reset(rm->sequence_name);
      gt_str_append_str(rm->sequence_name, seqid);
      gt_bioseq_delete(rm->bioseq);
      rm->bioseq = gt_bioseq_new_str(rm->sequence_file, err);
      if (!rm->bioseq)
        had_err = -1;
    }
  }
  return had_err;
}
Exemplo n.º 2
0
static int select_visitor_region_node(GtNodeVisitor *nv, GtRegionNode *rn,
                                      GT_UNUSED GtError *err)
{
  GtSelectVisitor *select_visitor;
  gt_error_check(err);
  select_visitor = select_visitor_cast(nv);
  if (!gt_str_length(select_visitor->seqid) || /* no seqid was specified */
      !gt_str_cmp(select_visitor->seqid,       /* or seqids are equal */
               gt_genome_node_get_seqid((GtGenomeNode*) rn))) {
    if (select_visitor->contain_range.start != GT_UNDEF_ULONG) {
      GtRange range = gt_genome_node_get_range((GtGenomeNode*) rn);
      if (gt_range_overlap(&range, &select_visitor->contain_range)) {
        /* an overlapping contain range was defined -> update range  */
        range.start = MAX(range.start, select_visitor->contain_range.start);
        range.end = MIN(range.end, select_visitor->contain_range.end);
        gt_genome_node_set_range((GtGenomeNode*) rn, &range);
        gt_queue_add(select_visitor->node_buffer, rn);
      }
      else /* contain range does not overlap with <rn> range -> delete <rn> */
        gt_genome_node_delete((GtGenomeNode*) rn);
    }
    else
      gt_queue_add(select_visitor->node_buffer, rn);
  }
  else
    gt_genome_node_delete((GtGenomeNode*) rn);
  return 0;
}
Exemplo n.º 3
0
static int gt_sort_stream_next(GtNodeStream *ns, GtGenomeNode **gn,
                               GtError *err)
{
  GtSortStream *sort_stream;
  GtGenomeNode *node, *eofn;
  int had_err = 0;
  gt_error_check(err);
  sort_stream = gt_sort_stream_cast(ns);

  if (!sort_stream->sorted) {
    while (!(had_err = gt_node_stream_next(sort_stream->in_stream, &node,
                                           err)) && node) {
      if ((eofn = gt_eof_node_try_cast(node)))
        gt_genome_node_delete(eofn); /* get rid of EOF nodes */
      else
        gt_array_add(sort_stream->nodes, node);
    }
    if (!had_err) {
      gt_genome_nodes_sort_stable(sort_stream->nodes);
      sort_stream->sorted = true;
    }
  }

  if (!had_err) {
    gt_assert(sort_stream->sorted);
    if (sort_stream->idx < gt_array_size(sort_stream->nodes)) {
      *gn = *(GtGenomeNode**) gt_array_get(sort_stream->nodes,
                                           sort_stream->idx);
      sort_stream->idx++;
      /* join region nodes with the same sequence ID */
      if (gt_region_node_try_cast(*gn)) {
        GtRange range_a, range_b;
        while (sort_stream->idx < gt_array_size(sort_stream->nodes)) {
          node = *(GtGenomeNode**) gt_array_get(sort_stream->nodes,
                                                sort_stream->idx);
          if (!gt_region_node_try_cast(node) ||
              gt_str_cmp(gt_genome_node_get_seqid(*gn),
                         gt_genome_node_get_seqid(node))) {
            /* the next node is not a region node with the same ID */
            break;
          }
          range_a = gt_genome_node_get_range(*gn);
          range_b = gt_genome_node_get_range(node);
          range_a = gt_range_join(&range_a, &range_b);
          gt_genome_node_set_range(*gn, &range_a);
          gt_genome_node_delete(node);
          sort_stream->idx++;
        }
      }
      return 0;
    }
  }

  if (!had_err) {
    gt_array_reset(sort_stream->nodes);
    *gn = NULL;
  }

  return had_err;
}
Exemplo n.º 4
0
void gt_region_node_consolidate(GtRegionNode *rn_a, GtRegionNode *rn_b)
{
  GtRange range_a, range_b;
  gt_assert(rn_a);
  gt_assert(rn_b);
  gt_assert(!gt_str_cmp(gt_genome_node_get_seqid((GtGenomeNode*) rn_a),
                        gt_genome_node_get_seqid((GtGenomeNode*) rn_b)));
  range_a = gt_genome_node_get_range((GtGenomeNode*) rn_a);
  range_b = gt_genome_node_get_range((GtGenomeNode*) rn_b);
  range_a = gt_range_join(&range_a, &range_b);
  gt_genome_node_set_range((GtGenomeNode*) rn_a, &range_a);
}
Exemplo n.º 5
0
static inline int parse_fastq_block(GtSeqIteratorFastQ *seqit, GtError *err)
{
  int had_err = 0;
  gt_assert(seqit);
  gt_error_check(err);

  /* parse @<seqname> */
  had_err = parse_fastq_seqname(seqit,
                                seqit->descbuffer,
                                GT_FASTQ_BLOCK_START_CHAR,
                                err);
  if (!had_err) {
    /* parse sequence */
    had_err = parse_fastq_sequence(seqit, err);
    gt_fastq_premature_end_check(had_err, seqit);
  }
  if (!had_err) {
    /* parse +[seqname] */
    had_err = parse_fastq_seqname(seqit,
                                  seqit->qdescbuffer,
                                  GT_FASTQ_QUAL_SEPARATOR_CHAR,
                                  err);
    gt_fastq_premature_end_check(had_err, seqit);
  }
  if (!had_err
      && gt_str_length(seqit->qdescbuffer)
      && gt_str_cmp(seqit->descbuffer, seqit->qdescbuffer) != 0)
  {
      gt_error_set(err, "sequence description '%s' is not equal to "
                        "qualities description '%s' in line %lu",
                        gt_str_get(seqit->descbuffer),
                        gt_str_get(seqit->qdescbuffer),
                        seqit->curline-1);
      return -2;
  }
  if (!had_err) {
    /* parse qualities */
    had_err = parse_fastq_qualities(seqit, err);
    if (gt_str_length(seqit->qualsbuffer)
          != gt_str_length(seqit->sequencebuffer))
    {
      gt_error_set(err, "lengths of character sequence and qualities "
                        "sequence differ (%lu <-> %lu)",
                        gt_str_length(seqit->qualsbuffer),
                        gt_str_length(seqit->sequencebuffer));
      return -2;
    }
  }
  return had_err;
}
Exemplo n.º 6
0
static int select_visitor_sequence_node(GtNodeVisitor *nv, GtSequenceNode *sn,
                                        GT_UNUSED GtError *err)
{
  GtSelectVisitor *select_visitor;
  gt_error_check(err);
  select_visitor = select_visitor_cast(nv);
  if (!gt_str_length(select_visitor->seqid) || /* no seqid was specified */
      !gt_str_cmp(select_visitor->seqid,       /* or seqids are equal */
                  gt_genome_node_get_seqid((GtGenomeNode*) sn))) {
    gt_queue_add(select_visitor->node_buffer, sn);
  }
  else
    gt_genome_node_delete((GtGenomeNode*) sn);
  return 0;
}
static int inter_feature_in_children(GtFeatureNode *current_feature, void *data,
                                     GT_UNUSED GtError *err)
{
  GtInterFeatureVisitor *aiv = (GtInterFeatureVisitor*) data;
  GtFeatureNode *inter_node;
  GtRange previous_range, current_range, inter_range;
  GtStrand previous_strand, /*current_strand, */inter_strand;
  GtStr *parent_seqid;
  gt_error_check(err);
  gt_assert(current_feature);
  if (gt_feature_node_has_type(current_feature, aiv->outside_type)) {
    if (aiv->previous_feature) {
      /* determine inter range */
      previous_range = gt_genome_node_get_range((GtGenomeNode*)
                                                aiv->previous_feature);
      current_range = gt_genome_node_get_range((GtGenomeNode*) current_feature);
      if (previous_range.end >= current_range.start) {
        gt_warning("overlapping boundary features " GT_WU "-" GT_WU " and "
                   GT_WU "-" GT_WU ", " "not placing '%s' inter-feature",
                   previous_range.start,
                   previous_range.end,
                   current_range.start,
                   current_range.end,
                   aiv->inter_type);
        return 0;
      }
      if (current_range.start - previous_range.end < 2) {
        gt_warning("no space for inter-feature '%s' between " GT_WU " and "
                   GT_WU,
                   aiv->inter_type,
                   previous_range.end,
                   current_range.start);
        return 0;
      }
      inter_range.start = previous_range.end + 1;
      inter_range.end = current_range.start - 1;

      /* determine inter strand */
      previous_strand = gt_feature_node_get_strand(aiv->previous_feature);
      /*current_strand = gt_feature_node_get_strand(current_feature);*/
      gt_assert(previous_strand == gt_feature_node_get_strand(current_feature));
      inter_strand = previous_strand;

      /* determine sequence id */
      parent_seqid =
        gt_genome_node_get_seqid((GtGenomeNode*) aiv->parent_feature);
      gt_assert(!gt_str_cmp(parent_seqid,
                            gt_genome_node_get_seqid((GtGenomeNode*)
                                                     aiv->previous_feature)));
      gt_assert(!gt_str_cmp(parent_seqid,
                            gt_genome_node_get_seqid((GtGenomeNode*)
                                                     current_feature)));

      /* create inter feature */
      inter_node = (GtFeatureNode*)
                   gt_feature_node_new(parent_seqid, aiv->inter_type,
                                       inter_range.start, inter_range.end,
                                       inter_strand);
      gt_feature_node_add_child(aiv->parent_feature, inter_node);
    }
    aiv->previous_feature = current_feature;
  }
  return 0;
}
Exemplo n.º 8
0
static int select_visitor_feature_node(GtNodeVisitor *nv,
                                       GtFeatureNode *fn,
                                       GT_UNUSED GtError *err)
{
  GtSelectVisitor *fv;
  bool filter_node = false;
  gt_error_check(err);
  fv = select_visitor_cast(nv);
  fv->current_feature++;
  if ((!gt_str_length(fv->seqid) || /* no seqid was specified or seqids are
                                       equal */
       !gt_str_cmp(fv->seqid, gt_genome_node_get_seqid((GtGenomeNode*) fn))) &&
      (!gt_str_length(fv->source) || /* no source was specified or sources are
                                        equal */
       !strcmp(gt_str_get(fv->source), gt_feature_node_get_source(fn)))) {
    GtRange range = gt_genome_node_get_range((GtGenomeNode*) fn);
    /* enforce maximum gene length */
    /* XXX: we (spuriously) assume that genes are always root nodes */
    if (fn && gt_feature_node_has_type(fn, gt_ft_gene)) {
      if (fv->max_gene_length != GT_UNDEF_ULONG &&
          gt_range_length(&range) > fv->max_gene_length) {
        filter_node = true;
      }
      else if (fv->max_gene_num != GT_UNDEF_ULONG &&
               fv->gene_num >= fv->max_gene_num) {
        filter_node = true;
      }
      else if (fv->min_gene_score != GT_UNDEF_DOUBLE &&
               gt_feature_node_get_score(fn) < fv->min_gene_score) {
        filter_node = true;
      }
      else if (fv->max_gene_score != GT_UNDEF_DOUBLE &&
               gt_feature_node_get_score(fn) > fv->max_gene_score) {
        filter_node = true;
      }
      else if (fv->feature_num != GT_UNDEF_ULONG &&
               fv->feature_num != fv->current_feature) {
        filter_node = true;
      }
      if (!filter_node)
        fv->gene_num++; /* gene passed filter */
    }
  }
  else
    filter_node = true;

  if (!filter_node)
    filter_node = filter_contain_range(fn, fv->contain_range);

  if (!filter_node)
    filter_node = filter_overlap_range(fn, fv->overlap_range);

  if (!filter_node)
    filter_node = filter_strand(fn, fv->strand);

  if (!filter_node)
    filter_node = filter_targetstrand(fn, fv->targetstrand);

  if (!filter_node)
    filter_node = filter_has_CDS(fn, fv->has_CDS);

  if (!filter_node)
    filter_node = filter_min_average_ssp(fn, fv->min_average_splice_site_prob);

  if (filter_node)
    gt_genome_node_delete((GtGenomeNode*) fn);
  else
    gt_queue_add(fv->node_buffer, fn);

  return 0;
}
Exemplo n.º 9
0
int gt_block_unit_test(GtError *err)
{
  GtRange r1, r2, r_temp, b_range;
  GtStrand s;
  GtGenomeNode *gn1, *gn2;
  GtElement *e1, *e2;
  double height;
  GtBlock *b;
  GtStr *seqid, *caption1, *caption2;
  int had_err = 0;
  GtStyle *sty;
  GtError *testerr;
  gt_error_check(err);

  seqid = gt_str_new_cstr("seqid");
  caption1 = gt_str_new_cstr("foo");
  caption2 = gt_str_new_cstr("bar");
  testerr = gt_error_new();

  r1.start = 10UL;
  r1.end = 50UL;

  r2.start = 40UL;
  r2.end = 50UL;

  gn1 = gt_feature_node_new(seqid, gt_ft_gene, r1.start, r1.end,
                            GT_STRAND_FORWARD);
  gn2 = gt_feature_node_new(seqid, gt_ft_exon, r2.start, r2.end,
                            GT_STRAND_FORWARD);

  e1 = gt_element_new((GtFeatureNode*) gn1);
  e2 = gt_element_new((GtFeatureNode*) gn2);

  b = gt_block_new();

  /* test gt_block_insert_elements */
  gt_ensure((0UL == gt_block_get_size(b)));
  gt_block_insert_element(b, (GtFeatureNode*) gn1);
  gt_ensure((1UL == gt_block_get_size(b)));
  gt_block_insert_element(b, (GtFeatureNode*) gn2);
  gt_ensure((2UL == gt_block_get_size(b)));

  /* test gt_block_set_range & gt_block_get_range */
  r_temp = gt_range_join(&r1, &r2);
  gt_block_set_range(b, r_temp);
  b_range = gt_block_get_range(b);
  gt_ensure((0 == gt_range_compare(&b_range, &r_temp)));
  gt_ensure((1 == gt_range_compare(&r2, &r_temp)));

  /* tests gt_block_set_caption & gt_block_get_caption */
  gt_block_set_caption(b, caption1);
  gt_ensure((0 == gt_str_cmp(gt_block_get_caption(b), caption1)));
  gt_ensure((0 != gt_str_cmp(gt_block_get_caption(b), caption2)));

  /* tests gt_block_set_strand & gt_block_get_range */
  s = gt_block_get_strand(b);
  gt_ensure((GT_STRAND_UNKNOWN == s));
  gt_block_set_strand(b, GT_STRAND_FORWARD);
  s = gt_block_get_strand(b);
  gt_ensure((GT_STRAND_FORWARD == s));

  /* test gt_block_get_max_height() */
  sty = gt_style_new(err);
  gt_ensure(gt_block_get_max_height(b, &height, sty, err) == 0);
  gt_ensure(!gt_error_is_set(testerr));
  gt_ensure(height == BAR_HEIGHT_DEFAULT);
  gt_style_set_num(sty, "exon", "bar_height", 42);
  gt_ensure(gt_block_get_max_height(b, &height, sty, err) == 0);
  gt_ensure(!gt_error_is_set(testerr));
  gt_ensure(height == 42);
  gt_style_set_num(sty, "gene", "bar_height", 23);
  gt_ensure(gt_block_get_max_height(b, &height, sty, err) == 0);
  gt_ensure(!gt_error_is_set(testerr));
  gt_ensure(height == 42);
  gt_style_unset(sty, "exon", "bar_height");
  gt_ensure(gt_block_get_max_height(b, &height, sty, err) == 0);
  gt_ensure(!gt_error_is_set(testerr));
  gt_ensure(height == 23);

  gt_str_delete(caption2);
  gt_str_delete(seqid);
  gt_element_delete(e1);
  gt_element_delete(e2);
  gt_block_delete(b);
  gt_style_delete(sty);
  gt_error_delete(testerr);
  gt_genome_node_delete(gn1);
  gt_genome_node_delete(gn2);

  return had_err;
}
Exemplo n.º 10
0
bool gth_sas_are_equal(const GthSA *saA, const GthSA *saB)
{
  Exoninfo *exoninfoA, *exoninfoB;
  Introninfo *introninfoA, *introninfoB;
  GtUword i;

  /* compare element 0 */
  if (gth_sa_alphatype(saA) != gth_sa_alphatype(saB))
    return false;

  /* compare element 1 */
  if (gth_backtrace_path_length(saA->backtrace_path) !=
      gth_backtrace_path_length(saB->backtrace_path)) {
    return false;
  }
  for (i = 0; i < gth_backtrace_path_length(saA->backtrace_path); i++) {
    if (((Editoperation*) gth_backtrace_path_get(saA->backtrace_path))[i] !=
        ((Editoperation*) gth_backtrace_path_get(saB->backtrace_path))[i]) {
      return false;
    }
  }

  /* element 2 has been removed (indelcount) */

  /* compare element 3 */
  if (gth_sa_gen_dp_length(saA) != gth_sa_gen_dp_length(saB))
    return false;

  /* compare element 4 */
  if (saA->gen_total_length != saB->gen_total_length)
    return false;

  /* compare element 5 */
  if (saA->gen_offset != saB->gen_offset)
    return false;

  /* compare element 6 */
  if (gth_sa_ref_total_length(saA) != gth_sa_ref_total_length(saB))
    return false;

  /* compare element 7 */
  if (gth_sa_gen_dp_start(saA) != gth_sa_gen_dp_start(saB))
    return false;

  /* element 8 has been removed (gen_dp_end) */

  /* compare element 9 */
  if (saA->gen_file_num != saB->gen_file_num)
    return false;

  /* compare element 10 */
  if (saA->gen_seq_num != saB->gen_seq_num)
    return false;

  /* compare element 11 */
  if (saA->ref_file_num != saB->ref_file_num)
    return false;

  /* compare element 12 */
  if (saA->ref_seq_num != saB->ref_seq_num)
    return false;

  /* compare element 13 */
  if (gt_str_cmp(saA->gen_id, saB->gen_id))
    return false;

  /* compare element 14 */
  if (gt_str_cmp(saA->ref_id, saB->ref_id))
    return false;

  /* compare element 15 */
  if (saA->gen_strand_forward != saB->gen_strand_forward)
    return false;

  /* compare element 16 */
  if (saA->ref_strand_forward != saB->ref_strand_forward)
    return false;

  /* compare element 17 */
  if (gth_sa_genomiccutoff_start(saA) != gth_sa_genomiccutoff_start(saB))
    return false;
  if (gth_sa_referencecutoff_start(saA) != gth_sa_referencecutoff_start(saB))
    return false;
  if (gth_sa_eopcutoff_start(saA) != gth_sa_eopcutoff_start(saB))
    return false;
  if (gth_sa_genomiccutoff_end(saA) != gth_sa_genomiccutoff_end(saB))
    return false;
  if (gth_sa_referencecutoff_end(saA) != gth_sa_referencecutoff_end(saB))
    return false;
  if (gth_sa_eopcutoff_end(saA) != gth_sa_eopcutoff_end(saB))
    return false;

  /* compare element 18 */
  if (gt_array_size(saA->exons) != gt_array_size(saB->exons))
    return false;
  for (i = 0; i < gt_array_size(saA->exons); i++) {
    exoninfoA = (Exoninfo*) gt_array_get(saA->exons, i);
    exoninfoB = (Exoninfo*) gt_array_get(saB->exons, i);
    if (exoninfoA->leftgenomicexonborder != exoninfoB->leftgenomicexonborder)
      return false;
    if (exoninfoA->rightgenomicexonborder != exoninfoB->rightgenomicexonborder)
      return false;
    if (exoninfoA->leftreferenceexonborder !=
        exoninfoB->leftreferenceexonborder) {
      return false;
    }
    if (exoninfoA->rightreferenceexonborder !=
        exoninfoB->rightreferenceexonborder) {
      return false;
    }
    if (!gt_double_equals_double(exoninfoA->exonscore, exoninfoB->exonscore)) {
      return false;
    }
  }

  /* compare element 19 */
  if (gt_array_size(saA->introns) != gt_array_size(saB->introns))
    return false;
  for (i = 0; i < gt_array_size(saA->introns); i++) {
    introninfoA = (Introninfo*) gt_array_get(saA->introns, i);
    introninfoB = (Introninfo*) gt_array_get(saB->introns, i);
    if (!gt_double_equals_double(introninfoA->donorsiteprobability,
                                 introninfoB->donorsiteprobability)) {
      return false;
    }
    if (!gt_double_equals_double(introninfoA->acceptorsiteprobability,
                                 introninfoB->acceptorsiteprobability)) {
      return false;
    }
    if (!gt_double_equals_double(introninfoA->donorsitescore,
                                 introninfoB->donorsitescore)) {
      return false;
    }
    if (!gt_double_equals_double(introninfoA->acceptorsitescore,
                                 introninfoB->acceptorsitescore)) {
      return false;
    }
  }

  /* compare element 20 */
  if (saA->polyAtailpos.start != saB->polyAtailpos.start)
    return false;
  if (saA->polyAtailpos.end != saB->polyAtailpos.end)
    return false;

  /* compare element 21 */
  if (saA->alignmentscore != saB->alignmentscore)
    return false;

  /* compare element 22 */
  if (saA->coverage != saB->coverage)
    return false;

  /* compare element 23 */
  if (saA->genomic_cov_is_highest != saB->genomic_cov_is_highest)
    return false;

  /* compare element 24 */
  if (saA->cumlen_scored_exons != saB->cumlen_scored_exons)
    return false;

  return true;
}
Exemplo n.º 11
0
static int update_seq_col_if_necessary(GtRegionMapping *rm, GtStr *seqid,
                                       GtError *err)
{
  int had_err = 0;
  gt_error_check(err);
  gt_assert(rm && seqid);
  /* for mappings, we need to load the changed sequence, if needed... */
  if (rm->mapping) {
    if (!rm->sequence_file || (gt_str_cmp(rm->sequence_name, seqid))) {
      gt_str_delete(rm->sequence_file);
      /* ignore MD5 hashes when using region mappings */
      if (gt_md5_seqid_has_prefix(gt_str_get(seqid))) {
        rm->sequence_file = region_mapping_map(rm,
                                               gt_str_get(seqid)
                                                 +GT_MD5_SEQID_TOTAL_LEN,
                                               err);
      } else
        rm->sequence_file = region_mapping_map(rm, gt_str_get(seqid), err);
      if (!rm->sequence_file)
        had_err = -1;
      else {
        /* load new seqcol */
        if (!rm->sequence_filenames)
          rm->sequence_filenames = gt_str_array_new();
        else
          gt_str_array_reset(rm->sequence_filenames);
        gt_str_array_add(rm->sequence_filenames, rm->sequence_file);
        if (!rm->sequence_name)
          rm->sequence_name = gt_str_new();
        else
          gt_str_reset(rm->sequence_name);
        gt_str_append_str(rm->sequence_name, seqid);
        gt_seq_col_delete(rm->seq_col);
        rm->seq_col = gt_bioseq_col_new(rm->sequence_filenames, err);
        if (!rm->seq_col)
          had_err = -1;
      }
    }
  } else {
    /* ...otherwise, just make sure the seqcol is loaded */
    if (!rm->seq_col) {
      if (rm->encseq) {
        if (!(rm->seq_col = gt_encseq_col_new(rm->encseq, err)))
          had_err = -1;
      } else {
        gt_assert(rm->sequence_filenames);
        if (!(rm->seq_col = gt_bioseq_col_new(rm->sequence_filenames, err)))
          had_err = -1;
      }
    }
    if (!had_err && rm->usedesc) {
      if (rm->seqid2seqnum_mapping)
        gt_seqid2seqnum_mapping_delete(rm->seqid2seqnum_mapping);
      rm->seqid2seqnum_mapping =
                           gt_seqid2seqnum_mapping_new_seqcol(rm->seq_col, err);
      if (!rm->seqid2seqnum_mapping) {
        had_err = -1;
      }
    }
  }
  return had_err;
}
Exemplo n.º 12
0
static int construct_genes(GT_UNUSED void *key, void *value, void *data,
                           GtError *err)
{
  GtHashmap *transcript_id_hash = (GtHashmap*) value;
  ConstructionInfo *cinfo = (ConstructionInfo*) data;
  GtQueue *genome_nodes = cinfo->genome_nodes;
  const char *gname;
  GtArray *mRNAs = gt_array_new(sizeof (GtGenomeNode*));
  GtGenomeNode *gene_node, *gn;
  GtStrand gene_strand;
  GtRange gene_range;
  GtStr *gene_seqid;
  GtUword i;
  int had_err = 0;

  gt_error_check(err);
  gt_assert(key && value && data);
  cinfo->mRNAs = mRNAs;
  had_err = gt_hashmap_foreach(transcript_id_hash, construct_mRNAs, cinfo, err);
  if (!had_err) {
    gt_assert(gt_array_size(mRNAs)); /* at least one mRNA constructed */

    /* determine the range and the strand of the gene */
    gn = *(GtGenomeNode**) gt_array_get(mRNAs, 0);
    gene_range = gt_genome_node_get_range(gn);
    gene_strand = gt_feature_node_get_strand((GtFeatureNode*) gn);
    gene_seqid = gt_genome_node_get_seqid(gn);
    for (i = 1; i < gt_array_size(mRNAs); i++) {
      GtRange range;
      gn = *(GtGenomeNode**) gt_array_get(mRNAs, i);
      range = gt_genome_node_get_range(gn);
      gene_range = gt_range_join(&gene_range, &range);
      gene_strand = gt_strand_join(gene_strand,
                          gt_feature_node_get_strand((GtFeatureNode*) gn));
      gt_assert(gt_str_cmp(gene_seqid, gt_genome_node_get_seqid(gn)) == 0);
    }

    gene_node = gt_feature_node_new(gene_seqid, gt_ft_gene, gene_range.start,
                                    gene_range.end, gene_strand);

    if ((gname = gt_hashmap_get(cinfo->gene_id_to_name_mapping,
                              (const char*) key))) {
      gt_feature_node_add_attribute((GtFeatureNode*) gene_node, GT_GFF_NAME,
                                      gname);
    }

    /* register children */
    for (i = 0; i < gt_array_size(mRNAs); i++) {
      gn = *(GtGenomeNode**) gt_array_get(mRNAs, i);
      gt_feature_node_add_child((GtFeatureNode*) gene_node,
                                (GtFeatureNode*) gn);
    }

    /* store the gene */
    gt_queue_add(genome_nodes, gene_node);

    /* free */
    gt_array_delete(mRNAs);
  }

  return had_err;
}
Exemplo n.º 13
0
static int construct_mRNAs(GT_UNUSED void *key, void *value, void *data,
                           GtError *err)
{
  ConstructionInfo *cinfo = (ConstructionInfo*) data;
  GtArray *gt_genome_node_array = (GtArray*) value,
          *mRNAs = (GtArray*) cinfo->mRNAs;
  GtGenomeNode *mRNA_node, *first_node, *gn;
  const char *tname;
  GtStrand mRNA_strand;
  GtRange mRNA_range;
  GtStr *mRNA_seqid;
  GtUword i;
  int had_err = 0;

  gt_error_check(err);
  gt_assert(key && value && data);
   /* at least one node in array */
  gt_assert(gt_array_size(gt_genome_node_array));

  /* determine the range and the strand of the mRNA */
  first_node = *(GtGenomeNode**) gt_array_get(gt_genome_node_array, 0);
  mRNA_range = gt_genome_node_get_range(first_node);
  mRNA_strand = gt_feature_node_get_strand((GtFeatureNode*) first_node);
  mRNA_seqid = gt_genome_node_get_seqid(first_node);
  for (i = 1; i < gt_array_size(gt_genome_node_array); i++) {
    GtRange range;
    gn = *(GtGenomeNode**) gt_array_get(gt_genome_node_array, i);
    range = gt_genome_node_get_range(gn);
    mRNA_range = gt_range_join(&mRNA_range, &range);
    /* XXX: an error check is necessary here, otherwise gt_strand_join() can
       cause a failed assertion */
    mRNA_strand = gt_strand_join(mRNA_strand,
                          gt_feature_node_get_strand((GtFeatureNode*) gn));
    if (gt_str_cmp(mRNA_seqid, gt_genome_node_get_seqid(gn))) {
      gt_error_set(err, "The features on lines %u and %u refer to different "
                "genomic sequences (``seqname''), although they have the same "
                "gene IDs (``gene_id'') which must be globally unique",
                gt_genome_node_get_line_number(first_node),
                gt_genome_node_get_line_number(gn));
      had_err = -1;
      break;
    }
  }

  if (!had_err) {
    mRNA_node = gt_feature_node_new(mRNA_seqid, gt_ft_mRNA, mRNA_range.start,
                                    mRNA_range.end, mRNA_strand);

    if ((tname = gt_hashmap_get(cinfo->transcript_id_to_name_mapping,
                              (const char*) key))) {
      gt_feature_node_add_attribute((GtFeatureNode*) mRNA_node, GT_GFF_NAME,
                                      tname);
    }

    /* register children */
    for (i = 0; i < gt_array_size(gt_genome_node_array); i++) {
      gn = *(GtGenomeNode**) gt_array_get(gt_genome_node_array, i);
      gt_feature_node_add_child((GtFeatureNode*) mRNA_node,
                                (GtFeatureNode*) gn);
    }

    /* store the mRNA */
    gt_array_add(mRNAs, mRNA_node);
  }

  return had_err;
}
Exemplo n.º 14
0
static int construct_mRNAs(GT_UNUSED void *key, void *value, void *data,
                           GtError *err)
{
  ConstructionInfo *cinfo = (ConstructionInfo*) data;
  GtArray *gt_genome_node_array = (GtArray*) value,
          *mRNAs = (GtArray*) cinfo->mRNAs;
  GtGenomeNode *mRNA_node, *first_node, *gn;
  const char *tname;
  GtStrand mRNA_strand;
  GtRange mRNA_range;
  GtStr *mRNA_seqid;
  GtUword i;
  int had_err = 0;

  gt_error_check(err);
  gt_assert(key && value && data);
   /* at least one node in array */
  gt_assert(gt_array_size(gt_genome_node_array));

  /* determine the range and the strand of the mRNA */
  first_node = *(GtGenomeNode**) gt_array_get(gt_genome_node_array, 0);
  mRNA_range = gt_genome_node_get_range(first_node);
  mRNA_strand = gt_feature_node_get_strand((GtFeatureNode*) first_node);
  mRNA_seqid = gt_genome_node_get_seqid(first_node);

  /* TODO: support discontinuous start/stop codons */
  for (i = 0; !had_err && i < gt_array_size(gt_genome_node_array); i++) {
    gn = *(GtGenomeNode**) gt_array_get(gt_genome_node_array, i);
    if (gt_feature_node_get_attribute((GtFeatureNode*) gn,
        GTF_PARSER_STOP_CODON_FLAG)) {
      GtUword j;
      GtRange stop_codon_rng = gt_genome_node_get_range(gn);
      bool found_cds = false;
      for (j = 0; !had_err && j < gt_array_size(gt_genome_node_array); j++) {
        GtGenomeNode* gn2;
        GtRange this_rng;
        const char *this_type;
        gn2 = *(GtGenomeNode**) gt_array_get(gt_genome_node_array, j);
        if (gn == gn2) continue;
        this_rng = gt_genome_node_get_range(gn2);
        this_type = gt_feature_node_get_type((GtFeatureNode*) gn2);
        if (this_type == gt_symbol(gt_ft_CDS)) {
          if (gt_range_contains(&this_rng, &stop_codon_rng)) {
            if (cinfo->tidy) {
              gt_warning("stop codon on line %u in file %s is contained in "
                         "CDS in line %u",
                         gt_genome_node_get_line_number(gn),
                         gt_genome_node_get_filename(gn),
                         gt_genome_node_get_line_number(gn2));
              found_cds = true;
            } else {
              gt_error_set(err, "stop codon on line %u in file %s is "
                                "contained in CDS in line %u",
                           gt_genome_node_get_line_number(gn),
                           gt_genome_node_get_filename(gn),
                           gt_genome_node_get_line_number(gn2));
              had_err = -1;
            }
            break;
          }
          if (this_rng.end + 1 == stop_codon_rng.start) {
            this_rng.end = stop_codon_rng.end;
            gt_genome_node_set_range(gn2, &this_rng);
            found_cds = true;
            break;
          }
          if (this_rng.start == stop_codon_rng.end + 1) {
            this_rng.start = stop_codon_rng.start;
            gt_genome_node_set_range(gn2, &this_rng);
            found_cds = true;
            break;
          }
        }
      }
      if (!found_cds) {
        if (!had_err) {
          if (cinfo->tidy) {
            gt_warning("found stop codon on line %u in file %s with no "
                       "flanking CDS, ignoring it",
                       gt_genome_node_get_line_number(gn),
                       gt_genome_node_get_filename(gn));
          } else {
            gt_error_set(err, "found stop codon on line %u in file %s with no "
                              "flanking CDS",
                         gt_genome_node_get_line_number(gn),
                         gt_genome_node_get_filename(gn));
            had_err = -1;
            break;
          }
        }
      } else {
        gt_array_rem(gt_genome_node_array, i);
        gt_genome_node_delete(gn);
      }
    }
  }

  for (i = 1; !had_err && i < gt_array_size(gt_genome_node_array); i++) {
    GtRange range;
    GtStrand strand;
    gn = *(GtGenomeNode**) gt_array_get(gt_genome_node_array, i);
    range = gt_genome_node_get_range(gn);
    mRNA_range = gt_range_join(&mRNA_range, &range);
    strand = gt_feature_node_get_strand((GtFeatureNode*) gn);
    if (strand != mRNA_strand) {
      gt_error_set(err, "feature %s on line %u has strand %c, but the "
                        "parent transcript has strand %c",
                   (const char*) key,
                   gt_genome_node_get_line_number(gn),
                   GT_STRAND_CHARS[strand],
                   GT_STRAND_CHARS[mRNA_strand]);
      had_err = -1;
      break;
    } else {
      mRNA_strand = gt_strand_join(mRNA_strand, strand);
    }
    if (!had_err && gt_str_cmp(mRNA_seqid, gt_genome_node_get_seqid(gn))) {
      gt_error_set(err, "The features on lines %u and %u refer to different "
                "genomic sequences (``seqname''), although they have the same "
                "gene IDs (``gene_id'') which must be globally unique",
                gt_genome_node_get_line_number(first_node),
                gt_genome_node_get_line_number(gn));
      had_err = -1;
      break;
    }
  }

  if (!had_err) {
    mRNA_node = gt_feature_node_new(mRNA_seqid, gt_ft_mRNA, mRNA_range.start,
                                    mRNA_range.end, mRNA_strand);
    gt_feature_node_add_attribute(((GtFeatureNode*) mRNA_node), "ID", key);
    gt_feature_node_add_attribute(((GtFeatureNode*) mRNA_node), "transcript_id",
                                  key);

    if ((tname = gt_hashmap_get(cinfo->transcript_id_to_name_mapping,
                              (const char*) key)) && strlen(tname) > 0) {
      gt_feature_node_add_attribute((GtFeatureNode*) mRNA_node, GT_GFF_NAME,
                                      tname);
    }

    /* register children */
    for (i = 0; i < gt_array_size(gt_genome_node_array); i++) {
      gn = *(GtGenomeNode**) gt_array_get(gt_genome_node_array, i);
      gt_feature_node_add_child((GtFeatureNode*) mRNA_node,
                                (GtFeatureNode*) gt_genome_node_ref(gn));
    }

    /* store the mRNA */
    gt_array_add(mRNAs, mRNA_node);
  }

  return had_err;
}