Esempio n. 1
0
void gt_gff3_output_leading_str(GtFeatureNode *fn, GtStr *outstr)
{
  GtGenomeNode *gn;
  gt_assert(fn && outstr);
  gn = (GtGenomeNode*) fn;
  gt_str_append_str(outstr, gt_genome_node_get_seqid(gn));
  gt_str_append_char(outstr, '\t');
  gt_str_append_cstr(outstr, gt_feature_node_get_source(fn));
  gt_str_append_char(outstr, '\t');
  gt_str_append_cstr(outstr, gt_feature_node_get_type(fn));
  gt_str_append_char(outstr, '\t');
  gt_str_append_uword(outstr, gt_genome_node_get_start(gn));
  gt_str_append_char(outstr, '\t');
  gt_str_append_uword(outstr, gt_genome_node_get_end(gn));
  gt_str_append_char(outstr, '\t');
  if (gt_feature_node_score_is_defined(fn)) {
    char buf[BUFSIZ];
    (void) snprintf(buf, BUFSIZ, "%.3g", gt_feature_node_get_score(fn));
    gt_str_append_cstr(outstr, buf);
  } else
    gt_str_append_char(outstr, '.');
  gt_str_append_char(outstr, '\t');
  gt_str_append_char(outstr, GT_STRAND_CHARS[gt_feature_node_get_strand(fn)]);
  gt_str_append_char(outstr, '\t');
  gt_str_append_char(outstr, GT_PHASE_CHARS[gt_feature_node_get_phase(fn)]);
  gt_str_append_char(outstr, '\t');
}
static int extract_join_feature(GtGenomeNode *gn, const char *type,
                                GtRegionMapping *region_mapping,
                                GtStr *sequence, bool *reverse_strand,
                                bool *first_child_of_type_seen, GtPhase *phase,
                                GtError *err)
{
  char *outsequence;
  GtFeatureNode *fn;
  GtRange range;
  int had_err = 0;

  gt_error_check(err);
  fn = gt_feature_node_cast(gn);
  gt_assert(fn);

  if (gt_feature_node_has_type(fn, type)) {
    if (gt_feature_node_get_strand(fn) == GT_STRAND_REVERSE) {
      *reverse_strand = true;
      *phase = gt_feature_node_get_phase(fn);
    } else {
      if (!(*first_child_of_type_seen)) {
        *first_child_of_type_seen = true;
        *phase = gt_feature_node_get_phase(fn);
      } else *phase = GT_PHASE_UNDEFINED;
    }
    range = gt_genome_node_get_range(gn);
    had_err = gt_region_mapping_get_sequence(region_mapping, &outsequence,
                                             gt_genome_node_get_seqid(gn),
                                             range.start, range.end, err);
    if (!had_err) {
      gt_str_append_cstr_nt(sequence, outsequence, gt_range_length(&range));
      gt_free(outsequence);
    }
  }
  return had_err;
}
Esempio n. 3
0
void gt_gff3_output_leading(GtFeatureNode *fn, GtFile *outfp)
{
  GtGenomeNode *gn;
  gt_assert(fn);
  gn = (GtGenomeNode*) fn;
  gt_file_xprintf(outfp, "%s\t%s\t%s\t"GT_WU"\t"GT_WU"\t",
                     gt_str_get(gt_genome_node_get_seqid(gn)),
                     gt_feature_node_get_source(fn),
                     gt_feature_node_get_type(fn),
                     gt_genome_node_get_start(gn),
                     gt_genome_node_get_end(gn));
  if (gt_feature_node_score_is_defined(fn))
    gt_file_xprintf(outfp, "%.3g", gt_feature_node_get_score(fn));
  else
    gt_file_xfputc('.', outfp);
  gt_file_xprintf(outfp, "\t%c\t%c\t",
                     GT_STRAND_CHARS[gt_feature_node_get_strand(fn)],
                     GT_PHASE_CHARS[gt_feature_node_get_phase(fn)]);
}
Esempio n. 4
0
static int check_cds_phases(GtArray *cds_features, GtCDSCheckVisitor *v,
                            bool is_multi, bool second_pass, GtError *err)
{
  GtPhase current_phase, correct_phase = GT_PHASE_ZERO;
  GtFeatureNode *fn;
  GtStrand strand;
  unsigned long i, current_length;
  int had_err = 0;
  gt_error_check(err);
  gt_assert(cds_features);
  gt_assert(gt_array_size(cds_features));
  fn = *(GtFeatureNode**) gt_array_get_first(cds_features);
  strand = gt_feature_node_get_strand(fn);
  if (strand == GT_STRAND_REVERSE)
    gt_array_reverse(cds_features);
  for (i = 0; !had_err && i < gt_array_size(cds_features); i++) {
    fn = *(GtFeatureNode**) gt_array_get(cds_features, i);
    /* the first phase can be anything (except being undefined), because the
       GFF3 spec says:

       NOTE 4 - CDS features MUST have have a defined phase field. Otherwise it
       is not possible to infer the correct polypeptides corresponding to
       partially annotated genes. */
    if ((!i && gt_feature_node_get_phase(fn) == GT_PHASE_UNDEFINED) ||
        (i && gt_feature_node_get_phase(fn) != correct_phase)) {
      if (gt_hashmap_get(v->cds_features, fn)) {
        if (v->tidy && !is_multi && !gt_feature_node_has_children(fn)) {
          /* we can split the feature */
          gt_warning("%s feature on line %u in file \"%s\" has multiple "
                     "parents which require different phases; split feature",
                     gt_ft_CDS,
                     gt_genome_node_get_line_number((GtGenomeNode*) fn),
                     gt_genome_node_get_filename((GtGenomeNode*) fn));
          gt_hashmap_add(v->cds_features_to_split, fn, fn);
          v->splitting_is_necessary = true; /* split later */
        }
        else {
          gt_error_set(err, "%s feature on line %u in file \"%s\" has multiple "
                       "parents which require different phases",
                       gt_ft_CDS,
                       gt_genome_node_get_line_number((GtGenomeNode*) fn),
                       gt_genome_node_get_filename((GtGenomeNode*) fn));
          had_err = -1;
        }
      }
      else {
        if (v->tidy) {
          if (!second_pass) {
            gt_warning("%s feature on line %u in file \"%s\" has the wrong "
                       "phase %c -> correcting it to %c", gt_ft_CDS,
                       gt_genome_node_get_line_number((GtGenomeNode*) fn),
                       gt_genome_node_get_filename((GtGenomeNode*) fn),
                       GT_PHASE_CHARS[gt_feature_node_get_phase(fn)],
                       GT_PHASE_CHARS[correct_phase]);
          }
          gt_feature_node_set_phase(fn, correct_phase);
        }
        else {
          gt_error_set(err, "%s feature on line %u in file \"%s\" has the "
                       "wrong phase %c (should be %c)", gt_ft_CDS,
                       gt_genome_node_get_line_number((GtGenomeNode*) fn),
                       gt_genome_node_get_filename((GtGenomeNode*) fn),
                       GT_PHASE_CHARS[gt_feature_node_get_phase(fn)],
                       GT_PHASE_CHARS[correct_phase]);
          had_err = -1;
        }
      }
    }
    if (!had_err) {
      current_phase = gt_feature_node_get_phase(fn);
      current_length = gt_genome_node_get_length((GtGenomeNode*) fn);
      correct_phase = (3 - (current_length - current_phase) % 3) % 3;
      gt_hashmap_add(v->cds_features, fn, fn); /* record CDS feature */
    }
  }
  return had_err;
}
static int gt_extract_feature_sequence_generic(GtStr *sequence,
                                GtGenomeNode *gn,
                                const char *type, bool join, GtStr *seqid,
                                GtStrArray *target_ids,
                                unsigned int *out_phase_offset,
                                GtRegionMapping *region_mapping, GtError *err)
{
  GtFeatureNode *fn;
  GtRange range;
  unsigned int phase_offset = 0;
  char *outsequence;
  const char *target;
  int had_err = 0;

  gt_error_check(err);
  fn = gt_genome_node_cast(gt_feature_node_class(), gn);
  gt_assert(fn);

  if (seqid)
    gt_str_append_str(seqid, gt_genome_node_get_seqid(gn));
  if (target_ids &&
      (target = gt_feature_node_get_attribute(fn, GT_GFF_TARGET))) {
    had_err = gt_gff3_parser_parse_all_target_attributes(target, false,
                                                         target_ids, NULL,
                                                         NULL, "", 0, err);
  }
  if (!had_err) {
    if (join) {
      GtFeatureNodeIterator *fni;
      GtFeatureNode *child;
      bool reverse_strand = false,
           first_child = true,
           first_child_of_type_seen = false;
      GtPhase phase = GT_PHASE_UNDEFINED;
      /* in this case we have to traverse the children */
      fni = gt_feature_node_iterator_new_direct(gt_feature_node_cast(gn));
      while (!had_err && (child = gt_feature_node_iterator_next(fni))) {
        if (first_child) {
          if (target_ids &&
               (target = gt_feature_node_get_attribute(child, GT_GFF_TARGET))) {
            gt_str_array_reset(target_ids);
            had_err = gt_gff3_parser_parse_all_target_attributes(target, false,
                                                                 target_ids,
                                                                 NULL,
                                                                 NULL, "", 0,
                                                                 err);
          }
          first_child = false;
        }
        if (!had_err) {
          if (extract_join_feature((GtGenomeNode*) child, type, region_mapping,
                                   sequence, &reverse_strand,
                                   &first_child_of_type_seen,
                                   &phase, err)) {
            had_err = -1;
          }
          if (phase != GT_PHASE_UNDEFINED) {
            phase_offset = (int) phase;
          }
        }
      }
      gt_feature_node_iterator_delete(fni);
      gt_assert(phase_offset <= (unsigned int) GT_PHASE_UNDEFINED);
      if (!had_err && gt_str_length(sequence)) {
        if (reverse_strand) {
          had_err = gt_reverse_complement(gt_str_get(sequence),
                                          gt_str_length(sequence), err);
        }
      }
    }
    else if (gt_feature_node_get_type(fn) == type) {
      GtPhase phase = gt_feature_node_get_phase(fn);
      gt_assert(!had_err);
      if (phase != GT_PHASE_UNDEFINED)
        phase_offset = (unsigned int) phase;
      /* otherwise we only have to look at this feature */
      range = gt_genome_node_get_range(gn);
      gt_assert(range.start); /* 1-based coordinates */
      had_err = gt_region_mapping_get_sequence(region_mapping, &outsequence,
                                               gt_genome_node_get_seqid(gn),
                                               range.start, range.end, err);
      if (!had_err) {
        gt_str_append_cstr_nt(sequence, outsequence, gt_range_length(&range));
        gt_free(outsequence);
        if (gt_feature_node_get_strand(fn) == GT_STRAND_REVERSE) {
          had_err = gt_reverse_complement(gt_str_get(sequence),
                                          gt_str_length(sequence), err);
        }
      }
    }
  }
  if (out_phase_offset && phase_offset != GT_PHASE_UNDEFINED) {
    *out_phase_offset = phase_offset;
  }
  return had_err;
}