void gt_gff3_output_leading_str(GtFeatureNode *fn, GtStr *outstr) { GtGenomeNode *gn; gt_assert(fn && outstr); gn = (GtGenomeNode*) fn; gt_str_append_str(outstr, gt_genome_node_get_seqid(gn)); gt_str_append_char(outstr, '\t'); gt_str_append_cstr(outstr, gt_feature_node_get_source(fn)); gt_str_append_char(outstr, '\t'); gt_str_append_cstr(outstr, gt_feature_node_get_type(fn)); gt_str_append_char(outstr, '\t'); gt_str_append_uword(outstr, gt_genome_node_get_start(gn)); gt_str_append_char(outstr, '\t'); gt_str_append_uword(outstr, gt_genome_node_get_end(gn)); gt_str_append_char(outstr, '\t'); if (gt_feature_node_score_is_defined(fn)) { char buf[BUFSIZ]; (void) snprintf(buf, BUFSIZ, "%.3g", gt_feature_node_get_score(fn)); gt_str_append_cstr(outstr, buf); } else gt_str_append_char(outstr, '.'); gt_str_append_char(outstr, '\t'); gt_str_append_char(outstr, GT_STRAND_CHARS[gt_feature_node_get_strand(fn)]); gt_str_append_char(outstr, '\t'); gt_str_append_char(outstr, GT_PHASE_CHARS[gt_feature_node_get_phase(fn)]); gt_str_append_char(outstr, '\t'); }
static int extract_join_feature(GtGenomeNode *gn, const char *type, GtRegionMapping *region_mapping, GtStr *sequence, bool *reverse_strand, bool *first_child_of_type_seen, GtPhase *phase, GtError *err) { char *outsequence; GtFeatureNode *fn; GtRange range; int had_err = 0; gt_error_check(err); fn = gt_feature_node_cast(gn); gt_assert(fn); if (gt_feature_node_has_type(fn, type)) { if (gt_feature_node_get_strand(fn) == GT_STRAND_REVERSE) { *reverse_strand = true; *phase = gt_feature_node_get_phase(fn); } else { if (!(*first_child_of_type_seen)) { *first_child_of_type_seen = true; *phase = gt_feature_node_get_phase(fn); } else *phase = GT_PHASE_UNDEFINED; } range = gt_genome_node_get_range(gn); had_err = gt_region_mapping_get_sequence(region_mapping, &outsequence, gt_genome_node_get_seqid(gn), range.start, range.end, err); if (!had_err) { gt_str_append_cstr_nt(sequence, outsequence, gt_range_length(&range)); gt_free(outsequence); } } return had_err; }
void gt_gff3_output_leading(GtFeatureNode *fn, GtFile *outfp) { GtGenomeNode *gn; gt_assert(fn); gn = (GtGenomeNode*) fn; gt_file_xprintf(outfp, "%s\t%s\t%s\t"GT_WU"\t"GT_WU"\t", gt_str_get(gt_genome_node_get_seqid(gn)), gt_feature_node_get_source(fn), gt_feature_node_get_type(fn), gt_genome_node_get_start(gn), gt_genome_node_get_end(gn)); if (gt_feature_node_score_is_defined(fn)) gt_file_xprintf(outfp, "%.3g", gt_feature_node_get_score(fn)); else gt_file_xfputc('.', outfp); gt_file_xprintf(outfp, "\t%c\t%c\t", GT_STRAND_CHARS[gt_feature_node_get_strand(fn)], GT_PHASE_CHARS[gt_feature_node_get_phase(fn)]); }
static int check_cds_phases(GtArray *cds_features, GtCDSCheckVisitor *v, bool is_multi, bool second_pass, GtError *err) { GtPhase current_phase, correct_phase = GT_PHASE_ZERO; GtFeatureNode *fn; GtStrand strand; unsigned long i, current_length; int had_err = 0; gt_error_check(err); gt_assert(cds_features); gt_assert(gt_array_size(cds_features)); fn = *(GtFeatureNode**) gt_array_get_first(cds_features); strand = gt_feature_node_get_strand(fn); if (strand == GT_STRAND_REVERSE) gt_array_reverse(cds_features); for (i = 0; !had_err && i < gt_array_size(cds_features); i++) { fn = *(GtFeatureNode**) gt_array_get(cds_features, i); /* the first phase can be anything (except being undefined), because the GFF3 spec says: NOTE 4 - CDS features MUST have have a defined phase field. Otherwise it is not possible to infer the correct polypeptides corresponding to partially annotated genes. */ if ((!i && gt_feature_node_get_phase(fn) == GT_PHASE_UNDEFINED) || (i && gt_feature_node_get_phase(fn) != correct_phase)) { if (gt_hashmap_get(v->cds_features, fn)) { if (v->tidy && !is_multi && !gt_feature_node_has_children(fn)) { /* we can split the feature */ gt_warning("%s feature on line %u in file \"%s\" has multiple " "parents which require different phases; split feature", gt_ft_CDS, gt_genome_node_get_line_number((GtGenomeNode*) fn), gt_genome_node_get_filename((GtGenomeNode*) fn)); gt_hashmap_add(v->cds_features_to_split, fn, fn); v->splitting_is_necessary = true; /* split later */ } else { gt_error_set(err, "%s feature on line %u in file \"%s\" has multiple " "parents which require different phases", gt_ft_CDS, gt_genome_node_get_line_number((GtGenomeNode*) fn), gt_genome_node_get_filename((GtGenomeNode*) fn)); had_err = -1; } } else { if (v->tidy) { if (!second_pass) { gt_warning("%s feature on line %u in file \"%s\" has the wrong " "phase %c -> correcting it to %c", gt_ft_CDS, gt_genome_node_get_line_number((GtGenomeNode*) fn), gt_genome_node_get_filename((GtGenomeNode*) fn), GT_PHASE_CHARS[gt_feature_node_get_phase(fn)], GT_PHASE_CHARS[correct_phase]); } gt_feature_node_set_phase(fn, correct_phase); } else { gt_error_set(err, "%s feature on line %u in file \"%s\" has the " "wrong phase %c (should be %c)", gt_ft_CDS, gt_genome_node_get_line_number((GtGenomeNode*) fn), gt_genome_node_get_filename((GtGenomeNode*) fn), GT_PHASE_CHARS[gt_feature_node_get_phase(fn)], GT_PHASE_CHARS[correct_phase]); had_err = -1; } } } if (!had_err) { current_phase = gt_feature_node_get_phase(fn); current_length = gt_genome_node_get_length((GtGenomeNode*) fn); correct_phase = (3 - (current_length - current_phase) % 3) % 3; gt_hashmap_add(v->cds_features, fn, fn); /* record CDS feature */ } } return had_err; }
static int gt_extract_feature_sequence_generic(GtStr *sequence, GtGenomeNode *gn, const char *type, bool join, GtStr *seqid, GtStrArray *target_ids, unsigned int *out_phase_offset, GtRegionMapping *region_mapping, GtError *err) { GtFeatureNode *fn; GtRange range; unsigned int phase_offset = 0; char *outsequence; const char *target; int had_err = 0; gt_error_check(err); fn = gt_genome_node_cast(gt_feature_node_class(), gn); gt_assert(fn); if (seqid) gt_str_append_str(seqid, gt_genome_node_get_seqid(gn)); if (target_ids && (target = gt_feature_node_get_attribute(fn, GT_GFF_TARGET))) { had_err = gt_gff3_parser_parse_all_target_attributes(target, false, target_ids, NULL, NULL, "", 0, err); } if (!had_err) { if (join) { GtFeatureNodeIterator *fni; GtFeatureNode *child; bool reverse_strand = false, first_child = true, first_child_of_type_seen = false; GtPhase phase = GT_PHASE_UNDEFINED; /* in this case we have to traverse the children */ fni = gt_feature_node_iterator_new_direct(gt_feature_node_cast(gn)); while (!had_err && (child = gt_feature_node_iterator_next(fni))) { if (first_child) { if (target_ids && (target = gt_feature_node_get_attribute(child, GT_GFF_TARGET))) { gt_str_array_reset(target_ids); had_err = gt_gff3_parser_parse_all_target_attributes(target, false, target_ids, NULL, NULL, "", 0, err); } first_child = false; } if (!had_err) { if (extract_join_feature((GtGenomeNode*) child, type, region_mapping, sequence, &reverse_strand, &first_child_of_type_seen, &phase, err)) { had_err = -1; } if (phase != GT_PHASE_UNDEFINED) { phase_offset = (int) phase; } } } gt_feature_node_iterator_delete(fni); gt_assert(phase_offset <= (unsigned int) GT_PHASE_UNDEFINED); if (!had_err && gt_str_length(sequence)) { if (reverse_strand) { had_err = gt_reverse_complement(gt_str_get(sequence), gt_str_length(sequence), err); } } } else if (gt_feature_node_get_type(fn) == type) { GtPhase phase = gt_feature_node_get_phase(fn); gt_assert(!had_err); if (phase != GT_PHASE_UNDEFINED) phase_offset = (unsigned int) phase; /* otherwise we only have to look at this feature */ range = gt_genome_node_get_range(gn); gt_assert(range.start); /* 1-based coordinates */ had_err = gt_region_mapping_get_sequence(region_mapping, &outsequence, gt_genome_node_get_seqid(gn), range.start, range.end, err); if (!had_err) { gt_str_append_cstr_nt(sequence, outsequence, gt_range_length(&range)); gt_free(outsequence); if (gt_feature_node_get_strand(fn) == GT_STRAND_REVERSE) { had_err = gt_reverse_complement(gt_str_get(sequence), gt_str_length(sequence), err); } } } } if (out_phase_offset && phase_offset != GT_PHASE_UNDEFINED) { *out_phase_offset = phase_offset; } return had_err; }