void gt_gff3_output_leading_str(GtFeatureNode *fn, GtStr *outstr) { GtGenomeNode *gn; gt_assert(fn && outstr); gn = (GtGenomeNode*) fn; gt_str_append_str(outstr, gt_genome_node_get_seqid(gn)); gt_str_append_char(outstr, '\t'); gt_str_append_cstr(outstr, gt_feature_node_get_source(fn)); gt_str_append_char(outstr, '\t'); gt_str_append_cstr(outstr, gt_feature_node_get_type(fn)); gt_str_append_char(outstr, '\t'); gt_str_append_uword(outstr, gt_genome_node_get_start(gn)); gt_str_append_char(outstr, '\t'); gt_str_append_uword(outstr, gt_genome_node_get_end(gn)); gt_str_append_char(outstr, '\t'); if (gt_feature_node_score_is_defined(fn)) { char buf[BUFSIZ]; (void) snprintf(buf, BUFSIZ, "%.3g", gt_feature_node_get_score(fn)); gt_str_append_cstr(outstr, buf); } else gt_str_append_char(outstr, '.'); gt_str_append_char(outstr, '\t'); gt_str_append_char(outstr, GT_STRAND_CHARS[gt_feature_node_get_strand(fn)]); gt_str_append_char(outstr, '\t'); gt_str_append_char(outstr, GT_PHASE_CHARS[gt_feature_node_get_phase(fn)]); gt_str_append_char(outstr, '\t'); }
static void filter_targetbest(GtFeatureNode *current_feature, GtDlist *trees, GtHashmap *target_to_elem) { unsigned long num_of_targets; GtDlistelem *previous_elem; GtStr *first_target_id; const char *target; int had_err; gt_assert(current_feature && trees); target = gt_feature_node_get_attribute(current_feature, TARGET_STRING); gt_assert(target); first_target_id = gt_str_new(); had_err = gt_gff3_parser_parse_target_attributes(target, &num_of_targets, first_target_id, NULL, NULL, "", 0, NULL); gt_assert(!had_err); if (num_of_targets == 1) { GtStr *key = gt_str_new(); build_key(key, current_feature, first_target_id); if (!(previous_elem = gt_hashmap_get(target_to_elem, gt_str_get(key)))) { /* element with this target_id not included yet -> include it */ include_feature(trees, target_to_elem, current_feature, key); } else { GtFeatureNode *previous_feature = gt_dlistelem_get_data(previous_elem); /* element with this target_id included already -> compare them */ if (gt_feature_node_get_score(current_feature) > gt_feature_node_get_score(previous_feature)) { /* current feature is better -> replace previous feature */ replace_previous_elem(previous_elem, current_feature, trees, target_to_elem, key); } else /* current feature is not better -> remove it */ gt_genome_node_delete((GtGenomeNode*) current_feature); } gt_str_delete(key); } else gt_dlist_add(trees, current_feature); gt_str_delete(first_target_id); }
static int create_block_features(GtBEDParser *bed_parser, GtFeatureNode *fn, GtUword block_count, GtSplitter *size_splitter, GtSplitter *start_splitter, GtIO *bed_file, GtError *err) { GtUword i; int had_err = 0; gt_assert(fn && block_count && size_splitter && start_splitter); gt_assert(gt_splitter_size(size_splitter) == block_count); gt_assert(gt_splitter_size(start_splitter) == block_count); for (i = 0; !had_err && i < block_count; i++) { GtUword block_size, block_start, start, end; GtGenomeNode *block; const char *name; if (gt_parse_uword(&block_size, gt_splitter_get_token(size_splitter, i))) { gt_error_set(err, "file \"%s\": line "GT_WU": could not parse blockSize '%s'", gt_io_get_filename(bed_file), gt_io_get_line_number(bed_file), gt_splitter_get_token(size_splitter, i)); had_err = -1; } if (!had_err && gt_parse_uword(&block_start, gt_splitter_get_token(start_splitter, i))) { gt_error_set(err, "file \"%s\": line "GT_WU": could not parse blockStart " "'%s'", gt_io_get_filename(bed_file), gt_io_get_line_number(bed_file), gt_splitter_get_token(start_splitter, i)); had_err = -1; } if (!had_err) { start = gt_genome_node_get_start((GtGenomeNode*) fn) + block_start; end = start + block_size - 1; block = gt_feature_node_new(gt_genome_node_get_seqid((GtGenomeNode*) fn), bed_parser->block_type ? bed_parser->block_type : BED_BLOCK_TYPE, start, end, gt_feature_node_get_strand(fn)); if ((name = gt_feature_node_get_attribute(fn, GT_GFF_NAME))) { gt_feature_node_add_attribute((GtFeatureNode*) block, GT_GFF_NAME, name); } gt_feature_node_set_score((GtFeatureNode*) block, gt_feature_node_get_score(fn)); gt_feature_node_set_strand((GtFeatureNode*) block, gt_feature_node_get_strand(fn)); gt_feature_node_add_child(fn, (GtFeatureNode*) block); } } return had_err; }
static int feature_node_lua_get_score(lua_State *L) { GtGenomeNode **gn = check_genome_node(L, 1); GtFeatureNode *fn; /* make sure we get a feature node */ fn = gt_feature_node_try_cast(*gn); luaL_argcheck(L, fn, 1, "not a feature node"); if (gt_feature_node_score_is_defined(fn)) lua_pushnumber(L, gt_feature_node_get_score(fn)); else lua_pushnil(L); return 1; }
static void compute_type_statistics(GtFeatureNode *fn, GtStatVisitor *sv) { GtRange range; gt_assert(fn && sv); if (gt_feature_node_has_type(fn, gt_ft_gene)) { sv->number_of_genes++; if (gt_feature_node_has_CDS(fn)) sv->number_of_protein_coding_genes++; if (sv->gene_length_distribution) { range = gt_genome_node_get_range((GtGenomeNode*) fn); gt_disc_distri_add(sv->gene_length_distribution, gt_range_length(&range)); } if (sv->gene_score_distribution) { gt_disc_distri_add(sv->gene_score_distribution, gt_feature_node_get_score(fn) * 100.0); } } else if (gt_feature_node_has_type(fn, gt_ft_mRNA)) { sv->number_of_mRNAs++; if (gt_feature_node_has_CDS(fn)) sv->number_of_protein_coding_mRNAs++; } else if (gt_feature_node_has_type(fn, gt_ft_exon)) { sv->number_of_exons++; if (sv->exon_length_distribution) { range = gt_genome_node_get_range((GtGenomeNode*) fn); gt_disc_distri_add(sv->exon_length_distribution, gt_range_length(&range)); } } else if (gt_feature_node_has_type(fn, gt_ft_CDS)) { sv->number_of_CDSs++; } else if (gt_feature_node_has_type(fn, gt_ft_intron)) { if (sv->intron_length_distribution) { range = gt_genome_node_get_range((GtGenomeNode*) fn); gt_disc_distri_add(sv->intron_length_distribution, gt_range_length(&range)); } } else if (gt_feature_node_has_type(fn, gt_ft_LTR_retrotransposon)) { sv->number_of_LTR_retrotransposons++; } }
void gt_gff3_output_leading(GtFeatureNode *fn, GtFile *outfp) { GtGenomeNode *gn; gt_assert(fn); gn = (GtGenomeNode*) fn; gt_file_xprintf(outfp, "%s\t%s\t%s\t"GT_WU"\t"GT_WU"\t", gt_str_get(gt_genome_node_get_seqid(gn)), gt_feature_node_get_source(fn), gt_feature_node_get_type(fn), gt_genome_node_get_start(gn), gt_genome_node_get_end(gn)); if (gt_feature_node_score_is_defined(fn)) gt_file_xprintf(outfp, "%.3g", gt_feature_node_get_score(fn)); else gt_file_xfputc('.', outfp); gt_file_xprintf(outfp, "\t%c\t%c\t", GT_STRAND_CHARS[gt_feature_node_get_strand(fn)], GT_PHASE_CHARS[gt_feature_node_get_phase(fn)]); }
static void construct_thick_feature(GtBEDParser *bed_parser, GtFeatureNode *fn, GtRange range) { GtGenomeNode *thick_feature; const char *name; gt_assert(fn); thick_feature = gt_feature_node_new(gt_genome_node_get_seqid((GtGenomeNode*) fn), bed_parser->thick_feature_type ? bed_parser->thick_feature_type : BED_THICK_FEATURE_TYPE, range.start, range.end, gt_feature_node_get_strand(fn)); if ((name = gt_feature_node_get_attribute(fn, "Name"))) gt_feature_node_add_attribute((GtFeatureNode*) thick_feature, "Name", name); gt_feature_node_set_score((GtFeatureNode*) thick_feature, gt_feature_node_get_score(fn)); gt_feature_node_set_strand((GtFeatureNode*) thick_feature, gt_feature_node_get_strand(fn)); gt_feature_node_add_child(fn, (GtFeatureNode*) thick_feature); }
static int select_visitor_feature_node(GtNodeVisitor *nv, GtFeatureNode *fn, GT_UNUSED GtError *err) { GtSelectVisitor *fv; bool filter_node = false; gt_error_check(err); fv = select_visitor_cast(nv); fv->current_feature++; if ((!gt_str_length(fv->seqid) || /* no seqid was specified or seqids are equal */ !gt_str_cmp(fv->seqid, gt_genome_node_get_seqid((GtGenomeNode*) fn))) && (!gt_str_length(fv->source) || /* no source was specified or sources are equal */ !strcmp(gt_str_get(fv->source), gt_feature_node_get_source(fn)))) { GtRange range = gt_genome_node_get_range((GtGenomeNode*) fn); /* enforce maximum gene length */ /* XXX: we (spuriously) assume that genes are always root nodes */ if (fn && gt_feature_node_has_type(fn, gt_ft_gene)) { if (fv->max_gene_length != GT_UNDEF_ULONG && gt_range_length(&range) > fv->max_gene_length) { filter_node = true; } else if (fv->max_gene_num != GT_UNDEF_ULONG && fv->gene_num >= fv->max_gene_num) { filter_node = true; } else if (fv->min_gene_score != GT_UNDEF_DOUBLE && gt_feature_node_get_score(fn) < fv->min_gene_score) { filter_node = true; } else if (fv->max_gene_score != GT_UNDEF_DOUBLE && gt_feature_node_get_score(fn) > fv->max_gene_score) { filter_node = true; } else if (fv->feature_num != GT_UNDEF_ULONG && fv->feature_num != fv->current_feature) { filter_node = true; } if (!filter_node) fv->gene_num++; /* gene passed filter */ } } else filter_node = true; if (!filter_node) filter_node = filter_contain_range(fn, fv->contain_range); if (!filter_node) filter_node = filter_overlap_range(fn, fv->overlap_range); if (!filter_node) filter_node = filter_strand(fn, fv->strand); if (!filter_node) filter_node = filter_targetstrand(fn, fv->targetstrand); if (!filter_node) filter_node = filter_has_CDS(fn, fv->has_CDS); if (!filter_node) filter_node = filter_min_average_ssp(fn, fv->min_average_splice_site_prob); if (filter_node) gt_genome_node_delete((GtGenomeNode*) fn); else gt_queue_add(fv->node_buffer, fn); return 0; }
static int gt_ltrdigest_pdom_visitor_choose_strand(GtLTRdigestPdomVisitor *lv) { int had_err = 0; double log_eval_fwd = 0.0, log_eval_rev = 0.0; GtFeatureNodeIterator *fni; GtStrand strand; double score; bool seen_fwd = false, seen_rev = false; GtFeatureNode *curnode = NULL; GtUword i; GtArray *to_delete; fni = gt_feature_node_iterator_new(lv->ltr_retrotrans); while (!had_err && (curnode = gt_feature_node_iterator_next(fni))) { if (strcmp(gt_feature_node_get_type(curnode), gt_ft_protein_match) == 0) { strand = gt_feature_node_get_strand(curnode); score = (double) gt_feature_node_get_score(curnode); if (strand == GT_STRAND_FORWARD) { log_eval_fwd += log(score); seen_fwd = true; } else if (strand == GT_STRAND_REVERSE) { log_eval_rev += log(score); seen_rev = true; } } } gt_feature_node_iterator_delete(fni); if (seen_rev && !seen_fwd) gt_feature_node_set_strand(lv->ltr_retrotrans, GT_STRAND_REVERSE); else if (!seen_rev && seen_fwd) gt_feature_node_set_strand(lv->ltr_retrotrans, GT_STRAND_FORWARD); else if (!seen_rev && !seen_fwd) return had_err; else { gt_assert(seen_rev && seen_fwd); if (gt_double_compare(log_eval_fwd, log_eval_rev) < 0) strand = GT_STRAND_FORWARD; else strand = GT_STRAND_REVERSE; gt_feature_node_set_strand(lv->ltr_retrotrans, strand); to_delete = gt_array_new(sizeof (GtFeatureNode*)); fni = gt_feature_node_iterator_new(lv->ltr_retrotrans); while (!had_err && (curnode = gt_feature_node_iterator_next(fni))) { if (strcmp(gt_feature_node_get_type(curnode), gt_ft_protein_match) == 0) { if (strand != gt_feature_node_get_strand(curnode)) { gt_array_add(to_delete, curnode); } } } gt_feature_node_iterator_delete(fni); gt_assert(gt_array_size(to_delete) > 0); for (i = 0; i < gt_array_size(to_delete); i++) { gt_feature_node_remove_leaf(lv->ltr_retrotrans, *(GtFeatureNode**) gt_array_get(to_delete, i)); } gt_array_delete(to_delete); } return had_err; }