static GtArray* gaeval_visitor_intersect(GtGenomeNode *genemodel, GtGenomeNode *alignment) { agn_assert(genemodel && alignment); GtFeatureNode *genefn = gt_feature_node_cast(genemodel); GtFeatureNode *algnfn = gt_feature_node_cast(alignment); agn_assert(gt_feature_node_has_type(genefn, "mRNA")); GtStrand genestrand = gt_feature_node_get_strand(genefn); GtStrand algnstrand = gt_feature_node_get_strand(algnfn); if(genestrand != algnstrand) return NULL; GtArray *covered_parts = gt_array_new( sizeof(GtRange) ); GtArray *exons = agn_typecheck_select(genefn, agn_typecheck_exon); GtWord i; for(i = 0; i < gt_array_size(exons); i++) { GtGenomeNode *exon = *(GtGenomeNode **)gt_array_get(exons, i); GtRange exonrange = gt_genome_node_get_range(exon); GtFeatureNodeIterator *aniter = gt_feature_node_iterator_new(algnfn); GtFeatureNode *tempaln; GtRange nullrange = {0, 0}; for(tempaln = gt_feature_node_iterator_next(aniter); tempaln != NULL; tempaln = gt_feature_node_iterator_next(aniter)) { if(gt_feature_node_has_type(tempaln, "match_gap")) continue; GtRange alnrange = gt_genome_node_get_range((GtGenomeNode *) tempaln); GtRange intr = gaeval_visitor_range_intersect(&exonrange, &alnrange); if(gt_range_compare(&intr, &nullrange) != 0) gt_array_add(covered_parts, intr); } gt_feature_node_iterator_delete(aniter); } gt_array_delete(exons); for(i = 0; i < gt_array_size(covered_parts); i++) { GtRange *r1 = gt_array_get(covered_parts, i); GtUword j; for(j = i+1; j < gt_array_size(covered_parts); j++) { GtRange *r2 = gt_array_get(covered_parts, j); agn_assert(gt_range_overlap(r1, r2) == false); } } return covered_parts; }
static int save_exon_node(GtFeatureNode *fn, void *data, GT_UNUSED GtError *err) { GtGTFVisitor *gtf_visitor; gt_error_check(err); gt_assert(fn && data); gtf_visitor = (GtGTFVisitor*) data; if (gt_feature_node_has_type(fn, gt_ft_exon)) gt_array_add(gtf_visitor->exon_features, fn); else if (gt_feature_node_has_type(fn, gt_ft_CDS)) gt_array_add(gtf_visitor->CDS_features, fn); return 0; }
static int add_exon_or_cds_number(GtFeatureNode *fn, void *data, GT_UNUSED GtError *err) { GtStatVisitor *sv = (GtStatVisitor*) data; gt_error_check(err); gt_assert(sv && fn); if (gt_feature_node_has_type(fn, gt_ft_exon)) sv->exon_number_for_distri++; else if (gt_feature_node_has_type(fn, gt_ft_CDS)) { GtRange range = gt_genome_node_get_range((GtGenomeNode*) fn); sv->cds_length_for_distri += gt_range_length(&range); } return 0; }
static int visit_feature_node(GtNodeVisitor *nv, GtFeatureNode *fn, GtError *error) { AgnLocusMapVisitor *v = locus_map_visitor_cast(nv); gt_error_check(error); agn_assert(gt_feature_node_has_type(fn, "locus")); const char *locuslabel = agn_feature_node_get_label(fn); GtFeatureNodeIterator *iter = gt_feature_node_iterator_new(fn); GtFeatureNode *current; for(current = gt_feature_node_iterator_next(iter); current != NULL; current = gt_feature_node_iterator_next(iter)) { if(agn_typecheck_gene(current) && v->genefh != NULL) { const char *genelabel = agn_feature_node_get_label(current); fprintf(v->genefh, "%s\t%s\n", genelabel, locuslabel); } if(agn_typecheck_mrna(current) && v->mrnafh != NULL) { const char *mrnalabel = agn_feature_node_get_label(current); fprintf(v->mrnafh, "%s\t%s\n", mrnalabel, locuslabel); } } gt_feature_node_iterator_delete(iter); return 0; }
static int gtf_show_feature_node(GtFeatureNode *fn, void *data, GtError *err) { GtGTFVisitor *gtf_visitor = (GtGTFVisitor*) data; int had_err = 0; if (gt_feature_node_has_type(fn, gt_ft_gene)) { gtf_visitor->gene_id++; gtf_visitor->transcript_id = 0; had_err = gtf_show_transcript(fn, gtf_visitor, err); } else if (gt_feature_node_has_type(fn, gt_ft_mRNA)) { had_err = gtf_show_transcript(fn, gtf_visitor, err); } else if (!(gt_feature_node_has_type(fn, gt_ft_CDS) || gt_feature_node_has_type(fn, gt_ft_exon))) { gt_warning("skipping GFF3 feature of type \"%s\" (from line %u in file " "\"%s\")", gt_feature_node_get_type(fn), gt_genome_node_get_line_number((GtGenomeNode*) fn), gt_genome_node_get_filename((GtGenomeNode*) fn)); } return had_err; }
static void infer_cds_visitor_set_utrs(AgnInferCDSVisitor *v) { GtGenomeNode **start; GtUword i, cds_start; if(!v->starts || gt_array_size(v->starts) != 1) return; start = gt_array_get(v->starts, 0); cds_start = gt_genome_node_get_start(*start); for(i = 0; i < gt_array_size(v->utrs); i++) { GtFeatureNode *utr = *(GtFeatureNode **)gt_array_get(v->utrs, i); GtStrand strand = gt_feature_node_get_strand(utr); GtUword utr_start = gt_genome_node_get_start((GtGenomeNode *)utr); if(!gt_feature_node_has_type(utr, "five_prime_UTR") && !gt_feature_node_has_type(utr, "three_prime_UTR")) { if(strand == GT_STRAND_FORWARD) { if(utr_start < cds_start) gt_feature_node_set_type(utr, "five_prime_UTR"); else gt_feature_node_set_type(utr, "three_prime_UTR"); } else { if(utr_start < cds_start) gt_feature_node_set_type(utr, "three_prime_UTR"); else gt_feature_node_set_type(utr, "five_prime_UTR"); } } } }
static int check_cds_phases_if_necessary(GtFeatureNode *fn, GtCDSCheckVisitor *v, bool second_pass, GtError *err) { GtFeatureNodeIterator *fni; GtFeatureNode *node; GtArray *cds_features = NULL; GtHashmap *multi_features = NULL; int had_err = 0; gt_error_check(err); gt_assert(fn); fni = gt_feature_node_iterator_new_direct(fn); while ((node = gt_feature_node_iterator_next(fni))) { if (gt_feature_node_has_type(node, gt_ft_CDS)) { if (gt_feature_node_is_multi(node)) { GtArray *features; if (!multi_features) multi_features = gt_hashmap_new(GT_HASH_DIRECT, NULL, (GtFree) gt_array_delete); if ((features = gt_hashmap_get(multi_features, gt_feature_node_get_multi_representative(node)))) { gt_array_add(features, node); } else { GtFeatureNode *representative; features = gt_array_new(sizeof (GtFeatureNode*)); representative = gt_feature_node_get_multi_representative(node); gt_array_add(features, representative); gt_hashmap_add(multi_features, representative, features); } } else { if (!cds_features) cds_features = gt_array_new(sizeof (GtFeatureNode*)); gt_array_add(cds_features, node); } } } if (cds_features) had_err = check_cds_phases(cds_features, v, false, second_pass, err); if (!had_err && multi_features) had_err = gt_hashmap_foreach(multi_features, check_cds_phases_hm, v, err); gt_array_delete(cds_features); gt_hashmap_delete(multi_features); gt_feature_node_iterator_delete(fni); return had_err; }
static double gaeval_visitor_coverage_resolve(GtFeatureNode *genemodel, GtArray *exon_coverage) { agn_assert(genemodel && exon_coverage); agn_assert(gt_feature_node_has_type(genemodel, "mRNA")); GtUword cum_exon_length = agn_typecheck_feature_combined_length(genemodel, agn_typecheck_exon); GtUword i, covered = 0; for(i = 0; i < gt_array_size(exon_coverage); i++) { GtRange *range = gt_array_get(exon_coverage, i); covered += gt_range_length(range); } agn_assert(covered <= cum_exon_length); return (double)covered / (double)cum_exon_length; }
static void compute_type_statistics(GtFeatureNode *fn, GtStatVisitor *sv) { GtRange range; gt_assert(fn && sv); if (gt_feature_node_has_type(fn, gt_ft_gene)) { sv->number_of_genes++; if (gt_feature_node_has_CDS(fn)) sv->number_of_protein_coding_genes++; if (sv->gene_length_distribution) { range = gt_genome_node_get_range((GtGenomeNode*) fn); gt_disc_distri_add(sv->gene_length_distribution, gt_range_length(&range)); } if (sv->gene_score_distribution) { gt_disc_distri_add(sv->gene_score_distribution, gt_feature_node_get_score(fn) * 100.0); } } else if (gt_feature_node_has_type(fn, gt_ft_mRNA)) { sv->number_of_mRNAs++; if (gt_feature_node_has_CDS(fn)) sv->number_of_protein_coding_mRNAs++; } else if (gt_feature_node_has_type(fn, gt_ft_exon)) { sv->number_of_exons++; if (sv->exon_length_distribution) { range = gt_genome_node_get_range((GtGenomeNode*) fn); gt_disc_distri_add(sv->exon_length_distribution, gt_range_length(&range)); } } else if (gt_feature_node_has_type(fn, gt_ft_CDS)) { sv->number_of_CDSs++; } else if (gt_feature_node_has_type(fn, gt_ft_intron)) { if (sv->intron_length_distribution) { range = gt_genome_node_get_range((GtGenomeNode*) fn); gt_disc_distri_add(sv->intron_length_distribution, gt_range_length(&range)); } } else if (gt_feature_node_has_type(fn, gt_ft_LTR_retrotransposon)) { sv->number_of_LTR_retrotransposons++; } }
static int extract_join_feature(GtGenomeNode *gn, const char *type, GtRegionMapping *region_mapping, GtStr *sequence, bool *reverse_strand, bool *first_child_of_type_seen, GtPhase *phase, GtError *err) { char *outsequence; GtFeatureNode *fn; GtRange range; int had_err = 0; gt_error_check(err); fn = gt_feature_node_cast(gn); gt_assert(fn); if (gt_feature_node_has_type(fn, type)) { if (gt_feature_node_get_strand(fn) == GT_STRAND_REVERSE) { *reverse_strand = true; *phase = gt_feature_node_get_phase(fn); } else { if (!(*first_child_of_type_seen)) { *first_child_of_type_seen = true; *phase = gt_feature_node_get_phase(fn); } else *phase = GT_PHASE_UNDEFINED; } range = gt_genome_node_get_range(gn); had_err = gt_region_mapping_get_sequence(region_mapping, &outsequence, gt_genome_node_get_seqid(gn), range.start, range.end, err); if (!had_err) { gt_str_append_cstr_nt(sequence, outsequence, gt_range_length(&range)); gt_free(outsequence); } } return had_err; }
static int inter_feature_in_children(GtFeatureNode *current_feature, void *data, GT_UNUSED GtError *err) { GtInterFeatureVisitor *aiv = (GtInterFeatureVisitor*) data; GtFeatureNode *inter_node; GtRange previous_range, current_range, inter_range; GtStrand previous_strand, /*current_strand, */inter_strand; GtStr *parent_seqid; gt_error_check(err); gt_assert(current_feature); if (gt_feature_node_has_type(current_feature, aiv->outside_type)) { if (aiv->previous_feature) { /* determine inter range */ previous_range = gt_genome_node_get_range((GtGenomeNode*) aiv->previous_feature); current_range = gt_genome_node_get_range((GtGenomeNode*) current_feature); if (previous_range.end >= current_range.start) { gt_warning("overlapping boundary features " GT_WU "-" GT_WU " and " GT_WU "-" GT_WU ", " "not placing '%s' inter-feature", previous_range.start, previous_range.end, current_range.start, current_range.end, aiv->inter_type); return 0; } if (current_range.start - previous_range.end < 2) { gt_warning("no space for inter-feature '%s' between " GT_WU " and " GT_WU, aiv->inter_type, previous_range.end, current_range.start); return 0; } inter_range.start = previous_range.end + 1; inter_range.end = current_range.start - 1; /* determine inter strand */ previous_strand = gt_feature_node_get_strand(aiv->previous_feature); /*current_strand = gt_feature_node_get_strand(current_feature);*/ gt_assert(previous_strand == gt_feature_node_get_strand(current_feature)); inter_strand = previous_strand; /* determine sequence id */ parent_seqid = gt_genome_node_get_seqid((GtGenomeNode*) aiv->parent_feature); gt_assert(!gt_str_cmp(parent_seqid, gt_genome_node_get_seqid((GtGenomeNode*) aiv->previous_feature))); gt_assert(!gt_str_cmp(parent_seqid, gt_genome_node_get_seqid((GtGenomeNode*) current_feature))); /* create inter feature */ inter_node = (GtFeatureNode*) gt_feature_node_new(parent_seqid, aiv->inter_type, inter_range.start, inter_range.end, inter_strand); gt_feature_node_add_child(aiv->parent_feature, inter_node); } aiv->previous_feature = current_feature; } return 0; }
static int select_visitor_feature_node(GtNodeVisitor *nv, GtFeatureNode *fn, GT_UNUSED GtError *err) { GtSelectVisitor *fv; bool filter_node = false; gt_error_check(err); fv = select_visitor_cast(nv); fv->current_feature++; if ((!gt_str_length(fv->seqid) || /* no seqid was specified or seqids are equal */ !gt_str_cmp(fv->seqid, gt_genome_node_get_seqid((GtGenomeNode*) fn))) && (!gt_str_length(fv->source) || /* no source was specified or sources are equal */ !strcmp(gt_str_get(fv->source), gt_feature_node_get_source(fn)))) { GtRange range = gt_genome_node_get_range((GtGenomeNode*) fn); /* enforce maximum gene length */ /* XXX: we (spuriously) assume that genes are always root nodes */ if (fn && gt_feature_node_has_type(fn, gt_ft_gene)) { if (fv->max_gene_length != GT_UNDEF_ULONG && gt_range_length(&range) > fv->max_gene_length) { filter_node = true; } else if (fv->max_gene_num != GT_UNDEF_ULONG && fv->gene_num >= fv->max_gene_num) { filter_node = true; } else if (fv->min_gene_score != GT_UNDEF_DOUBLE && gt_feature_node_get_score(fn) < fv->min_gene_score) { filter_node = true; } else if (fv->max_gene_score != GT_UNDEF_DOUBLE && gt_feature_node_get_score(fn) > fv->max_gene_score) { filter_node = true; } else if (fv->feature_num != GT_UNDEF_ULONG && fv->feature_num != fv->current_feature) { filter_node = true; } if (!filter_node) fv->gene_num++; /* gene passed filter */ } } else filter_node = true; if (!filter_node) filter_node = filter_contain_range(fn, fv->contain_range); if (!filter_node) filter_node = filter_overlap_range(fn, fv->overlap_range); if (!filter_node) filter_node = filter_strand(fn, fv->strand); if (!filter_node) filter_node = filter_targetstrand(fn, fv->targetstrand); if (!filter_node) filter_node = filter_has_CDS(fn, fv->has_CDS); if (!filter_node) filter_node = filter_min_average_ssp(fn, fv->min_average_splice_site_prob); if (filter_node) gt_genome_node_delete((GtGenomeNode*) fn); else gt_queue_add(fv->node_buffer, fn); return 0; }
static bool gaeval_visitor_typecheck_gap(GtFeatureNode *fn) { return gt_feature_node_has_type(fn, "match_gap"); }
static int CpGIOverlap_stream_next(GtNodeStream * ns, GtGenomeNode ** gn, GtError * err) { GtGenomeNode * cur_node, * next_node; GtFeatureNodeIterator * iter; int err_num = 0; *gn = NULL; CpGIOverlap_stream * context; const char * gene_name = NULL; const char * overlap_name = NULL; char chr_str[255]; int chr_num; unsigned int TSS; float CpGIOverlap; context = CpGIOverlap_stream_cast(ns); // find the genes, determine expression level if(!gt_node_stream_next(context->in_stream, &cur_node, err ) && cur_node != NULL ) { *gn = cur_node; // try casting as a feature node so we can test type if(!gt_genome_node_try_cast(gt_feature_node_class(), cur_node)) { return 0; } else // we found a feature node { // first check if it is a pseudo node, if so find the gene in it if available if (gt_feature_node_is_pseudo(cur_node)) { iter = gt_feature_node_iterator_new(cur_node); if (iter == NULL) return; while ((next_node = gt_feature_node_iterator_next(iter)) && !gt_feature_node_has_type(next_node, feature_type_gene)); gt_feature_node_iterator_delete(iter); if (NULL == (cur_node = next_node)) return 0; } if(!gt_feature_node_has_type(cur_node, feature_type_gene)) return 0; // find name of gene gene_name = gt_feature_node_get_attribute(cur_node, "Name"); if (gene_name == NULL) return; if ( 1 != sscanf(gt_str_get(gt_genome_node_get_seqid(cur_node)), "Chr%d", &chr_num)) return 0; TSS = (gt_feature_node_get_strand(cur_node) == GT_STRAND_FORWARD) ? gt_genome_node_get_start(cur_node) : gt_genome_node_get_end(cur_node); // now figure out the overlapping gene if (! (overlap_name = CpGIOverlap_stream_find_gene_overlap( context, TSS, chr_num))) return 0; // save the score into the node gt_feature_node_set_attribute(cur_node, "cpgi_at_tss", overlap_name); return 0; } } return err_num; }
static int CpGI_score_stream_next(GtNodeStream * ns, GtGenomeNode ** gn, GtError * err) { GtGenomeNode * cur_node; int err_num = 0; *gn = NULL; CpGI_score_stream * score_stream; unsigned long island_start; unsigned long island_end; float island_score; int chromosome_num; GtStr * seqID_gtstr; char * seqID_str; char * num_cg_str; unsigned long num_cg = 0; score_stream = CpGI_score_stream_cast(ns); // find the CpGI's, process methylome score if(!gt_node_stream_next(score_stream->in_stream, &cur_node, err ) && cur_node != NULL ) { *gn = cur_node; // try casting as a feature node so we can test type if(!gt_genome_node_try_cast(gt_feature_node_class(), cur_node)) { return 0; } else // we found a feature node { if(!gt_feature_node_has_type(cur_node, feature_type_CpGI)) return 0; #if DEBUG_SCORE printf("found CpGI\n"); #endif island_start = gt_genome_node_get_start(cur_node); island_end = gt_genome_node_get_end(cur_node); seqID_gtstr = gt_genome_node_get_seqid(cur_node); seqID_str = gt_str_get(seqID_gtstr); sscanf(seqID_str, "Chr%d", &chromosome_num); num_cg_str = gt_feature_node_get_attribute(cur_node, "sumcg"); if (!num_cg_str) return 0; sscanf(num_cg_str, "%d", &num_cg); // now figure out the score island_score = CpGI_score_stream_score_island(score_stream , chromosome_num, num_cg, island_start, island_end); // gt_str_delete(seqID_gtstr); // save the score into the node gt_feature_node_set_score(cur_node, island_score); return 0; } } return err_num; }