static int check_boundaries_visitor_check_rec(GtFeatureNode *parent, GtFeatureNode *child, GtError *err) { GtFeatureNodeIterator *fni; GtFeatureNode *node; GtRange range, p_range; int had_err = 0; range = gt_genome_node_get_range((GtGenomeNode*) child); p_range = gt_genome_node_get_range((GtGenomeNode*) parent); if (range.start < p_range.start || range.end > p_range.end) { gt_warning("%s child range " GT_WU "-" GT_WU " (file %s, line %u) not " "contained in %s parent range " GT_WU "-" GT_WU " (file %s, " "line %u)", gt_feature_node_get_type(child), range.start, range.end, gt_genome_node_get_filename((GtGenomeNode*) child), gt_genome_node_get_line_number((GtGenomeNode*) child), gt_feature_node_get_type(parent), p_range.start, p_range.end, gt_genome_node_get_filename((GtGenomeNode*) parent), gt_genome_node_get_line_number((GtGenomeNode*) parent)); } fni = gt_feature_node_iterator_new_direct(child); while ((node = gt_feature_node_iterator_next(fni))) { had_err = check_boundaries_visitor_check_rec(child, node, err); } gt_feature_node_iterator_delete(fni); return had_err; }
static double gaeval_visitor_introns_confirmed(GtArray *introns, GtArray *gaps) { agn_assert(introns && gaps); GtUword intron_count = gt_array_size(introns); GtUword gap_count = gt_array_size(gaps); agn_assert(intron_count > 0); if(gap_count == 0) return 0.0; GtUword i, j, num_confirmed = 0; for(i = 0; i < intron_count; i++) { GtGenomeNode *intron = *(GtGenomeNode **)gt_array_get(introns, i); GtRange intron_range = gt_genome_node_get_range(intron); for(j = 0; j < gap_count; j++) { GtGenomeNode *gap = *(GtGenomeNode **)gt_array_get(gaps, j); GtRange gap_range = gt_genome_node_get_range(gap); if(gt_range_compare(&intron_range, &gap_range) == 0) { num_confirmed++; break; } } } return (double)num_confirmed / (double)intron_count; }
double agn_calc_edit_distance(GtFeatureNode *t1, GtFeatureNode *t2) { AgnTranscriptClique *clique1 = agn_transcript_clique_new(); agn_transcript_clique_add(clique1, t1); AgnTranscriptClique *clique2 = agn_transcript_clique_new(); agn_transcript_clique_add(clique2, t2); GtGenomeNode *gn1 = (GtGenomeNode *)t1; GtRange r1 = gt_genome_node_get_range(gn1); GtStr *seqid = gt_genome_node_get_seqid(gn1); GtGenomeNode *gn2 = (GtGenomeNode *)t2; GtRange r2 = gt_genome_node_get_range(gn2); GtRange local_range = r1; if(r2.start < r1.start) local_range.start = r2.start; if(r2.end > r1.end) local_range.end = r2.end; AgnCliquePair *pair = agn_clique_pair_new(gt_str_get(seqid), clique1, clique2, &local_range); agn_clique_pair_build_model_vectors(pair); agn_clique_pair_comparative_analysis(pair); double ed = agn_clique_pair_get_edit_distance(pair); agn_transcript_clique_delete(clique1); agn_transcript_clique_delete(clique2); agn_clique_pair_delete(pair); return ed; }
static int gt_sort_stream_next(GtNodeStream *ns, GtGenomeNode **gn, GtError *err) { GtSortStream *sort_stream; GtGenomeNode *node, *eofn; int had_err = 0; gt_error_check(err); sort_stream = gt_sort_stream_cast(ns); if (!sort_stream->sorted) { while (!(had_err = gt_node_stream_next(sort_stream->in_stream, &node, err)) && node) { if ((eofn = gt_eof_node_try_cast(node))) gt_genome_node_delete(eofn); /* get rid of EOF nodes */ else gt_array_add(sort_stream->nodes, node); } if (!had_err) { gt_genome_nodes_sort_stable(sort_stream->nodes); sort_stream->sorted = true; } } if (!had_err) { gt_assert(sort_stream->sorted); if (sort_stream->idx < gt_array_size(sort_stream->nodes)) { *gn = *(GtGenomeNode**) gt_array_get(sort_stream->nodes, sort_stream->idx); sort_stream->idx++; /* join region nodes with the same sequence ID */ if (gt_region_node_try_cast(*gn)) { GtRange range_a, range_b; while (sort_stream->idx < gt_array_size(sort_stream->nodes)) { node = *(GtGenomeNode**) gt_array_get(sort_stream->nodes, sort_stream->idx); if (!gt_region_node_try_cast(node) || gt_str_cmp(gt_genome_node_get_seqid(*gn), gt_genome_node_get_seqid(node))) { /* the next node is not a region node with the same ID */ break; } range_a = gt_genome_node_get_range(*gn); range_b = gt_genome_node_get_range(node); range_a = gt_range_join(&range_a, &range_b); gt_genome_node_set_range(*gn, &range_a); gt_genome_node_delete(node); sort_stream->idx++; } } return 0; } } if (!had_err) { gt_array_reset(sort_stream->nodes); *gn = NULL; } return had_err; }
static GtArray* gaeval_visitor_intersect(GtGenomeNode *genemodel, GtGenomeNode *alignment) { agn_assert(genemodel && alignment); GtFeatureNode *genefn = gt_feature_node_cast(genemodel); GtFeatureNode *algnfn = gt_feature_node_cast(alignment); agn_assert(gt_feature_node_has_type(genefn, "mRNA")); GtStrand genestrand = gt_feature_node_get_strand(genefn); GtStrand algnstrand = gt_feature_node_get_strand(algnfn); if(genestrand != algnstrand) return NULL; GtArray *covered_parts = gt_array_new( sizeof(GtRange) ); GtArray *exons = agn_typecheck_select(genefn, agn_typecheck_exon); GtWord i; for(i = 0; i < gt_array_size(exons); i++) { GtGenomeNode *exon = *(GtGenomeNode **)gt_array_get(exons, i); GtRange exonrange = gt_genome_node_get_range(exon); GtFeatureNodeIterator *aniter = gt_feature_node_iterator_new(algnfn); GtFeatureNode *tempaln; GtRange nullrange = {0, 0}; for(tempaln = gt_feature_node_iterator_next(aniter); tempaln != NULL; tempaln = gt_feature_node_iterator_next(aniter)) { if(gt_feature_node_has_type(tempaln, "match_gap")) continue; GtRange alnrange = gt_genome_node_get_range((GtGenomeNode *) tempaln); GtRange intr = gaeval_visitor_range_intersect(&exonrange, &alnrange); if(gt_range_compare(&intr, &nullrange) != 0) gt_array_add(covered_parts, intr); } gt_feature_node_iterator_delete(aniter); } gt_array_delete(exons); for(i = 0; i < gt_array_size(covered_parts); i++) { GtRange *r1 = gt_array_get(covered_parts, i); GtUword j; for(j = i+1; j < gt_array_size(covered_parts); j++) { GtRange *r2 = gt_array_get(covered_parts, j); agn_assert(gt_range_overlap(r1, r2) == false); } } return covered_parts; }
void gt_region_node_consolidate(GtRegionNode *rn_a, GtRegionNode *rn_b) { GtRange range_a, range_b; gt_assert(rn_a); gt_assert(rn_b); gt_assert(!gt_str_cmp(gt_genome_node_get_seqid((GtGenomeNode*) rn_a), gt_genome_node_get_seqid((GtGenomeNode*) rn_b))); range_a = gt_genome_node_get_range((GtGenomeNode*) rn_a); range_b = gt_genome_node_get_range((GtGenomeNode*) rn_b); range_a = gt_range_join(&range_a, &range_b); gt_genome_node_set_range((GtGenomeNode*) rn_a, &range_a); }
void agn_transcript_structure_gbk(GtFeatureNode *transcript, FILE *outstream) { gt_assert(transcript && outstream); GtArray *exons = gt_array_new( sizeof(GtFeatureNode *) ); GtFeatureNodeIterator *iter = gt_feature_node_iterator_new_direct(transcript); GtFeatureNode *child; for ( child = gt_feature_node_iterator_next(iter); child != NULL; child = gt_feature_node_iterator_next(iter) ) { if(agn_gt_feature_node_is_exon_feature(child)) gt_array_add(exons, child); } gt_feature_node_iterator_delete(iter); gt_assert(gt_array_size(exons) > 0); gt_array_sort(exons, (GtCompare)agn_gt_genome_node_compare); if(gt_feature_node_get_strand(transcript) == GT_STRAND_REVERSE) fputs("complement(", outstream); if(gt_array_size(exons) == 1) { GtGenomeNode *exon = *(GtGenomeNode **)gt_array_get(exons, 0); GtRange exonrange = gt_genome_node_get_range(exon); fprintf(outstream, "<%lu..>%lu", exonrange.start, exonrange.end); } else { fputs("join(", outstream); GtUword i; for(i = 0; i < gt_array_size(exons); i++) { GtGenomeNode *exon = *(GtGenomeNode **)gt_array_get(exons, i); GtRange exonrange = gt_genome_node_get_range(exon); if(i == 0) fprintf(outstream, "<%lu..%lu", exonrange.start, exonrange.end); else if(i+1 == gt_array_size(exons)) fprintf(outstream, ",%lu..>%lu", exonrange.start, exonrange.end); else fprintf(outstream, ",%lu..%lu", exonrange.start, exonrange.end); } fputs(")", outstream); } if(gt_feature_node_get_strand(transcript) == GT_STRAND_REVERSE) fputs(")", outstream); }
static void infer_cds_visitor_infer_cds(AgnInferCDSVisitor *v) { GtFeatureNode **start_codon = NULL, **stop_codon = NULL; bool exonsexplicit = gt_array_size(v->exons) > 0; bool startcodon_check = gt_array_size(v->starts) == 1 && (start_codon = gt_array_get(v->starts, 0)) != NULL; bool stopcodon_check = gt_array_size(v->stops) == 1 && (stop_codon = gt_array_get(v->stops, 0)) != NULL; if(gt_array_size(v->cds) > 0) { return; } else if(!exonsexplicit || !startcodon_check || !stopcodon_check) { return; } GtRange left_codon_range, right_codon_range; left_codon_range = gt_genome_node_get_range(*(GtGenomeNode **)start_codon); right_codon_range = gt_genome_node_get_range(*(GtGenomeNode **)stop_codon); if(gt_feature_node_get_strand(v->mrna) == GT_STRAND_REVERSE) { left_codon_range = gt_genome_node_get_range(*(GtGenomeNode **)stop_codon); right_codon_range = gt_genome_node_get_range(*(GtGenomeNode **)start_codon); } GtUword i; for(i = 0; i < gt_array_size(v->exons); i++) { GtFeatureNode *exon = *(GtFeatureNode **)gt_array_get(v->exons, i); GtGenomeNode *exon_gn = (GtGenomeNode *)exon; GtRange exon_range = gt_genome_node_get_range(exon_gn); GtStrand exon_strand = gt_feature_node_get_strand(exon); GtRange cdsrange; bool exon_includes_cds = infer_cds_visitor_infer_range(&exon_range, &left_codon_range, &right_codon_range, &cdsrange); if(exon_includes_cds) { GtGenomeNode *cdsfeat; cdsfeat = gt_feature_node_new(gt_genome_node_get_seqid(exon_gn), "CDS", cdsrange.start, cdsrange.end, exon_strand); if(v->source) gt_feature_node_set_source((GtFeatureNode *)cdsfeat, v->source); gt_feature_node_add_child(v->mrna, (GtFeatureNode *)cdsfeat); gt_array_add(v->cds, cdsfeat); } } }
static void infer_cds_visitor_check_stop(AgnInferCDSVisitor *v) { if(gt_array_size(v->cds) == 0) return; const char *mrnaid = gt_feature_node_get_attribute(v->mrna, "ID"); unsigned int ln = gt_genome_node_get_line_number((GtGenomeNode *)v->mrna); GtStrand strand = gt_feature_node_get_strand(v->mrna); GtRange stoprange; GtUword threeprimeindex = gt_array_size(v->cds) - 1; GtGenomeNode **threeprimesegment = gt_array_get(v->cds, threeprimeindex); stoprange = gt_genome_node_get_range(*threeprimesegment); stoprange.start = stoprange.end - 2; if(strand == GT_STRAND_REVERSE) { threeprimesegment = gt_array_get(v->cds, 0); stoprange = gt_genome_node_get_range(*threeprimesegment); stoprange.end = stoprange.start + 2; } if(gt_array_size(v->stops) > 1) { gt_logger_log(v->logger, "mRNA '%s' (line %u) has %lu stop codons", mrnaid, ln, gt_array_size(v->starts)); } else if(gt_array_size(v->stops) == 1) { GtGenomeNode **codon = gt_array_get(v->stops, 0); GtRange testrange = gt_genome_node_get_range(*codon); if(gt_range_compare(&stoprange, &testrange) != 0) { gt_logger_log(v->logger, "stop codon inferred from CDS [%lu, %lu] does " "not match explicitly provided stop codon [%lu, %lu] for " "mRNA '%s'", stoprange.start, stoprange.end, testrange.start, testrange.end, mrnaid); } } else // agn_assert(gt_array_size(v->stops) == 0) { GtStr *seqid = gt_genome_node_get_seqid((GtGenomeNode *)v->mrna); GtGenomeNode *codonfeature = gt_feature_node_new(seqid, "stop_codon", stoprange.start, stoprange.end, strand); if(v->source) gt_feature_node_set_source((GtFeatureNode *)codonfeature, v->source); GtFeatureNode *cf = (GtFeatureNode *)codonfeature; gt_feature_node_add_child(v->mrna, cf); gt_array_add(v->stops, cf); } }
static int select_visitor_region_node(GtNodeVisitor *nv, GtRegionNode *rn, GT_UNUSED GtError *err) { GtSelectVisitor *select_visitor; gt_error_check(err); select_visitor = select_visitor_cast(nv); if (!gt_str_length(select_visitor->seqid) || /* no seqid was specified */ !gt_str_cmp(select_visitor->seqid, /* or seqids are equal */ gt_genome_node_get_seqid((GtGenomeNode*) rn))) { if (select_visitor->contain_range.start != GT_UNDEF_ULONG) { GtRange range = gt_genome_node_get_range((GtGenomeNode*) rn); if (gt_range_overlap(&range, &select_visitor->contain_range)) { /* an overlapping contain range was defined -> update range */ range.start = MAX(range.start, select_visitor->contain_range.start); range.end = MIN(range.end, select_visitor->contain_range.end); gt_genome_node_set_range((GtGenomeNode*) rn, &range); gt_queue_add(select_visitor->node_buffer, rn); } else /* contain range does not overlap with <rn> range -> delete <rn> */ gt_genome_node_delete((GtGenomeNode*) rn); } else gt_queue_add(select_visitor->node_buffer, rn); } else gt_genome_node_delete((GtGenomeNode*) rn); return 0; }
GtRange agn_transcript_cds_range(GtFeatureNode *transcript) { gt_assert(transcript); GtRange trange; trange.start = 0; trange.end = 0; GtFeatureNodeIterator *iter = gt_feature_node_iterator_new_direct(transcript); GtFeatureNode *current; for ( current = gt_feature_node_iterator_next(iter); current != NULL; current = gt_feature_node_iterator_next(iter) ) { if(agn_gt_feature_node_is_cds_feature(current)) { GtRange crange = gt_genome_node_get_range((GtGenomeNode *)current); if(trange.start == 0 || crange.start < trange.start) trange.start = crange.start; if(trange.end == 0 || crange.end > trange.end) trange.end = crange.end; } } if(gt_feature_node_get_strand(transcript) == GT_STRAND_REVERSE) { GtUword temp = trange.start; trange.start = trange.end; trange.end = temp; } return trange; }
static int gt_regioncov_visitor_feature_node(GtNodeVisitor *nv, GtFeatureNode *fn, GT_UNUSED GtError *err) { GtRange *old_range_ptr, old_range, new_range; GtArray *ranges; GtRegionCovVisitor *regioncov_visitor; gt_error_check(err); regioncov_visitor = gt_regioncov_visitor_cast(nv); ranges = gt_hashmap_get(regioncov_visitor->region2rangelist, gt_str_get(gt_genome_node_get_seqid((GtGenomeNode*) fn))); gt_assert(ranges); new_range = gt_genome_node_get_range((GtGenomeNode*) fn); if (!gt_array_size(ranges)) gt_array_add(ranges, new_range); else { old_range_ptr = gt_array_get_last(ranges); old_range = *old_range_ptr; old_range.end += regioncov_visitor->max_feature_dist; if (gt_range_overlap(&old_range, &new_range)) { old_range_ptr->end = MAX(old_range_ptr->end, new_range.end); } else gt_array_add(ranges, new_range); } return 0; }
GtArray* agn_feature_neighbors(GtGenomeNode *feature, GtArray *feature_set) { GtArray *neighbors = gt_array_new( sizeof(GtGenomeNode *) ); GtUword i; for(i = 0; i < gt_array_size(feature_set); i++) { GtGenomeNode *other = *(GtGenomeNode **)gt_array_get(feature_set, i); if(other != feature) { GtRange feature_range = gt_genome_node_get_range(feature); GtRange other_range = gt_genome_node_get_range(other); if(gt_range_overlap(&feature_range, &other_range) == false) gt_array_add(neighbors, other); } } return neighbors; }
static void compute_type_statistics(GtFeatureNode *fn, GtStatVisitor *sv) { GtRange range; gt_assert(fn && sv); if (gt_feature_node_has_type(fn, gt_ft_gene)) { sv->number_of_genes++; if (gt_feature_node_has_CDS(fn)) sv->number_of_protein_coding_genes++; if (sv->gene_length_distribution) { range = gt_genome_node_get_range((GtGenomeNode*) fn); gt_disc_distri_add(sv->gene_length_distribution, gt_range_length(&range)); } if (sv->gene_score_distribution) { gt_disc_distri_add(sv->gene_score_distribution, gt_feature_node_get_score(fn) * 100.0); } } else if (gt_feature_node_has_type(fn, gt_ft_mRNA)) { sv->number_of_mRNAs++; if (gt_feature_node_has_CDS(fn)) sv->number_of_protein_coding_mRNAs++; } else if (gt_feature_node_has_type(fn, gt_ft_exon)) { sv->number_of_exons++; if (sv->exon_length_distribution) { range = gt_genome_node_get_range((GtGenomeNode*) fn); gt_disc_distri_add(sv->exon_length_distribution, gt_range_length(&range)); } } else if (gt_feature_node_has_type(fn, gt_ft_CDS)) { sv->number_of_CDSs++; } else if (gt_feature_node_has_type(fn, gt_ft_intron)) { if (sv->intron_length_distribution) { range = gt_genome_node_get_range((GtGenomeNode*) fn); gt_disc_distri_add(sv->intron_length_distribution, gt_range_length(&range)); } } else if (gt_feature_node_has_type(fn, gt_ft_LTR_retrotransposon)) { sv->number_of_LTR_retrotransposons++; } }
static bool filter_overlap_range(GtFeatureNode *fn, GtRange overlap_range) { GtRange feature_range; gt_assert(fn); feature_range = gt_genome_node_get_range((GtGenomeNode*) fn); if (overlap_range.start != GT_UNDEF_ULONG && !gt_range_overlap(&overlap_range, &feature_range)) return true; return false; }
static bool filter_contain_range(GtFeatureNode *fn, GtRange contain_range) { GtRange range; gt_assert(fn); range = gt_genome_node_get_range((GtGenomeNode*) fn); if (contain_range.start != GT_UNDEF_ULONG && !gt_range_contains(&contain_range, &range)) { return true; } return false; }
static int stat_visitor_region_node(GtNodeVisitor *nv, GtRegionNode *rn, GT_UNUSED GtError *err) { GtStatVisitor *sv; GtRange range; gt_error_check(err); sv = stat_visitor_cast(nv); sv->number_of_sequence_regions++; range = gt_genome_node_get_range((GtGenomeNode*) rn); sv->total_length_of_sequence_regions += gt_range_length(&range); return 0; }
static int gt_orf_finder_visitor_feature_node(GtNodeVisitor *gv, GtFeatureNode *gf, GtError *err) { GtORFFinderVisitor *lv; const char *gft = NULL; GtFeatureNodeIterator *gfi; GtFeatureNode *curnode = NULL; int had_err = 0; GtRange rng; lv = gt_orf_finder_visitor_cast(gv); gt_assert(lv); gt_error_check(err); gfi = gt_feature_node_iterator_new(gf); while (!had_err && (curnode = gt_feature_node_iterator_next(gfi))) { gft = gt_feature_node_get_type(curnode); if (gt_hashmap_get(lv->types, (void*) gft) != NULL || gt_hashmap_get(lv->types, (void*) "all") == (void*) 1) { if (!had_err) { rng = gt_genome_node_get_range((GtGenomeNode*) curnode); had_err = run_orffinder(lv->rmap, curnode, rng.start - 1, rng.end - 1, lv->min, lv->max, lv->all, err); if (gt_hashmap_get(lv->types, (void*) "all") == (void*) 1) { break; } else if (gt_feature_node_has_children(curnode)) { GtFeatureNode *tmpnode = NULL; GtFeatureNodeIterator *tmpgfi = gt_feature_node_iterator_new(curnode); (void) gt_feature_node_iterator_next(tmpgfi); while ((tmpnode = gt_feature_node_iterator_next(tmpgfi))) { gft = gt_feature_node_get_type(tmpnode); if (strcmp(gft, (const char*) GT_ORF_TYPE) == 0) { continue; } /* curnode = gt_feature_node_iterator_next(gfi); */ } gt_feature_node_iterator_delete(tmpgfi); } } } } gt_feature_node_iterator_delete(gfi); return had_err; }
static int add_exon_or_cds_number(GtFeatureNode *fn, void *data, GT_UNUSED GtError *err) { GtStatVisitor *sv = (GtStatVisitor*) data; gt_error_check(err); gt_assert(sv && fn); if (gt_feature_node_has_type(fn, gt_ft_exon)) sv->exon_number_for_distri++; else if (gt_feature_node_has_type(fn, gt_ft_CDS)) { GtRange range = gt_genome_node_get_range((GtGenomeNode*) fn); sv->cds_length_for_distri += gt_range_length(&range); } return 0; }
GtBlock* gt_block_new_from_node(GtFeatureNode *node) { GtBlock *block; gt_assert(node); block = gt_block_new(); block->range = gt_genome_node_get_range((GtGenomeNode*) node); block->strand = gt_feature_node_get_strand(node); block->type = gt_feature_node_get_type(node); if (!gt_feature_node_is_pseudo(node)) { block->top_level_feature = (GtFeatureNode*) gt_genome_node_ref((GtGenomeNode*) node); } return block; }
static int gt_seqpos_classifier_next_specified_ft( GtSeqposClassifier *seqpos_classifier, GtRange *range, bool *end_of_annotation, GtError *err) { int had_err = 0; GtFeatureNode *cfn; bool fni_exhausted = (seqpos_classifier->fni == NULL) ? true : false; gt_assert(seqpos_classifier != NULL); gt_assert(range != NULL); while (true) { if (fni_exhausted) { had_err = gt_seqpos_classifier_next_fn(seqpos_classifier, err); if (had_err != 0 || seqpos_classifier->fn == NULL) { *end_of_annotation = true; return had_err; } fni_exhausted = false; } gt_assert(seqpos_classifier->fni != NULL); cfn = gt_feature_node_iterator_next(seqpos_classifier->fni); if (cfn == NULL) { fni_exhausted = true; } else if (strcmp(gt_feature_node_get_type(cfn), seqpos_classifier->specified_ft) == 0) { seqpos_classifier->nof_specified_ft_found++; *range = gt_genome_node_get_range((GtGenomeNode*)cfn); gt_assert(range->start > 0); gt_assert(range->end > 0); range->start--; range->end--; *end_of_annotation = false; return had_err; } } }
static void orf_attach_results_to_gff3(GtFeatureNode *gf, GtRange orf_rng, unsigned int orf_frame, GtStrand strand, GT_UNUSED GtError *err) { GtGenomeNode *child; GtStr *tag; tag = gt_str_new_cstr(GT_ORF_FINDER_TAG); orf_rng.start++; orf_rng.end++; GtFeatureNodeIterator *gfi; GtFeatureNode *curnode = NULL, *parent_node = NULL; GtRange gfi_range; char frame_buf[3]; sprintf(frame_buf, "%d", orf_frame); gfi = gt_feature_node_iterator_new(gf); while ((curnode = gt_feature_node_iterator_next(gfi))) { if (strcmp(gt_feature_node_get_type(curnode), (const char*) GT_ORF_TYPE) != 0) { gfi_range = gt_genome_node_get_range((GtGenomeNode*) curnode); if (gt_range_contains(&gfi_range, &orf_rng)) { parent_node = curnode; } } } if (parent_node) { child = gt_feature_node_new(gt_genome_node_get_seqid((GtGenomeNode*) gf), GT_ORF_TYPE, orf_rng.start, orf_rng.end, strand); gt_feature_node_set_source((GtFeatureNode*) child, tag); gt_feature_node_set_attribute((GtFeatureNode*) child, "frame", frame_buf); gt_feature_node_add_child(parent_node,(GtFeatureNode*) child); } gt_str_delete(tag); gt_feature_node_iterator_delete(gfi); }
static double gaeval_visitor_calculate_coverage(AgnGaevalVisitor *v, GtFeatureNode *genemodel, GtError *error) { agn_assert(v && genemodel); GtStr *seqid = gt_genome_node_get_seqid((GtGenomeNode *)genemodel); GtRange mrna_range = gt_genome_node_get_range((GtGenomeNode *)genemodel); GtArray *overlapping = gt_array_new( sizeof(GtFeatureNode *) ); bool hasseqid; gt_feature_index_has_seqid(v->alignments, &hasseqid, gt_str_get(seqid),error); if(hasseqid) { gt_feature_index_get_features_for_range(v->alignments, overlapping, gt_str_get(seqid), &mrna_range, error); } GtArray *exon_coverage = gt_array_new( sizeof(GtRange) ); GtUword i; for(i = 0; i < gt_array_size(overlapping); i++) { GtFeatureNode *alignment = *(GtFeatureNode **)gt_array_get(overlapping, i); GtArray *covered_parts = gaeval_visitor_intersect((GtGenomeNode*)genemodel, (GtGenomeNode*)alignment); if(covered_parts != NULL) { GtArray *temp = gaeval_visitor_union(exon_coverage, covered_parts); gt_array_delete(covered_parts); gt_array_delete(exon_coverage); exon_coverage = temp; } } double coverage = gaeval_visitor_coverage_resolve(genemodel, exon_coverage); gt_array_delete(exon_coverage); gt_array_delete(overlapping); return coverage; }
static int extract_join_feature(GtGenomeNode *gn, const char *type, GtRegionMapping *region_mapping, GtStr *sequence, bool *reverse_strand, bool *first_child_of_type_seen, GtPhase *phase, GtError *err) { char *outsequence; GtFeatureNode *fn; GtRange range; int had_err = 0; gt_error_check(err); fn = gt_feature_node_cast(gn); gt_assert(fn); if (gt_feature_node_has_type(fn, type)) { if (gt_feature_node_get_strand(fn) == GT_STRAND_REVERSE) { *reverse_strand = true; *phase = gt_feature_node_get_phase(fn); } else { if (!(*first_child_of_type_seen)) { *first_child_of_type_seen = true; *phase = gt_feature_node_get_phase(fn); } else *phase = GT_PHASE_UNDEFINED; } range = gt_genome_node_get_range(gn); had_err = gt_region_mapping_get_sequence(region_mapping, &outsequence, gt_genome_node_get_seqid(gn), range.start, range.end, err); if (!had_err) { gt_str_append_cstr_nt(sequence, outsequence, gt_range_length(&range)); gt_free(outsequence); } } return had_err; }
static int inter_feature_in_children(GtFeatureNode *current_feature, void *data, GT_UNUSED GtError *err) { GtInterFeatureVisitor *aiv = (GtInterFeatureVisitor*) data; GtFeatureNode *inter_node; GtRange previous_range, current_range, inter_range; GtStrand previous_strand, /*current_strand, */inter_strand; GtStr *parent_seqid; gt_error_check(err); gt_assert(current_feature); if (gt_feature_node_has_type(current_feature, aiv->outside_type)) { if (aiv->previous_feature) { /* determine inter range */ previous_range = gt_genome_node_get_range((GtGenomeNode*) aiv->previous_feature); current_range = gt_genome_node_get_range((GtGenomeNode*) current_feature); if (previous_range.end >= current_range.start) { gt_warning("overlapping boundary features " GT_WU "-" GT_WU " and " GT_WU "-" GT_WU ", " "not placing '%s' inter-feature", previous_range.start, previous_range.end, current_range.start, current_range.end, aiv->inter_type); return 0; } if (current_range.start - previous_range.end < 2) { gt_warning("no space for inter-feature '%s' between " GT_WU " and " GT_WU, aiv->inter_type, previous_range.end, current_range.start); return 0; } inter_range.start = previous_range.end + 1; inter_range.end = current_range.start - 1; /* determine inter strand */ previous_strand = gt_feature_node_get_strand(aiv->previous_feature); /*current_strand = gt_feature_node_get_strand(current_feature);*/ gt_assert(previous_strand == gt_feature_node_get_strand(current_feature)); inter_strand = previous_strand; /* determine sequence id */ parent_seqid = gt_genome_node_get_seqid((GtGenomeNode*) aiv->parent_feature); gt_assert(!gt_str_cmp(parent_seqid, gt_genome_node_get_seqid((GtGenomeNode*) aiv->previous_feature))); gt_assert(!gt_str_cmp(parent_seqid, gt_genome_node_get_seqid((GtGenomeNode*) current_feature))); /* create inter feature */ inter_node = (GtFeatureNode*) gt_feature_node_new(parent_seqid, aiv->inter_type, inter_range.start, inter_range.end, inter_strand); gt_feature_node_add_child(aiv->parent_feature, inter_node); } aiv->previous_feature = current_feature; } return 0; }
static int write_pdom(GtLTRdigestFileOutStream *ls, GtArray *pdoms, const char *pdomname, GT_UNUSED GtRegionMapping *rmap, char *desc, GtError *err) { int had_err = 0; GtFile *seqfile = NULL, *alifile = NULL, *aafile = NULL; GtUword i = 0, seq_length = 0; GtStr *pdom_seq, *pdom_aaseq; gt_error_check(err); pdom_seq = gt_str_new(); pdom_aaseq = gt_str_new(); /* get protein domain output file */ seqfile = (GtFile*) gt_hashmap_get(ls->pdomout_files, pdomname); if (seqfile == NULL) { /* no file opened for this domain yet, do it */ char buffer[GT_MAXFILENAMELEN]; (void) snprintf(buffer, (size_t) (GT_MAXFILENAMELEN-1), "%s_pdom_%s.fas", ls->fileprefix, pdomname); seqfile = gt_file_xopen(buffer, "w+"); gt_hashmap_add(ls->pdomout_files, gt_cstr_dup(pdomname), seqfile); } /* get protein alignment output file */ if (ls->write_pdom_alignments) { alifile = (GtFile*) gt_hashmap_get(ls->pdomali_files, pdomname); if (alifile == NULL) { /* no file opened for this domain yet, do it */ char buffer[GT_MAXFILENAMELEN]; (void) snprintf(buffer, (size_t) (GT_MAXFILENAMELEN-1), "%s_pdom_%s.ali", ls->fileprefix, pdomname); alifile = gt_file_xopen(buffer, "w+"); gt_hashmap_add(ls->pdomali_files, gt_cstr_dup(pdomname), alifile); } } /* get amino acid sequence output file */ if (ls->write_pdom_aaseqs) { aafile = (GtFile*) gt_hashmap_get(ls->pdomaa_files, pdomname); if (aafile == NULL) { /* no file opened for this domain yet, do it */ char buffer[GT_MAXFILENAMELEN]; (void) snprintf(buffer, (size_t) (GT_MAXFILENAMELEN-1), "%s_pdom_%s_aa.fas", ls->fileprefix, pdomname); aafile = gt_file_xopen(buffer, "w+"); gt_hashmap_add(ls->pdomaa_files, gt_cstr_dup(pdomname), aafile); } } if (gt_array_size(pdoms) > 1UL) { for (i=1UL; i<gt_array_size(pdoms); i++) { gt_assert(gt_genome_node_cmp(*(GtGenomeNode**)gt_array_get(pdoms, i), *(GtGenomeNode**)gt_array_get(pdoms, i-1)) >= 0); } if (gt_feature_node_get_strand(*(GtFeatureNode**) gt_array_get(pdoms, 0UL)) == GT_STRAND_REVERSE) { gt_array_reverse(pdoms); } } /* output protein domain data */ for (i=0;i<gt_array_size(pdoms);i++) { GtRange pdom_rng; GtStr *ali, *aaseq; GtFeatureNode *fn; int rval; fn = *(GtFeatureNode**) gt_array_get(pdoms, i); ali = gt_genome_node_get_user_data((GtGenomeNode*) fn, "pdom_alignment"); aaseq = gt_genome_node_get_user_data((GtGenomeNode*) fn, "pdom_aaseq"); pdom_rng = gt_genome_node_get_range((GtGenomeNode*) fn); rval = gt_extract_feature_sequence(pdom_seq, (GtGenomeNode*) fn, gt_symbol(gt_ft_protein_match), false, NULL, NULL, rmap, err); if (rval) { had_err = -1; break; } if (ls->write_pdom_alignments && ali) { char buf[BUFSIZ]; /* write away alignment */ (void) snprintf(buf, BUFSIZ-1, "Protein domain alignment in translated " "sequence for candidate\n'%s':\n\n", desc); gt_file_xwrite(alifile, buf, (size_t) strlen(buf) * sizeof (char)); gt_file_xwrite(alifile, gt_str_get(ali), (size_t) gt_str_length(ali) * sizeof (char)); gt_file_xwrite(alifile, "---\n\n", 5 * sizeof (char)); } if (ls->write_pdom_aaseqs && aaseq) { /* append amino acid sequence */ gt_str_append_str(pdom_aaseq, aaseq); } gt_genome_node_release_user_data((GtGenomeNode*) fn, "pdom_alignment"); gt_genome_node_release_user_data((GtGenomeNode*) fn, "pdom_aaseq"); seq_length += gt_range_length(&pdom_rng); } if (!had_err) { gt_fasta_show_entry(desc, gt_str_get(pdom_seq), seq_length, GT_FSWIDTH, seqfile); if (ls->write_pdom_aaseqs) { gt_fasta_show_entry(desc, gt_str_get(pdom_aaseq), gt_str_length(pdom_aaseq), GT_FSWIDTH, aafile); } } gt_str_delete(pdom_seq); gt_str_delete(pdom_aaseq); return had_err; }
int gt_ltrfileout_stream_next(GtNodeStream *ns, GtGenomeNode **gn, GtError *err) { GtLTRdigestFileOutStream *ls; GtFeatureNode *fn; GtRange lltr_rng = {GT_UNDEF_UWORD, GT_UNDEF_UWORD}, rltr_rng = {GT_UNDEF_UWORD, GT_UNDEF_UWORD}, ppt_rng = {GT_UNDEF_UWORD, GT_UNDEF_UWORD}, pbs_rng = {GT_UNDEF_UWORD, GT_UNDEF_UWORD}; int had_err; GtUword i=0; gt_error_check(err); ls = gt_ltrdigest_file_out_stream_cast(ns); /* initialize this element */ memset(&ls->element, 0, sizeof (GtLTRElement)); /* get annotations from parser */ had_err = gt_node_stream_next(ls->in_stream, gn, err); if (!had_err && *gn) { GtFeatureNodeIterator* gni; GtFeatureNode *mygn; /* only process feature nodes */ if (!(fn = gt_feature_node_try_cast(*gn))) return 0; ls->element.pdomorder = gt_array_new(sizeof (const char*)); /* fill LTRElement structure from GFF3 subgraph */ gni = gt_feature_node_iterator_new(fn); for (mygn = fn; mygn != NULL; mygn = gt_feature_node_iterator_next(gni)) (void) gt_genome_node_accept((GtGenomeNode*) mygn, (GtNodeVisitor*) ls->lv, err); gt_feature_node_iterator_delete(gni); } if (!had_err && ls->element.mainnode != NULL) { char desc[GT_MAXFASTAHEADER]; GtFeatureNode *ltr3, *ltr5; GtStr *sdesc, *sreg, *seq; /* find sequence in GtEncseq */ sreg = gt_genome_node_get_seqid((GtGenomeNode*) ls->element.mainnode); sdesc = gt_str_new(); had_err = gt_region_mapping_get_description(ls->rmap, sdesc, sreg, err); if (!had_err) { GtRange rng; ls->element.seqid = gt_calloc((size_t) ls->seqnamelen+1, sizeof (char)); (void) snprintf(ls->element.seqid, MIN((size_t) gt_str_length(sdesc), (size_t) ls->seqnamelen)+1, "%s", gt_str_get(sdesc)); gt_cstr_rep(ls->element.seqid, ' ', '_'); if (gt_str_length(sdesc) > (GtUword) ls->seqnamelen) ls->element.seqid[ls->seqnamelen] = '\0'; (void) gt_ltrelement_format_description(&ls->element, ls->seqnamelen, desc, (size_t) (GT_MAXFASTAHEADER-1)); gt_str_delete(sdesc); /* output basic retrotransposon data */ lltr_rng = gt_genome_node_get_range((GtGenomeNode*) ls->element.leftLTR); rltr_rng = gt_genome_node_get_range((GtGenomeNode*) ls->element.rightLTR); rng = gt_genome_node_get_range((GtGenomeNode*) ls->element.mainnode); gt_file_xprintf(ls->tabout_file, GT_WU"\t"GT_WU"\t"GT_WU"\t%s\t"GT_WU"\t"GT_WU"\t"GT_WU"\t" GT_WU"\t"GT_WU"\t"GT_WU"\t", rng.start, rng.end, gt_ltrelement_length(&ls->element), ls->element.seqid, lltr_rng.start, lltr_rng.end, gt_ltrelement_leftltrlen(&ls->element), rltr_rng.start, rltr_rng.end, gt_ltrelement_rightltrlen(&ls->element)); } seq = gt_str_new(); /* output TSDs */ if (!had_err && ls->element.leftTSD != NULL) { GtRange tsd_rng; tsd_rng = gt_genome_node_get_range((GtGenomeNode*) ls->element.leftTSD); had_err = gt_extract_feature_sequence(seq, (GtGenomeNode*) ls->element.leftTSD, gt_symbol(gt_ft_target_site_duplication), false, NULL, NULL, ls->rmap, err); if (!had_err) { gt_file_xprintf(ls->tabout_file, ""GT_WU"\t"GT_WU"\t%s\t", tsd_rng.start, tsd_rng.end, gt_str_get(seq)); } gt_str_reset(seq); } else gt_file_xprintf(ls->tabout_file, "\t\t\t"); if (!had_err && ls->element.rightTSD != NULL) { GtRange tsd_rng; tsd_rng = gt_genome_node_get_range((GtGenomeNode*) ls->element.rightTSD); had_err = gt_extract_feature_sequence(seq, (GtGenomeNode*) ls->element.rightTSD, gt_symbol(gt_ft_target_site_duplication), false, NULL, NULL, ls->rmap, err); if (!had_err) { gt_file_xprintf(ls->tabout_file, ""GT_WU"\t"GT_WU"\t%s\t", tsd_rng.start, tsd_rng.end, gt_str_get(seq)); } gt_str_reset(seq); } else gt_file_xprintf(ls->tabout_file, "\t\t\t"); /* output PPT */ if (!had_err && ls->element.ppt != NULL) { GtStrand ppt_strand = gt_feature_node_get_strand(ls->element.ppt); ppt_rng = gt_genome_node_get_range((GtGenomeNode*) ls->element.ppt); had_err = gt_extract_feature_sequence(seq, (GtGenomeNode*) ls->element.ppt, gt_symbol(gt_ft_RR_tract), false, NULL, NULL, ls->rmap, err); if (!had_err) { gt_fasta_show_entry(desc, gt_str_get(seq), gt_range_length(&ppt_rng), GT_FSWIDTH, ls->pptout_file); gt_file_xprintf(ls->tabout_file, ""GT_WU"\t"GT_WU"\t%s\t%c\t%d\t", ppt_rng.start, ppt_rng.end, gt_str_get(seq), GT_STRAND_CHARS[ppt_strand], (ppt_strand == GT_STRAND_FORWARD ? abs((int) (rltr_rng.start - ppt_rng.end)) : abs((int) (lltr_rng.end - ppt_rng.start)))); } gt_str_reset(seq); } else gt_file_xprintf(ls->tabout_file, "\t\t\t\t\t"); /* output PBS */ if (!had_err && ls->element.pbs != NULL) { GtStrand pbs_strand; pbs_strand = gt_feature_node_get_strand(ls->element.pbs); pbs_rng = gt_genome_node_get_range((GtGenomeNode*) ls->element.pbs); had_err = gt_extract_feature_sequence(seq, (GtGenomeNode*) ls->element.pbs, gt_symbol(gt_ft_primer_binding_site), false, NULL, NULL, ls->rmap, err); if (!had_err) { gt_fasta_show_entry(desc, gt_str_get(seq), gt_range_length(&pbs_rng), GT_FSWIDTH, ls->pbsout_file); gt_file_xprintf(ls->tabout_file, ""GT_WU"\t"GT_WU"\t%c\t%s\t%s\t%s\t%s\t%s\t", pbs_rng.start, pbs_rng.end, GT_STRAND_CHARS[pbs_strand], gt_feature_node_get_attribute(ls->element.pbs, "trna"), gt_str_get(seq), gt_feature_node_get_attribute(ls->element.pbs, "pbsoffset"), gt_feature_node_get_attribute(ls->element.pbs, "trnaoffset"), gt_feature_node_get_attribute(ls->element.pbs, "edist")); } gt_str_reset(seq); } else gt_file_xprintf(ls->tabout_file, "\t\t\t\t\t\t\t\t"); /* output protein domains */ if (!had_err && ls->element.pdoms != NULL) { GtStr *pdomorderstr = gt_str_new(); for (i=0; !had_err && i<gt_array_size(ls->element.pdomorder); i++) { const char* key = *(const char**) gt_array_get(ls->element.pdomorder, i); GtArray *entry = (GtArray*) gt_hashmap_get(ls->element.pdoms, key); had_err = write_pdom(ls, entry, key, ls->rmap, desc, err); } if (GT_STRAND_REVERSE == gt_feature_node_get_strand(ls->element.mainnode)) gt_array_reverse(ls->element.pdomorder); for (i=0 ;!had_err && i<gt_array_size(ls->element.pdomorder); i++) { const char* name = *(const char**) gt_array_get(ls->element.pdomorder, i); gt_str_append_cstr(pdomorderstr, name); if (i != gt_array_size(ls->element.pdomorder)-1) gt_str_append_cstr(pdomorderstr, "/"); } gt_file_xprintf(ls->tabout_file, "%s", gt_str_get(pdomorderstr)); gt_str_delete(pdomorderstr); } /* output LTRs (we just expect them to exist) */ switch (gt_feature_node_get_strand(ls->element.mainnode)) { case GT_STRAND_REVERSE: ltr5 = ls->element.rightLTR; ltr3 = ls->element.leftLTR; break; case GT_STRAND_FORWARD: default: ltr5 = ls->element.leftLTR; ltr3 = ls->element.rightLTR; break; } if (!had_err) { had_err = gt_extract_feature_sequence(seq, (GtGenomeNode*) ltr5, gt_symbol(gt_ft_long_terminal_repeat), false, NULL, NULL, ls->rmap, err); } if (!had_err) { gt_fasta_show_entry(desc, gt_str_get(seq), gt_str_length(seq), GT_FSWIDTH, ls->ltr5out_file); gt_str_reset(seq); } if (!had_err) { had_err = gt_extract_feature_sequence(seq, (GtGenomeNode*) ltr3, gt_symbol(gt_ft_long_terminal_repeat), false, NULL, NULL, ls->rmap, err); } if (!had_err) { gt_fasta_show_entry(desc, gt_str_get(seq), gt_str_length(seq), GT_FSWIDTH, ls->ltr3out_file); gt_str_reset(seq); } /* output complete oriented element */ if (!had_err) { had_err = gt_extract_feature_sequence(seq, (GtGenomeNode*) ls->element.mainnode, gt_symbol(gt_ft_LTR_retrotransposon), false, NULL, NULL, ls->rmap, err); } if (!had_err) { gt_fasta_show_entry(desc,gt_str_get(seq), gt_str_length(seq), GT_FSWIDTH, ls->elemout_file); gt_str_reset(seq); } gt_file_xprintf(ls->tabout_file, "\n"); gt_str_delete(seq); } gt_hashmap_delete(ls->element.pdoms); gt_array_delete(ls->element.pdomorder); gt_free(ls->element.seqid); return had_err; }
static int snp_annotator_visitor_feature_node(GtNodeVisitor *nv, GtFeatureNode *fn, GtError *err) { int had_err = 0; GtSNPAnnotatorVisitor *sav; GtFeatureNodeIterator *fni, *mrnafni; GtFeatureNode *curnode, *curnode2; GtRange snp_rng; gt_error_check(err); sav = snp_annotator_visitor_cast(nv); /* ignore non-nodes */ if (!fn) return 0; /* only process SNPs */ if (!(gt_feature_node_get_type(fn) == sav->SNV_type || gt_feature_node_get_type(fn) == sav->SNP_type)) { return 0; } fni = gt_feature_node_iterator_new_direct(sav->gene); snp_rng = gt_genome_node_get_range((GtGenomeNode*) fn); while (!had_err && (curnode = gt_feature_node_iterator_next(fni))) { if (gt_feature_node_get_type(curnode) == sav->mRNA_type) { GtStrand mrna_strand = gt_feature_node_get_strand(curnode); #ifndef NDEBUG const char *refstr; #endif GtUword mrnasnppos = 0; mrnafni = gt_feature_node_iterator_new(curnode); while (!had_err && (curnode2 = gt_feature_node_iterator_next(mrnafni))) { if (gt_feature_node_get_type(curnode2) == sav->CDS_type) { GtRange cds_rng = gt_genome_node_get_range((GtGenomeNode*) curnode2); if (gt_range_overlap(&snp_rng, &cds_rng)) { char *mRNA, origchar; char *variantchars, *variantptr = NULL; GT_UNUSED char *refchars, *refptr = NULL; mRNA = (char*) gt_hashmap_get(sav->rnaseqs, curnode); gt_assert(mRNA); gt_assert(snp_rng.start >= cds_rng.start); mrnasnppos += (snp_rng.start - cds_rng.start); if (mrna_strand == GT_STRAND_REVERSE) mrnasnppos = strlen(mRNA) - mrnasnppos - 1; gt_assert(mrnasnppos < strlen(mRNA)); origchar = mRNA[mrnasnppos]; #ifndef NDEBUG refstr = refptr = gt_cstr_dup(gt_feature_node_get_attribute(fn, GT_GVF_REFERENCE_SEQ)); if (!had_err && refstr) { if (gt_feature_node_get_strand(curnode) == GT_STRAND_REVERSE) { int rval = gt_complement(&origchar, origchar, err); gt_assert(rval == 0); } gt_assert(toupper(origchar) == toupper(refstr[0])); } #endif variantchars = variantptr = gt_cstr_dup( gt_feature_node_get_attribute(fn, GT_GVF_VARIANT_SEQ)); if (!had_err && variantchars) { GtUword i = 0; while (!had_err && (*variantchars != ';' && *variantchars != '\0')) { if (*variantchars != ',' && *variantchars != origchar) { char variantchar = *variantchars; #ifndef NDEBUG char refchar = refstr ? refstr[0] : '-'; /* XXX */ if (!had_err && mrna_strand == GT_STRAND_REVERSE) had_err = gt_complement(&refchar, refchar, err); #endif if (!had_err && mrna_strand == GT_STRAND_REVERSE) had_err = gt_complement(&variantchar, variantchar, err); if (!had_err) { had_err = snp_annotator_classify_snp(sav, curnode, fn, mrnasnppos, i++, variantchar, #ifndef NDEBUG refchar, #endif err); } } else if (*variantchars == origchar) { i++; } variantchars++; } gt_free(variantptr); gt_free(refptr); } } else { mrnasnppos += gt_range_length(&cds_rng); } } } gt_feature_node_iterator_delete(mrnafni); } } gt_feature_node_iterator_delete(fni); return had_err; }
static int gt_snp_annotator_visitor_prepare_gene(GtSNPAnnotatorVisitor *sav, GtError *err) { GtFeatureNodeIterator *fni, *mrnafni; GtFeatureNode *curnode, *last_mRNA = NULL; GtStr *mrnaseq, *seqid; int had_err = 0; mrnaseq = gt_str_new(); seqid = gt_genome_node_get_seqid((GtGenomeNode*) sav->gene); fni = gt_feature_node_iterator_new(sav->gene); while (!had_err && (curnode = gt_feature_node_iterator_next(fni))) { if (gt_feature_node_get_type(curnode) == sav->mRNA_type) { GtFeatureNode *curnode2; if (last_mRNA) { char *mrna_charseq = gt_calloc(gt_str_length(mrnaseq)+1, sizeof (char)); (void) strncpy(mrna_charseq, gt_str_get(mrnaseq), gt_str_length(mrnaseq)); if (gt_feature_node_get_strand(sav->gene) == GT_STRAND_REVERSE) { had_err = gt_reverse_complement(mrna_charseq, gt_str_length(mrnaseq), err); } if (!had_err) { gt_hashmap_add(sav->rnaseqs, last_mRNA, mrna_charseq); last_mRNA = curnode; gt_str_reset(mrnaseq); } } else last_mRNA = curnode; if (!had_err) { mrnafni = gt_feature_node_iterator_new(curnode); while (!had_err && (curnode2 = gt_feature_node_iterator_next(mrnafni))) { if (gt_feature_node_get_type(curnode2) == sav->CDS_type) { char *tmp; GtRange rng = gt_genome_node_get_range((GtGenomeNode*) curnode2); had_err = gt_region_mapping_get_sequence(sav->rmap, &tmp, seqid, rng.start, rng.end, err); if (!had_err) { gt_str_append_cstr_nt(mrnaseq, tmp, gt_range_length(&rng)); gt_free(tmp); } } } gt_feature_node_iterator_delete(mrnafni); } } } if (!had_err && last_mRNA) { char *mrna_charseq = gt_calloc(gt_str_length(mrnaseq)+1, sizeof (char)); (void) strncpy(mrna_charseq, gt_str_get(mrnaseq), gt_str_length(mrnaseq)); if (gt_feature_node_get_strand(sav->gene) == GT_STRAND_REVERSE) { had_err = gt_reverse_complement(mrna_charseq, gt_str_length(mrnaseq), err); } if (!had_err) { gt_hashmap_add(sav->rnaseqs, last_mRNA, mrna_charseq); } } gt_feature_node_iterator_delete(fni); gt_str_delete(mrnaseq); return had_err; }
static int select_visitor_feature_node(GtNodeVisitor *nv, GtFeatureNode *fn, GT_UNUSED GtError *err) { GtSelectVisitor *fv; bool filter_node = false; gt_error_check(err); fv = select_visitor_cast(nv); fv->current_feature++; if ((!gt_str_length(fv->seqid) || /* no seqid was specified or seqids are equal */ !gt_str_cmp(fv->seqid, gt_genome_node_get_seqid((GtGenomeNode*) fn))) && (!gt_str_length(fv->source) || /* no source was specified or sources are equal */ !strcmp(gt_str_get(fv->source), gt_feature_node_get_source(fn)))) { GtRange range = gt_genome_node_get_range((GtGenomeNode*) fn); /* enforce maximum gene length */ /* XXX: we (spuriously) assume that genes are always root nodes */ if (fn && gt_feature_node_has_type(fn, gt_ft_gene)) { if (fv->max_gene_length != GT_UNDEF_ULONG && gt_range_length(&range) > fv->max_gene_length) { filter_node = true; } else if (fv->max_gene_num != GT_UNDEF_ULONG && fv->gene_num >= fv->max_gene_num) { filter_node = true; } else if (fv->min_gene_score != GT_UNDEF_DOUBLE && gt_feature_node_get_score(fn) < fv->min_gene_score) { filter_node = true; } else if (fv->max_gene_score != GT_UNDEF_DOUBLE && gt_feature_node_get_score(fn) > fv->max_gene_score) { filter_node = true; } else if (fv->feature_num != GT_UNDEF_ULONG && fv->feature_num != fv->current_feature) { filter_node = true; } if (!filter_node) fv->gene_num++; /* gene passed filter */ } } else filter_node = true; if (!filter_node) filter_node = filter_contain_range(fn, fv->contain_range); if (!filter_node) filter_node = filter_overlap_range(fn, fv->overlap_range); if (!filter_node) filter_node = filter_strand(fn, fv->strand); if (!filter_node) filter_node = filter_targetstrand(fn, fv->targetstrand); if (!filter_node) filter_node = filter_has_CDS(fn, fv->has_CDS); if (!filter_node) filter_node = filter_min_average_ssp(fn, fv->min_average_splice_site_prob); if (filter_node) gt_genome_node_delete((GtGenomeNode*) fn); else gt_queue_add(fv->node_buffer, fn); return 0; }