static void split_cds_feature(GtFeatureNode *cds_feature, GtFeatureNode *fn) { GtArray *parents; unsigned long i; gt_assert(cds_feature && fn); /* find parents */ parents = find_cds_parents(cds_feature, fn); /* remove CDS feature */ gt_feature_node_remove_leaf(fn, cds_feature); /* add CDS feature to all parents */ for (i = 0; i < gt_array_size(parents); i++) { GtFeatureNode *parent = *(GtFeatureNode**) gt_array_get(parents, i); const char *id = gt_feature_node_get_attribute(parent, GT_GFF_ID); if (!i) { gt_feature_node_set_attribute(cds_feature, GT_GFF_PARENT, id); gt_feature_node_add_child(parent, cds_feature); } else { GtFeatureNode *new_cds = gt_feature_node_clone(cds_feature); gt_feature_node_set_attribute(new_cds, GT_GFF_PARENT, id); gt_feature_node_add_child(parent, new_cds); gt_genome_node_delete((GtGenomeNode*) cds_feature); } } gt_array_delete(parents); }
static int pdom_hit_attach_gff3(GtPdomModel *model, GtPdomModelHit *hit, void *data, GT_UNUSED GtError *err) { unsigned long i; GtRange rng; GtLTRdigestStream *ls = (GtLTRdigestStream *) data; GtStrand strand; gt_assert(model && hit); strand = gt_pdom_model_hit_get_best_strand(hit); /* do not use the hits on the non-predicted strand -- maybe identify nested elements ? */ if (strand != gt_feature_node_get_strand(ls->element.mainnode)) return 0; for (i=0;i<gt_pdom_model_hit_best_chain_length(hit);i++) { GtGenomeNode *gf; GtStr *alignmentstring, *aastring; GtPdomSingleHit *singlehit; GtPhase frame; singlehit = gt_pdom_model_hit_best_single_hit(hit, i); alignmentstring = gt_str_new(); aastring = gt_str_new(); frame = gt_pdom_single_hit_get_phase(singlehit); rng = gt_pdom_single_hit_get_range(singlehit); gt_pdom_single_hit_format_alignment(singlehit, GT_ALIWIDTH, alignmentstring); gt_pdom_single_hit_get_aaseq(singlehit, aastring); rng.start++; rng.end++; /* GFF3 is 1-based */ gf = gt_feature_node_new(gt_genome_node_get_seqid((GtGenomeNode*) ls->element.mainnode), GT_PDOM_TYPE, rng.start, rng.end, strand); gt_genome_node_add_user_data((GtGenomeNode*) gf, "pdom_alignment", alignmentstring, (GtFree) gt_str_delete); gt_genome_node_add_user_data((GtGenomeNode*) gf, "pdom_aaseq", aastring, (GtFree) gt_str_delete); gt_feature_node_set_source((GtFeatureNode*) gf, ls->ltrdigest_tag); gt_feature_node_set_score((GtFeatureNode*) gf, gt_pdom_single_hit_get_evalue(singlehit)); gt_feature_node_set_phase((GtFeatureNode*) gf, frame); if (gt_pdom_model_get_name(model)) { gt_feature_node_add_attribute((GtFeatureNode*) gf, "name", gt_pdom_model_get_name(model)); } if (gt_pdom_model_get_acc(model)) { gt_feature_node_add_attribute((GtFeatureNode*) gf, "id", gt_pdom_model_get_acc(model)); } gt_feature_node_add_child(ls->element.mainnode, (GtFeatureNode*) gf); } return 0; }
static int gt_ltrdigest_pdom_visitor_attach_hit(GtLTRdigestPdomVisitor *lv, GtHMMERModelHit *modelhit, GtHMMERSingleHit *singlehit) { GT_UNUSED GtUword i; GtGenomeNode *gf; int had_err = 0; GtRange rrng; gt_assert(lv && singlehit); rrng = gt_ltrdigest_pdom_visitor_coords(lv, singlehit); if (gt_array_size(singlehit->chains) > 0 || lv->output_all_chains) { char buf[32]; gf = gt_feature_node_new(gt_genome_node_get_seqid((GtGenomeNode*) lv->ltr_retrotrans), gt_ft_protein_match, rrng.start, rrng.end, singlehit->strand); gt_genome_node_add_user_data((GtGenomeNode*) gf, "pdom_alignment", gt_str_ref(singlehit->alignment), (GtFree) gt_str_delete); gt_genome_node_add_user_data((GtGenomeNode*) gf, "pdom_aaseq", gt_str_ref(singlehit->aastring), (GtFree) gt_str_delete); gt_feature_node_set_source((GtFeatureNode*) gf, lv->tag); gt_feature_node_set_score((GtFeatureNode*) gf, (float) singlehit->evalue); (void) snprintf(buf, (size_t) 32, "%d", (int) singlehit->frame); gt_feature_node_add_attribute((GtFeatureNode*) gf, "reading_frame", buf); if (modelhit->modelname != NULL) { gt_feature_node_add_attribute((GtFeatureNode*) gf, "name", modelhit->modelname); } if (gt_array_size(singlehit->chains) > 1UL && lv->output_all_chains) { GtStr *buffer; GtUword j; gt_assert(singlehit->chains != NULL); buffer = gt_str_new(); for (j = 0UL; j < gt_array_size(singlehit->chains); j++) { gt_str_append_cstr(buffer, modelhit->modelname); gt_str_append_char(buffer, ':'); gt_str_append_ulong(buffer, *(GtUword*) gt_array_get(singlehit->chains, j)); if (j != gt_array_size(singlehit->chains) - 1) { gt_str_append_char(buffer, ','); } } gt_feature_node_set_attribute((GtFeatureNode*) gf, "chains", gt_str_get(buffer)); gt_str_delete(buffer); } gt_feature_node_add_child(lv->ltr_retrotrans, (GtFeatureNode*) gf); } gt_array_delete(singlehit->chains); singlehit->chains = NULL; return had_err; }
static void infer_cds_visitor_check_stop(AgnInferCDSVisitor *v) { if(gt_array_size(v->cds) == 0) return; const char *mrnaid = gt_feature_node_get_attribute(v->mrna, "ID"); unsigned int ln = gt_genome_node_get_line_number((GtGenomeNode *)v->mrna); GtStrand strand = gt_feature_node_get_strand(v->mrna); GtRange stoprange; GtUword threeprimeindex = gt_array_size(v->cds) - 1; GtGenomeNode **threeprimesegment = gt_array_get(v->cds, threeprimeindex); stoprange = gt_genome_node_get_range(*threeprimesegment); stoprange.start = stoprange.end - 2; if(strand == GT_STRAND_REVERSE) { threeprimesegment = gt_array_get(v->cds, 0); stoprange = gt_genome_node_get_range(*threeprimesegment); stoprange.end = stoprange.start + 2; } if(gt_array_size(v->stops) > 1) { gt_logger_log(v->logger, "mRNA '%s' (line %u) has %lu stop codons", mrnaid, ln, gt_array_size(v->starts)); } else if(gt_array_size(v->stops) == 1) { GtGenomeNode **codon = gt_array_get(v->stops, 0); GtRange testrange = gt_genome_node_get_range(*codon); if(gt_range_compare(&stoprange, &testrange) != 0) { gt_logger_log(v->logger, "stop codon inferred from CDS [%lu, %lu] does " "not match explicitly provided stop codon [%lu, %lu] for " "mRNA '%s'", stoprange.start, stoprange.end, testrange.start, testrange.end, mrnaid); } } else // agn_assert(gt_array_size(v->stops) == 0) { GtStr *seqid = gt_genome_node_get_seqid((GtGenomeNode *)v->mrna); GtGenomeNode *codonfeature = gt_feature_node_new(seqid, "stop_codon", stoprange.start, stoprange.end, strand); if(v->source) gt_feature_node_set_source((GtFeatureNode *)codonfeature, v->source); GtFeatureNode *cf = (GtFeatureNode *)codonfeature; gt_feature_node_add_child(v->mrna, cf); gt_array_add(v->stops, cf); } }
static int create_block_features(GtBEDParser *bed_parser, GtFeatureNode *fn, GtUword block_count, GtSplitter *size_splitter, GtSplitter *start_splitter, GtIO *bed_file, GtError *err) { GtUword i; int had_err = 0; gt_assert(fn && block_count && size_splitter && start_splitter); gt_assert(gt_splitter_size(size_splitter) == block_count); gt_assert(gt_splitter_size(start_splitter) == block_count); for (i = 0; !had_err && i < block_count; i++) { GtUword block_size, block_start, start, end; GtGenomeNode *block; const char *name; if (gt_parse_uword(&block_size, gt_splitter_get_token(size_splitter, i))) { gt_error_set(err, "file \"%s\": line "GT_WU": could not parse blockSize '%s'", gt_io_get_filename(bed_file), gt_io_get_line_number(bed_file), gt_splitter_get_token(size_splitter, i)); had_err = -1; } if (!had_err && gt_parse_uword(&block_start, gt_splitter_get_token(start_splitter, i))) { gt_error_set(err, "file \"%s\": line "GT_WU": could not parse blockStart " "'%s'", gt_io_get_filename(bed_file), gt_io_get_line_number(bed_file), gt_splitter_get_token(start_splitter, i)); had_err = -1; } if (!had_err) { start = gt_genome_node_get_start((GtGenomeNode*) fn) + block_start; end = start + block_size - 1; block = gt_feature_node_new(gt_genome_node_get_seqid((GtGenomeNode*) fn), bed_parser->block_type ? bed_parser->block_type : BED_BLOCK_TYPE, start, end, gt_feature_node_get_strand(fn)); if ((name = gt_feature_node_get_attribute(fn, GT_GFF_NAME))) { gt_feature_node_add_attribute((GtFeatureNode*) block, GT_GFF_NAME, name); } gt_feature_node_set_score((GtFeatureNode*) block, gt_feature_node_get_score(fn)); gt_feature_node_set_strand((GtFeatureNode*) block, gt_feature_node_get_strand(fn)); gt_feature_node_add_child(fn, (GtFeatureNode*) block); } } return had_err; }
static void infer_cds_visitor_infer_cds(AgnInferCDSVisitor *v) { GtFeatureNode **start_codon = NULL, **stop_codon = NULL; bool exonsexplicit = gt_array_size(v->exons) > 0; bool startcodon_check = gt_array_size(v->starts) == 1 && (start_codon = gt_array_get(v->starts, 0)) != NULL; bool stopcodon_check = gt_array_size(v->stops) == 1 && (stop_codon = gt_array_get(v->stops, 0)) != NULL; if(gt_array_size(v->cds) > 0) { return; } else if(!exonsexplicit || !startcodon_check || !stopcodon_check) { return; } GtRange left_codon_range, right_codon_range; left_codon_range = gt_genome_node_get_range(*(GtGenomeNode **)start_codon); right_codon_range = gt_genome_node_get_range(*(GtGenomeNode **)stop_codon); if(gt_feature_node_get_strand(v->mrna) == GT_STRAND_REVERSE) { left_codon_range = gt_genome_node_get_range(*(GtGenomeNode **)stop_codon); right_codon_range = gt_genome_node_get_range(*(GtGenomeNode **)start_codon); } GtUword i; for(i = 0; i < gt_array_size(v->exons); i++) { GtFeatureNode *exon = *(GtFeatureNode **)gt_array_get(v->exons, i); GtGenomeNode *exon_gn = (GtGenomeNode *)exon; GtRange exon_range = gt_genome_node_get_range(exon_gn); GtStrand exon_strand = gt_feature_node_get_strand(exon); GtRange cdsrange; bool exon_includes_cds = infer_cds_visitor_infer_range(&exon_range, &left_codon_range, &right_codon_range, &cdsrange); if(exon_includes_cds) { GtGenomeNode *cdsfeat; cdsfeat = gt_feature_node_new(gt_genome_node_get_seqid(exon_gn), "CDS", cdsrange.start, cdsrange.end, exon_strand); if(v->source) gt_feature_node_set_source((GtFeatureNode *)cdsfeat, v->source); gt_feature_node_add_child(v->mrna, (GtFeatureNode *)cdsfeat); gt_array_add(v->cds, cdsfeat); } } }
static void pbs_attach_results_to_gff3(GtPBSResults *results, GtLTRElement *element, GtStrand *canonical_strand, GtStr *tag) { GtRange pbs_range; GtGenomeNode *gf; unsigned long i = 0; char buffer[BUFSIZ]; GtPBSHit* hit = gt_pbs_results_get_ranked_hit(results, i++); if (*canonical_strand == GT_STRAND_UNKNOWN) *canonical_strand = gt_pbs_hit_get_strand(hit); else { /* do we have to satisfy a strand constraint? * then find best-scoring PBS on the given canonical strand */ while (gt_pbs_hit_get_strand(hit) != *canonical_strand && i < gt_pbs_results_get_number_of_hits(results)) { gt_log_log("dropping PBS because of nonconsistent strand: %s\n", gt_feature_node_get_attribute(element->mainnode, "ID")); hit = gt_pbs_results_get_ranked_hit(results, i++); } /* if there is none, do not report a PBS */ if (gt_pbs_hit_get_strand(hit) != *canonical_strand) return; } pbs_range = gt_pbs_hit_get_coords(hit); pbs_range.start++; pbs_range.end++; /* GFF3 is 1-based */ gf = gt_feature_node_new(gt_genome_node_get_seqid((GtGenomeNode*) element->mainnode), GT_PBS_TYPE, pbs_range.start, pbs_range.end, gt_pbs_hit_get_strand(hit)); gt_feature_node_set_source((GtFeatureNode*) gf, tag); gt_feature_node_set_score((GtFeatureNode*) gf, (float) gt_pbs_hit_get_score(hit)); if (gt_pbs_hit_get_trna(hit) != NULL) { gt_feature_node_add_attribute((GtFeatureNode*) gf, "trna", gt_pbs_hit_get_trna(hit)); } gt_feature_node_set_strand(element->mainnode, gt_pbs_hit_get_strand(hit)); (void) snprintf(buffer, BUFSIZ-1, "%lu", gt_pbs_hit_get_tstart(hit)); gt_feature_node_add_attribute((GtFeatureNode*) gf, "trnaoffset", buffer); (void) snprintf(buffer, BUFSIZ-1, "%lu", gt_pbs_hit_get_offset(hit)); gt_feature_node_add_attribute((GtFeatureNode*) gf, "pbsoffset", buffer); (void) snprintf(buffer, BUFSIZ-1, "%lu", gt_pbs_hit_get_edist(hit)); gt_feature_node_add_attribute((GtFeatureNode*) gf, "edist", buffer); gt_feature_node_add_child(element->mainnode, (GtFeatureNode*) gf); }
static int genome_node_lua_add_child(lua_State *L) { GtGenomeNode **parent, **child; GtFeatureNode *pf, *cf; parent = check_genome_node(L, 1); child = check_genome_node(L, 2); pf = gt_feature_node_try_cast(*parent); luaL_argcheck(L, pf, 1, "not a feature node"); cf = gt_feature_node_try_cast(*child); luaL_argcheck(L, cf, 2, "not a feature node"); gt_feature_node_add_child(pf, (GtFeatureNode*) gt_genome_node_ref((GtGenomeNode*) cf)); return 0; }
static void construct_thick_feature(GtBEDParser *bed_parser, GtFeatureNode *fn, GtRange range) { GtGenomeNode *thick_feature; const char *name; gt_assert(fn); thick_feature = gt_feature_node_new(gt_genome_node_get_seqid((GtGenomeNode*) fn), bed_parser->thick_feature_type ? bed_parser->thick_feature_type : BED_THICK_FEATURE_TYPE, range.start, range.end, gt_feature_node_get_strand(fn)); if ((name = gt_feature_node_get_attribute(fn, "Name"))) gt_feature_node_add_attribute((GtFeatureNode*) thick_feature, "Name", name); gt_feature_node_set_score((GtFeatureNode*) thick_feature, gt_feature_node_get_score(fn)); gt_feature_node_set_strand((GtFeatureNode*) thick_feature, gt_feature_node_get_strand(fn)); gt_feature_node_add_child(fn, (GtFeatureNode*) thick_feature); }
static void orf_attach_results_to_gff3(GtFeatureNode *gf, GtRange orf_rng, unsigned int orf_frame, GtStrand strand, GT_UNUSED GtError *err) { GtGenomeNode *child; GtStr *tag; tag = gt_str_new_cstr(GT_ORF_FINDER_TAG); orf_rng.start++; orf_rng.end++; GtFeatureNodeIterator *gfi; GtFeatureNode *curnode = NULL, *parent_node = NULL; GtRange gfi_range; char frame_buf[3]; sprintf(frame_buf, "%d", orf_frame); gfi = gt_feature_node_iterator_new(gf); while ((curnode = gt_feature_node_iterator_next(gfi))) { if (strcmp(gt_feature_node_get_type(curnode), (const char*) GT_ORF_TYPE) != 0) { gfi_range = gt_genome_node_get_range((GtGenomeNode*) curnode); if (gt_range_contains(&gfi_range, &orf_rng)) { parent_node = curnode; } } } if (parent_node) { child = gt_feature_node_new(gt_genome_node_get_seqid((GtGenomeNode*) gf), GT_ORF_TYPE, orf_rng.start, orf_rng.end, strand); gt_feature_node_set_source((GtFeatureNode*) child, tag); gt_feature_node_set_attribute((GtFeatureNode*) child, "frame", frame_buf); gt_feature_node_add_child(parent_node,(GtFeatureNode*) child); } gt_str_delete(tag); gt_feature_node_iterator_delete(gfi); }
static void ppt_attach_results_to_gff3(GtPPTResults *results, GtLTRElement *element, GtStrand *canonical_strand, GtStr *tag) { GtRange ppt_range; unsigned long i = 0; GtGenomeNode *gf; GtPPTHit* hit = gt_ppt_results_get_ranked_hit(results, i++); if (*canonical_strand == GT_STRAND_UNKNOWN) *canonical_strand = gt_ppt_hit_get_strand(hit); else { /* find best-scoring PPT on the given canonical strand */ while (gt_ppt_hit_get_strand(hit) != *canonical_strand && i < gt_ppt_results_get_number_of_hits(results)) { gt_log_log("dropping PPT because of nonconsistent strand: %s\n", gt_feature_node_get_attribute(element->mainnode, "ID")); hit = gt_ppt_results_get_ranked_hit(results, i++); } /* if there is none, do not report a PPT */ if (gt_ppt_hit_get_strand(hit) != *canonical_strand) return; } ppt_range = gt_ppt_hit_get_coords(hit); ppt_range.start++; ppt_range.end++; /* GFF3 is 1-based */ gf = gt_feature_node_new(gt_genome_node_get_seqid((GtGenomeNode*) element->mainnode), GT_PPT_TYPE, ppt_range.start, ppt_range.end, gt_ppt_hit_get_strand(hit)); gt_feature_node_set_source((GtFeatureNode*) gf, tag); gt_feature_node_set_strand(element->mainnode, gt_ppt_hit_get_strand(hit)); gt_feature_node_add_child(element->mainnode, (GtFeatureNode*) gf); }
static int inter_feature_in_children(GtFeatureNode *current_feature, void *data, GT_UNUSED GtError *err) { GtInterFeatureVisitor *aiv = (GtInterFeatureVisitor*) data; GtFeatureNode *inter_node; GtRange previous_range, current_range, inter_range; GtStrand previous_strand, /*current_strand, */inter_strand; GtStr *parent_seqid; gt_error_check(err); gt_assert(current_feature); if (gt_feature_node_has_type(current_feature, aiv->outside_type)) { if (aiv->previous_feature) { /* determine inter range */ previous_range = gt_genome_node_get_range((GtGenomeNode*) aiv->previous_feature); current_range = gt_genome_node_get_range((GtGenomeNode*) current_feature); if (previous_range.end >= current_range.start) { gt_warning("overlapping boundary features " GT_WU "-" GT_WU " and " GT_WU "-" GT_WU ", " "not placing '%s' inter-feature", previous_range.start, previous_range.end, current_range.start, current_range.end, aiv->inter_type); return 0; } if (current_range.start - previous_range.end < 2) { gt_warning("no space for inter-feature '%s' between " GT_WU " and " GT_WU, aiv->inter_type, previous_range.end, current_range.start); return 0; } inter_range.start = previous_range.end + 1; inter_range.end = current_range.start - 1; /* determine inter strand */ previous_strand = gt_feature_node_get_strand(aiv->previous_feature); /*current_strand = gt_feature_node_get_strand(current_feature);*/ gt_assert(previous_strand == gt_feature_node_get_strand(current_feature)); inter_strand = previous_strand; /* determine sequence id */ parent_seqid = gt_genome_node_get_seqid((GtGenomeNode*) aiv->parent_feature); gt_assert(!gt_str_cmp(parent_seqid, gt_genome_node_get_seqid((GtGenomeNode*) aiv->previous_feature))); gt_assert(!gt_str_cmp(parent_seqid, gt_genome_node_get_seqid((GtGenomeNode*) current_feature))); /* create inter feature */ inter_node = (GtFeatureNode*) gt_feature_node_new(parent_seqid, aiv->inter_type, inter_range.start, inter_range.end, inter_strand); gt_feature_node_add_child(aiv->parent_feature, inter_node); } aiv->previous_feature = current_feature; } return 0; }
static int construct_mRNAs(GT_UNUSED void *key, void *value, void *data, GtError *err) { ConstructionInfo *cinfo = (ConstructionInfo*) data; GtArray *gt_genome_node_array = (GtArray*) value, *mRNAs = (GtArray*) cinfo->mRNAs; GtGenomeNode *mRNA_node, *first_node, *gn; const char *tname; GtStrand mRNA_strand; GtRange mRNA_range; GtStr *mRNA_seqid; GtUword i; int had_err = 0; gt_error_check(err); gt_assert(key && value && data); /* at least one node in array */ gt_assert(gt_array_size(gt_genome_node_array)); /* determine the range and the strand of the mRNA */ first_node = *(GtGenomeNode**) gt_array_get(gt_genome_node_array, 0); mRNA_range = gt_genome_node_get_range(first_node); mRNA_strand = gt_feature_node_get_strand((GtFeatureNode*) first_node); mRNA_seqid = gt_genome_node_get_seqid(first_node); /* TODO: support discontinuous start/stop codons */ for (i = 0; !had_err && i < gt_array_size(gt_genome_node_array); i++) { gn = *(GtGenomeNode**) gt_array_get(gt_genome_node_array, i); if (gt_feature_node_get_attribute((GtFeatureNode*) gn, GTF_PARSER_STOP_CODON_FLAG)) { GtUword j; GtRange stop_codon_rng = gt_genome_node_get_range(gn); bool found_cds = false; for (j = 0; !had_err && j < gt_array_size(gt_genome_node_array); j++) { GtGenomeNode* gn2; GtRange this_rng; const char *this_type; gn2 = *(GtGenomeNode**) gt_array_get(gt_genome_node_array, j); if (gn == gn2) continue; this_rng = gt_genome_node_get_range(gn2); this_type = gt_feature_node_get_type((GtFeatureNode*) gn2); if (this_type == gt_symbol(gt_ft_CDS)) { if (gt_range_contains(&this_rng, &stop_codon_rng)) { if (cinfo->tidy) { gt_warning("stop codon on line %u in file %s is contained in " "CDS in line %u", gt_genome_node_get_line_number(gn), gt_genome_node_get_filename(gn), gt_genome_node_get_line_number(gn2)); found_cds = true; } else { gt_error_set(err, "stop codon on line %u in file %s is " "contained in CDS in line %u", gt_genome_node_get_line_number(gn), gt_genome_node_get_filename(gn), gt_genome_node_get_line_number(gn2)); had_err = -1; } break; } if (this_rng.end + 1 == stop_codon_rng.start) { this_rng.end = stop_codon_rng.end; gt_genome_node_set_range(gn2, &this_rng); found_cds = true; break; } if (this_rng.start == stop_codon_rng.end + 1) { this_rng.start = stop_codon_rng.start; gt_genome_node_set_range(gn2, &this_rng); found_cds = true; break; } } } if (!found_cds) { if (!had_err) { if (cinfo->tidy) { gt_warning("found stop codon on line %u in file %s with no " "flanking CDS, ignoring it", gt_genome_node_get_line_number(gn), gt_genome_node_get_filename(gn)); } else { gt_error_set(err, "found stop codon on line %u in file %s with no " "flanking CDS", gt_genome_node_get_line_number(gn), gt_genome_node_get_filename(gn)); had_err = -1; break; } } } else { gt_array_rem(gt_genome_node_array, i); gt_genome_node_delete(gn); } } } for (i = 1; !had_err && i < gt_array_size(gt_genome_node_array); i++) { GtRange range; GtStrand strand; gn = *(GtGenomeNode**) gt_array_get(gt_genome_node_array, i); range = gt_genome_node_get_range(gn); mRNA_range = gt_range_join(&mRNA_range, &range); strand = gt_feature_node_get_strand((GtFeatureNode*) gn); if (strand != mRNA_strand) { gt_error_set(err, "feature %s on line %u has strand %c, but the " "parent transcript has strand %c", (const char*) key, gt_genome_node_get_line_number(gn), GT_STRAND_CHARS[strand], GT_STRAND_CHARS[mRNA_strand]); had_err = -1; break; } else { mRNA_strand = gt_strand_join(mRNA_strand, strand); } if (!had_err && gt_str_cmp(mRNA_seqid, gt_genome_node_get_seqid(gn))) { gt_error_set(err, "The features on lines %u and %u refer to different " "genomic sequences (``seqname''), although they have the same " "gene IDs (``gene_id'') which must be globally unique", gt_genome_node_get_line_number(first_node), gt_genome_node_get_line_number(gn)); had_err = -1; break; } } if (!had_err) { mRNA_node = gt_feature_node_new(mRNA_seqid, gt_ft_mRNA, mRNA_range.start, mRNA_range.end, mRNA_strand); gt_feature_node_add_attribute(((GtFeatureNode*) mRNA_node), "ID", key); gt_feature_node_add_attribute(((GtFeatureNode*) mRNA_node), "transcript_id", key); if ((tname = gt_hashmap_get(cinfo->transcript_id_to_name_mapping, (const char*) key)) && strlen(tname) > 0) { gt_feature_node_add_attribute((GtFeatureNode*) mRNA_node, GT_GFF_NAME, tname); } /* register children */ for (i = 0; i < gt_array_size(gt_genome_node_array); i++) { gn = *(GtGenomeNode**) gt_array_get(gt_genome_node_array, i); gt_feature_node_add_child((GtFeatureNode*) mRNA_node, (GtFeatureNode*) gt_genome_node_ref(gn)); } /* store the mRNA */ gt_array_add(mRNAs, mRNA_node); } return had_err; }
GtGenomeNode* gt_feature_node_new_standard_gene(void) { GtGenomeNode *fn, *child, *grand; GtStr *seqid; seqid = gt_str_new_cstr("ctg123"); /* gene */ fn = gt_feature_node_new(seqid, gt_ft_gene, 1000, 9000, GT_STRAND_FORWARD); /* TF binding site */ child = gt_feature_node_new(seqid, gt_ft_TF_binding_site, 1000, 1012, GT_STRAND_FORWARD); gt_feature_node_add_child((GtFeatureNode*) fn, (GtFeatureNode*) child); /* first mRNA */ child = gt_feature_node_new(seqid, gt_ft_mRNA, 1050, 9000, GT_STRAND_FORWARD); gt_feature_node_add_child((GtFeatureNode*) fn, (GtFeatureNode*) child); grand = gt_feature_node_new(seqid, gt_ft_exon, 1050, 1500, GT_STRAND_FORWARD); gt_feature_node_add_child((GtFeatureNode*) child, (GtFeatureNode*) grand); grand = gt_feature_node_new(seqid, gt_ft_exon, 3000, 3902, GT_STRAND_FORWARD); gt_feature_node_add_child((GtFeatureNode*) child, (GtFeatureNode*) grand); grand = gt_feature_node_new(seqid, gt_ft_exon, 5000, 5500, GT_STRAND_FORWARD); gt_feature_node_add_child((GtFeatureNode*) child, (GtFeatureNode*) grand); grand = gt_feature_node_new(seqid, gt_ft_exon, 7000, 9000, GT_STRAND_FORWARD); gt_feature_node_add_child((GtFeatureNode*) child, (GtFeatureNode*) grand); /* second mRNA */ child = gt_feature_node_new(seqid, gt_ft_mRNA, 1050, 9000, GT_STRAND_FORWARD); gt_feature_node_add_child((GtFeatureNode*) fn, (GtFeatureNode*) child); grand = gt_feature_node_new(seqid, gt_ft_exon, 1050, 1500, GT_STRAND_FORWARD); gt_feature_node_add_child((GtFeatureNode*) child, (GtFeatureNode*) grand); grand = gt_feature_node_new(seqid, gt_ft_exon, 5000, 5500, GT_STRAND_FORWARD); gt_feature_node_add_child((GtFeatureNode*) child, (GtFeatureNode*) grand); grand = gt_feature_node_new(seqid, gt_ft_exon, 7000, 9000, GT_STRAND_FORWARD); gt_feature_node_add_child((GtFeatureNode*) child, (GtFeatureNode*) grand); /* third mRNA */ child = gt_feature_node_new(seqid, gt_ft_mRNA, 1300, 9000, GT_STRAND_FORWARD); gt_feature_node_add_child((GtFeatureNode*) fn, (GtFeatureNode*) child); grand = gt_feature_node_new(seqid, gt_ft_exon, 1300, 1500, GT_STRAND_FORWARD); gt_feature_node_add_child((GtFeatureNode*) child, (GtFeatureNode*) grand); grand = gt_feature_node_new(seqid, gt_ft_exon, 3000, 3902, GT_STRAND_FORWARD); gt_feature_node_add_child((GtFeatureNode*) child, (GtFeatureNode*) grand); grand = gt_feature_node_new(seqid, gt_ft_exon, 5000, 5500, GT_STRAND_FORWARD); gt_feature_node_add_child((GtFeatureNode*) child, (GtFeatureNode*) grand); grand = gt_feature_node_new(seqid, gt_ft_exon, 7000, 9000,GT_STRAND_FORWARD); gt_feature_node_add_child((GtFeatureNode*) child, (GtFeatureNode*) grand); gt_str_delete(seqid); return fn; }
static int construct_mRNAs(GT_UNUSED void *key, void *value, void *data, GtError *err) { ConstructionInfo *cinfo = (ConstructionInfo*) data; GtArray *gt_genome_node_array = (GtArray*) value, *mRNAs = (GtArray*) cinfo->mRNAs; GtGenomeNode *mRNA_node, *first_node, *gn; const char *tname; GtStrand mRNA_strand; GtRange mRNA_range; GtStr *mRNA_seqid; GtUword i; int had_err = 0; gt_error_check(err); gt_assert(key && value && data); /* at least one node in array */ gt_assert(gt_array_size(gt_genome_node_array)); /* determine the range and the strand of the mRNA */ first_node = *(GtGenomeNode**) gt_array_get(gt_genome_node_array, 0); mRNA_range = gt_genome_node_get_range(first_node); mRNA_strand = gt_feature_node_get_strand((GtFeatureNode*) first_node); mRNA_seqid = gt_genome_node_get_seqid(first_node); for (i = 1; i < gt_array_size(gt_genome_node_array); i++) { GtRange range; gn = *(GtGenomeNode**) gt_array_get(gt_genome_node_array, i); range = gt_genome_node_get_range(gn); mRNA_range = gt_range_join(&mRNA_range, &range); /* XXX: an error check is necessary here, otherwise gt_strand_join() can cause a failed assertion */ mRNA_strand = gt_strand_join(mRNA_strand, gt_feature_node_get_strand((GtFeatureNode*) gn)); if (gt_str_cmp(mRNA_seqid, gt_genome_node_get_seqid(gn))) { gt_error_set(err, "The features on lines %u and %u refer to different " "genomic sequences (``seqname''), although they have the same " "gene IDs (``gene_id'') which must be globally unique", gt_genome_node_get_line_number(first_node), gt_genome_node_get_line_number(gn)); had_err = -1; break; } } if (!had_err) { mRNA_node = gt_feature_node_new(mRNA_seqid, gt_ft_mRNA, mRNA_range.start, mRNA_range.end, mRNA_strand); if ((tname = gt_hashmap_get(cinfo->transcript_id_to_name_mapping, (const char*) key))) { gt_feature_node_add_attribute((GtFeatureNode*) mRNA_node, GT_GFF_NAME, tname); } /* register children */ for (i = 0; i < gt_array_size(gt_genome_node_array); i++) { gn = *(GtGenomeNode**) gt_array_get(gt_genome_node_array, i); gt_feature_node_add_child((GtFeatureNode*) mRNA_node, (GtFeatureNode*) gn); } /* store the mRNA */ gt_array_add(mRNAs, mRNA_node); } return had_err; }
int gt_track_unit_test(GtError *err) { int had_err = 0; GtBlock *b[4]; GtRange r[4]; GtTrack *track; GtGenomeNode *parent[4], *gn[4]; GtStr *title; double height, tmp; GtStyle *sty; unsigned long i; GtLineBreaker *lb; double t_rest = 0, l_rest = 0; gt_error_check(err); title = gt_str_new_cstr("test"); r[0].start=100UL; r[0].end=1000UL; r[1].start=1001UL; r[1].end=1500UL; r[2].start=700UL; r[2].end=1200UL; r[3].start=10UL; r[3].end=200UL; for (i=0; i<4; i++) { parent[i] = gt_feature_node_new(title, gt_ft_gene, r[i].start, r[i].end, GT_STRAND_FORWARD); gn[i] = gt_feature_node_new(title, gt_ft_exon, r[i].start, r[i].end, GT_STRAND_FORWARD); gt_feature_node_add_child((GtFeatureNode*) parent[i], (GtFeatureNode*) gn[i]); gt_feature_node_add_attribute((GtFeatureNode*) parent[i], GT_GFF_NAME, "parent"); gt_feature_node_add_attribute((GtFeatureNode*) gn[i], GT_GFF_NAME, "child"); } for (i=0; i<4; i++) { b[i] = gt_block_new(); gt_block_set_range(b[i], r[i]); gt_block_insert_element(b[i], (GtFeatureNode*) parent[i]); gt_block_insert_element(b[i], (GtFeatureNode*) gn[i]); } lb = gt_line_breaker_bases_new(); sty = gt_style_new(err); if (gt_style_get_num(sty, "format", "track_caption_font_size", &tmp, NULL, err) == GT_STYLE_QUERY_NOT_SET) { tmp = TEXT_SIZE_DEFAULT; } t_rest += tmp; if (gt_style_get_num(sty, "format", "track_caption_space", &tmp, NULL, err) == GT_STYLE_QUERY_NOT_SET) { tmp = CAPTION_BAR_SPACE_DEFAULT; } t_rest += tmp; if (gt_style_get_num(sty, "format", "track_vspace", &tmp, NULL, err) == GT_STYLE_QUERY_NOT_SET) { tmp = TRACK_VSPACE_DEFAULT; } t_rest += tmp; if (gt_style_get_num(sty, "format", "bar_vspace", &l_rest, NULL, err) == GT_STYLE_QUERY_NOT_SET) { l_rest = BAR_VSPACE_DEFAULT; } track = gt_track_new(title, GT_UNDEF_ULONG, true, lb); gt_ensure(had_err, track); gt_ensure(had_err, gt_track_get_title(track) == title); gt_ensure(had_err, gt_track_get_number_of_lines(track) == 0); gt_ensure(had_err, gt_track_get_height(track, &height, sty, err) == 0); gt_ensure(had_err, height == t_rest); gt_ensure(had_err, !gt_error_is_set(err)); gt_ensure(had_err, gt_track_insert_block(track, b[0], err) == 0); gt_ensure(had_err, !gt_error_is_set(err)); gt_ensure(had_err, gt_track_get_number_of_lines(track) == 1); gt_ensure(had_err, gt_track_get_height(track, &height, sty, err) == 0); gt_ensure(had_err, height == t_rest + l_rest + BAR_HEIGHT_DEFAULT); gt_ensure(had_err, !gt_error_is_set(err)); gt_ensure(had_err, gt_track_insert_block(track, b[1], err) == 0); gt_ensure(had_err, !gt_error_is_set(err)); gt_ensure(had_err, gt_track_get_number_of_lines(track) == 1); gt_ensure(had_err, gt_track_get_height(track, &height, sty, err) == 0); gt_ensure(had_err, height == t_rest + l_rest + BAR_HEIGHT_DEFAULT); gt_ensure(had_err, !gt_error_is_set(err)); gt_ensure(had_err, gt_track_insert_block(track, b[2], err) == 0); gt_ensure(had_err, !gt_error_is_set(err)); gt_ensure(had_err, gt_track_get_number_of_lines(track) == 2); gt_ensure(had_err, gt_track_insert_block(track, b[3], err) == 0); gt_ensure(had_err, !gt_error_is_set(err)); gt_ensure(had_err, gt_track_get_number_of_lines(track) == 2); gt_ensure(had_err, gt_track_get_height(track, &height, sty, err) == 0); gt_ensure(had_err, height == t_rest + 2*(l_rest + BAR_HEIGHT_DEFAULT)); gt_ensure(had_err, !gt_error_is_set(err)); gt_style_set_num(sty, "exon", "bar_height", 42); gt_ensure(had_err, gt_track_get_height(track, &height, sty, err) == 0); gt_ensure(had_err, height == t_rest + 2*(l_rest+42)); gt_ensure(had_err, !gt_error_is_set(err)); gt_style_set_num(sty, "gene", "bar_height", 23); gt_ensure(had_err, gt_track_get_height(track, &height, sty, err) == 0); gt_ensure(had_err, height == t_rest + 2*(l_rest+42)); gt_ensure(had_err, !gt_error_is_set(err)); gt_style_unset(sty, "exon", "bar_height"); gt_ensure(had_err, gt_track_get_height(track, &height, sty, err) == 0); gt_ensure(had_err, height == t_rest + 2*(l_rest+23)); gt_ensure(had_err, !gt_error_is_set(err)); gt_style_unset(sty, "gene", "bar_height"); gt_style_set_num(sty, "format", "bar_height", 99); gt_ensure(had_err, gt_track_get_height(track, &height, sty, err) == 0); gt_ensure(had_err, height == t_rest + 2*(l_rest+99)); gt_ensure(had_err, !gt_error_is_set(err)); gt_ensure(had_err, gt_track_get_number_of_discarded_blocks(track) == 0); gt_track_delete(track); gt_str_delete(title); gt_style_delete(sty); for (i=0; i<4; i++) { gt_block_delete(b[i]); gt_genome_node_delete(parent[i]); } return had_err; }
static void infer_cds_visitor_infer_utrs(AgnInferCDSVisitor *v) { GtFeatureNode *start_codon, *stop_codon; bool exonsexplicit = gt_array_size(v->exons) > 0; bool cdsexplicit = gt_array_size(v->cds) > 0; bool startcodon_check = gt_array_size(v->starts) == 1 && (start_codon = gt_array_get(v->starts, 0)) != NULL; bool stopcodon_check = gt_array_size(v->stops) == 1 && (stop_codon = gt_array_get(v->stops, 0)) != NULL; bool caninferutrs = exonsexplicit && startcodon_check && stopcodon_check; if(gt_array_size(v->utrs) > 0) { return; } else if(!cdsexplicit && !caninferutrs) { return; } GtGenomeNode **leftcodon = gt_array_get(v->starts, 0); GtGenomeNode **rightcodon = gt_array_get(v->stops, 0); GtStrand strand = gt_feature_node_get_strand(v->mrna); const char *lefttype = "five_prime_UTR"; const char *righttype = "three_prime_UTR"; if(strand == GT_STRAND_REVERSE) { lefttype = "three_prime_UTR"; righttype = "five_prime_UTR"; void *temp = leftcodon; leftcodon = rightcodon; rightcodon = temp; } GtRange leftrange = gt_genome_node_get_range(*leftcodon); GtRange rightrange = gt_genome_node_get_range(*rightcodon); GtUword i; for(i = 0; i < gt_array_size(v->exons); i++) { GtGenomeNode **exon = gt_array_get(v->exons, i); GtRange exonrange = gt_genome_node_get_range(*exon); if(exonrange.start < leftrange.start) { GtRange utrrange; if(gt_range_overlap(&exonrange, &leftrange)) { utrrange.start = exonrange.start; utrrange.end = leftrange.start - 1; } else { utrrange = exonrange; } GtGenomeNode *utr = gt_feature_node_new(gt_genome_node_get_seqid(*exon), lefttype, utrrange.start, utrrange.end, strand); if(v->source) gt_feature_node_set_source((GtFeatureNode *)utr, v->source); gt_feature_node_add_child(v->mrna, (GtFeatureNode *)utr); gt_array_add(v->utrs, utr); } if(exonrange.end > rightrange.end) { GtRange utrrange; if(gt_range_overlap(&exonrange, &rightrange)) { utrrange.start = rightrange.end + 1; utrrange.end = exonrange.end; } else { utrrange = exonrange; } GtGenomeNode *utr = gt_feature_node_new(gt_genome_node_get_seqid(*exon), righttype, utrrange.start, utrrange.end, strand); if(v->source) gt_feature_node_set_source((GtFeatureNode *)utr, v->source); gt_feature_node_add_child(v->mrna, (GtFeatureNode *)utr); gt_array_add(v->utrs, utr); } } }
static int construct_genes(GT_UNUSED void *key, void *value, void *data, GtError *err) { GtHashmap *transcript_id_hash = (GtHashmap*) value; ConstructionInfo *cinfo = (ConstructionInfo*) data; GtQueue *genome_nodes = cinfo->genome_nodes; const char *gname; GtArray *mRNAs = gt_array_new(sizeof (GtGenomeNode*)); GtGenomeNode *gene_node, *gn; GtStrand gene_strand; GtRange gene_range; GtStr *gene_seqid; GtUword i; int had_err = 0; gt_error_check(err); gt_assert(key && value && data); cinfo->mRNAs = mRNAs; had_err = gt_hashmap_foreach(transcript_id_hash, construct_mRNAs, cinfo, err); if (!had_err) { gt_assert(gt_array_size(mRNAs)); /* at least one mRNA constructed */ /* determine the range and the strand of the gene */ gn = *(GtGenomeNode**) gt_array_get(mRNAs, 0); gene_range = gt_genome_node_get_range(gn); gene_strand = gt_feature_node_get_strand((GtFeatureNode*) gn); gene_seqid = gt_genome_node_get_seqid(gn); for (i = 1; i < gt_array_size(mRNAs); i++) { GtRange range; gn = *(GtGenomeNode**) gt_array_get(mRNAs, i); range = gt_genome_node_get_range(gn); gene_range = gt_range_join(&gene_range, &range); gene_strand = gt_strand_join(gene_strand, gt_feature_node_get_strand((GtFeatureNode*) gn)); gt_assert(gt_str_cmp(gene_seqid, gt_genome_node_get_seqid(gn)) == 0); } gene_node = gt_feature_node_new(gene_seqid, gt_ft_gene, gene_range.start, gene_range.end, gene_strand); if ((gname = gt_hashmap_get(cinfo->gene_id_to_name_mapping, (const char*) key))) { gt_feature_node_add_attribute((GtFeatureNode*) gene_node, GT_GFF_NAME, gname); } /* register children */ for (i = 0; i < gt_array_size(mRNAs); i++) { gn = *(GtGenomeNode**) gt_array_get(mRNAs, i); gt_feature_node_add_child((GtFeatureNode*) gene_node, (GtFeatureNode*) gn); } /* store the gene */ gt_queue_add(genome_nodes, gene_node); /* free */ gt_array_delete(mRNAs); } return had_err; }