static GtArray* gaeval_visitor_intersect(GtGenomeNode *genemodel, GtGenomeNode *alignment) { agn_assert(genemodel && alignment); GtFeatureNode *genefn = gt_feature_node_cast(genemodel); GtFeatureNode *algnfn = gt_feature_node_cast(alignment); agn_assert(gt_feature_node_has_type(genefn, "mRNA")); GtStrand genestrand = gt_feature_node_get_strand(genefn); GtStrand algnstrand = gt_feature_node_get_strand(algnfn); if(genestrand != algnstrand) return NULL; GtArray *covered_parts = gt_array_new( sizeof(GtRange) ); GtArray *exons = agn_typecheck_select(genefn, agn_typecheck_exon); GtWord i; for(i = 0; i < gt_array_size(exons); i++) { GtGenomeNode *exon = *(GtGenomeNode **)gt_array_get(exons, i); GtRange exonrange = gt_genome_node_get_range(exon); GtFeatureNodeIterator *aniter = gt_feature_node_iterator_new(algnfn); GtFeatureNode *tempaln; GtRange nullrange = {0, 0}; for(tempaln = gt_feature_node_iterator_next(aniter); tempaln != NULL; tempaln = gt_feature_node_iterator_next(aniter)) { if(gt_feature_node_has_type(tempaln, "match_gap")) continue; GtRange alnrange = gt_genome_node_get_range((GtGenomeNode *) tempaln); GtRange intr = gaeval_visitor_range_intersect(&exonrange, &alnrange); if(gt_range_compare(&intr, &nullrange) != 0) gt_array_add(covered_parts, intr); } gt_feature_node_iterator_delete(aniter); } gt_array_delete(exons); for(i = 0; i < gt_array_size(covered_parts); i++) { GtRange *r1 = gt_array_get(covered_parts, i); GtUword j; for(j = i+1; j < gt_array_size(covered_parts); j++) { GtRange *r2 = gt_array_get(covered_parts, j); agn_assert(gt_range_overlap(r1, r2) == false); } } return covered_parts; }
void agn_transcript_structure_gbk(GtFeatureNode *transcript, FILE *outstream) { gt_assert(transcript && outstream); GtArray *exons = gt_array_new( sizeof(GtFeatureNode *) ); GtFeatureNodeIterator *iter = gt_feature_node_iterator_new_direct(transcript); GtFeatureNode *child; for ( child = gt_feature_node_iterator_next(iter); child != NULL; child = gt_feature_node_iterator_next(iter) ) { if(agn_gt_feature_node_is_exon_feature(child)) gt_array_add(exons, child); } gt_feature_node_iterator_delete(iter); gt_assert(gt_array_size(exons) > 0); gt_array_sort(exons, (GtCompare)agn_gt_genome_node_compare); if(gt_feature_node_get_strand(transcript) == GT_STRAND_REVERSE) fputs("complement(", outstream); if(gt_array_size(exons) == 1) { GtGenomeNode *exon = *(GtGenomeNode **)gt_array_get(exons, 0); GtRange exonrange = gt_genome_node_get_range(exon); fprintf(outstream, "<%lu..>%lu", exonrange.start, exonrange.end); } else { fputs("join(", outstream); GtUword i; for(i = 0; i < gt_array_size(exons); i++) { GtGenomeNode *exon = *(GtGenomeNode **)gt_array_get(exons, i); GtRange exonrange = gt_genome_node_get_range(exon); if(i == 0) fprintf(outstream, "<%lu..%lu", exonrange.start, exonrange.end); else if(i+1 == gt_array_size(exons)) fprintf(outstream, ",%lu..>%lu", exonrange.start, exonrange.end); else fprintf(outstream, ",%lu..%lu", exonrange.start, exonrange.end); } fputs(")", outstream); } if(gt_feature_node_get_strand(transcript) == GT_STRAND_REVERSE) fputs(")", outstream); }
static int create_block_features(GtBEDParser *bed_parser, GtFeatureNode *fn, GtUword block_count, GtSplitter *size_splitter, GtSplitter *start_splitter, GtIO *bed_file, GtError *err) { GtUword i; int had_err = 0; gt_assert(fn && block_count && size_splitter && start_splitter); gt_assert(gt_splitter_size(size_splitter) == block_count); gt_assert(gt_splitter_size(start_splitter) == block_count); for (i = 0; !had_err && i < block_count; i++) { GtUword block_size, block_start, start, end; GtGenomeNode *block; const char *name; if (gt_parse_uword(&block_size, gt_splitter_get_token(size_splitter, i))) { gt_error_set(err, "file \"%s\": line "GT_WU": could not parse blockSize '%s'", gt_io_get_filename(bed_file), gt_io_get_line_number(bed_file), gt_splitter_get_token(size_splitter, i)); had_err = -1; } if (!had_err && gt_parse_uword(&block_start, gt_splitter_get_token(start_splitter, i))) { gt_error_set(err, "file \"%s\": line "GT_WU": could not parse blockStart " "'%s'", gt_io_get_filename(bed_file), gt_io_get_line_number(bed_file), gt_splitter_get_token(start_splitter, i)); had_err = -1; } if (!had_err) { start = gt_genome_node_get_start((GtGenomeNode*) fn) + block_start; end = start + block_size - 1; block = gt_feature_node_new(gt_genome_node_get_seqid((GtGenomeNode*) fn), bed_parser->block_type ? bed_parser->block_type : BED_BLOCK_TYPE, start, end, gt_feature_node_get_strand(fn)); if ((name = gt_feature_node_get_attribute(fn, GT_GFF_NAME))) { gt_feature_node_add_attribute((GtFeatureNode*) block, GT_GFF_NAME, name); } gt_feature_node_set_score((GtFeatureNode*) block, gt_feature_node_get_score(fn)); gt_feature_node_set_strand((GtFeatureNode*) block, gt_feature_node_get_strand(fn)); gt_feature_node_add_child(fn, (GtFeatureNode*) block); } } return had_err; }
static void infer_cds_visitor_infer_cds(AgnInferCDSVisitor *v) { GtFeatureNode **start_codon = NULL, **stop_codon = NULL; bool exonsexplicit = gt_array_size(v->exons) > 0; bool startcodon_check = gt_array_size(v->starts) == 1 && (start_codon = gt_array_get(v->starts, 0)) != NULL; bool stopcodon_check = gt_array_size(v->stops) == 1 && (stop_codon = gt_array_get(v->stops, 0)) != NULL; if(gt_array_size(v->cds) > 0) { return; } else if(!exonsexplicit || !startcodon_check || !stopcodon_check) { return; } GtRange left_codon_range, right_codon_range; left_codon_range = gt_genome_node_get_range(*(GtGenomeNode **)start_codon); right_codon_range = gt_genome_node_get_range(*(GtGenomeNode **)stop_codon); if(gt_feature_node_get_strand(v->mrna) == GT_STRAND_REVERSE) { left_codon_range = gt_genome_node_get_range(*(GtGenomeNode **)stop_codon); right_codon_range = gt_genome_node_get_range(*(GtGenomeNode **)start_codon); } GtUword i; for(i = 0; i < gt_array_size(v->exons); i++) { GtFeatureNode *exon = *(GtFeatureNode **)gt_array_get(v->exons, i); GtGenomeNode *exon_gn = (GtGenomeNode *)exon; GtRange exon_range = gt_genome_node_get_range(exon_gn); GtStrand exon_strand = gt_feature_node_get_strand(exon); GtRange cdsrange; bool exon_includes_cds = infer_cds_visitor_infer_range(&exon_range, &left_codon_range, &right_codon_range, &cdsrange); if(exon_includes_cds) { GtGenomeNode *cdsfeat; cdsfeat = gt_feature_node_new(gt_genome_node_get_seqid(exon_gn), "CDS", cdsrange.start, cdsrange.end, exon_strand); if(v->source) gt_feature_node_set_source((GtFeatureNode *)cdsfeat, v->source); gt_feature_node_add_child(v->mrna, (GtFeatureNode *)cdsfeat); gt_array_add(v->cds, cdsfeat); } } }
GtRange agn_transcript_cds_range(GtFeatureNode *transcript) { gt_assert(transcript); GtRange trange; trange.start = 0; trange.end = 0; GtFeatureNodeIterator *iter = gt_feature_node_iterator_new_direct(transcript); GtFeatureNode *current; for ( current = gt_feature_node_iterator_next(iter); current != NULL; current = gt_feature_node_iterator_next(iter) ) { if(agn_gt_feature_node_is_cds_feature(current)) { GtRange crange = gt_genome_node_get_range((GtGenomeNode *)current); if(trange.start == 0 || crange.start < trange.start) trange.start = crange.start; if(trange.end == 0 || crange.end > trange.end) trange.end = crange.end; } } if(gt_feature_node_get_strand(transcript) == GT_STRAND_REVERSE) { GtUword temp = trange.start; trange.start = trange.end; trange.end = temp; } return trange; }
void gt_gff3_output_leading_str(GtFeatureNode *fn, GtStr *outstr) { GtGenomeNode *gn; gt_assert(fn && outstr); gn = (GtGenomeNode*) fn; gt_str_append_str(outstr, gt_genome_node_get_seqid(gn)); gt_str_append_char(outstr, '\t'); gt_str_append_cstr(outstr, gt_feature_node_get_source(fn)); gt_str_append_char(outstr, '\t'); gt_str_append_cstr(outstr, gt_feature_node_get_type(fn)); gt_str_append_char(outstr, '\t'); gt_str_append_uword(outstr, gt_genome_node_get_start(gn)); gt_str_append_char(outstr, '\t'); gt_str_append_uword(outstr, gt_genome_node_get_end(gn)); gt_str_append_char(outstr, '\t'); if (gt_feature_node_score_is_defined(fn)) { char buf[BUFSIZ]; (void) snprintf(buf, BUFSIZ, "%.3g", gt_feature_node_get_score(fn)); gt_str_append_cstr(outstr, buf); } else gt_str_append_char(outstr, '.'); gt_str_append_char(outstr, '\t'); gt_str_append_char(outstr, GT_STRAND_CHARS[gt_feature_node_get_strand(fn)]); gt_str_append_char(outstr, '\t'); gt_str_append_char(outstr, GT_PHASE_CHARS[gt_feature_node_get_phase(fn)]); gt_str_append_char(outstr, '\t'); }
static int pdom_hit_attach_gff3(GtPdomModel *model, GtPdomModelHit *hit, void *data, GT_UNUSED GtError *err) { unsigned long i; GtRange rng; GtLTRdigestStream *ls = (GtLTRdigestStream *) data; GtStrand strand; gt_assert(model && hit); strand = gt_pdom_model_hit_get_best_strand(hit); /* do not use the hits on the non-predicted strand -- maybe identify nested elements ? */ if (strand != gt_feature_node_get_strand(ls->element.mainnode)) return 0; for (i=0;i<gt_pdom_model_hit_best_chain_length(hit);i++) { GtGenomeNode *gf; GtStr *alignmentstring, *aastring; GtPdomSingleHit *singlehit; GtPhase frame; singlehit = gt_pdom_model_hit_best_single_hit(hit, i); alignmentstring = gt_str_new(); aastring = gt_str_new(); frame = gt_pdom_single_hit_get_phase(singlehit); rng = gt_pdom_single_hit_get_range(singlehit); gt_pdom_single_hit_format_alignment(singlehit, GT_ALIWIDTH, alignmentstring); gt_pdom_single_hit_get_aaseq(singlehit, aastring); rng.start++; rng.end++; /* GFF3 is 1-based */ gf = gt_feature_node_new(gt_genome_node_get_seqid((GtGenomeNode*) ls->element.mainnode), GT_PDOM_TYPE, rng.start, rng.end, strand); gt_genome_node_add_user_data((GtGenomeNode*) gf, "pdom_alignment", alignmentstring, (GtFree) gt_str_delete); gt_genome_node_add_user_data((GtGenomeNode*) gf, "pdom_aaseq", aastring, (GtFree) gt_str_delete); gt_feature_node_set_source((GtFeatureNode*) gf, ls->ltrdigest_tag); gt_feature_node_set_score((GtFeatureNode*) gf, gt_pdom_single_hit_get_evalue(singlehit)); gt_feature_node_set_phase((GtFeatureNode*) gf, frame); if (gt_pdom_model_get_name(model)) { gt_feature_node_add_attribute((GtFeatureNode*) gf, "name", gt_pdom_model_get_name(model)); } if (gt_pdom_model_get_acc(model)) { gt_feature_node_add_attribute((GtFeatureNode*) gf, "id", gt_pdom_model_get_acc(model)); } gt_feature_node_add_child(ls->element.mainnode, (GtFeatureNode*) gf); } return 0; }
static bool filter_strand(GtFeatureNode *fn, GtStrand strand) { gt_assert(fn); if (strand != GT_NUM_OF_STRAND_TYPES && gt_feature_node_get_strand(fn) != strand) return true; return false; }
static void construct_thick_feature(GtBEDParser *bed_parser, GtFeatureNode *fn, GtRange range) { GtGenomeNode *thick_feature; const char *name; gt_assert(fn); thick_feature = gt_feature_node_new(gt_genome_node_get_seqid((GtGenomeNode*) fn), bed_parser->thick_feature_type ? bed_parser->thick_feature_type : BED_THICK_FEATURE_TYPE, range.start, range.end, gt_feature_node_get_strand(fn)); if ((name = gt_feature_node_get_attribute(fn, "Name"))) gt_feature_node_add_attribute((GtFeatureNode*) thick_feature, "Name", name); gt_feature_node_set_score((GtFeatureNode*) thick_feature, gt_feature_node_get_score(fn)); gt_feature_node_set_strand((GtFeatureNode*) thick_feature, gt_feature_node_get_strand(fn)); gt_feature_node_add_child(fn, (GtFeatureNode*) thick_feature); }
static void infer_cds_visitor_check_stop(AgnInferCDSVisitor *v) { if(gt_array_size(v->cds) == 0) return; const char *mrnaid = gt_feature_node_get_attribute(v->mrna, "ID"); unsigned int ln = gt_genome_node_get_line_number((GtGenomeNode *)v->mrna); GtStrand strand = gt_feature_node_get_strand(v->mrna); GtRange stoprange; GtUword threeprimeindex = gt_array_size(v->cds) - 1; GtGenomeNode **threeprimesegment = gt_array_get(v->cds, threeprimeindex); stoprange = gt_genome_node_get_range(*threeprimesegment); stoprange.start = stoprange.end - 2; if(strand == GT_STRAND_REVERSE) { threeprimesegment = gt_array_get(v->cds, 0); stoprange = gt_genome_node_get_range(*threeprimesegment); stoprange.end = stoprange.start + 2; } if(gt_array_size(v->stops) > 1) { gt_logger_log(v->logger, "mRNA '%s' (line %u) has %lu stop codons", mrnaid, ln, gt_array_size(v->starts)); } else if(gt_array_size(v->stops) == 1) { GtGenomeNode **codon = gt_array_get(v->stops, 0); GtRange testrange = gt_genome_node_get_range(*codon); if(gt_range_compare(&stoprange, &testrange) != 0) { gt_logger_log(v->logger, "stop codon inferred from CDS [%lu, %lu] does " "not match explicitly provided stop codon [%lu, %lu] for " "mRNA '%s'", stoprange.start, stoprange.end, testrange.start, testrange.end, mrnaid); } } else // agn_assert(gt_array_size(v->stops) == 0) { GtStr *seqid = gt_genome_node_get_seqid((GtGenomeNode *)v->mrna); GtGenomeNode *codonfeature = gt_feature_node_new(seqid, "stop_codon", stoprange.start, stoprange.end, strand); if(v->source) gt_feature_node_set_source((GtFeatureNode *)codonfeature, v->source); GtFeatureNode *cf = (GtFeatureNode *)codonfeature; gt_feature_node_add_child(v->mrna, cf); gt_array_add(v->stops, cf); } }
static int feature_node_lua_get_strand(lua_State *L) { GtGenomeNode **gn = check_genome_node(L, 1); GtFeatureNode *fn; char strand_char[2]; /* make sure we get a feature node */ fn = gt_feature_node_try_cast(*gn); luaL_argcheck(L, fn, 1, "not a feature node"); strand_char[0] = GT_STRAND_CHARS[gt_feature_node_get_strand(fn)]; strand_char[1] = '\0'; lua_pushstring(L, strand_char); return 1; }
GtBlock* gt_block_new_from_node(GtFeatureNode *node) { GtBlock *block; gt_assert(node); block = gt_block_new(); block->range = gt_genome_node_get_range((GtGenomeNode*) node); block->strand = gt_feature_node_get_strand(node); block->type = gt_feature_node_get_type(node); if (!gt_feature_node_is_pseudo(node)) { block->top_level_feature = (GtFeatureNode*) gt_genome_node_ref((GtGenomeNode*) node); } return block; }
GtGenomeNode* gt_feature_node_new_pseudo_template(GtFeatureNode *fn) { GtFeatureNode *pf; GtGenomeNode *pn; GtRange range; gt_assert(fn); range = feature_node_get_range((GtGenomeNode*) fn), pn = gt_feature_node_new_pseudo(feature_node_get_seqid((GtGenomeNode*) fn), range.start, range.end, gt_feature_node_get_strand(fn)); pf = gt_feature_node_cast(pn); gt_feature_node_set_source(pf, fn->source); return pn; }
static void infer_cds_visitor_check_cds_phase(AgnInferCDSVisitor *v) { unsigned long num_cds_feats = gt_array_size(v->cds); if(num_cds_feats == 0) return; GtFeatureNode *cdsf1 = *(GtFeatureNode **)gt_array_get(v->cds, 0); GtStrand strand = gt_feature_node_get_strand(cdsf1); if(strand == GT_STRAND_REVERSE) cdsf1 = *(GtFeatureNode **)gt_array_get(v->cds, num_cds_feats - 1); gt_feature_node_set_phase(cdsf1, GT_PHASE_ZERO); if(num_cds_feats == 1) return; unsigned long cds_length = gt_genome_node_get_length((GtGenomeNode *)cdsf1); int i; if(strand == GT_STRAND_REVERSE) { for(i = num_cds_feats - 2; i >= 0; i--) { GtFeatureNode *cds = *(GtFeatureNode **)gt_array_get(v->cds, i); int phasenum = cds_length % 3; GtPhase phase = GT_PHASE_ZERO; if(phasenum == 1) phase = GT_PHASE_TWO; else if(phasenum == 2) phase = GT_PHASE_ONE; gt_feature_node_set_phase(cds, phase); cds_length += gt_genome_node_get_length((GtGenomeNode *)cds); } } else { for(i = 1; i < num_cds_feats; i++) { GtFeatureNode *cds = *(GtFeatureNode **)gt_array_get(v->cds, i); int phasenum = cds_length % 3; GtPhase phase = GT_PHASE_ZERO; if(phasenum == 1) phase = GT_PHASE_TWO; else if(phasenum == 2) phase = GT_PHASE_ONE; gt_feature_node_set_phase(cds, phase); cds_length += gt_genome_node_get_length((GtGenomeNode *)cds); } } }
void gt_gff3_output_leading(GtFeatureNode *fn, GtFile *outfp) { GtGenomeNode *gn; gt_assert(fn); gn = (GtGenomeNode*) fn; gt_file_xprintf(outfp, "%s\t%s\t%s\t"GT_WU"\t"GT_WU"\t", gt_str_get(gt_genome_node_get_seqid(gn)), gt_feature_node_get_source(fn), gt_feature_node_get_type(fn), gt_genome_node_get_start(gn), gt_genome_node_get_end(gn)); if (gt_feature_node_score_is_defined(fn)) gt_file_xprintf(outfp, "%.3g", gt_feature_node_get_score(fn)); else gt_file_xfputc('.', outfp); gt_file_xprintf(outfp, "\t%c\t%c\t", GT_STRAND_CHARS[gt_feature_node_get_strand(fn)], GT_PHASE_CHARS[gt_feature_node_get_phase(fn)]); }
static int extract_join_feature(GtGenomeNode *gn, const char *type, GtRegionMapping *region_mapping, GtStr *sequence, bool *reverse_strand, bool *first_child_of_type_seen, GtPhase *phase, GtError *err) { char *outsequence; GtFeatureNode *fn; GtRange range; int had_err = 0; gt_error_check(err); fn = gt_feature_node_cast(gn); gt_assert(fn); if (gt_feature_node_has_type(fn, type)) { if (gt_feature_node_get_strand(fn) == GT_STRAND_REVERSE) { *reverse_strand = true; *phase = gt_feature_node_get_phase(fn); } else { if (!(*first_child_of_type_seen)) { *first_child_of_type_seen = true; *phase = gt_feature_node_get_phase(fn); } else *phase = GT_PHASE_UNDEFINED; } range = gt_genome_node_get_range(gn); had_err = gt_region_mapping_get_sequence(region_mapping, &outsequence, gt_genome_node_get_seqid(gn), range.start, range.end, err); if (!had_err) { gt_str_append_cstr_nt(sequence, outsequence, gt_range_length(&range)); gt_free(outsequence); } } return had_err; }
static void infer_cds_visitor_set_utrs(AgnInferCDSVisitor *v) { GtGenomeNode **start; GtUword i, cds_start; if(!v->starts || gt_array_size(v->starts) != 1) return; start = gt_array_get(v->starts, 0); cds_start = gt_genome_node_get_start(*start); for(i = 0; i < gt_array_size(v->utrs); i++) { GtFeatureNode *utr = *(GtFeatureNode **)gt_array_get(v->utrs, i); GtStrand strand = gt_feature_node_get_strand(utr); GtUword utr_start = gt_genome_node_get_start((GtGenomeNode *)utr); if(!gt_feature_node_has_type(utr, "five_prime_UTR") && !gt_feature_node_has_type(utr, "three_prime_UTR")) { if(strand == GT_STRAND_FORWARD) { if(utr_start < cds_start) gt_feature_node_set_type(utr, "five_prime_UTR"); else gt_feature_node_set_type(utr, "three_prime_UTR"); } else { if(utr_start < cds_start) gt_feature_node_set_type(utr, "three_prime_UTR"); else gt_feature_node_set_type(utr, "five_prime_UTR"); } } } }
static int check_cds_phases(GtArray *cds_features, GtCDSCheckVisitor *v, bool is_multi, bool second_pass, GtError *err) { GtPhase current_phase, correct_phase = GT_PHASE_ZERO; GtFeatureNode *fn; GtStrand strand; unsigned long i, current_length; int had_err = 0; gt_error_check(err); gt_assert(cds_features); gt_assert(gt_array_size(cds_features)); fn = *(GtFeatureNode**) gt_array_get_first(cds_features); strand = gt_feature_node_get_strand(fn); if (strand == GT_STRAND_REVERSE) gt_array_reverse(cds_features); for (i = 0; !had_err && i < gt_array_size(cds_features); i++) { fn = *(GtFeatureNode**) gt_array_get(cds_features, i); /* the first phase can be anything (except being undefined), because the GFF3 spec says: NOTE 4 - CDS features MUST have have a defined phase field. Otherwise it is not possible to infer the correct polypeptides corresponding to partially annotated genes. */ if ((!i && gt_feature_node_get_phase(fn) == GT_PHASE_UNDEFINED) || (i && gt_feature_node_get_phase(fn) != correct_phase)) { if (gt_hashmap_get(v->cds_features, fn)) { if (v->tidy && !is_multi && !gt_feature_node_has_children(fn)) { /* we can split the feature */ gt_warning("%s feature on line %u in file \"%s\" has multiple " "parents which require different phases; split feature", gt_ft_CDS, gt_genome_node_get_line_number((GtGenomeNode*) fn), gt_genome_node_get_filename((GtGenomeNode*) fn)); gt_hashmap_add(v->cds_features_to_split, fn, fn); v->splitting_is_necessary = true; /* split later */ } else { gt_error_set(err, "%s feature on line %u in file \"%s\" has multiple " "parents which require different phases", gt_ft_CDS, gt_genome_node_get_line_number((GtGenomeNode*) fn), gt_genome_node_get_filename((GtGenomeNode*) fn)); had_err = -1; } } else { if (v->tidy) { if (!second_pass) { gt_warning("%s feature on line %u in file \"%s\" has the wrong " "phase %c -> correcting it to %c", gt_ft_CDS, gt_genome_node_get_line_number((GtGenomeNode*) fn), gt_genome_node_get_filename((GtGenomeNode*) fn), GT_PHASE_CHARS[gt_feature_node_get_phase(fn)], GT_PHASE_CHARS[correct_phase]); } gt_feature_node_set_phase(fn, correct_phase); } else { gt_error_set(err, "%s feature on line %u in file \"%s\" has the " "wrong phase %c (should be %c)", gt_ft_CDS, gt_genome_node_get_line_number((GtGenomeNode*) fn), gt_genome_node_get_filename((GtGenomeNode*) fn), GT_PHASE_CHARS[gt_feature_node_get_phase(fn)], GT_PHASE_CHARS[correct_phase]); had_err = -1; } } } if (!had_err) { current_phase = gt_feature_node_get_phase(fn); current_length = gt_genome_node_get_length((GtGenomeNode*) fn); correct_phase = (3 - (current_length - current_phase) % 3) % 3; gt_hashmap_add(v->cds_features, fn, fn); /* record CDS feature */ } } return had_err; }
static int inter_feature_in_children(GtFeatureNode *current_feature, void *data, GT_UNUSED GtError *err) { GtInterFeatureVisitor *aiv = (GtInterFeatureVisitor*) data; GtFeatureNode *inter_node; GtRange previous_range, current_range, inter_range; GtStrand previous_strand, /*current_strand, */inter_strand; GtStr *parent_seqid; gt_error_check(err); gt_assert(current_feature); if (gt_feature_node_has_type(current_feature, aiv->outside_type)) { if (aiv->previous_feature) { /* determine inter range */ previous_range = gt_genome_node_get_range((GtGenomeNode*) aiv->previous_feature); current_range = gt_genome_node_get_range((GtGenomeNode*) current_feature); if (previous_range.end >= current_range.start) { gt_warning("overlapping boundary features " GT_WU "-" GT_WU " and " GT_WU "-" GT_WU ", " "not placing '%s' inter-feature", previous_range.start, previous_range.end, current_range.start, current_range.end, aiv->inter_type); return 0; } if (current_range.start - previous_range.end < 2) { gt_warning("no space for inter-feature '%s' between " GT_WU " and " GT_WU, aiv->inter_type, previous_range.end, current_range.start); return 0; } inter_range.start = previous_range.end + 1; inter_range.end = current_range.start - 1; /* determine inter strand */ previous_strand = gt_feature_node_get_strand(aiv->previous_feature); /*current_strand = gt_feature_node_get_strand(current_feature);*/ gt_assert(previous_strand == gt_feature_node_get_strand(current_feature)); inter_strand = previous_strand; /* determine sequence id */ parent_seqid = gt_genome_node_get_seqid((GtGenomeNode*) aiv->parent_feature); gt_assert(!gt_str_cmp(parent_seqid, gt_genome_node_get_seqid((GtGenomeNode*) aiv->previous_feature))); gt_assert(!gt_str_cmp(parent_seqid, gt_genome_node_get_seqid((GtGenomeNode*) current_feature))); /* create inter feature */ inter_node = (GtFeatureNode*) gt_feature_node_new(parent_seqid, aiv->inter_type, inter_range.start, inter_range.end, inter_strand); gt_feature_node_add_child(aiv->parent_feature, inter_node); } aiv->previous_feature = current_feature; } return 0; }
static int construct_mRNAs(GT_UNUSED void *key, void *value, void *data, GtError *err) { ConstructionInfo *cinfo = (ConstructionInfo*) data; GtArray *gt_genome_node_array = (GtArray*) value, *mRNAs = (GtArray*) cinfo->mRNAs; GtGenomeNode *mRNA_node, *first_node, *gn; const char *tname; GtStrand mRNA_strand; GtRange mRNA_range; GtStr *mRNA_seqid; GtUword i; int had_err = 0; gt_error_check(err); gt_assert(key && value && data); /* at least one node in array */ gt_assert(gt_array_size(gt_genome_node_array)); /* determine the range and the strand of the mRNA */ first_node = *(GtGenomeNode**) gt_array_get(gt_genome_node_array, 0); mRNA_range = gt_genome_node_get_range(first_node); mRNA_strand = gt_feature_node_get_strand((GtFeatureNode*) first_node); mRNA_seqid = gt_genome_node_get_seqid(first_node); for (i = 1; i < gt_array_size(gt_genome_node_array); i++) { GtRange range; gn = *(GtGenomeNode**) gt_array_get(gt_genome_node_array, i); range = gt_genome_node_get_range(gn); mRNA_range = gt_range_join(&mRNA_range, &range); /* XXX: an error check is necessary here, otherwise gt_strand_join() can cause a failed assertion */ mRNA_strand = gt_strand_join(mRNA_strand, gt_feature_node_get_strand((GtFeatureNode*) gn)); if (gt_str_cmp(mRNA_seqid, gt_genome_node_get_seqid(gn))) { gt_error_set(err, "The features on lines %u and %u refer to different " "genomic sequences (``seqname''), although they have the same " "gene IDs (``gene_id'') which must be globally unique", gt_genome_node_get_line_number(first_node), gt_genome_node_get_line_number(gn)); had_err = -1; break; } } if (!had_err) { mRNA_node = gt_feature_node_new(mRNA_seqid, gt_ft_mRNA, mRNA_range.start, mRNA_range.end, mRNA_strand); if ((tname = gt_hashmap_get(cinfo->transcript_id_to_name_mapping, (const char*) key))) { gt_feature_node_add_attribute((GtFeatureNode*) mRNA_node, GT_GFF_NAME, tname); } /* register children */ for (i = 0; i < gt_array_size(gt_genome_node_array); i++) { gn = *(GtGenomeNode**) gt_array_get(gt_genome_node_array, i); gt_feature_node_add_child((GtFeatureNode*) mRNA_node, (GtFeatureNode*) gn); } /* store the mRNA */ gt_array_add(mRNAs, mRNA_node); } return had_err; }
static int gt_extract_feature_sequence_generic(GtStr *sequence, GtGenomeNode *gn, const char *type, bool join, GtStr *seqid, GtStrArray *target_ids, unsigned int *out_phase_offset, GtRegionMapping *region_mapping, GtError *err) { GtFeatureNode *fn; GtRange range; unsigned int phase_offset = 0; char *outsequence; const char *target; int had_err = 0; gt_error_check(err); fn = gt_genome_node_cast(gt_feature_node_class(), gn); gt_assert(fn); if (seqid) gt_str_append_str(seqid, gt_genome_node_get_seqid(gn)); if (target_ids && (target = gt_feature_node_get_attribute(fn, GT_GFF_TARGET))) { had_err = gt_gff3_parser_parse_all_target_attributes(target, false, target_ids, NULL, NULL, "", 0, err); } if (!had_err) { if (join) { GtFeatureNodeIterator *fni; GtFeatureNode *child; bool reverse_strand = false, first_child = true, first_child_of_type_seen = false; GtPhase phase = GT_PHASE_UNDEFINED; /* in this case we have to traverse the children */ fni = gt_feature_node_iterator_new_direct(gt_feature_node_cast(gn)); while (!had_err && (child = gt_feature_node_iterator_next(fni))) { if (first_child) { if (target_ids && (target = gt_feature_node_get_attribute(child, GT_GFF_TARGET))) { gt_str_array_reset(target_ids); had_err = gt_gff3_parser_parse_all_target_attributes(target, false, target_ids, NULL, NULL, "", 0, err); } first_child = false; } if (!had_err) { if (extract_join_feature((GtGenomeNode*) child, type, region_mapping, sequence, &reverse_strand, &first_child_of_type_seen, &phase, err)) { had_err = -1; } if (phase != GT_PHASE_UNDEFINED) { phase_offset = (int) phase; } } } gt_feature_node_iterator_delete(fni); gt_assert(phase_offset <= (unsigned int) GT_PHASE_UNDEFINED); if (!had_err && gt_str_length(sequence)) { if (reverse_strand) { had_err = gt_reverse_complement(gt_str_get(sequence), gt_str_length(sequence), err); } } } else if (gt_feature_node_get_type(fn) == type) { GtPhase phase = gt_feature_node_get_phase(fn); gt_assert(!had_err); if (phase != GT_PHASE_UNDEFINED) phase_offset = (unsigned int) phase; /* otherwise we only have to look at this feature */ range = gt_genome_node_get_range(gn); gt_assert(range.start); /* 1-based coordinates */ had_err = gt_region_mapping_get_sequence(region_mapping, &outsequence, gt_genome_node_get_seqid(gn), range.start, range.end, err); if (!had_err) { gt_str_append_cstr_nt(sequence, outsequence, gt_range_length(&range)); gt_free(outsequence); if (gt_feature_node_get_strand(fn) == GT_STRAND_REVERSE) { had_err = gt_reverse_complement(gt_str_get(sequence), gt_str_length(sequence), err); } } } } if (out_phase_offset && phase_offset != GT_PHASE_UNDEFINED) { *out_phase_offset = phase_offset; } return had_err; }
static int CpGIOverlap_stream_next(GtNodeStream * ns, GtGenomeNode ** gn, GtError * err) { GtGenomeNode * cur_node, * next_node; GtFeatureNodeIterator * iter; int err_num = 0; *gn = NULL; CpGIOverlap_stream * context; const char * gene_name = NULL; const char * overlap_name = NULL; char chr_str[255]; int chr_num; unsigned int TSS; float CpGIOverlap; context = CpGIOverlap_stream_cast(ns); // find the genes, determine expression level if(!gt_node_stream_next(context->in_stream, &cur_node, err ) && cur_node != NULL ) { *gn = cur_node; // try casting as a feature node so we can test type if(!gt_genome_node_try_cast(gt_feature_node_class(), cur_node)) { return 0; } else // we found a feature node { // first check if it is a pseudo node, if so find the gene in it if available if (gt_feature_node_is_pseudo(cur_node)) { iter = gt_feature_node_iterator_new(cur_node); if (iter == NULL) return; while ((next_node = gt_feature_node_iterator_next(iter)) && !gt_feature_node_has_type(next_node, feature_type_gene)); gt_feature_node_iterator_delete(iter); if (NULL == (cur_node = next_node)) return 0; } if(!gt_feature_node_has_type(cur_node, feature_type_gene)) return 0; // find name of gene gene_name = gt_feature_node_get_attribute(cur_node, "Name"); if (gene_name == NULL) return; if ( 1 != sscanf(gt_str_get(gt_genome_node_get_seqid(cur_node)), "Chr%d", &chr_num)) return 0; TSS = (gt_feature_node_get_strand(cur_node) == GT_STRAND_FORWARD) ? gt_genome_node_get_start(cur_node) : gt_genome_node_get_end(cur_node); // now figure out the overlapping gene if (! (overlap_name = CpGIOverlap_stream_find_gene_overlap( context, TSS, chr_num))) return 0; // save the score into the node gt_feature_node_set_attribute(cur_node, "cpgi_at_tss", overlap_name); return 0; } } return err_num; }
static int construct_genes(GT_UNUSED void *key, void *value, void *data, GtError *err) { GtHashmap *transcript_id_hash = (GtHashmap*) value; ConstructionInfo *cinfo = (ConstructionInfo*) data; GtQueue *genome_nodes = cinfo->genome_nodes; const char *gname; GtArray *mRNAs = gt_array_new(sizeof (GtGenomeNode*)); GtGenomeNode *gene_node, *gn; GtStrand gene_strand; GtRange gene_range; GtStr *gene_seqid; GtUword i; int had_err = 0; gt_error_check(err); gt_assert(key && value && data); cinfo->mRNAs = mRNAs; had_err = gt_hashmap_foreach(transcript_id_hash, construct_mRNAs, cinfo, err); if (!had_err) { gt_assert(gt_array_size(mRNAs)); /* at least one mRNA constructed */ /* determine the range and the strand of the gene */ gn = *(GtGenomeNode**) gt_array_get(mRNAs, 0); gene_range = gt_genome_node_get_range(gn); gene_strand = gt_feature_node_get_strand((GtFeatureNode*) gn); gene_seqid = gt_genome_node_get_seqid(gn); for (i = 1; i < gt_array_size(mRNAs); i++) { GtRange range; gn = *(GtGenomeNode**) gt_array_get(mRNAs, i); range = gt_genome_node_get_range(gn); gene_range = gt_range_join(&gene_range, &range); gene_strand = gt_strand_join(gene_strand, gt_feature_node_get_strand((GtFeatureNode*) gn)); gt_assert(gt_str_cmp(gene_seqid, gt_genome_node_get_seqid(gn)) == 0); } gene_node = gt_feature_node_new(gene_seqid, gt_ft_gene, gene_range.start, gene_range.end, gene_strand); if ((gname = gt_hashmap_get(cinfo->gene_id_to_name_mapping, (const char*) key))) { gt_feature_node_add_attribute((GtFeatureNode*) gene_node, GT_GFF_NAME, gname); } /* register children */ for (i = 0; i < gt_array_size(mRNAs); i++) { gn = *(GtGenomeNode**) gt_array_get(mRNAs, i); gt_feature_node_add_child((GtFeatureNode*) gene_node, (GtFeatureNode*) gn); } /* store the gene */ gt_queue_add(genome_nodes, gene_node); /* free */ gt_array_delete(mRNAs); } return had_err; }
static int snp_annotator_visitor_feature_node(GtNodeVisitor *nv, GtFeatureNode *fn, GtError *err) { int had_err = 0; GtSNPAnnotatorVisitor *sav; GtFeatureNodeIterator *fni, *mrnafni; GtFeatureNode *curnode, *curnode2; GtRange snp_rng; gt_error_check(err); sav = snp_annotator_visitor_cast(nv); /* ignore non-nodes */ if (!fn) return 0; /* only process SNPs */ if (!(gt_feature_node_get_type(fn) == sav->SNV_type || gt_feature_node_get_type(fn) == sav->SNP_type)) { return 0; } fni = gt_feature_node_iterator_new_direct(sav->gene); snp_rng = gt_genome_node_get_range((GtGenomeNode*) fn); while (!had_err && (curnode = gt_feature_node_iterator_next(fni))) { if (gt_feature_node_get_type(curnode) == sav->mRNA_type) { GtStrand mrna_strand = gt_feature_node_get_strand(curnode); #ifndef NDEBUG const char *refstr; #endif GtUword mrnasnppos = 0; mrnafni = gt_feature_node_iterator_new(curnode); while (!had_err && (curnode2 = gt_feature_node_iterator_next(mrnafni))) { if (gt_feature_node_get_type(curnode2) == sav->CDS_type) { GtRange cds_rng = gt_genome_node_get_range((GtGenomeNode*) curnode2); if (gt_range_overlap(&snp_rng, &cds_rng)) { char *mRNA, origchar; char *variantchars, *variantptr = NULL; GT_UNUSED char *refchars, *refptr = NULL; mRNA = (char*) gt_hashmap_get(sav->rnaseqs, curnode); gt_assert(mRNA); gt_assert(snp_rng.start >= cds_rng.start); mrnasnppos += (snp_rng.start - cds_rng.start); if (mrna_strand == GT_STRAND_REVERSE) mrnasnppos = strlen(mRNA) - mrnasnppos - 1; gt_assert(mrnasnppos < strlen(mRNA)); origchar = mRNA[mrnasnppos]; #ifndef NDEBUG refstr = refptr = gt_cstr_dup(gt_feature_node_get_attribute(fn, GT_GVF_REFERENCE_SEQ)); if (!had_err && refstr) { if (gt_feature_node_get_strand(curnode) == GT_STRAND_REVERSE) { int rval = gt_complement(&origchar, origchar, err); gt_assert(rval == 0); } gt_assert(toupper(origchar) == toupper(refstr[0])); } #endif variantchars = variantptr = gt_cstr_dup( gt_feature_node_get_attribute(fn, GT_GVF_VARIANT_SEQ)); if (!had_err && variantchars) { GtUword i = 0; while (!had_err && (*variantchars != ';' && *variantchars != '\0')) { if (*variantchars != ',' && *variantchars != origchar) { char variantchar = *variantchars; #ifndef NDEBUG char refchar = refstr ? refstr[0] : '-'; /* XXX */ if (!had_err && mrna_strand == GT_STRAND_REVERSE) had_err = gt_complement(&refchar, refchar, err); #endif if (!had_err && mrna_strand == GT_STRAND_REVERSE) had_err = gt_complement(&variantchar, variantchar, err); if (!had_err) { had_err = snp_annotator_classify_snp(sav, curnode, fn, mrnasnppos, i++, variantchar, #ifndef NDEBUG refchar, #endif err); } } else if (*variantchars == origchar) { i++; } variantchars++; } gt_free(variantptr); gt_free(refptr); } } else { mrnasnppos += gt_range_length(&cds_rng); } } } gt_feature_node_iterator_delete(mrnafni); } } gt_feature_node_iterator_delete(fni); return had_err; }
static int gt_snp_annotator_visitor_prepare_gene(GtSNPAnnotatorVisitor *sav, GtError *err) { GtFeatureNodeIterator *fni, *mrnafni; GtFeatureNode *curnode, *last_mRNA = NULL; GtStr *mrnaseq, *seqid; int had_err = 0; mrnaseq = gt_str_new(); seqid = gt_genome_node_get_seqid((GtGenomeNode*) sav->gene); fni = gt_feature_node_iterator_new(sav->gene); while (!had_err && (curnode = gt_feature_node_iterator_next(fni))) { if (gt_feature_node_get_type(curnode) == sav->mRNA_type) { GtFeatureNode *curnode2; if (last_mRNA) { char *mrna_charseq = gt_calloc(gt_str_length(mrnaseq)+1, sizeof (char)); (void) strncpy(mrna_charseq, gt_str_get(mrnaseq), gt_str_length(mrnaseq)); if (gt_feature_node_get_strand(sav->gene) == GT_STRAND_REVERSE) { had_err = gt_reverse_complement(mrna_charseq, gt_str_length(mrnaseq), err); } if (!had_err) { gt_hashmap_add(sav->rnaseqs, last_mRNA, mrna_charseq); last_mRNA = curnode; gt_str_reset(mrnaseq); } } else last_mRNA = curnode; if (!had_err) { mrnafni = gt_feature_node_iterator_new(curnode); while (!had_err && (curnode2 = gt_feature_node_iterator_next(mrnafni))) { if (gt_feature_node_get_type(curnode2) == sav->CDS_type) { char *tmp; GtRange rng = gt_genome_node_get_range((GtGenomeNode*) curnode2); had_err = gt_region_mapping_get_sequence(sav->rmap, &tmp, seqid, rng.start, rng.end, err); if (!had_err) { gt_str_append_cstr_nt(mrnaseq, tmp, gt_range_length(&rng)); gt_free(tmp); } } } gt_feature_node_iterator_delete(mrnafni); } } } if (!had_err && last_mRNA) { char *mrna_charseq = gt_calloc(gt_str_length(mrnaseq)+1, sizeof (char)); (void) strncpy(mrna_charseq, gt_str_get(mrnaseq), gt_str_length(mrnaseq)); if (gt_feature_node_get_strand(sav->gene) == GT_STRAND_REVERSE) { had_err = gt_reverse_complement(mrna_charseq, gt_str_length(mrnaseq), err); } if (!had_err) { gt_hashmap_add(sav->rnaseqs, last_mRNA, mrna_charseq); } } gt_feature_node_iterator_delete(fni); gt_str_delete(mrnaseq); return had_err; }
int gt_ltrfileout_stream_next(GtNodeStream *ns, GtGenomeNode **gn, GtError *err) { GtLTRdigestFileOutStream *ls; GtFeatureNode *fn; GtRange lltr_rng = {GT_UNDEF_UWORD, GT_UNDEF_UWORD}, rltr_rng = {GT_UNDEF_UWORD, GT_UNDEF_UWORD}, ppt_rng = {GT_UNDEF_UWORD, GT_UNDEF_UWORD}, pbs_rng = {GT_UNDEF_UWORD, GT_UNDEF_UWORD}; int had_err; GtUword i=0; gt_error_check(err); ls = gt_ltrdigest_file_out_stream_cast(ns); /* initialize this element */ memset(&ls->element, 0, sizeof (GtLTRElement)); /* get annotations from parser */ had_err = gt_node_stream_next(ls->in_stream, gn, err); if (!had_err && *gn) { GtFeatureNodeIterator* gni; GtFeatureNode *mygn; /* only process feature nodes */ if (!(fn = gt_feature_node_try_cast(*gn))) return 0; ls->element.pdomorder = gt_array_new(sizeof (const char*)); /* fill LTRElement structure from GFF3 subgraph */ gni = gt_feature_node_iterator_new(fn); for (mygn = fn; mygn != NULL; mygn = gt_feature_node_iterator_next(gni)) (void) gt_genome_node_accept((GtGenomeNode*) mygn, (GtNodeVisitor*) ls->lv, err); gt_feature_node_iterator_delete(gni); } if (!had_err && ls->element.mainnode != NULL) { char desc[GT_MAXFASTAHEADER]; GtFeatureNode *ltr3, *ltr5; GtStr *sdesc, *sreg, *seq; /* find sequence in GtEncseq */ sreg = gt_genome_node_get_seqid((GtGenomeNode*) ls->element.mainnode); sdesc = gt_str_new(); had_err = gt_region_mapping_get_description(ls->rmap, sdesc, sreg, err); if (!had_err) { GtRange rng; ls->element.seqid = gt_calloc((size_t) ls->seqnamelen+1, sizeof (char)); (void) snprintf(ls->element.seqid, MIN((size_t) gt_str_length(sdesc), (size_t) ls->seqnamelen)+1, "%s", gt_str_get(sdesc)); gt_cstr_rep(ls->element.seqid, ' ', '_'); if (gt_str_length(sdesc) > (GtUword) ls->seqnamelen) ls->element.seqid[ls->seqnamelen] = '\0'; (void) gt_ltrelement_format_description(&ls->element, ls->seqnamelen, desc, (size_t) (GT_MAXFASTAHEADER-1)); gt_str_delete(sdesc); /* output basic retrotransposon data */ lltr_rng = gt_genome_node_get_range((GtGenomeNode*) ls->element.leftLTR); rltr_rng = gt_genome_node_get_range((GtGenomeNode*) ls->element.rightLTR); rng = gt_genome_node_get_range((GtGenomeNode*) ls->element.mainnode); gt_file_xprintf(ls->tabout_file, GT_WU"\t"GT_WU"\t"GT_WU"\t%s\t"GT_WU"\t"GT_WU"\t"GT_WU"\t" GT_WU"\t"GT_WU"\t"GT_WU"\t", rng.start, rng.end, gt_ltrelement_length(&ls->element), ls->element.seqid, lltr_rng.start, lltr_rng.end, gt_ltrelement_leftltrlen(&ls->element), rltr_rng.start, rltr_rng.end, gt_ltrelement_rightltrlen(&ls->element)); } seq = gt_str_new(); /* output TSDs */ if (!had_err && ls->element.leftTSD != NULL) { GtRange tsd_rng; tsd_rng = gt_genome_node_get_range((GtGenomeNode*) ls->element.leftTSD); had_err = gt_extract_feature_sequence(seq, (GtGenomeNode*) ls->element.leftTSD, gt_symbol(gt_ft_target_site_duplication), false, NULL, NULL, ls->rmap, err); if (!had_err) { gt_file_xprintf(ls->tabout_file, ""GT_WU"\t"GT_WU"\t%s\t", tsd_rng.start, tsd_rng.end, gt_str_get(seq)); } gt_str_reset(seq); } else gt_file_xprintf(ls->tabout_file, "\t\t\t"); if (!had_err && ls->element.rightTSD != NULL) { GtRange tsd_rng; tsd_rng = gt_genome_node_get_range((GtGenomeNode*) ls->element.rightTSD); had_err = gt_extract_feature_sequence(seq, (GtGenomeNode*) ls->element.rightTSD, gt_symbol(gt_ft_target_site_duplication), false, NULL, NULL, ls->rmap, err); if (!had_err) { gt_file_xprintf(ls->tabout_file, ""GT_WU"\t"GT_WU"\t%s\t", tsd_rng.start, tsd_rng.end, gt_str_get(seq)); } gt_str_reset(seq); } else gt_file_xprintf(ls->tabout_file, "\t\t\t"); /* output PPT */ if (!had_err && ls->element.ppt != NULL) { GtStrand ppt_strand = gt_feature_node_get_strand(ls->element.ppt); ppt_rng = gt_genome_node_get_range((GtGenomeNode*) ls->element.ppt); had_err = gt_extract_feature_sequence(seq, (GtGenomeNode*) ls->element.ppt, gt_symbol(gt_ft_RR_tract), false, NULL, NULL, ls->rmap, err); if (!had_err) { gt_fasta_show_entry(desc, gt_str_get(seq), gt_range_length(&ppt_rng), GT_FSWIDTH, ls->pptout_file); gt_file_xprintf(ls->tabout_file, ""GT_WU"\t"GT_WU"\t%s\t%c\t%d\t", ppt_rng.start, ppt_rng.end, gt_str_get(seq), GT_STRAND_CHARS[ppt_strand], (ppt_strand == GT_STRAND_FORWARD ? abs((int) (rltr_rng.start - ppt_rng.end)) : abs((int) (lltr_rng.end - ppt_rng.start)))); } gt_str_reset(seq); } else gt_file_xprintf(ls->tabout_file, "\t\t\t\t\t"); /* output PBS */ if (!had_err && ls->element.pbs != NULL) { GtStrand pbs_strand; pbs_strand = gt_feature_node_get_strand(ls->element.pbs); pbs_rng = gt_genome_node_get_range((GtGenomeNode*) ls->element.pbs); had_err = gt_extract_feature_sequence(seq, (GtGenomeNode*) ls->element.pbs, gt_symbol(gt_ft_primer_binding_site), false, NULL, NULL, ls->rmap, err); if (!had_err) { gt_fasta_show_entry(desc, gt_str_get(seq), gt_range_length(&pbs_rng), GT_FSWIDTH, ls->pbsout_file); gt_file_xprintf(ls->tabout_file, ""GT_WU"\t"GT_WU"\t%c\t%s\t%s\t%s\t%s\t%s\t", pbs_rng.start, pbs_rng.end, GT_STRAND_CHARS[pbs_strand], gt_feature_node_get_attribute(ls->element.pbs, "trna"), gt_str_get(seq), gt_feature_node_get_attribute(ls->element.pbs, "pbsoffset"), gt_feature_node_get_attribute(ls->element.pbs, "trnaoffset"), gt_feature_node_get_attribute(ls->element.pbs, "edist")); } gt_str_reset(seq); } else gt_file_xprintf(ls->tabout_file, "\t\t\t\t\t\t\t\t"); /* output protein domains */ if (!had_err && ls->element.pdoms != NULL) { GtStr *pdomorderstr = gt_str_new(); for (i=0; !had_err && i<gt_array_size(ls->element.pdomorder); i++) { const char* key = *(const char**) gt_array_get(ls->element.pdomorder, i); GtArray *entry = (GtArray*) gt_hashmap_get(ls->element.pdoms, key); had_err = write_pdom(ls, entry, key, ls->rmap, desc, err); } if (GT_STRAND_REVERSE == gt_feature_node_get_strand(ls->element.mainnode)) gt_array_reverse(ls->element.pdomorder); for (i=0 ;!had_err && i<gt_array_size(ls->element.pdomorder); i++) { const char* name = *(const char**) gt_array_get(ls->element.pdomorder, i); gt_str_append_cstr(pdomorderstr, name); if (i != gt_array_size(ls->element.pdomorder)-1) gt_str_append_cstr(pdomorderstr, "/"); } gt_file_xprintf(ls->tabout_file, "%s", gt_str_get(pdomorderstr)); gt_str_delete(pdomorderstr); } /* output LTRs (we just expect them to exist) */ switch (gt_feature_node_get_strand(ls->element.mainnode)) { case GT_STRAND_REVERSE: ltr5 = ls->element.rightLTR; ltr3 = ls->element.leftLTR; break; case GT_STRAND_FORWARD: default: ltr5 = ls->element.leftLTR; ltr3 = ls->element.rightLTR; break; } if (!had_err) { had_err = gt_extract_feature_sequence(seq, (GtGenomeNode*) ltr5, gt_symbol(gt_ft_long_terminal_repeat), false, NULL, NULL, ls->rmap, err); } if (!had_err) { gt_fasta_show_entry(desc, gt_str_get(seq), gt_str_length(seq), GT_FSWIDTH, ls->ltr5out_file); gt_str_reset(seq); } if (!had_err) { had_err = gt_extract_feature_sequence(seq, (GtGenomeNode*) ltr3, gt_symbol(gt_ft_long_terminal_repeat), false, NULL, NULL, ls->rmap, err); } if (!had_err) { gt_fasta_show_entry(desc, gt_str_get(seq), gt_str_length(seq), GT_FSWIDTH, ls->ltr3out_file); gt_str_reset(seq); } /* output complete oriented element */ if (!had_err) { had_err = gt_extract_feature_sequence(seq, (GtGenomeNode*) ls->element.mainnode, gt_symbol(gt_ft_LTR_retrotransposon), false, NULL, NULL, ls->rmap, err); } if (!had_err) { gt_fasta_show_entry(desc,gt_str_get(seq), gt_str_length(seq), GT_FSWIDTH, ls->elemout_file); gt_str_reset(seq); } gt_file_xprintf(ls->tabout_file, "\n"); gt_str_delete(seq); } gt_hashmap_delete(ls->element.pdoms); gt_array_delete(ls->element.pdomorder); gt_free(ls->element.seqid); return had_err; }
static int write_pdom(GtLTRdigestFileOutStream *ls, GtArray *pdoms, const char *pdomname, GT_UNUSED GtRegionMapping *rmap, char *desc, GtError *err) { int had_err = 0; GtFile *seqfile = NULL, *alifile = NULL, *aafile = NULL; GtUword i = 0, seq_length = 0; GtStr *pdom_seq, *pdom_aaseq; gt_error_check(err); pdom_seq = gt_str_new(); pdom_aaseq = gt_str_new(); /* get protein domain output file */ seqfile = (GtFile*) gt_hashmap_get(ls->pdomout_files, pdomname); if (seqfile == NULL) { /* no file opened for this domain yet, do it */ char buffer[GT_MAXFILENAMELEN]; (void) snprintf(buffer, (size_t) (GT_MAXFILENAMELEN-1), "%s_pdom_%s.fas", ls->fileprefix, pdomname); seqfile = gt_file_xopen(buffer, "w+"); gt_hashmap_add(ls->pdomout_files, gt_cstr_dup(pdomname), seqfile); } /* get protein alignment output file */ if (ls->write_pdom_alignments) { alifile = (GtFile*) gt_hashmap_get(ls->pdomali_files, pdomname); if (alifile == NULL) { /* no file opened for this domain yet, do it */ char buffer[GT_MAXFILENAMELEN]; (void) snprintf(buffer, (size_t) (GT_MAXFILENAMELEN-1), "%s_pdom_%s.ali", ls->fileprefix, pdomname); alifile = gt_file_xopen(buffer, "w+"); gt_hashmap_add(ls->pdomali_files, gt_cstr_dup(pdomname), alifile); } } /* get amino acid sequence output file */ if (ls->write_pdom_aaseqs) { aafile = (GtFile*) gt_hashmap_get(ls->pdomaa_files, pdomname); if (aafile == NULL) { /* no file opened for this domain yet, do it */ char buffer[GT_MAXFILENAMELEN]; (void) snprintf(buffer, (size_t) (GT_MAXFILENAMELEN-1), "%s_pdom_%s_aa.fas", ls->fileprefix, pdomname); aafile = gt_file_xopen(buffer, "w+"); gt_hashmap_add(ls->pdomaa_files, gt_cstr_dup(pdomname), aafile); } } if (gt_array_size(pdoms) > 1UL) { for (i=1UL; i<gt_array_size(pdoms); i++) { gt_assert(gt_genome_node_cmp(*(GtGenomeNode**)gt_array_get(pdoms, i), *(GtGenomeNode**)gt_array_get(pdoms, i-1)) >= 0); } if (gt_feature_node_get_strand(*(GtFeatureNode**) gt_array_get(pdoms, 0UL)) == GT_STRAND_REVERSE) { gt_array_reverse(pdoms); } } /* output protein domain data */ for (i=0;i<gt_array_size(pdoms);i++) { GtRange pdom_rng; GtStr *ali, *aaseq; GtFeatureNode *fn; int rval; fn = *(GtFeatureNode**) gt_array_get(pdoms, i); ali = gt_genome_node_get_user_data((GtGenomeNode*) fn, "pdom_alignment"); aaseq = gt_genome_node_get_user_data((GtGenomeNode*) fn, "pdom_aaseq"); pdom_rng = gt_genome_node_get_range((GtGenomeNode*) fn); rval = gt_extract_feature_sequence(pdom_seq, (GtGenomeNode*) fn, gt_symbol(gt_ft_protein_match), false, NULL, NULL, rmap, err); if (rval) { had_err = -1; break; } if (ls->write_pdom_alignments && ali) { char buf[BUFSIZ]; /* write away alignment */ (void) snprintf(buf, BUFSIZ-1, "Protein domain alignment in translated " "sequence for candidate\n'%s':\n\n", desc); gt_file_xwrite(alifile, buf, (size_t) strlen(buf) * sizeof (char)); gt_file_xwrite(alifile, gt_str_get(ali), (size_t) gt_str_length(ali) * sizeof (char)); gt_file_xwrite(alifile, "---\n\n", 5 * sizeof (char)); } if (ls->write_pdom_aaseqs && aaseq) { /* append amino acid sequence */ gt_str_append_str(pdom_aaseq, aaseq); } gt_genome_node_release_user_data((GtGenomeNode*) fn, "pdom_alignment"); gt_genome_node_release_user_data((GtGenomeNode*) fn, "pdom_aaseq"); seq_length += gt_range_length(&pdom_rng); } if (!had_err) { gt_fasta_show_entry(desc, gt_str_get(pdom_seq), seq_length, GT_FSWIDTH, seqfile); if (ls->write_pdom_aaseqs) { gt_fasta_show_entry(desc, gt_str_get(pdom_aaseq), gt_str_length(pdom_aaseq), GT_FSWIDTH, aafile); } } gt_str_delete(pdom_seq); gt_str_delete(pdom_aaseq); return had_err; }
static int gt_ltrdigest_pdom_visitor_choose_strand(GtLTRdigestPdomVisitor *lv) { int had_err = 0; double log_eval_fwd = 0.0, log_eval_rev = 0.0; GtFeatureNodeIterator *fni; GtStrand strand; double score; bool seen_fwd = false, seen_rev = false; GtFeatureNode *curnode = NULL; GtUword i; GtArray *to_delete; fni = gt_feature_node_iterator_new(lv->ltr_retrotrans); while (!had_err && (curnode = gt_feature_node_iterator_next(fni))) { if (strcmp(gt_feature_node_get_type(curnode), gt_ft_protein_match) == 0) { strand = gt_feature_node_get_strand(curnode); score = (double) gt_feature_node_get_score(curnode); if (strand == GT_STRAND_FORWARD) { log_eval_fwd += log(score); seen_fwd = true; } else if (strand == GT_STRAND_REVERSE) { log_eval_rev += log(score); seen_rev = true; } } } gt_feature_node_iterator_delete(fni); if (seen_rev && !seen_fwd) gt_feature_node_set_strand(lv->ltr_retrotrans, GT_STRAND_REVERSE); else if (!seen_rev && seen_fwd) gt_feature_node_set_strand(lv->ltr_retrotrans, GT_STRAND_FORWARD); else if (!seen_rev && !seen_fwd) return had_err; else { gt_assert(seen_rev && seen_fwd); if (gt_double_compare(log_eval_fwd, log_eval_rev) < 0) strand = GT_STRAND_FORWARD; else strand = GT_STRAND_REVERSE; gt_feature_node_set_strand(lv->ltr_retrotrans, strand); to_delete = gt_array_new(sizeof (GtFeatureNode*)); fni = gt_feature_node_iterator_new(lv->ltr_retrotrans); while (!had_err && (curnode = gt_feature_node_iterator_next(fni))) { if (strcmp(gt_feature_node_get_type(curnode), gt_ft_protein_match) == 0) { if (strand != gt_feature_node_get_strand(curnode)) { gt_array_add(to_delete, curnode); } } } gt_feature_node_iterator_delete(fni); gt_assert(gt_array_size(to_delete) > 0); for (i = 0; i < gt_array_size(to_delete); i++) { gt_feature_node_remove_leaf(lv->ltr_retrotrans, *(GtFeatureNode**) gt_array_get(to_delete, i)); } gt_array_delete(to_delete); } return had_err; }
bool agn_infer_cds_visitor_unit_test(AgnUnitTest *test) { GtQueue *queue = gt_queue_new(); infer_cds_visitor_test_data(queue); agn_assert(gt_queue_size(queue) == 4); GtFeatureNode *fn = gt_queue_get(queue); GtArray *cds = agn_typecheck_select(fn, agn_typecheck_cds); bool grape1 = (gt_array_size(cds) == 4); if(grape1) { GtGenomeNode *cds2 = *(GtGenomeNode **)gt_array_get(cds, 1); GtRange range = gt_genome_node_get_range(cds2); grape1 = (range.start == 349 && range.end == 522); } agn_unit_test_result(test, "grape test sans UTRs", grape1); gt_genome_node_delete((GtGenomeNode *)fn); gt_array_delete(cds); fn = gt_queue_get(queue); cds = agn_typecheck_select(fn, agn_typecheck_cds); bool grape2 = (gt_array_size(cds) == 1); if(grape2) { GtGenomeNode *cds1 = *(GtGenomeNode **)gt_array_get(cds, 0); GtRange range = gt_genome_node_get_range(cds1); GtStrand strand = gt_feature_node_get_strand((GtFeatureNode *)cds1); grape2 = (range.start == 10747 && range.end == 11577 && strand == GT_STRAND_REVERSE); } agn_unit_test_result(test, "grape test with UTRs, strand check", grape2); gt_genome_node_delete((GtGenomeNode *)fn); gt_array_delete(cds); fn = gt_queue_get(queue); cds = agn_typecheck_select(fn, agn_typecheck_cds); bool grape3 = (gt_array_size(cds) == 2); if(grape3) { GtGenomeNode *cds2 = *(GtGenomeNode **)gt_array_get(cds, 1); GtRange range = gt_genome_node_get_range(cds2); grape3 = (range.start == 22651 && range.end == 23022); } agn_unit_test_result(test, "grape test 3", grape3); gt_genome_node_delete((GtGenomeNode *)fn); gt_array_delete(cds); fn = gt_queue_get(queue); cds = agn_typecheck_select(fn, agn_typecheck_cds); bool grape4 = (gt_array_size(cds) == 12); if(grape4) { GtGenomeNode *cds7 = *(GtGenomeNode **)gt_array_get(cds, 6); GtRange range = gt_genome_node_get_range(cds7); grape4 = (range.start == 27956 && range.end == 27996); } agn_unit_test_result(test, "grape test 4", grape4); gt_genome_node_delete((GtGenomeNode *)fn); gt_array_delete(cds); while(gt_queue_size(queue) > 0) { GtGenomeNode *cds_n = gt_queue_get(queue); gt_genome_node_delete(cds_n); } gt_queue_delete(queue); return agn_unit_test_success(test); }
static void infer_cds_visitor_infer_utrs(AgnInferCDSVisitor *v) { GtFeatureNode *start_codon, *stop_codon; bool exonsexplicit = gt_array_size(v->exons) > 0; bool cdsexplicit = gt_array_size(v->cds) > 0; bool startcodon_check = gt_array_size(v->starts) == 1 && (start_codon = gt_array_get(v->starts, 0)) != NULL; bool stopcodon_check = gt_array_size(v->stops) == 1 && (stop_codon = gt_array_get(v->stops, 0)) != NULL; bool caninferutrs = exonsexplicit && startcodon_check && stopcodon_check; if(gt_array_size(v->utrs) > 0) { return; } else if(!cdsexplicit && !caninferutrs) { return; } GtGenomeNode **leftcodon = gt_array_get(v->starts, 0); GtGenomeNode **rightcodon = gt_array_get(v->stops, 0); GtStrand strand = gt_feature_node_get_strand(v->mrna); const char *lefttype = "five_prime_UTR"; const char *righttype = "three_prime_UTR"; if(strand == GT_STRAND_REVERSE) { lefttype = "three_prime_UTR"; righttype = "five_prime_UTR"; void *temp = leftcodon; leftcodon = rightcodon; rightcodon = temp; } GtRange leftrange = gt_genome_node_get_range(*leftcodon); GtRange rightrange = gt_genome_node_get_range(*rightcodon); GtUword i; for(i = 0; i < gt_array_size(v->exons); i++) { GtGenomeNode **exon = gt_array_get(v->exons, i); GtRange exonrange = gt_genome_node_get_range(*exon); if(exonrange.start < leftrange.start) { GtRange utrrange; if(gt_range_overlap(&exonrange, &leftrange)) { utrrange.start = exonrange.start; utrrange.end = leftrange.start - 1; } else { utrrange = exonrange; } GtGenomeNode *utr = gt_feature_node_new(gt_genome_node_get_seqid(*exon), lefttype, utrrange.start, utrrange.end, strand); if(v->source) gt_feature_node_set_source((GtFeatureNode *)utr, v->source); gt_feature_node_add_child(v->mrna, (GtFeatureNode *)utr); gt_array_add(v->utrs, utr); } if(exonrange.end > rightrange.end) { GtRange utrrange; if(gt_range_overlap(&exonrange, &rightrange)) { utrrange.start = rightrange.end + 1; utrrange.end = exonrange.end; } else { utrrange = exonrange; } GtGenomeNode *utr = gt_feature_node_new(gt_genome_node_get_seqid(*exon), righttype, utrrange.start, utrrange.end, strand); if(v->source) gt_feature_node_set_source((GtFeatureNode *)utr, v->source); gt_feature_node_add_child(v->mrna, (GtFeatureNode *)utr); gt_array_add(v->utrs, utr); } } }