static int cds_check_visitor_feature_node(GtNodeVisitor *nv, GtFeatureNode *fn, GtError *err) { GtCDSCheckVisitor *v = cds_check_visitor_cast(nv); GtFeatureNodeIterator *fni; GtFeatureNode *node; int had_err = 0; gt_error_check(err); gt_assert(v && fn); fni = gt_feature_node_iterator_new(fn); while (!had_err && (node = gt_feature_node_iterator_next(fni))) had_err = check_cds_phases_if_necessary(node, v, false, err); gt_feature_node_iterator_delete(fni); gt_hashmap_reset(v->cds_features); while (v->splitting_is_necessary) { split_cds_features(v->cds_features_to_split, fn); gt_hashmap_reset(v->cds_features_to_split); v->splitting_is_necessary = false; /* perform second pass to correct phases */ fni = gt_feature_node_iterator_new(fn); while (!had_err && (node = gt_feature_node_iterator_next(fni))) had_err = check_cds_phases_if_necessary(node, v, false, err); gt_feature_node_iterator_delete(fni); gt_hashmap_reset(v->cds_features); } return had_err; }
GtRange agn_transcript_cds_range(GtFeatureNode *transcript) { gt_assert(transcript); GtRange trange; trange.start = 0; trange.end = 0; GtFeatureNodeIterator *iter = gt_feature_node_iterator_new_direct(transcript); GtFeatureNode *current; for ( current = gt_feature_node_iterator_next(iter); current != NULL; current = gt_feature_node_iterator_next(iter) ) { if(agn_gt_feature_node_is_cds_feature(current)) { GtRange crange = gt_genome_node_get_range((GtGenomeNode *)current); if(trange.start == 0 || crange.start < trange.start) trange.start = crange.start; if(trange.end == 0 || crange.end > trange.end) trange.end = crange.end; } } if(gt_feature_node_get_strand(transcript) == GT_STRAND_REVERSE) { GtUword temp = trange.start; trange.start = trange.end; trange.end = temp; } return trange; }
static int visit_feature_node(GtNodeVisitor *nv, GtFeatureNode *fn, GtError *error) { AgnLocusMapVisitor *v = locus_map_visitor_cast(nv); gt_error_check(error); agn_assert(gt_feature_node_has_type(fn, "locus")); const char *locuslabel = agn_feature_node_get_label(fn); GtFeatureNodeIterator *iter = gt_feature_node_iterator_new(fn); GtFeatureNode *current; for(current = gt_feature_node_iterator_next(iter); current != NULL; current = gt_feature_node_iterator_next(iter)) { if(agn_typecheck_gene(current) && v->genefh != NULL) { const char *genelabel = agn_feature_node_get_label(current); fprintf(v->genefh, "%s\t%s\n", genelabel, locuslabel); } if(agn_typecheck_mrna(current) && v->mrnafh != NULL) { const char *mrnalabel = agn_feature_node_get_label(current); fprintf(v->mrnafh, "%s\t%s\n", mrnalabel, locuslabel); } } gt_feature_node_iterator_delete(iter); return 0; }
static int filter_stream_next(GtNodeStream *ns, GtGenomeNode **gn, GtError *error) { AgnFilterStream *stream; GtFeatureNode *fn; int had_err; gt_error_check(error); stream = filter_stream_cast(ns); if(gt_queue_size(stream->cache) > 0) { *gn = gt_queue_get(stream->cache); return 0; } while(1) { had_err = gt_node_stream_next(stream->in_stream, gn, error); if(had_err) return had_err; if(!*gn) return 0; fn = gt_feature_node_try_cast(*gn); if(!fn) return 0; GtFeatureNode *current; GtFeatureNodeIterator *iter = gt_feature_node_iterator_new(fn); for(current = gt_feature_node_iterator_next(iter); current != NULL; current = gt_feature_node_iterator_next(iter)) { const char *type = gt_feature_node_get_type(current); bool keepfeature = false; if(gt_hashmap_get(stream->typestokeep, type) != NULL) keepfeature = true; if(keepfeature) { gt_genome_node_ref((GtGenomeNode *)current); gt_queue_add(stream->cache, current); } } gt_feature_node_iterator_delete(iter); gt_genome_node_delete((GtGenomeNode *)fn); if(gt_queue_size(stream->cache) > 0) { *gn = gt_queue_get(stream->cache); return 0; } } return 0; }
static GtArray* gaeval_visitor_intersect(GtGenomeNode *genemodel, GtGenomeNode *alignment) { agn_assert(genemodel && alignment); GtFeatureNode *genefn = gt_feature_node_cast(genemodel); GtFeatureNode *algnfn = gt_feature_node_cast(alignment); agn_assert(gt_feature_node_has_type(genefn, "mRNA")); GtStrand genestrand = gt_feature_node_get_strand(genefn); GtStrand algnstrand = gt_feature_node_get_strand(algnfn); if(genestrand != algnstrand) return NULL; GtArray *covered_parts = gt_array_new( sizeof(GtRange) ); GtArray *exons = agn_typecheck_select(genefn, agn_typecheck_exon); GtWord i; for(i = 0; i < gt_array_size(exons); i++) { GtGenomeNode *exon = *(GtGenomeNode **)gt_array_get(exons, i); GtRange exonrange = gt_genome_node_get_range(exon); GtFeatureNodeIterator *aniter = gt_feature_node_iterator_new(algnfn); GtFeatureNode *tempaln; GtRange nullrange = {0, 0}; for(tempaln = gt_feature_node_iterator_next(aniter); tempaln != NULL; tempaln = gt_feature_node_iterator_next(aniter)) { if(gt_feature_node_has_type(tempaln, "match_gap")) continue; GtRange alnrange = gt_genome_node_get_range((GtGenomeNode *) tempaln); GtRange intr = gaeval_visitor_range_intersect(&exonrange, &alnrange); if(gt_range_compare(&intr, &nullrange) != 0) gt_array_add(covered_parts, intr); } gt_feature_node_iterator_delete(aniter); } gt_array_delete(exons); for(i = 0; i < gt_array_size(covered_parts); i++) { GtRange *r1 = gt_array_get(covered_parts, i); GtUword j; for(j = i+1; j < gt_array_size(covered_parts); j++) { GtRange *r2 = gt_array_get(covered_parts, j); agn_assert(gt_range_overlap(r1, r2) == false); } } return covered_parts; }
void agn_transcript_structure_gbk(GtFeatureNode *transcript, FILE *outstream) { gt_assert(transcript && outstream); GtArray *exons = gt_array_new( sizeof(GtFeatureNode *) ); GtFeatureNodeIterator *iter = gt_feature_node_iterator_new_direct(transcript); GtFeatureNode *child; for ( child = gt_feature_node_iterator_next(iter); child != NULL; child = gt_feature_node_iterator_next(iter) ) { if(agn_gt_feature_node_is_exon_feature(child)) gt_array_add(exons, child); } gt_feature_node_iterator_delete(iter); gt_assert(gt_array_size(exons) > 0); gt_array_sort(exons, (GtCompare)agn_gt_genome_node_compare); if(gt_feature_node_get_strand(transcript) == GT_STRAND_REVERSE) fputs("complement(", outstream); if(gt_array_size(exons) == 1) { GtGenomeNode *exon = *(GtGenomeNode **)gt_array_get(exons, 0); GtRange exonrange = gt_genome_node_get_range(exon); fprintf(outstream, "<%lu..>%lu", exonrange.start, exonrange.end); } else { fputs("join(", outstream); GtUword i; for(i = 0; i < gt_array_size(exons); i++) { GtGenomeNode *exon = *(GtGenomeNode **)gt_array_get(exons, i); GtRange exonrange = gt_genome_node_get_range(exon); if(i == 0) fprintf(outstream, "<%lu..%lu", exonrange.start, exonrange.end); else if(i+1 == gt_array_size(exons)) fprintf(outstream, ",%lu..>%lu", exonrange.start, exonrange.end); else fprintf(outstream, ",%lu..%lu", exonrange.start, exonrange.end); } fputs(")", outstream); } if(gt_feature_node_get_strand(transcript) == GT_STRAND_REVERSE) fputs(")", outstream); }
static int gt_orf_finder_visitor_feature_node(GtNodeVisitor *gv, GtFeatureNode *gf, GtError *err) { GtORFFinderVisitor *lv; const char *gft = NULL; GtFeatureNodeIterator *gfi; GtFeatureNode *curnode = NULL; int had_err = 0; GtRange rng; lv = gt_orf_finder_visitor_cast(gv); gt_assert(lv); gt_error_check(err); gfi = gt_feature_node_iterator_new(gf); while (!had_err && (curnode = gt_feature_node_iterator_next(gfi))) { gft = gt_feature_node_get_type(curnode); if (gt_hashmap_get(lv->types, (void*) gft) != NULL || gt_hashmap_get(lv->types, (void*) "all") == (void*) 1) { if (!had_err) { rng = gt_genome_node_get_range((GtGenomeNode*) curnode); had_err = run_orffinder(lv->rmap, curnode, rng.start - 1, rng.end - 1, lv->min, lv->max, lv->all, err); if (gt_hashmap_get(lv->types, (void*) "all") == (void*) 1) { break; } else if (gt_feature_node_has_children(curnode)) { GtFeatureNode *tmpnode = NULL; GtFeatureNodeIterator *tmpgfi = gt_feature_node_iterator_new(curnode); (void) gt_feature_node_iterator_next(tmpgfi); while ((tmpnode = gt_feature_node_iterator_next(tmpgfi))) { gft = gt_feature_node_get_type(tmpnode); if (strcmp(gft, (const char*) GT_ORF_TYPE) == 0) { continue; } /* curnode = gt_feature_node_iterator_next(gfi); */ } gt_feature_node_iterator_delete(tmpgfi); } } } } gt_feature_node_iterator_delete(gfi); return had_err; }
static int check_boundaries_visitor_check_rec(GtFeatureNode *parent, GtFeatureNode *child, GtError *err) { GtFeatureNodeIterator *fni; GtFeatureNode *node; GtRange range, p_range; int had_err = 0; range = gt_genome_node_get_range((GtGenomeNode*) child); p_range = gt_genome_node_get_range((GtGenomeNode*) parent); if (range.start < p_range.start || range.end > p_range.end) { gt_warning("%s child range " GT_WU "-" GT_WU " (file %s, line %u) not " "contained in %s parent range " GT_WU "-" GT_WU " (file %s, " "line %u)", gt_feature_node_get_type(child), range.start, range.end, gt_genome_node_get_filename((GtGenomeNode*) child), gt_genome_node_get_line_number((GtGenomeNode*) child), gt_feature_node_get_type(parent), p_range.start, p_range.end, gt_genome_node_get_filename((GtGenomeNode*) parent), gt_genome_node_get_line_number((GtGenomeNode*) parent)); } fni = gt_feature_node_iterator_new_direct(child); while ((node = gt_feature_node_iterator_next(fni))) { had_err = check_boundaries_visitor_check_rec(child, node, err); } gt_feature_node_iterator_delete(fni); return had_err; }
static int gt_ltr_input_check_visitor_feature_node(GtNodeVisitor *nv, GtFeatureNode *fn, GtError *err) { GT_UNUSED GtLTRInputCheckVisitor *lv; GtFeatureNodeIterator *fni; bool seen_left = false; GtFeatureNode *curnode = NULL, *ltr_retrotrans = NULL, *lltr = NULL, *rltr = NULL; int had_err = 0; lv = gt_ltr_input_check_visitor_cast(nv); gt_assert(lv); gt_error_check(err); /* traverse annotation subgraph and find LTR components */ fni = gt_feature_node_iterator_new(fn); while (!had_err && (curnode = gt_feature_node_iterator_next(fni))) { if (strcmp(gt_feature_node_get_type(curnode), gt_ft_LTR_retrotransposon) == 0) { ltr_retrotrans = curnode; } if (strcmp(gt_feature_node_get_type(curnode), gt_ft_long_terminal_repeat) == 0) { if (seen_left) rltr = curnode; else { lltr = curnode; seen_left = true; } } } gt_feature_node_iterator_delete(fni); if (lv->only_ltrs) { if (!had_err && !ltr_retrotrans) { gt_error_set(err, "connected component with %s entry node (%s, line %u) " "does not contain a '%s' node, which is required", gt_feature_node_get_type(fn), gt_genome_node_get_filename((GtGenomeNode*) fn), gt_genome_node_get_line_number((GtGenomeNode*) fn), gt_ft_LTR_retrotransposon); had_err = -1; } } if (!had_err && ltr_retrotrans && (!lltr || !rltr)) { gt_error_set(err, "LTR_retrotransposon feature (%s, line %u) " "does not contain two %s child features, both of which " "are required", gt_genome_node_get_filename((GtGenomeNode*) ltr_retrotrans), gt_genome_node_get_line_number((GtGenomeNode*) ltr_retrotrans), gt_ft_long_terminal_repeat); had_err = -1; } return had_err; }
static int gaeval_visitor_visit_feature_node(GtNodeVisitor *nv, GtFeatureNode *fn, GtError *error) { AgnGaevalVisitor *v = gaeval_visitor_cast(nv); gt_error_check(error); GtFeatureNodeIterator *feats = gt_feature_node_iterator_new(fn); GtFeatureNode *tempfeat; for(tempfeat = gt_feature_node_iterator_next(feats); tempfeat != NULL; tempfeat = gt_feature_node_iterator_next(feats)) { if(agn_typecheck_mrna(tempfeat) == false) continue; double coverage = gaeval_visitor_calculate_coverage(v, tempfeat, error); char covstr[16]; sprintf(covstr, "%.3lf", coverage); gt_feature_node_add_attribute(tempfeat, "gaeval_coverage", covstr); double integrity_components[5]; double integrity = gaeval_visitor_calculate_integrity( v, tempfeat, coverage, integrity_components, error ); char intstr[16]; sprintf(intstr, "%.3lf", integrity); gt_feature_node_add_attribute(tempfeat, "gaeval_integrity", intstr); if(v->tsvout) { const char *mrnaid = gt_feature_node_get_attribute(tempfeat, "ID"); const char *mrnalabel = agn_feature_node_get_label(tempfeat); GtUword num_introns = agn_typecheck_count(tempfeat, agn_typecheck_intron); fprintf(v->tsvout, "%s\t%s\t%s\t%s\t%lu\t%.3lf\t%.3lf\t%.3lf\t%.3lf\n", mrnaid, mrnalabel, intstr, covstr, num_introns, integrity_components[0], integrity_components[1], integrity_components[2], integrity_components[3]); } } gt_feature_node_iterator_delete(feats); return 0; }
static GtArray* find_cds_parents(GtFeatureNode *cds_feature, GtFeatureNode *fn) { GtFeatureNodeIterator *fni, *di; GtFeatureNode *parent, *child; GtArray *parents; gt_assert(cds_feature && fn); parents = gt_array_new(sizeof (GtFeatureNode*)); fni = gt_feature_node_iterator_new(fn); while ((parent = gt_feature_node_iterator_next(fni))) { di = gt_feature_node_iterator_new_direct(parent); while ((child = gt_feature_node_iterator_next(di))) { if (child == cds_feature) gt_array_add(parents, parent); } gt_feature_node_iterator_delete(di); } gt_feature_node_iterator_delete(fni); return parents; }
static int infer_cds_visitor_visit_feature_node(GtNodeVisitor *nv, GtFeatureNode *fn, GtError *error) { AgnInferCDSVisitor *v = infer_cds_visitor_cast(nv); gt_error_check(error); GtFeatureNodeIterator *iter = gt_feature_node_iterator_new(fn); GtFeatureNode *current; for(current = gt_feature_node_iterator_next(iter); current != NULL; current = gt_feature_node_iterator_next(iter)) { if(!agn_typecheck_mrna(current)) continue; v->cds = agn_typecheck_select(current, agn_typecheck_cds); v->utrs = agn_typecheck_select(current, agn_typecheck_utr); v->exons = agn_typecheck_select(current, agn_typecheck_exon); v->starts = agn_typecheck_select(current, agn_typecheck_start_codon); v->stops = agn_typecheck_select(current, agn_typecheck_stop_codon); v->mrna = current; infer_cds_visitor_infer_cds(v); infer_cds_visitor_check_start(v); infer_cds_visitor_check_stop(v); infer_cds_visitor_infer_utrs(v); infer_cds_visitor_check_cds_multi(v); infer_cds_visitor_check_cds_phase(v); infer_cds_visitor_set_utrs(v); v->mrna = NULL; gt_array_delete(v->cds); gt_array_delete(v->utrs); gt_array_delete(v->exons); gt_array_delete(v->starts); gt_array_delete(v->stops); } gt_feature_node_iterator_delete(iter); return 0; }
static int feature_node_iterator_lua_next(lua_State *L) { GtFeatureNodeIterator **fni; GtFeatureNode *fn; fni = check_gt_feature_node_iterator(L, 1); fn = gt_feature_node_iterator_next(*fni); if (fn) gt_lua_genome_node_push(L, gt_genome_node_ref((GtGenomeNode*) fn)); else lua_pushnil(L); return 1; }
static int extract_feature_visitor_feature_node(GtNodeVisitor *nv, GtFeatureNode *fn, GtError *err) { GtExtractFeatureVisitor *efv; GtFeatureNodeIterator *fni; GtFeatureNode *child; GtStrArray *target_ids = NULL; GtStr *seqid = NULL, *description, *sequence; int had_err = 0; gt_error_check(err); efv = gt_extract_feature_visitor_cast(nv); gt_assert(efv->region_mapping); fni = gt_feature_node_iterator_new(fn); if (efv->target) target_ids = gt_str_array_new(); if (efv->seqid) seqid = gt_str_new(); description = gt_str_new(); sequence = gt_str_new(); while (!had_err && (child = gt_feature_node_iterator_next(fni))) { if (seqid) gt_str_reset(seqid); if (target_ids) gt_str_array_reset(target_ids); if (gt_extract_feature_sequence(sequence, (GtGenomeNode*) child, efv->type, efv->join, seqid, target_ids, efv->region_mapping, err)) { had_err = -1; } if (!had_err && gt_str_length(sequence)) { efv->fastaseq_counter++; construct_description(description, efv->type, efv->fastaseq_counter, efv->join, efv->translate, seqid, target_ids); had_err = show_entry(description, sequence, efv->translate, efv->width, efv->outfp); gt_str_reset(description); gt_str_reset(sequence); } } gt_str_delete(sequence); gt_str_delete(description); gt_str_delete(seqid); gt_str_array_delete(target_ids); gt_feature_node_iterator_delete(fni); return had_err; }
static int check_cds_phases_if_necessary(GtFeatureNode *fn, GtCDSCheckVisitor *v, bool second_pass, GtError *err) { GtFeatureNodeIterator *fni; GtFeatureNode *node; GtArray *cds_features = NULL; GtHashmap *multi_features = NULL; int had_err = 0; gt_error_check(err); gt_assert(fn); fni = gt_feature_node_iterator_new_direct(fn); while ((node = gt_feature_node_iterator_next(fni))) { if (gt_feature_node_has_type(node, gt_ft_CDS)) { if (gt_feature_node_is_multi(node)) { GtArray *features; if (!multi_features) multi_features = gt_hashmap_new(GT_HASH_DIRECT, NULL, (GtFree) gt_array_delete); if ((features = gt_hashmap_get(multi_features, gt_feature_node_get_multi_representative(node)))) { gt_array_add(features, node); } else { GtFeatureNode *representative; features = gt_array_new(sizeof (GtFeatureNode*)); representative = gt_feature_node_get_multi_representative(node); gt_array_add(features, representative); gt_hashmap_add(multi_features, representative, features); } } else { if (!cds_features) cds_features = gt_array_new(sizeof (GtFeatureNode*)); gt_array_add(cds_features, node); } } } if (cds_features) had_err = check_cds_phases(cds_features, v, false, second_pass, err); if (!had_err && multi_features) had_err = gt_hashmap_foreach(multi_features, check_cds_phases_hm, v, err); gt_array_delete(cds_features); gt_hashmap_delete(multi_features); gt_feature_node_iterator_delete(fni); return had_err; }
int gt_feature_node_iterator_example(GT_UNUSED GtError *err) { GtFeatureNodeIterator *fni; GtFeatureNode *fn, *node; fn = (GtFeatureNode*) gt_feature_node_new_standard_gene(); /* an example genome node iterator use case */ fni = gt_feature_node_iterator_new(fn); while ((node = gt_feature_node_iterator_next(fni))) { /* do something with <node> */ } gt_feature_node_iterator_delete(fni); gt_genome_node_delete((GtGenomeNode*) fn); return 0; }
static int check_boundaries_visitor_feature_node(GT_UNUSED GtNodeVisitor *nv, GtFeatureNode *fn, GT_UNUSED GtError *err) { GtFeatureNodeIterator *fni; GtFeatureNode *node; int had_err = 0; fni = gt_feature_node_iterator_new_direct(fn); while (!had_err && (node = gt_feature_node_iterator_next(fni))) { had_err = check_boundaries_visitor_check_rec(fn, node, err); } gt_feature_node_iterator_delete(fni); return 0; }
static int cds_check_visitor_feature_node(GtNodeVisitor *nv, GtFeatureNode *fn, GtError *err) { GtCDSCheckVisitor *v = cds_check_visitor_cast(nv); GtFeatureNodeIterator *fni; GtFeatureNode *node; int had_err = 0; gt_error_check(err); gt_assert(v && fn); fni = gt_feature_node_iterator_new(fn); while (!had_err && (node = gt_feature_node_iterator_next(fni))) had_err = check_cds_phases_if_necessary(node, v, err); gt_feature_node_iterator_delete(fni); gt_hashmap_reset(v->cds_features); return had_err; }
static int extracttarget_from_node(GtGenomeNode *gn, GtStrArray *seqfiles, GtError *err) { GtFeatureNodeIterator *fni; int had_err = 0; gt_error_check(err); gt_assert(gn && seqfiles); if (gt_genome_node_cast(gt_feature_node_class(), gn)) { const char *target; GtFeatureNode *child; fni = gt_feature_node_iterator_new(gt_feature_node_cast(gn)); while (!had_err && /* XXX remove cast */ (child = (GtFeatureNode*) gt_feature_node_iterator_next(fni))) { if ((target = gt_feature_node_get_attribute(child, "Target"))) had_err = extracttarget_from_seqfiles(target, seqfiles, err); } gt_feature_node_iterator_delete(fni); } return had_err; }
static int gt_seqpos_classifier_next_specified_ft( GtSeqposClassifier *seqpos_classifier, GtRange *range, bool *end_of_annotation, GtError *err) { int had_err = 0; GtFeatureNode *cfn; bool fni_exhausted = (seqpos_classifier->fni == NULL) ? true : false; gt_assert(seqpos_classifier != NULL); gt_assert(range != NULL); while (true) { if (fni_exhausted) { had_err = gt_seqpos_classifier_next_fn(seqpos_classifier, err); if (had_err != 0 || seqpos_classifier->fn == NULL) { *end_of_annotation = true; return had_err; } fni_exhausted = false; } gt_assert(seqpos_classifier->fni != NULL); cfn = gt_feature_node_iterator_next(seqpos_classifier->fni); if (cfn == NULL) { fni_exhausted = true; } else if (strcmp(gt_feature_node_get_type(cfn), seqpos_classifier->specified_ft) == 0) { seqpos_classifier->nof_specified_ft_found++; *range = gt_genome_node_get_range((GtGenomeNode*)cfn); gt_assert(range->start > 0); gt_assert(range->end > 0); range->start--; range->end--; *end_of_annotation = false; return had_err; } } }
static void orf_attach_results_to_gff3(GtFeatureNode *gf, GtRange orf_rng, unsigned int orf_frame, GtStrand strand, GT_UNUSED GtError *err) { GtGenomeNode *child; GtStr *tag; tag = gt_str_new_cstr(GT_ORF_FINDER_TAG); orf_rng.start++; orf_rng.end++; GtFeatureNodeIterator *gfi; GtFeatureNode *curnode = NULL, *parent_node = NULL; GtRange gfi_range; char frame_buf[3]; sprintf(frame_buf, "%d", orf_frame); gfi = gt_feature_node_iterator_new(gf); while ((curnode = gt_feature_node_iterator_next(gfi))) { if (strcmp(gt_feature_node_get_type(curnode), (const char*) GT_ORF_TYPE) != 0) { gfi_range = gt_genome_node_get_range((GtGenomeNode*) curnode); if (gt_range_contains(&gfi_range, &orf_rng)) { parent_node = curnode; } } } if (parent_node) { child = gt_feature_node_new(gt_genome_node_get_seqid((GtGenomeNode*) gf), GT_ORF_TYPE, orf_rng.start, orf_rng.end, strand); gt_feature_node_set_source((GtFeatureNode*) child, tag); gt_feature_node_set_attribute((GtFeatureNode*) child, "frame", frame_buf); gt_feature_node_add_child(parent_node,(GtFeatureNode*) child); } gt_str_delete(tag); gt_feature_node_iterator_delete(gfi); }
static int CpGIOverlap_stream_next(GtNodeStream * ns, GtGenomeNode ** gn, GtError * err) { GtGenomeNode * cur_node, * next_node; GtFeatureNodeIterator * iter; int err_num = 0; *gn = NULL; CpGIOverlap_stream * context; const char * gene_name = NULL; const char * overlap_name = NULL; char chr_str[255]; int chr_num; unsigned int TSS; float CpGIOverlap; context = CpGIOverlap_stream_cast(ns); // find the genes, determine expression level if(!gt_node_stream_next(context->in_stream, &cur_node, err ) && cur_node != NULL ) { *gn = cur_node; // try casting as a feature node so we can test type if(!gt_genome_node_try_cast(gt_feature_node_class(), cur_node)) { return 0; } else // we found a feature node { // first check if it is a pseudo node, if so find the gene in it if available if (gt_feature_node_is_pseudo(cur_node)) { iter = gt_feature_node_iterator_new(cur_node); if (iter == NULL) return; while ((next_node = gt_feature_node_iterator_next(iter)) && !gt_feature_node_has_type(next_node, feature_type_gene)); gt_feature_node_iterator_delete(iter); if (NULL == (cur_node = next_node)) return 0; } if(!gt_feature_node_has_type(cur_node, feature_type_gene)) return 0; // find name of gene gene_name = gt_feature_node_get_attribute(cur_node, "Name"); if (gene_name == NULL) return; if ( 1 != sscanf(gt_str_get(gt_genome_node_get_seqid(cur_node)), "Chr%d", &chr_num)) return 0; TSS = (gt_feature_node_get_strand(cur_node) == GT_STRAND_FORWARD) ? gt_genome_node_get_start(cur_node) : gt_genome_node_get_end(cur_node); // now figure out the overlapping gene if (! (overlap_name = CpGIOverlap_stream_find_gene_overlap( context, TSS, chr_num))) return 0; // save the score into the node gt_feature_node_set_attribute(cur_node, "cpgi_at_tss", overlap_name); return 0; } } return err_num; }
static int cluster_annotate_nodes(GtClusteredSet *cs, GtEncseq *encseq, const char *feature, GtArray *nodes, GtError *err) { GtFeatureNodeIterator *fni; GtFeatureNode *curnode = NULL, *tmp; GtClusteredSetIterator *csi = NULL; GtGenomeNode *gn; GtHashmap *desc2node; GtStr *seqid = NULL; int had_err = 0; unsigned long num_of_clusters, i, elm; const char *fnt = NULL; char buffer[BUFSIZ], *real_feature; gt_error_check(err); if ((strcmp(feature, "lLTR") == 0) || (strcmp(feature, "rLTR") == 0)) real_feature = gt_cstr_dup(gt_ft_long_terminal_repeat); else real_feature = gt_cstr_dup(feature); desc2node = gt_hashmap_new(GT_HASH_STRING, free_hash, NULL); for (i = 0; i < gt_array_size(nodes); i++) { gn = *(GtGenomeNode**) gt_array_get(nodes, i); if (gt_feature_node_try_cast(gn) == NULL) continue; fni = gt_feature_node_iterator_new((GtFeatureNode*) gn); while ((curnode = gt_feature_node_iterator_next(fni)) != NULL) { char header[BUFSIZ]; fnt = gt_feature_node_get_type(curnode); if (strcmp(fnt, gt_ft_repeat_region) == 0) { const char *rid; unsigned long id; seqid = gt_genome_node_get_seqid((GtGenomeNode*) curnode); rid = gt_feature_node_get_attribute(curnode, "ID"); (void) sscanf(rid, "repeat_region%lu", &id); (void) snprintf(buffer, BUFSIZ, "%s_%lu", gt_str_get(seqid), id); } else if (strcmp(fnt, gt_ft_protein_match) == 0) { GtRange range; const char *attr; attr = gt_feature_node_get_attribute(curnode, "name"); if (!attr) continue; if (strcmp(feature, attr) != 0) continue; range = gt_genome_node_get_range((GtGenomeNode*) curnode); if ((range.end - range.start + 1) < 10UL) continue; (void) snprintf(header, BUFSIZ, "%s_%lu_%lu", buffer, range.start, range.end); gt_hashmap_add(desc2node, (void*) gt_cstr_dup(header), (void*) curnode); } else if (strcmp(fnt, real_feature) == 0) { GtRange range; range = gt_genome_node_get_range((GtGenomeNode*) curnode); if ((range.end - range.start + 1) < 10UL) continue; (void) snprintf(header, BUFSIZ, "%s_%lu_%lu", buffer, range.start, range.end); gt_hashmap_add(desc2node, (void*) gt_cstr_dup(header), (void*) curnode); } } gt_feature_node_iterator_delete(fni); } gt_free(real_feature); num_of_clusters = gt_clustered_set_num_of_clusters(cs, err); for (i = 0; i < num_of_clusters; i++) { csi = gt_clustered_set_get_iterator(cs, i ,err); if (csi != NULL) { while (!had_err && (gt_clustered_set_iterator_next(csi, &elm, err) != GT_CLUSTERED_SET_ITERATOR_STATUS_END)) { char clid[BUFSIZ]; const char *encseqdesc; char *encseqid; unsigned long desclen; encseqdesc = gt_encseq_description(encseq, &desclen, elm); encseqid = gt_calloc((size_t) (desclen + 1), sizeof (char)); (void) strncpy(encseqid, encseqdesc, (size_t) desclen); encseqid[desclen] = '\0'; tmp = (GtFeatureNode*) gt_hashmap_get(desc2node, (void*) encseqid); (void) snprintf(clid, BUFSIZ, "%lu", i); gt_feature_node_set_attribute(tmp, "clid", clid); gt_free(encseqid); } } gt_clustered_set_iterator_delete(csi, err); csi = NULL; } gt_hashmap_delete(desc2node); return had_err; }
static int gt_ltrdigest_pdom_visitor_feature_node(GtNodeVisitor *nv, GtFeatureNode *fn, GtError *err) { GtLTRdigestPdomVisitor *lv; GtFeatureNodeIterator *fni; GtFeatureNode *curnode = NULL; int had_err = 0; GtRange rng; GtUword i; lv = gt_ltrdigest_pdom_visitor_cast(nv); gt_assert(lv); gt_error_check(err); /* traverse annotation subgraph and find LTR element */ fni = gt_feature_node_iterator_new(fn); while (!had_err && (curnode = gt_feature_node_iterator_next(fni))) { if (strcmp(gt_feature_node_get_type(curnode), lv->root_type) == 0) { lv->ltr_retrotrans = curnode; } } gt_feature_node_iterator_delete(fni); if (!had_err && lv->ltr_retrotrans != NULL) { GtCodonIterator *ci; GtTranslator *tr; GtTranslatorStatus status; GtUword seqlen; char translated, *rev_seq; #ifndef _WIN32 FILE *instream; GtHMMERParseStatus *pstatus; #endif unsigned int frame; GtStr *seq; seq = gt_str_new(); rng = gt_genome_node_get_range((GtGenomeNode*) lv->ltr_retrotrans); lv->leftLTR_5 = rng.start - 1; lv->rightLTR_3 = rng.end - 1; seqlen = gt_range_length(&rng); had_err = gt_extract_feature_sequence(seq, (GtGenomeNode*) lv->ltr_retrotrans, lv->root_type, false, NULL, NULL, lv->rmap, err); if (!had_err) { for (i = 0UL; i < 3UL; i++) { gt_str_reset(lv->fwd[i]); gt_str_reset(lv->rev[i]); } /* create translations */ ci = gt_codon_iterator_simple_new(gt_str_get(seq), seqlen, NULL); gt_assert(ci); tr = gt_translator_new(ci); status = gt_translator_next(tr, &translated, &frame, err); while (status == GT_TRANSLATOR_OK && translated) { gt_str_append_char(lv->fwd[frame], translated); status = gt_translator_next(tr, &translated, &frame, NULL); } if (status == GT_TRANSLATOR_ERROR) had_err = -1; if (!had_err) { rev_seq = gt_malloc((size_t) seqlen * sizeof (char)); strncpy(rev_seq, gt_str_get(seq), (size_t) seqlen * sizeof (char)); (void) gt_reverse_complement(rev_seq, seqlen, NULL); gt_codon_iterator_delete(ci); ci = gt_codon_iterator_simple_new(rev_seq, seqlen, NULL); gt_translator_set_codon_iterator(tr, ci); status = gt_translator_next(tr, &translated, &frame, err); while (status == GT_TRANSLATOR_OK && translated) { gt_str_append_char(lv->rev[frame], translated); status = gt_translator_next(tr, &translated, &frame, NULL); } if (status == GT_TRANSLATOR_ERROR) had_err = -1; gt_free(rev_seq); } gt_codon_iterator_delete(ci); gt_translator_delete(tr); } /* run HMMER and handle results */ if (!had_err) { #ifndef _WIN32 int pid, pc[2], cp[2]; GT_UNUSED int rval; (void) signal(SIGCHLD, SIG_IGN); /* XXX: for now, ignore child's exit status */ rval = pipe(pc); gt_assert(rval == 0); rval = pipe(cp); gt_assert(rval == 0); switch ((pid = (int) fork())) { case -1: perror("Can't fork"); exit(1); /* XXX: error handling */ case 0: /* child */ (void) close(1); /* close current stdout. */ rval = dup(cp[1]); /* make stdout go to write end of pipe. */ (void) close(0); /* close current stdin. */ rval = dup(pc[0]); /* make stdin come from read end of pipe. */ (void) close(pc[0]); (void) close(pc[1]); (void) close(cp[0]); (void) close(cp[1]); (void) execvp("hmmscan", lv->args); /* XXX: read path from env */ perror("couldn't execute hmmscan!"); exit(1); default: /* parent */ for (i = 0UL; i < 3UL; i++) { char buf[5]; GT_UNUSED ssize_t written; (void) sprintf(buf, ">"GT_WU"%c\n", i, '+'); written = write(pc[1], buf, 4 * sizeof (char)); written = write(pc[1], gt_str_get(lv->fwd[i]), (size_t) gt_str_length(lv->fwd[i]) * sizeof (char)); written = write(pc[1], "\n", 1 * sizeof (char)); (void) sprintf(buf, ">"GT_WU"%c\n", i, '-'); written = write(pc[1], buf, 4 * sizeof (char)); written = write(pc[1], gt_str_get(lv->rev[i]), (size_t) gt_str_length(lv->rev[i]) * sizeof (char)); written = write(pc[1], "\n", 1 * sizeof (char)); } (void) close(pc[0]); (void) close(pc[1]); (void) close(cp[1]); instream = fdopen(cp[0], "r"); pstatus = gt_hmmer_parse_status_new(); had_err = gt_ltrdigest_pdom_visitor_parse_output(lv, pstatus, instream, err); (void) fclose(instream); if (!had_err) had_err = gt_ltrdigest_pdom_visitor_process_hits(lv, pstatus, err); gt_hmmer_parse_status_delete(pstatus); } #else /* XXX */ gt_error_set(err, "HMMER call not implemented on Windows\n"); had_err = -1; #endif } gt_str_delete(seq); } if (!had_err) had_err = gt_ltrdigest_pdom_visitor_choose_strand(lv); return had_err; }
static int gt_ltrdigest_pdom_visitor_choose_strand(GtLTRdigestPdomVisitor *lv) { int had_err = 0; double log_eval_fwd = 0.0, log_eval_rev = 0.0; GtFeatureNodeIterator *fni; GtStrand strand; double score; bool seen_fwd = false, seen_rev = false; GtFeatureNode *curnode = NULL; GtUword i; GtArray *to_delete; fni = gt_feature_node_iterator_new(lv->ltr_retrotrans); while (!had_err && (curnode = gt_feature_node_iterator_next(fni))) { if (strcmp(gt_feature_node_get_type(curnode), gt_ft_protein_match) == 0) { strand = gt_feature_node_get_strand(curnode); score = (double) gt_feature_node_get_score(curnode); if (strand == GT_STRAND_FORWARD) { log_eval_fwd += log(score); seen_fwd = true; } else if (strand == GT_STRAND_REVERSE) { log_eval_rev += log(score); seen_rev = true; } } } gt_feature_node_iterator_delete(fni); if (seen_rev && !seen_fwd) gt_feature_node_set_strand(lv->ltr_retrotrans, GT_STRAND_REVERSE); else if (!seen_rev && seen_fwd) gt_feature_node_set_strand(lv->ltr_retrotrans, GT_STRAND_FORWARD); else if (!seen_rev && !seen_fwd) return had_err; else { gt_assert(seen_rev && seen_fwd); if (gt_double_compare(log_eval_fwd, log_eval_rev) < 0) strand = GT_STRAND_FORWARD; else strand = GT_STRAND_REVERSE; gt_feature_node_set_strand(lv->ltr_retrotrans, strand); to_delete = gt_array_new(sizeof (GtFeatureNode*)); fni = gt_feature_node_iterator_new(lv->ltr_retrotrans); while (!had_err && (curnode = gt_feature_node_iterator_next(fni))) { if (strcmp(gt_feature_node_get_type(curnode), gt_ft_protein_match) == 0) { if (strand != gt_feature_node_get_strand(curnode)) { gt_array_add(to_delete, curnode); } } } gt_feature_node_iterator_delete(fni); gt_assert(gt_array_size(to_delete) > 0); for (i = 0; i < gt_array_size(to_delete); i++) { gt_feature_node_remove_leaf(lv->ltr_retrotrans, *(GtFeatureNode**) gt_array_get(to_delete, i)); } gt_array_delete(to_delete); } return had_err; }
static int gt_extract_feature_sequence_generic(GtStr *sequence, GtGenomeNode *gn, const char *type, bool join, GtStr *seqid, GtStrArray *target_ids, unsigned int *out_phase_offset, GtRegionMapping *region_mapping, GtError *err) { GtFeatureNode *fn; GtRange range; unsigned int phase_offset = 0; char *outsequence; const char *target; int had_err = 0; gt_error_check(err); fn = gt_genome_node_cast(gt_feature_node_class(), gn); gt_assert(fn); if (seqid) gt_str_append_str(seqid, gt_genome_node_get_seqid(gn)); if (target_ids && (target = gt_feature_node_get_attribute(fn, GT_GFF_TARGET))) { had_err = gt_gff3_parser_parse_all_target_attributes(target, false, target_ids, NULL, NULL, "", 0, err); } if (!had_err) { if (join) { GtFeatureNodeIterator *fni; GtFeatureNode *child; bool reverse_strand = false, first_child = true, first_child_of_type_seen = false; GtPhase phase = GT_PHASE_UNDEFINED; /* in this case we have to traverse the children */ fni = gt_feature_node_iterator_new_direct(gt_feature_node_cast(gn)); while (!had_err && (child = gt_feature_node_iterator_next(fni))) { if (first_child) { if (target_ids && (target = gt_feature_node_get_attribute(child, GT_GFF_TARGET))) { gt_str_array_reset(target_ids); had_err = gt_gff3_parser_parse_all_target_attributes(target, false, target_ids, NULL, NULL, "", 0, err); } first_child = false; } if (!had_err) { if (extract_join_feature((GtGenomeNode*) child, type, region_mapping, sequence, &reverse_strand, &first_child_of_type_seen, &phase, err)) { had_err = -1; } if (phase != GT_PHASE_UNDEFINED) { phase_offset = (int) phase; } } } gt_feature_node_iterator_delete(fni); gt_assert(phase_offset <= (unsigned int) GT_PHASE_UNDEFINED); if (!had_err && gt_str_length(sequence)) { if (reverse_strand) { had_err = gt_reverse_complement(gt_str_get(sequence), gt_str_length(sequence), err); } } } else if (gt_feature_node_get_type(fn) == type) { GtPhase phase = gt_feature_node_get_phase(fn); gt_assert(!had_err); if (phase != GT_PHASE_UNDEFINED) phase_offset = (unsigned int) phase; /* otherwise we only have to look at this feature */ range = gt_genome_node_get_range(gn); gt_assert(range.start); /* 1-based coordinates */ had_err = gt_region_mapping_get_sequence(region_mapping, &outsequence, gt_genome_node_get_seqid(gn), range.start, range.end, err); if (!had_err) { gt_str_append_cstr_nt(sequence, outsequence, gt_range_length(&range)); gt_free(outsequence); if (gt_feature_node_get_strand(fn) == GT_STRAND_REVERSE) { had_err = gt_reverse_complement(gt_str_get(sequence), gt_str_length(sequence), err); } } } } if (out_phase_offset && phase_offset != GT_PHASE_UNDEFINED) { *out_phase_offset = phase_offset; } return had_err; }
static int add_ids_visitor_feature_node(GtNodeVisitor *nv, GtFeatureNode *fn, GtError *err) { AutomaticSequenceRegion *auto_sr; GtAddIDsVisitor *aiv; const char *seqid; bool is_circular; aiv = add_ids_visitor_cast(nv); seqid = gt_str_get(gt_genome_node_get_seqid((GtGenomeNode*) fn)); if (aiv->ensure_sorting && !gt_cstr_table_get(aiv->defined_seqids, seqid)) { gt_error_set(err, "the file %s is not sorted (seqid \"%s\" on line %u has " "not been previously introduced with a \"%s\" line)", gt_genome_node_get_filename((GtGenomeNode*) fn), seqid, gt_genome_node_get_line_number((GtGenomeNode*) fn), GT_GFF_SEQUENCE_REGION); return -1; } if (!gt_cstr_table_get(aiv->defined_seqids, seqid)) { GtFeatureNodeIterator *fni; GtFeatureNode *node; GtRange range = gt_genome_node_get_range((GtGenomeNode*) fn); is_circular = gt_feature_node_get_attribute(fn, GT_GFF_IS_CIRCULAR) ? true : false; if (!is_circular) { fni = gt_feature_node_iterator_new(fn); while ((node = gt_feature_node_iterator_next(fni))) { GtRange node_range = gt_genome_node_get_range((GtGenomeNode*) node); range = gt_range_join(&range, &node_range); } gt_feature_node_iterator_delete(fni); } /* sequence region has not been previously introduced -> check if one has already been created automatically */ auto_sr = gt_hashmap_get(aiv->undefined_sequence_regions, seqid); if (!auto_sr) { GtStr *seqid_str; /* sequence region has not been createad automatically -> do it now */ gt_warning("seqid \"%s\" on line %u in file \"%s\" has not been " "previously introduced with a \"%s\" line, create such a line " "automatically", seqid, gt_genome_node_get_line_number((GtGenomeNode*) fn), gt_genome_node_get_filename((GtGenomeNode*) fn), GT_GFF_SEQUENCE_REGION); auto_sr = automatic_sequence_region_new(is_circular); seqid_str = gt_genome_node_get_seqid((GtGenomeNode*) fn); auto_sr->sequence_region = gt_region_node_new(seqid_str, range.start, range.end); gt_hashmap_add(aiv->undefined_sequence_regions, gt_str_get(seqid_str), auto_sr); } else { if (auto_sr->is_circular) { gt_assert(!is_circular); /* XXX */ } else if (is_circular) { gt_assert(!auto_sr->is_circular); /* XXX */ auto_sr->is_circular = true; gt_genome_node_set_range(auto_sr->sequence_region, &range); } else { GtRange joined_range, sr_range = gt_genome_node_get_range(auto_sr->sequence_region); /* update the range of the sequence region */ joined_range = gt_range_join(&range, &sr_range); gt_genome_node_set_range(auto_sr->sequence_region, &joined_range); } } gt_array_add(auto_sr->feature_nodes, fn); } else gt_queue_add(aiv->node_buffer, fn); return 0; }
int gt_ltrfileout_stream_next(GtNodeStream *ns, GtGenomeNode **gn, GtError *err) { GtLTRdigestFileOutStream *ls; GtFeatureNode *fn; GtRange lltr_rng = {GT_UNDEF_UWORD, GT_UNDEF_UWORD}, rltr_rng = {GT_UNDEF_UWORD, GT_UNDEF_UWORD}, ppt_rng = {GT_UNDEF_UWORD, GT_UNDEF_UWORD}, pbs_rng = {GT_UNDEF_UWORD, GT_UNDEF_UWORD}; int had_err; GtUword i=0; gt_error_check(err); ls = gt_ltrdigest_file_out_stream_cast(ns); /* initialize this element */ memset(&ls->element, 0, sizeof (GtLTRElement)); /* get annotations from parser */ had_err = gt_node_stream_next(ls->in_stream, gn, err); if (!had_err && *gn) { GtFeatureNodeIterator* gni; GtFeatureNode *mygn; /* only process feature nodes */ if (!(fn = gt_feature_node_try_cast(*gn))) return 0; ls->element.pdomorder = gt_array_new(sizeof (const char*)); /* fill LTRElement structure from GFF3 subgraph */ gni = gt_feature_node_iterator_new(fn); for (mygn = fn; mygn != NULL; mygn = gt_feature_node_iterator_next(gni)) (void) gt_genome_node_accept((GtGenomeNode*) mygn, (GtNodeVisitor*) ls->lv, err); gt_feature_node_iterator_delete(gni); } if (!had_err && ls->element.mainnode != NULL) { char desc[GT_MAXFASTAHEADER]; GtFeatureNode *ltr3, *ltr5; GtStr *sdesc, *sreg, *seq; /* find sequence in GtEncseq */ sreg = gt_genome_node_get_seqid((GtGenomeNode*) ls->element.mainnode); sdesc = gt_str_new(); had_err = gt_region_mapping_get_description(ls->rmap, sdesc, sreg, err); if (!had_err) { GtRange rng; ls->element.seqid = gt_calloc((size_t) ls->seqnamelen+1, sizeof (char)); (void) snprintf(ls->element.seqid, MIN((size_t) gt_str_length(sdesc), (size_t) ls->seqnamelen)+1, "%s", gt_str_get(sdesc)); gt_cstr_rep(ls->element.seqid, ' ', '_'); if (gt_str_length(sdesc) > (GtUword) ls->seqnamelen) ls->element.seqid[ls->seqnamelen] = '\0'; (void) gt_ltrelement_format_description(&ls->element, ls->seqnamelen, desc, (size_t) (GT_MAXFASTAHEADER-1)); gt_str_delete(sdesc); /* output basic retrotransposon data */ lltr_rng = gt_genome_node_get_range((GtGenomeNode*) ls->element.leftLTR); rltr_rng = gt_genome_node_get_range((GtGenomeNode*) ls->element.rightLTR); rng = gt_genome_node_get_range((GtGenomeNode*) ls->element.mainnode); gt_file_xprintf(ls->tabout_file, GT_WU"\t"GT_WU"\t"GT_WU"\t%s\t"GT_WU"\t"GT_WU"\t"GT_WU"\t" GT_WU"\t"GT_WU"\t"GT_WU"\t", rng.start, rng.end, gt_ltrelement_length(&ls->element), ls->element.seqid, lltr_rng.start, lltr_rng.end, gt_ltrelement_leftltrlen(&ls->element), rltr_rng.start, rltr_rng.end, gt_ltrelement_rightltrlen(&ls->element)); } seq = gt_str_new(); /* output TSDs */ if (!had_err && ls->element.leftTSD != NULL) { GtRange tsd_rng; tsd_rng = gt_genome_node_get_range((GtGenomeNode*) ls->element.leftTSD); had_err = gt_extract_feature_sequence(seq, (GtGenomeNode*) ls->element.leftTSD, gt_symbol(gt_ft_target_site_duplication), false, NULL, NULL, ls->rmap, err); if (!had_err) { gt_file_xprintf(ls->tabout_file, ""GT_WU"\t"GT_WU"\t%s\t", tsd_rng.start, tsd_rng.end, gt_str_get(seq)); } gt_str_reset(seq); } else gt_file_xprintf(ls->tabout_file, "\t\t\t"); if (!had_err && ls->element.rightTSD != NULL) { GtRange tsd_rng; tsd_rng = gt_genome_node_get_range((GtGenomeNode*) ls->element.rightTSD); had_err = gt_extract_feature_sequence(seq, (GtGenomeNode*) ls->element.rightTSD, gt_symbol(gt_ft_target_site_duplication), false, NULL, NULL, ls->rmap, err); if (!had_err) { gt_file_xprintf(ls->tabout_file, ""GT_WU"\t"GT_WU"\t%s\t", tsd_rng.start, tsd_rng.end, gt_str_get(seq)); } gt_str_reset(seq); } else gt_file_xprintf(ls->tabout_file, "\t\t\t"); /* output PPT */ if (!had_err && ls->element.ppt != NULL) { GtStrand ppt_strand = gt_feature_node_get_strand(ls->element.ppt); ppt_rng = gt_genome_node_get_range((GtGenomeNode*) ls->element.ppt); had_err = gt_extract_feature_sequence(seq, (GtGenomeNode*) ls->element.ppt, gt_symbol(gt_ft_RR_tract), false, NULL, NULL, ls->rmap, err); if (!had_err) { gt_fasta_show_entry(desc, gt_str_get(seq), gt_range_length(&ppt_rng), GT_FSWIDTH, ls->pptout_file); gt_file_xprintf(ls->tabout_file, ""GT_WU"\t"GT_WU"\t%s\t%c\t%d\t", ppt_rng.start, ppt_rng.end, gt_str_get(seq), GT_STRAND_CHARS[ppt_strand], (ppt_strand == GT_STRAND_FORWARD ? abs((int) (rltr_rng.start - ppt_rng.end)) : abs((int) (lltr_rng.end - ppt_rng.start)))); } gt_str_reset(seq); } else gt_file_xprintf(ls->tabout_file, "\t\t\t\t\t"); /* output PBS */ if (!had_err && ls->element.pbs != NULL) { GtStrand pbs_strand; pbs_strand = gt_feature_node_get_strand(ls->element.pbs); pbs_rng = gt_genome_node_get_range((GtGenomeNode*) ls->element.pbs); had_err = gt_extract_feature_sequence(seq, (GtGenomeNode*) ls->element.pbs, gt_symbol(gt_ft_primer_binding_site), false, NULL, NULL, ls->rmap, err); if (!had_err) { gt_fasta_show_entry(desc, gt_str_get(seq), gt_range_length(&pbs_rng), GT_FSWIDTH, ls->pbsout_file); gt_file_xprintf(ls->tabout_file, ""GT_WU"\t"GT_WU"\t%c\t%s\t%s\t%s\t%s\t%s\t", pbs_rng.start, pbs_rng.end, GT_STRAND_CHARS[pbs_strand], gt_feature_node_get_attribute(ls->element.pbs, "trna"), gt_str_get(seq), gt_feature_node_get_attribute(ls->element.pbs, "pbsoffset"), gt_feature_node_get_attribute(ls->element.pbs, "trnaoffset"), gt_feature_node_get_attribute(ls->element.pbs, "edist")); } gt_str_reset(seq); } else gt_file_xprintf(ls->tabout_file, "\t\t\t\t\t\t\t\t"); /* output protein domains */ if (!had_err && ls->element.pdoms != NULL) { GtStr *pdomorderstr = gt_str_new(); for (i=0; !had_err && i<gt_array_size(ls->element.pdomorder); i++) { const char* key = *(const char**) gt_array_get(ls->element.pdomorder, i); GtArray *entry = (GtArray*) gt_hashmap_get(ls->element.pdoms, key); had_err = write_pdom(ls, entry, key, ls->rmap, desc, err); } if (GT_STRAND_REVERSE == gt_feature_node_get_strand(ls->element.mainnode)) gt_array_reverse(ls->element.pdomorder); for (i=0 ;!had_err && i<gt_array_size(ls->element.pdomorder); i++) { const char* name = *(const char**) gt_array_get(ls->element.pdomorder, i); gt_str_append_cstr(pdomorderstr, name); if (i != gt_array_size(ls->element.pdomorder)-1) gt_str_append_cstr(pdomorderstr, "/"); } gt_file_xprintf(ls->tabout_file, "%s", gt_str_get(pdomorderstr)); gt_str_delete(pdomorderstr); } /* output LTRs (we just expect them to exist) */ switch (gt_feature_node_get_strand(ls->element.mainnode)) { case GT_STRAND_REVERSE: ltr5 = ls->element.rightLTR; ltr3 = ls->element.leftLTR; break; case GT_STRAND_FORWARD: default: ltr5 = ls->element.leftLTR; ltr3 = ls->element.rightLTR; break; } if (!had_err) { had_err = gt_extract_feature_sequence(seq, (GtGenomeNode*) ltr5, gt_symbol(gt_ft_long_terminal_repeat), false, NULL, NULL, ls->rmap, err); } if (!had_err) { gt_fasta_show_entry(desc, gt_str_get(seq), gt_str_length(seq), GT_FSWIDTH, ls->ltr5out_file); gt_str_reset(seq); } if (!had_err) { had_err = gt_extract_feature_sequence(seq, (GtGenomeNode*) ltr3, gt_symbol(gt_ft_long_terminal_repeat), false, NULL, NULL, ls->rmap, err); } if (!had_err) { gt_fasta_show_entry(desc, gt_str_get(seq), gt_str_length(seq), GT_FSWIDTH, ls->ltr3out_file); gt_str_reset(seq); } /* output complete oriented element */ if (!had_err) { had_err = gt_extract_feature_sequence(seq, (GtGenomeNode*) ls->element.mainnode, gt_symbol(gt_ft_LTR_retrotransposon), false, NULL, NULL, ls->rmap, err); } if (!had_err) { gt_fasta_show_entry(desc,gt_str_get(seq), gt_str_length(seq), GT_FSWIDTH, ls->elemout_file); gt_str_reset(seq); } gt_file_xprintf(ls->tabout_file, "\n"); gt_str_delete(seq); } gt_hashmap_delete(ls->element.pdoms); gt_array_delete(ls->element.pdomorder); gt_free(ls->element.seqid); return had_err; }
static int gt_snp_annotator_visitor_prepare_gene(GtSNPAnnotatorVisitor *sav, GtError *err) { GtFeatureNodeIterator *fni, *mrnafni; GtFeatureNode *curnode, *last_mRNA = NULL; GtStr *mrnaseq, *seqid; int had_err = 0; mrnaseq = gt_str_new(); seqid = gt_genome_node_get_seqid((GtGenomeNode*) sav->gene); fni = gt_feature_node_iterator_new(sav->gene); while (!had_err && (curnode = gt_feature_node_iterator_next(fni))) { if (gt_feature_node_get_type(curnode) == sav->mRNA_type) { GtFeatureNode *curnode2; if (last_mRNA) { char *mrna_charseq = gt_calloc(gt_str_length(mrnaseq)+1, sizeof (char)); (void) strncpy(mrna_charseq, gt_str_get(mrnaseq), gt_str_length(mrnaseq)); if (gt_feature_node_get_strand(sav->gene) == GT_STRAND_REVERSE) { had_err = gt_reverse_complement(mrna_charseq, gt_str_length(mrnaseq), err); } if (!had_err) { gt_hashmap_add(sav->rnaseqs, last_mRNA, mrna_charseq); last_mRNA = curnode; gt_str_reset(mrnaseq); } } else last_mRNA = curnode; if (!had_err) { mrnafni = gt_feature_node_iterator_new(curnode); while (!had_err && (curnode2 = gt_feature_node_iterator_next(mrnafni))) { if (gt_feature_node_get_type(curnode2) == sav->CDS_type) { char *tmp; GtRange rng = gt_genome_node_get_range((GtGenomeNode*) curnode2); had_err = gt_region_mapping_get_sequence(sav->rmap, &tmp, seqid, rng.start, rng.end, err); if (!had_err) { gt_str_append_cstr_nt(mrnaseq, tmp, gt_range_length(&rng)); gt_free(tmp); } } } gt_feature_node_iterator_delete(mrnafni); } } } if (!had_err && last_mRNA) { char *mrna_charseq = gt_calloc(gt_str_length(mrnaseq)+1, sizeof (char)); (void) strncpy(mrna_charseq, gt_str_get(mrnaseq), gt_str_length(mrnaseq)); if (gt_feature_node_get_strand(sav->gene) == GT_STRAND_REVERSE) { had_err = gt_reverse_complement(mrna_charseq, gt_str_length(mrnaseq), err); } if (!had_err) { gt_hashmap_add(sav->rnaseqs, last_mRNA, mrna_charseq); } } gt_feature_node_iterator_delete(fni); gt_str_delete(mrnaseq); return had_err; }
static int snp_annotator_visitor_feature_node(GtNodeVisitor *nv, GtFeatureNode *fn, GtError *err) { int had_err = 0; GtSNPAnnotatorVisitor *sav; GtFeatureNodeIterator *fni, *mrnafni; GtFeatureNode *curnode, *curnode2; GtRange snp_rng; gt_error_check(err); sav = snp_annotator_visitor_cast(nv); /* ignore non-nodes */ if (!fn) return 0; /* only process SNPs */ if (!(gt_feature_node_get_type(fn) == sav->SNV_type || gt_feature_node_get_type(fn) == sav->SNP_type)) { return 0; } fni = gt_feature_node_iterator_new_direct(sav->gene); snp_rng = gt_genome_node_get_range((GtGenomeNode*) fn); while (!had_err && (curnode = gt_feature_node_iterator_next(fni))) { if (gt_feature_node_get_type(curnode) == sav->mRNA_type) { GtStrand mrna_strand = gt_feature_node_get_strand(curnode); #ifndef NDEBUG const char *refstr; #endif GtUword mrnasnppos = 0; mrnafni = gt_feature_node_iterator_new(curnode); while (!had_err && (curnode2 = gt_feature_node_iterator_next(mrnafni))) { if (gt_feature_node_get_type(curnode2) == sav->CDS_type) { GtRange cds_rng = gt_genome_node_get_range((GtGenomeNode*) curnode2); if (gt_range_overlap(&snp_rng, &cds_rng)) { char *mRNA, origchar; char *variantchars, *variantptr = NULL; GT_UNUSED char *refchars, *refptr = NULL; mRNA = (char*) gt_hashmap_get(sav->rnaseqs, curnode); gt_assert(mRNA); gt_assert(snp_rng.start >= cds_rng.start); mrnasnppos += (snp_rng.start - cds_rng.start); if (mrna_strand == GT_STRAND_REVERSE) mrnasnppos = strlen(mRNA) - mrnasnppos - 1; gt_assert(mrnasnppos < strlen(mRNA)); origchar = mRNA[mrnasnppos]; #ifndef NDEBUG refstr = refptr = gt_cstr_dup(gt_feature_node_get_attribute(fn, GT_GVF_REFERENCE_SEQ)); if (!had_err && refstr) { if (gt_feature_node_get_strand(curnode) == GT_STRAND_REVERSE) { int rval = gt_complement(&origchar, origchar, err); gt_assert(rval == 0); } gt_assert(toupper(origchar) == toupper(refstr[0])); } #endif variantchars = variantptr = gt_cstr_dup( gt_feature_node_get_attribute(fn, GT_GVF_VARIANT_SEQ)); if (!had_err && variantchars) { GtUword i = 0; while (!had_err && (*variantchars != ';' && *variantchars != '\0')) { if (*variantchars != ',' && *variantchars != origchar) { char variantchar = *variantchars; #ifndef NDEBUG char refchar = refstr ? refstr[0] : '-'; /* XXX */ if (!had_err && mrna_strand == GT_STRAND_REVERSE) had_err = gt_complement(&refchar, refchar, err); #endif if (!had_err && mrna_strand == GT_STRAND_REVERSE) had_err = gt_complement(&variantchar, variantchar, err); if (!had_err) { had_err = snp_annotator_classify_snp(sav, curnode, fn, mrnasnppos, i++, variantchar, #ifndef NDEBUG refchar, #endif err); } } else if (*variantchars == origchar) { i++; } variantchars++; } gt_free(variantptr); gt_free(refptr); } } else { mrnasnppos += gt_range_length(&cds_rng); } } } gt_feature_node_iterator_delete(mrnafni); } } gt_feature_node_iterator_delete(fni); return had_err; }