static int check_boundaries_visitor_check_rec(GtFeatureNode *parent, GtFeatureNode *child, GtError *err) { GtFeatureNodeIterator *fni; GtFeatureNode *node; GtRange range, p_range; int had_err = 0; range = gt_genome_node_get_range((GtGenomeNode*) child); p_range = gt_genome_node_get_range((GtGenomeNode*) parent); if (range.start < p_range.start || range.end > p_range.end) { gt_warning("%s child range " GT_WU "-" GT_WU " (file %s, line %u) not " "contained in %s parent range " GT_WU "-" GT_WU " (file %s, " "line %u)", gt_feature_node_get_type(child), range.start, range.end, gt_genome_node_get_filename((GtGenomeNode*) child), gt_genome_node_get_line_number((GtGenomeNode*) child), gt_feature_node_get_type(parent), p_range.start, p_range.end, gt_genome_node_get_filename((GtGenomeNode*) parent), gt_genome_node_get_line_number((GtGenomeNode*) parent)); } fni = gt_feature_node_iterator_new_direct(child); while ((node = gt_feature_node_iterator_next(fni))) { had_err = check_boundaries_visitor_check_rec(child, node, err); } gt_feature_node_iterator_delete(fni); return had_err; }
static int add_to_parent(GtDiagram *d, GtFeatureNode *node, GtFeatureNode *parent, GtError *err) { GtBlock *block = NULL; NodeInfoElement *par_ni, *ni; gt_assert(d && node); if (!parent) return 0; par_ni = nodeinfo_get(d, parent); ni = nodeinfo_get(d, node); gt_log_log("adding %s to parent %p", gt_feature_node_get_type(node), parent); ni->parent = parent; block = nodeinfo_find_block(par_ni, gt_feature_node_get_type(node), parent); if (!block) { block = gt_block_new_from_node(parent); gt_block_set_type(block, gt_feature_node_get_type(node)); if (assign_block_caption(d, node, parent, block, err) < 0) { gt_block_delete(block); return -1; } nodeinfo_add_block(par_ni, gt_feature_node_get_type((GtFeatureNode*) node), parent, block); } gt_assert(block); gt_block_insert_element(block, node); return 0; }
static int gt_ltr_input_check_visitor_feature_node(GtNodeVisitor *nv, GtFeatureNode *fn, GtError *err) { GT_UNUSED GtLTRInputCheckVisitor *lv; GtFeatureNodeIterator *fni; bool seen_left = false; GtFeatureNode *curnode = NULL, *ltr_retrotrans = NULL, *lltr = NULL, *rltr = NULL; int had_err = 0; lv = gt_ltr_input_check_visitor_cast(nv); gt_assert(lv); gt_error_check(err); /* traverse annotation subgraph and find LTR components */ fni = gt_feature_node_iterator_new(fn); while (!had_err && (curnode = gt_feature_node_iterator_next(fni))) { if (strcmp(gt_feature_node_get_type(curnode), gt_ft_LTR_retrotransposon) == 0) { ltr_retrotrans = curnode; } if (strcmp(gt_feature_node_get_type(curnode), gt_ft_long_terminal_repeat) == 0) { if (seen_left) rltr = curnode; else { lltr = curnode; seen_left = true; } } } gt_feature_node_iterator_delete(fni); if (lv->only_ltrs) { if (!had_err && !ltr_retrotrans) { gt_error_set(err, "connected component with %s entry node (%s, line %u) " "does not contain a '%s' node, which is required", gt_feature_node_get_type(fn), gt_genome_node_get_filename((GtGenomeNode*) fn), gt_genome_node_get_line_number((GtGenomeNode*) fn), gt_ft_LTR_retrotransposon); had_err = -1; } } if (!had_err && ltr_retrotrans && (!lltr || !rltr)) { gt_error_set(err, "LTR_retrotransposon feature (%s, line %u) " "does not contain two %s child features, both of which " "are required", gt_genome_node_get_filename((GtGenomeNode*) ltr_retrotrans), gt_genome_node_get_line_number((GtGenomeNode*) ltr_retrotrans), gt_ft_long_terminal_repeat); had_err = -1; } return had_err; }
static int add_to_current(GtDiagram *d, GtFeatureNode *node, GtFeatureNode *parent, GtError *err) { GtBlock *block; NodeInfoElement *ni; GtStyleQueryStatus rval; GtStr *caption = NULL; bool status = true; const char *nnid_p = NULL, *nnid_n = NULL, *nodetype; gt_assert(d && node); nodetype = gt_feature_node_get_type(node); if (get_caption_display_status(d, nodetype, &status, err) < 0) { return -1; } /* Get nodeinfo element and set itself as parent */ ni = nodeinfo_get(d, node); gt_log_log("adding %s to self", nodetype); ni->parent = node; /* create new GtBlock tuple and add to node info */ block = gt_block_new_from_node(node); caption = gt_str_new(); rval = gt_style_get_str(d->style, nodetype, "block_caption", caption, node, err); if (rval == GT_STYLE_QUERY_ERROR) { gt_str_delete(caption); gt_block_delete(block); return -1; } else if (rval == GT_STYLE_QUERY_NOT_SET) { nnid_p = get_node_name_or_id(parent); nnid_n = get_node_name_or_id(node); if ((nnid_p || nnid_n) && status) { if (parent) { if (nnid_p && gt_feature_node_has_children(parent)) gt_str_append_cstr(caption, nnid_p); else gt_str_append_cstr(caption, "-"); gt_str_append_cstr(caption, "/"); } if (nnid_n) gt_str_append_cstr(caption, nnid_n); } else { gt_str_delete(caption); caption = NULL; } } gt_block_set_caption(block, caption); gt_block_insert_element(block, node); nodeinfo_add_block(ni, gt_feature_node_get_type(node), GT_UNDEF_REPR, block); return 0; }
static int gt_orf_finder_visitor_feature_node(GtNodeVisitor *gv, GtFeatureNode *gf, GtError *err) { GtORFFinderVisitor *lv; const char *gft = NULL; GtFeatureNodeIterator *gfi; GtFeatureNode *curnode = NULL; int had_err = 0; GtRange rng; lv = gt_orf_finder_visitor_cast(gv); gt_assert(lv); gt_error_check(err); gfi = gt_feature_node_iterator_new(gf); while (!had_err && (curnode = gt_feature_node_iterator_next(gfi))) { gft = gt_feature_node_get_type(curnode); if (gt_hashmap_get(lv->types, (void*) gft) != NULL || gt_hashmap_get(lv->types, (void*) "all") == (void*) 1) { if (!had_err) { rng = gt_genome_node_get_range((GtGenomeNode*) curnode); had_err = run_orffinder(lv->rmap, curnode, rng.start - 1, rng.end - 1, lv->min, lv->max, lv->all, err); if (gt_hashmap_get(lv->types, (void*) "all") == (void*) 1) { break; } else if (gt_feature_node_has_children(curnode)) { GtFeatureNode *tmpnode = NULL; GtFeatureNodeIterator *tmpgfi = gt_feature_node_iterator_new(curnode); (void) gt_feature_node_iterator_next(tmpgfi); while ((tmpnode = gt_feature_node_iterator_next(tmpgfi))) { gft = gt_feature_node_get_type(tmpnode); if (strcmp(gft, (const char*) GT_ORF_TYPE) == 0) { continue; } /* curnode = gt_feature_node_iterator_next(gfi); */ } gt_feature_node_iterator_delete(tmpgfi); } } } } gt_feature_node_iterator_delete(gfi); return had_err; }
GtNodeVisitor* gt_snp_annotator_visitor_new(GtFeatureNode *gene, GtTransTable *trans_table, GtRegionMapping *rmap, GtError *err) { GtNodeVisitor *nv; GtSNPAnnotatorVisitor *sav; gt_assert(gene && gt_feature_node_get_type(gene) == gt_symbol(gt_ft_gene)); nv = gt_node_visitor_create(gt_snp_annotator_visitor_class()); sav = snp_annotator_visitor_cast(nv); sav->gene = (GtFeatureNode*) gt_genome_node_ref((GtGenomeNode*) gene); sav->rmap = gt_region_mapping_ref(rmap); sav->mRNA_type = gt_symbol(gt_ft_mRNA); sav->CDS_type = gt_symbol(gt_ft_CDS); sav->SNV_type = gt_symbol(gt_ft_SNV); sav->SNP_type = gt_symbol(gt_ft_SNP); sav->rnaseqs = gt_hashmap_new(GT_HASH_DIRECT, NULL, gt_free_func); if (trans_table) { sav->tt = trans_table; sav->own_tt = false; } else { sav->tt = gt_trans_table_new_standard(err); sav->own_tt = true; } if (!sav->tt || gt_snp_annotator_visitor_prepare_gene(sav, err) != 0) { gt_node_visitor_delete(nv); return NULL; } return nv; }
void gt_gff3_output_leading_str(GtFeatureNode *fn, GtStr *outstr) { GtGenomeNode *gn; gt_assert(fn && outstr); gn = (GtGenomeNode*) fn; gt_str_append_str(outstr, gt_genome_node_get_seqid(gn)); gt_str_append_char(outstr, '\t'); gt_str_append_cstr(outstr, gt_feature_node_get_source(fn)); gt_str_append_char(outstr, '\t'); gt_str_append_cstr(outstr, gt_feature_node_get_type(fn)); gt_str_append_char(outstr, '\t'); gt_str_append_uword(outstr, gt_genome_node_get_start(gn)); gt_str_append_char(outstr, '\t'); gt_str_append_uword(outstr, gt_genome_node_get_end(gn)); gt_str_append_char(outstr, '\t'); if (gt_feature_node_score_is_defined(fn)) { char buf[BUFSIZ]; (void) snprintf(buf, BUFSIZ, "%.3g", gt_feature_node_get_score(fn)); gt_str_append_cstr(outstr, buf); } else gt_str_append_char(outstr, '.'); gt_str_append_char(outstr, '\t'); gt_str_append_char(outstr, GT_STRAND_CHARS[gt_feature_node_get_strand(fn)]); gt_str_append_char(outstr, '\t'); gt_str_append_char(outstr, GT_PHASE_CHARS[gt_feature_node_get_phase(fn)]); gt_str_append_char(outstr, '\t'); }
static void add_recursive(GtDiagram *d, GtFeatureNode *node, GtFeatureNode* parent, GtFeatureNode *original_node) { NodeInfoElement *ni; GtFeatureNode *rep = GT_UNDEF_REPR; gt_assert(d && node && original_node); if (!parent) return; ni = nodeinfo_get(d, node); if (gt_feature_node_is_multi(original_node)) { rep = gt_feature_node_get_multi_representative(original_node); } /* end of recursion, insert into target block */ if (parent == node) { GtBlock *block ; block = nodeinfo_find_block(ni, gt_feature_node_get_type(node), rep); if (!block) { block = gt_block_new_from_node(node); nodeinfo_add_block(ni, gt_feature_node_get_type(node), rep, block); } gt_block_insert_element(block, original_node); gt_log_log("add %s to target %s", gt_feature_node_get_type(original_node), gt_block_get_type(block)); } else { /* not at target type block yet, set up reverse entry and follow */ NodeInfoElement *parent_ni; /* set up reverse entry */ ni->parent = parent; parent_ni = gt_hashmap_get(d->nodeinfo, parent); if (parent_ni) { gt_log_log("recursion: %s -> %s", gt_feature_node_get_type(node), gt_feature_node_get_type(parent)); add_recursive(d, parent, parent_ni->parent, original_node); } } }
static int filter_stream_next(GtNodeStream *ns, GtGenomeNode **gn, GtError *error) { AgnFilterStream *stream; GtFeatureNode *fn; int had_err; gt_error_check(error); stream = filter_stream_cast(ns); if(gt_queue_size(stream->cache) > 0) { *gn = gt_queue_get(stream->cache); return 0; } while(1) { had_err = gt_node_stream_next(stream->in_stream, gn, error); if(had_err) return had_err; if(!*gn) return 0; fn = gt_feature_node_try_cast(*gn); if(!fn) return 0; GtFeatureNode *current; GtFeatureNodeIterator *iter = gt_feature_node_iterator_new(fn); for(current = gt_feature_node_iterator_next(iter); current != NULL; current = gt_feature_node_iterator_next(iter)) { const char *type = gt_feature_node_get_type(current); bool keepfeature = false; if(gt_hashmap_get(stream->typestokeep, type) != NULL) keepfeature = true; if(keepfeature) { gt_genome_node_ref((GtGenomeNode *)current); gt_queue_add(stream->cache, current); } } gt_feature_node_iterator_delete(iter); gt_genome_node_delete((GtGenomeNode *)fn); if(gt_queue_size(stream->cache) > 0) { *gn = gt_queue_get(stream->cache); return 0; } } return 0; }
static int feature_node_lua_get_type(lua_State *L) { GtGenomeNode **gn; GtFeatureNode *fn; gn = check_genome_node(L, 1); /* make sure we get a feature node */ fn = gt_feature_node_try_cast(*gn); luaL_argcheck(L, fn, 1, "not a feature node"); lua_pushstring(L, gt_feature_node_get_type(fn)); return 1; }
static int add_to_rep(GtDiagram *d, GtFeatureNode *node, GtFeatureNode* parent, GtError *err) { GtBlock *block = NULL; GtFeatureNode *rep = GT_UNDEF_REPR; NodeInfoElement *ni; gt_assert(d && node && gt_feature_node_is_multi(node)); rep = gt_feature_node_get_multi_representative(node); gt_log_log("adding %s to representative %p", gt_feature_node_get_type(node), rep); ni = nodeinfo_get(d, rep); block = nodeinfo_find_block(ni, gt_feature_node_get_type(node), rep); if (!block) { block = gt_block_new_from_node(parent); gt_block_set_type(block, gt_feature_node_get_type(node)); /* if parent is a pseudonode, then we have a multiline feature without a parent. we must not access the parent in this case! */ if (gt_feature_node_is_pseudo(parent)) { if (assign_block_caption(d, node, NULL, block, err) < 0) { gt_block_delete(block); return -1; } } else { if (assign_block_caption(d, node, parent, block, err) < 0) { gt_block_delete(block); return -1; } } nodeinfo_add_block(ni, gt_feature_node_get_type(node), rep, block); } gt_assert(block); gt_block_insert_element(block, node); return 0; }
GtBlock* gt_block_new_from_node(GtFeatureNode *node) { GtBlock *block; gt_assert(node); block = gt_block_new(); block->range = gt_genome_node_get_range((GtGenomeNode*) node); block->strand = gt_feature_node_get_strand(node); block->type = gt_feature_node_get_type(node); if (!gt_feature_node_is_pseudo(node)) { block->top_level_feature = (GtFeatureNode*) gt_genome_node_ref((GtGenomeNode*) node); } return block; }
void gt_gff3_output_leading(GtFeatureNode *fn, GtFile *outfp) { GtGenomeNode *gn; gt_assert(fn); gn = (GtGenomeNode*) fn; gt_file_xprintf(outfp, "%s\t%s\t%s\t"GT_WU"\t"GT_WU"\t", gt_str_get(gt_genome_node_get_seqid(gn)), gt_feature_node_get_source(fn), gt_feature_node_get_type(fn), gt_genome_node_get_start(gn), gt_genome_node_get_end(gn)); if (gt_feature_node_score_is_defined(fn)) gt_file_xprintf(outfp, "%.3g", gt_feature_node_get_score(fn)); else gt_file_xfputc('.', outfp); gt_file_xprintf(outfp, "\t%c\t%c\t", GT_STRAND_CHARS[gt_feature_node_get_strand(fn)], GT_PHASE_CHARS[gt_feature_node_get_phase(fn)]); }
static GtStr* create_unique_id(GtGFF3Visitor *gff3_visitor, GtFeatureNode *fn) { const char *type; GtStr *id; gt_assert(gff3_visitor && fn); type = gt_feature_node_get_type(fn); /* increase id counter */ gt_string_distri_add(gff3_visitor->id_counter, type); /* build id string */ id = gt_str_new_cstr(type); gt_str_append_ulong(id, gt_string_distri_get(gff3_visitor->id_counter, type)); /* store (unique) id */ gt_hashmap_add(gff3_visitor->feature_node_to_unique_id_str, fn, id); return id; }
static int gt_seqpos_classifier_next_specified_ft( GtSeqposClassifier *seqpos_classifier, GtRange *range, bool *end_of_annotation, GtError *err) { int had_err = 0; GtFeatureNode *cfn; bool fni_exhausted = (seqpos_classifier->fni == NULL) ? true : false; gt_assert(seqpos_classifier != NULL); gt_assert(range != NULL); while (true) { if (fni_exhausted) { had_err = gt_seqpos_classifier_next_fn(seqpos_classifier, err); if (had_err != 0 || seqpos_classifier->fn == NULL) { *end_of_annotation = true; return had_err; } fni_exhausted = false; } gt_assert(seqpos_classifier->fni != NULL); cfn = gt_feature_node_iterator_next(seqpos_classifier->fni); if (cfn == NULL) { fni_exhausted = true; } else if (strcmp(gt_feature_node_get_type(cfn), seqpos_classifier->specified_ft) == 0) { seqpos_classifier->nof_specified_ft_found++; *range = gt_genome_node_get_range((GtGenomeNode*)cfn); gt_assert(range->start > 0); gt_assert(range->end > 0); range->start--; range->end--; *end_of_annotation = false; return had_err; } } }
static int assign_block_caption(GtDiagram *d, GtFeatureNode *node, GtFeatureNode *parent, GtBlock *block, GtError *err) { const char *nnid_p = NULL, *nnid_n = NULL; GtStr *caption = NULL; bool status = true; int rval; caption = gt_str_new(); rval = gt_style_get_str(d->style, gt_feature_node_get_type(node), "block_caption", caption, node, err); if (rval == GT_STYLE_QUERY_ERROR) { gt_str_delete(caption); return -1; } else if (rval == GT_STYLE_QUERY_NOT_SET) { nnid_p = get_node_name_or_id(parent); nnid_n = get_node_name_or_id(node); if ((nnid_p || nnid_n) && status) { if (parent) { if (nnid_p && gt_feature_node_has_children(parent)) gt_str_append_cstr(caption, nnid_p); else gt_str_append_cstr(caption, "-"); gt_str_append_cstr(caption, "/"); } if (nnid_n) gt_str_append_cstr(caption, nnid_n); } else { gt_str_delete(caption); caption = NULL; } } gt_block_set_caption(block, caption); return 0; }
static void orf_attach_results_to_gff3(GtFeatureNode *gf, GtRange orf_rng, unsigned int orf_frame, GtStrand strand, GT_UNUSED GtError *err) { GtGenomeNode *child; GtStr *tag; tag = gt_str_new_cstr(GT_ORF_FINDER_TAG); orf_rng.start++; orf_rng.end++; GtFeatureNodeIterator *gfi; GtFeatureNode *curnode = NULL, *parent_node = NULL; GtRange gfi_range; char frame_buf[3]; sprintf(frame_buf, "%d", orf_frame); gfi = gt_feature_node_iterator_new(gf); while ((curnode = gt_feature_node_iterator_next(gfi))) { if (strcmp(gt_feature_node_get_type(curnode), (const char*) GT_ORF_TYPE) != 0) { gfi_range = gt_genome_node_get_range((GtGenomeNode*) curnode); if (gt_range_contains(&gfi_range, &orf_rng)) { parent_node = curnode; } } } if (parent_node) { child = gt_feature_node_new(gt_genome_node_get_seqid((GtGenomeNode*) gf), GT_ORF_TYPE, orf_rng.start, orf_rng.end, strand); gt_feature_node_set_source((GtFeatureNode*) child, tag); gt_feature_node_set_attribute((GtFeatureNode*) child, "frame", frame_buf); gt_feature_node_add_child(parent_node,(GtFeatureNode*) child); } gt_str_delete(tag); gt_feature_node_iterator_delete(gfi); }
static int gtf_show_feature_node(GtFeatureNode *fn, void *data, GtError *err) { GtGTFVisitor *gtf_visitor = (GtGTFVisitor*) data; int had_err = 0; if (gt_feature_node_has_type(fn, gt_ft_gene)) { gtf_visitor->gene_id++; gtf_visitor->transcript_id = 0; had_err = gtf_show_transcript(fn, gtf_visitor, err); } else if (gt_feature_node_has_type(fn, gt_ft_mRNA)) { had_err = gtf_show_transcript(fn, gtf_visitor, err); } else if (!(gt_feature_node_has_type(fn, gt_ft_CDS) || gt_feature_node_has_type(fn, gt_ft_exon))) { gt_warning("skipping GFF3 feature of type \"%s\" (from line %u in file " "\"%s\")", gt_feature_node_get_type(fn), gt_genome_node_get_line_number((GtGenomeNode*) fn), gt_genome_node_get_filename((GtGenomeNode*) fn)); } return had_err; }
static int cluster_annotate_nodes(GtClusteredSet *cs, GtEncseq *encseq, const char *feature, GtArray *nodes, GtError *err) { GtFeatureNodeIterator *fni; GtFeatureNode *curnode = NULL, *tmp; GtClusteredSetIterator *csi = NULL; GtGenomeNode *gn; GtHashmap *desc2node; GtStr *seqid = NULL; int had_err = 0; unsigned long num_of_clusters, i, elm; const char *fnt = NULL; char buffer[BUFSIZ], *real_feature; gt_error_check(err); if ((strcmp(feature, "lLTR") == 0) || (strcmp(feature, "rLTR") == 0)) real_feature = gt_cstr_dup(gt_ft_long_terminal_repeat); else real_feature = gt_cstr_dup(feature); desc2node = gt_hashmap_new(GT_HASH_STRING, free_hash, NULL); for (i = 0; i < gt_array_size(nodes); i++) { gn = *(GtGenomeNode**) gt_array_get(nodes, i); if (gt_feature_node_try_cast(gn) == NULL) continue; fni = gt_feature_node_iterator_new((GtFeatureNode*) gn); while ((curnode = gt_feature_node_iterator_next(fni)) != NULL) { char header[BUFSIZ]; fnt = gt_feature_node_get_type(curnode); if (strcmp(fnt, gt_ft_repeat_region) == 0) { const char *rid; unsigned long id; seqid = gt_genome_node_get_seqid((GtGenomeNode*) curnode); rid = gt_feature_node_get_attribute(curnode, "ID"); (void) sscanf(rid, "repeat_region%lu", &id); (void) snprintf(buffer, BUFSIZ, "%s_%lu", gt_str_get(seqid), id); } else if (strcmp(fnt, gt_ft_protein_match) == 0) { GtRange range; const char *attr; attr = gt_feature_node_get_attribute(curnode, "name"); if (!attr) continue; if (strcmp(feature, attr) != 0) continue; range = gt_genome_node_get_range((GtGenomeNode*) curnode); if ((range.end - range.start + 1) < 10UL) continue; (void) snprintf(header, BUFSIZ, "%s_%lu_%lu", buffer, range.start, range.end); gt_hashmap_add(desc2node, (void*) gt_cstr_dup(header), (void*) curnode); } else if (strcmp(fnt, real_feature) == 0) { GtRange range; range = gt_genome_node_get_range((GtGenomeNode*) curnode); if ((range.end - range.start + 1) < 10UL) continue; (void) snprintf(header, BUFSIZ, "%s_%lu_%lu", buffer, range.start, range.end); gt_hashmap_add(desc2node, (void*) gt_cstr_dup(header), (void*) curnode); } } gt_feature_node_iterator_delete(fni); } gt_free(real_feature); num_of_clusters = gt_clustered_set_num_of_clusters(cs, err); for (i = 0; i < num_of_clusters; i++) { csi = gt_clustered_set_get_iterator(cs, i ,err); if (csi != NULL) { while (!had_err && (gt_clustered_set_iterator_next(csi, &elm, err) != GT_CLUSTERED_SET_ITERATOR_STATUS_END)) { char clid[BUFSIZ]; const char *encseqdesc; char *encseqid; unsigned long desclen; encseqdesc = gt_encseq_description(encseq, &desclen, elm); encseqid = gt_calloc((size_t) (desclen + 1), sizeof (char)); (void) strncpy(encseqid, encseqdesc, (size_t) desclen); encseqid[desclen] = '\0'; tmp = (GtFeatureNode*) gt_hashmap_get(desc2node, (void*) encseqid); (void) snprintf(clid, BUFSIZ, "%lu", i); gt_feature_node_set_attribute(tmp, "clid", clid); gt_free(encseqid); } } gt_clustered_set_iterator_delete(csi, err); csi = NULL; } gt_hashmap_delete(desc2node); return had_err; }
static int gt_extract_feature_sequence_generic(GtStr *sequence, GtGenomeNode *gn, const char *type, bool join, GtStr *seqid, GtStrArray *target_ids, unsigned int *out_phase_offset, GtRegionMapping *region_mapping, GtError *err) { GtFeatureNode *fn; GtRange range; unsigned int phase_offset = 0; char *outsequence; const char *target; int had_err = 0; gt_error_check(err); fn = gt_genome_node_cast(gt_feature_node_class(), gn); gt_assert(fn); if (seqid) gt_str_append_str(seqid, gt_genome_node_get_seqid(gn)); if (target_ids && (target = gt_feature_node_get_attribute(fn, GT_GFF_TARGET))) { had_err = gt_gff3_parser_parse_all_target_attributes(target, false, target_ids, NULL, NULL, "", 0, err); } if (!had_err) { if (join) { GtFeatureNodeIterator *fni; GtFeatureNode *child; bool reverse_strand = false, first_child = true, first_child_of_type_seen = false; GtPhase phase = GT_PHASE_UNDEFINED; /* in this case we have to traverse the children */ fni = gt_feature_node_iterator_new_direct(gt_feature_node_cast(gn)); while (!had_err && (child = gt_feature_node_iterator_next(fni))) { if (first_child) { if (target_ids && (target = gt_feature_node_get_attribute(child, GT_GFF_TARGET))) { gt_str_array_reset(target_ids); had_err = gt_gff3_parser_parse_all_target_attributes(target, false, target_ids, NULL, NULL, "", 0, err); } first_child = false; } if (!had_err) { if (extract_join_feature((GtGenomeNode*) child, type, region_mapping, sequence, &reverse_strand, &first_child_of_type_seen, &phase, err)) { had_err = -1; } if (phase != GT_PHASE_UNDEFINED) { phase_offset = (int) phase; } } } gt_feature_node_iterator_delete(fni); gt_assert(phase_offset <= (unsigned int) GT_PHASE_UNDEFINED); if (!had_err && gt_str_length(sequence)) { if (reverse_strand) { had_err = gt_reverse_complement(gt_str_get(sequence), gt_str_length(sequence), err); } } } else if (gt_feature_node_get_type(fn) == type) { GtPhase phase = gt_feature_node_get_phase(fn); gt_assert(!had_err); if (phase != GT_PHASE_UNDEFINED) phase_offset = (unsigned int) phase; /* otherwise we only have to look at this feature */ range = gt_genome_node_get_range(gn); gt_assert(range.start); /* 1-based coordinates */ had_err = gt_region_mapping_get_sequence(region_mapping, &outsequence, gt_genome_node_get_seqid(gn), range.start, range.end, err); if (!had_err) { gt_str_append_cstr_nt(sequence, outsequence, gt_range_length(&range)); gt_free(outsequence); if (gt_feature_node_get_strand(fn) == GT_STRAND_REVERSE) { had_err = gt_reverse_complement(gt_str_get(sequence), gt_str_length(sequence), err); } } } } if (out_phase_offset && phase_offset != GT_PHASE_UNDEFINED) { *out_phase_offset = phase_offset; } return had_err; }
static int run_orffinder(GtRegionMapping *rmap, GtFeatureNode *gf, unsigned long start, GT_UNUSED unsigned long end, unsigned int min, unsigned int max, bool all, GtError *err) { int had_err = 0, i; unsigned long sum; GtCodonIterator* ci = NULL; GtTranslator* translator = NULL; GtORFIterator* orfi = NULL; GtORFIteratorStatus state; GtRange orf_rng, tmp_orf_rng[3]; GtStr *seq; unsigned int orf_frame; /* forward strand */ seq = gt_str_new(); had_err = gt_extract_feature_sequence(seq, (GtGenomeNode*) gf, gt_feature_node_get_type(gf), false, NULL, NULL, rmap, err); ci = gt_codon_iterator_simple_new(gt_str_get(seq), gt_str_length(seq), err); gt_assert(ci); translator = gt_translator_new(ci); gt_assert(translator); orfi = gt_orf_iterator_new(ci, translator); gt_assert(orfi); for (i = 0; i < 3; i++) { tmp_orf_rng[i].start = GT_UNDEF_ULONG; tmp_orf_rng[i].end = GT_UNDEF_ULONG; } while ((state = gt_orf_iterator_next(orfi, &orf_rng, &orf_frame, err)) == GT_ORF_ITERATOR_OK) { if (all) { process_orf(orf_rng, orf_frame, GT_STRAND_FORWARD, gf, start, min, max, err); } else { if (gt_range_length(&orf_rng) > gt_range_length(&tmp_orf_rng[orf_frame])) { tmp_orf_rng[orf_frame].start = orf_rng.start; tmp_orf_rng[orf_frame].end = orf_rng.end; } } } if (state == GT_ORF_ITERATOR_ERROR) had_err = -1; if (!had_err) { if (!all) { for (i = 0; i < 3; i++) { if (tmp_orf_rng[i].start != GT_UNDEF_ULONG) { process_orf(tmp_orf_rng[i], (unsigned int) i, GT_STRAND_FORWARD, gf, start, min, max, err); } } } gt_codon_iterator_delete(ci); gt_translator_delete(translator); gt_orf_iterator_delete(orfi); orfi = NULL; ci = NULL; translator = NULL; for (i = 0; i < 3; i++) { tmp_orf_rng[i].start = GT_UNDEF_ULONG; tmp_orf_rng[i].end = GT_UNDEF_ULONG; } /* reverse strand */ if (!had_err) { GT_UNUSED int rval = 0; unsigned long length = gt_str_length(seq); char *strp = (char*) gt_str_get_mem(seq); rval = gt_reverse_complement(strp, gt_str_length(seq), err); gt_assert(!rval); /* XXX */ ci = gt_codon_iterator_simple_new(gt_str_get(seq), gt_str_length(seq), err); gt_assert(ci); translator = gt_translator_new(ci); gt_assert(translator); orfi = gt_orf_iterator_new(ci, translator); gt_assert(orfi); sum = start + length - 1; while ((state = gt_orf_iterator_next(orfi, &orf_rng, &orf_frame, err)) == GT_ORF_ITERATOR_OK) { if (all) { process_orf(orf_rng, orf_frame, GT_STRAND_REVERSE, gf, sum, min, max, err); } else { if (gt_range_length(&orf_rng) > gt_range_length(&tmp_orf_rng[orf_frame])) { tmp_orf_rng[orf_frame].start = orf_rng.start; tmp_orf_rng[orf_frame].end = orf_rng.end; } } } if (state == GT_ORF_ITERATOR_ERROR) had_err = -1; if (!had_err) { if (!all) { for (i = 0; i < 3; i++) { if (tmp_orf_rng[i].start != GT_UNDEF_ULONG) { process_orf(tmp_orf_rng[i], (unsigned int) i, GT_STRAND_REVERSE, gf, sum, min, max, err); } } } } } gt_str_delete(seq); gt_codon_iterator_delete(ci); gt_translator_delete(translator); gt_orf_iterator_delete(orfi); } return had_err; }
static int process_node(GtDiagram *d, GtFeatureNode *node, GtFeatureNode *parent, GtError *err) { GtRange elem_range; bool *collapse; GtShouldGroupByParent *group; const char *feature_type = NULL, *parent_gft = NULL; double tmp; GtStyleQueryStatus rval; GtUword max_show_width = GT_UNDEF_UWORD, par_max_show_width = GT_UNDEF_UWORD; gt_assert(d && node); gt_log_log(">> getting '%s'", gt_feature_node_get_type(node)); /* skip pseudonodes */ if (gt_feature_node_is_pseudo(node)) return 0; feature_type = gt_feature_node_get_type(node); gt_assert(feature_type); /* discard elements that do not overlap with visible range */ elem_range = gt_genome_node_get_range((GtGenomeNode*) node); if (!gt_range_overlap(&d->range, &elem_range)) return 0; /* get maximal view widths in nucleotides to show this type */ rval = gt_style_get_num(d->style, feature_type, "max_show_width", &tmp, NULL, err); switch (rval) { case GT_STYLE_QUERY_OK: max_show_width = tmp; break; case GT_STYLE_QUERY_ERROR: return -1; break; /* should never be reached */ default: /* do not change default value */ break; } /* for non-root nodes, get maximal view with to show parent */ if (parent) { if (!gt_feature_node_is_pseudo(parent)) { parent_gft = gt_feature_node_get_type(parent); rval = gt_style_get_num(d->style, parent_gft, "max_show_width", &tmp, NULL, err); switch (rval) { case GT_STYLE_QUERY_OK: par_max_show_width = tmp; break; case GT_STYLE_QUERY_ERROR: return -1; break; /* should never be reached */ default: /* do not change default value */ break; } } else par_max_show_width = GT_UNDEF_UWORD; } /* check if this type is to be displayed at all */ if (max_show_width != GT_UNDEF_UWORD && gt_range_length(&d->range) > max_show_width) { return 0; } /* disregard parent node if it is configured not to be shown */ if (parent && par_max_show_width != GT_UNDEF_UWORD && gt_range_length(&d->range) > par_max_show_width) { parent = NULL; } /* check if this is a collapsing type, cache result */ if ((collapse = (bool*) gt_hashmap_get(d->collapsingtypes, feature_type)) == NULL) { collapse = gt_malloc(sizeof (bool)); *collapse = false; if (gt_style_get_bool(d->style, feature_type, "collapse_to_parent", collapse, NULL, err) == GT_STYLE_QUERY_ERROR) { gt_free(collapse); return -1; } gt_hashmap_add(d->collapsingtypes, (void*) feature_type, collapse); } /* check if type should be grouped by parent, cache result */ if ((group = (GtShouldGroupByParent*) gt_hashmap_get(d->groupedtypes, feature_type)) == NULL) { bool tmp; group = gt_malloc(sizeof (GtShouldGroupByParent)); rval = gt_style_get_bool(d->style, feature_type, "group_by_parent", &tmp, NULL, err); switch (rval) { case GT_STYLE_QUERY_OK: if (tmp) *group = GT_GROUP_BY_PARENT; else *group = GT_DO_NOT_GROUP_BY_PARENT; break; case GT_STYLE_QUERY_NOT_SET: *group = GT_UNDEFINED_GROUPING; break; case GT_STYLE_QUERY_ERROR: gt_free(group); return -1; break; /* should never be reached */ } gt_hashmap_add(d->groupedtypes, (void*) feature_type, group); } /* decide where to place this feature: */ if (*collapse) { /* user has specified collapsing to parent for this type */ if (parent && !gt_feature_node_is_pseudo(parent)) { /* collapsing child nodes are added to upwards blocks, but never collapse into pseudo nodes */ add_recursive(d, node, parent, node); } else { /* if no parent or only pseudo-parent, do not collapse */ if (add_to_current(d, node, parent, err) < 0) { return -1; } } } else /* (!*collapse) */ { if (parent) { bool do_not_overlap = false; do_not_overlap = gt_feature_node_direct_children_do_not_overlap_st(parent, node); if (*group == GT_GROUP_BY_PARENT || (do_not_overlap && *group == GT_UNDEFINED_GROUPING)) { if (gt_feature_node_is_pseudo(parent) && gt_feature_node_is_multi(node)) { if (add_to_rep(d, node, parent, err) < 0) { return -1; } } else if (gt_feature_node_number_of_children(parent) > 1) { if (add_to_parent(d, node, parent, err) < 0) { return -1; } } else { if (add_to_current(d, node, parent, err) < 0) { return -1; } } } else { if (gt_feature_node_is_pseudo(parent) && gt_feature_node_is_multi(node)) { if (add_to_rep(d, node, parent, err) < 0) { return -1; } } else { if (add_to_current(d, node, parent, err) < 0) { return -1; } } } } else { /* root nodes always get their own block */ if (add_to_current(d, node, parent, err) < 0) { return -1; } } } /* we can now assume that this node (or its representative) has been processed into the reverse lookup structure */ #ifndef NDEBUG if (gt_feature_node_is_multi(node)) { GtFeatureNode *rep; rep = gt_feature_node_get_multi_representative((GtFeatureNode*) node); gt_assert(gt_hashmap_get(d->nodeinfo, rep)); } else gt_assert(gt_hashmap_get(d->nodeinfo, node)); #endif return 0; }
static int gt_sketch_runner(int argc, const char **argv, int parsed_args, void *tool_arguments, GT_UNUSED GtError *err) { GtSketchArguments *arguments = tool_arguments; GtNodeStream *in_stream = NULL, *add_introns_stream = NULL, *gff3_out_stream = NULL, *feature_stream = NULL, *sort_stream = NULL, *last_stream; GtFeatureIndex *features = NULL; const char *file; char *seqid = NULL; GtRange qry_range, sequence_region_range; GtArray *results = NULL; GtStyle *sty = NULL; GtStr *prog, *defaultstylefile = NULL; GtDiagram *d = NULL; GtLayout *l = NULL; GtImageInfo* ii = NULL; GtCanvas *canvas = NULL; GtUword height; bool has_seqid; int had_err = 0; gt_error_check(err); gt_assert(arguments); prog = gt_str_new(); gt_str_append_cstr_nt(prog, argv[0], gt_cstr_length_up_to_char(argv[0], ' ')); defaultstylefile = gt_get_gtdata_path(gt_str_get(prog), err); gt_str_delete(prog); if (!defaultstylefile) had_err = -1; if (!had_err) { gt_str_append_cstr(defaultstylefile, "/sketch/default.style"); } file = argv[parsed_args]; if (!had_err) { /* create feature index */ features = gt_feature_index_memory_new(); parsed_args++; /* create an input stream */ if (strcmp(gt_str_get(arguments->input), "gff") == 0) { in_stream = gt_gff3_in_stream_new_unsorted(argc - parsed_args, argv + parsed_args); if (arguments->verbose) gt_gff3_in_stream_show_progress_bar((GtGFF3InStream*) in_stream); } else if (strcmp(gt_str_get(arguments->input), "bed") == 0) { if (argc - parsed_args == 0) in_stream = gt_bed_in_stream_new(NULL); else in_stream = gt_bed_in_stream_new(argv[parsed_args]); } else if (strcmp(gt_str_get(arguments->input), "gtf") == 0) { if (argc - parsed_args == 0) in_stream = gt_gtf_in_stream_new(NULL); else in_stream = gt_gtf_in_stream_new(argv[parsed_args]); } last_stream = in_stream; /* create add introns stream if -addintrons was used */ if (arguments->addintrons) { sort_stream = gt_sort_stream_new(last_stream); add_introns_stream = gt_add_introns_stream_new(sort_stream); last_stream = add_introns_stream; } /* create gff3 output stream if -pipe was used */ if (arguments->pipe) { gff3_out_stream = gt_gff3_out_stream_new(last_stream, NULL); last_stream = gff3_out_stream; } /* create feature stream */ feature_stream = gt_feature_stream_new(last_stream, features); /* pull the features through the stream and free them afterwards */ had_err = gt_node_stream_pull(feature_stream, err); gt_node_stream_delete(feature_stream); gt_node_stream_delete(gff3_out_stream); gt_node_stream_delete(sort_stream); gt_node_stream_delete(add_introns_stream); gt_node_stream_delete(in_stream); } if (!had_err) { had_err = gt_feature_index_has_seqid(features, &has_seqid, gt_str_get(arguments->seqid), err); } /* if seqid is empty, take first one added to index */ if (!had_err && strcmp(gt_str_get(arguments->seqid),"") == 0) { seqid = gt_feature_index_get_first_seqid(features, err); if (seqid == NULL) { gt_error_set(err, "GFF input file must contain a sequence region!"); had_err = -1; } } else if (!had_err && !has_seqid) { gt_error_set(err, "sequence region '%s' does not exist in GFF input file", gt_str_get(arguments->seqid)); had_err = -1; } else if (!had_err) seqid = gt_str_get(arguments->seqid); results = gt_array_new(sizeof (GtGenomeNode*)); if (!had_err) { had_err = gt_feature_index_get_range_for_seqid(features, &sequence_region_range, seqid, err); } if (!had_err) { qry_range.start = (arguments->start == GT_UNDEF_UWORD ? sequence_region_range.start : arguments->start); qry_range.end = (arguments->end == GT_UNDEF_UWORD ? sequence_region_range.end : arguments->end); } if (!had_err) { if (arguments->verbose) fprintf(stderr, "# of results: "GT_WU"\n", gt_array_size(results)); /* find and load style file */ if (!(sty = gt_style_new(err))) had_err = -1; if (gt_str_length(arguments->stylefile) == 0) { gt_str_append_str(arguments->stylefile, defaultstylefile); } else { if (!had_err && gt_file_exists(gt_str_get(arguments->stylefile))) { if (arguments->unsafe) gt_style_unsafe_mode(sty); } else { had_err = -1; gt_error_set(err, "style file '%s' does not exist!", gt_str_get(arguments->stylefile)); } } if (!had_err) had_err = gt_style_load_file(sty, gt_str_get(arguments->stylefile), err); } if (!had_err) { /* create and write image file */ if (!(d = gt_diagram_new(features, seqid, &qry_range, sty, err))) had_err = -1; if (!had_err && arguments->flattenfiles) gt_diagram_set_track_selector_func(d, flattened_file_track_selector, NULL); if (had_err || !(l = gt_layout_new(d, arguments->width, sty, err))) had_err = -1; if (!had_err) had_err = gt_layout_get_height(l, &height, err); if (!had_err) { ii = gt_image_info_new(); if (strcmp(gt_str_get(arguments->format),"pdf")==0) { canvas = gt_canvas_cairo_file_new(sty, GT_GRAPHICS_PDF, arguments->width, height, ii, err); } else if (strcmp(gt_str_get(arguments->format),"ps")==0) { canvas = gt_canvas_cairo_file_new(sty, GT_GRAPHICS_PS, arguments->width, height, ii, err); } else if (strcmp(gt_str_get(arguments->format),"svg")==0) { canvas = gt_canvas_cairo_file_new(sty, GT_GRAPHICS_SVG, arguments->width, height, ii, err); } else { canvas = gt_canvas_cairo_file_new(sty, GT_GRAPHICS_PNG, arguments->width, height, ii, err); } if (!canvas) had_err = -1; if (!had_err) { had_err = gt_layout_sketch(l, canvas, err); } if (!had_err) { if (arguments->showrecmaps) { GtUword i; const GtRecMap *rm; for (i = 0; i < gt_image_info_num_of_rec_maps(ii) ;i++) { char buf[BUFSIZ]; rm = gt_image_info_get_rec_map(ii, i); (void) gt_rec_map_format_html_imagemap_coords(rm, buf, BUFSIZ); printf("%s, %s\n", buf, gt_feature_node_get_type(gt_rec_map_get_genome_feature(rm))); } } if (arguments->use_streams) { GtFile *outfile; GtStr *str = gt_str_new(); gt_canvas_cairo_file_to_stream((GtCanvasCairoFile*) canvas, str); outfile = gt_file_open(GT_FILE_MODE_UNCOMPRESSED, file, "w+", err); if (outfile) { gt_file_xwrite(outfile, gt_str_get_mem(str), gt_str_length(str)); gt_file_delete(outfile); } else { had_err = -1; } gt_str_delete(str); } else { had_err = gt_canvas_cairo_file_to_file((GtCanvasCairoFile*) canvas, file, err); } } } } /* free */ gt_free(seqid); gt_canvas_delete(canvas); gt_layout_delete(l); gt_image_info_delete(ii); gt_style_delete(sty); gt_diagram_delete(d); gt_array_delete(results); gt_str_delete(defaultstylefile); gt_feature_index_delete(features); return had_err; }
static int gt_ltrdigest_pdom_visitor_choose_strand(GtLTRdigestPdomVisitor *lv) { int had_err = 0; double log_eval_fwd = 0.0, log_eval_rev = 0.0; GtFeatureNodeIterator *fni; GtStrand strand; double score; bool seen_fwd = false, seen_rev = false; GtFeatureNode *curnode = NULL; GtUword i; GtArray *to_delete; fni = gt_feature_node_iterator_new(lv->ltr_retrotrans); while (!had_err && (curnode = gt_feature_node_iterator_next(fni))) { if (strcmp(gt_feature_node_get_type(curnode), gt_ft_protein_match) == 0) { strand = gt_feature_node_get_strand(curnode); score = (double) gt_feature_node_get_score(curnode); if (strand == GT_STRAND_FORWARD) { log_eval_fwd += log(score); seen_fwd = true; } else if (strand == GT_STRAND_REVERSE) { log_eval_rev += log(score); seen_rev = true; } } } gt_feature_node_iterator_delete(fni); if (seen_rev && !seen_fwd) gt_feature_node_set_strand(lv->ltr_retrotrans, GT_STRAND_REVERSE); else if (!seen_rev && seen_fwd) gt_feature_node_set_strand(lv->ltr_retrotrans, GT_STRAND_FORWARD); else if (!seen_rev && !seen_fwd) return had_err; else { gt_assert(seen_rev && seen_fwd); if (gt_double_compare(log_eval_fwd, log_eval_rev) < 0) strand = GT_STRAND_FORWARD; else strand = GT_STRAND_REVERSE; gt_feature_node_set_strand(lv->ltr_retrotrans, strand); to_delete = gt_array_new(sizeof (GtFeatureNode*)); fni = gt_feature_node_iterator_new(lv->ltr_retrotrans); while (!had_err && (curnode = gt_feature_node_iterator_next(fni))) { if (strcmp(gt_feature_node_get_type(curnode), gt_ft_protein_match) == 0) { if (strand != gt_feature_node_get_strand(curnode)) { gt_array_add(to_delete, curnode); } } } gt_feature_node_iterator_delete(fni); gt_assert(gt_array_size(to_delete) > 0); for (i = 0; i < gt_array_size(to_delete); i++) { gt_feature_node_remove_leaf(lv->ltr_retrotrans, *(GtFeatureNode**) gt_array_get(to_delete, i)); } gt_array_delete(to_delete); } return had_err; }
static int snp_annotator_classify_snp(GtSNPAnnotatorVisitor *sav, GtFeatureNode *mRNA, GtFeatureNode *snp, GtUword variant_pos, GtUword variant_idx, char variant_char, #ifndef NDEBUG GT_UNUSED char reference_char, #endif GT_UNUSED GtError *err) { int had_err = 0; char *mrnaseq; const char *variant_effect = NULL; gt_assert(mRNA && snp && sav); gt_log_log("processing variant char %c for SNP %s\n", variant_char, gt_feature_node_get_attribute(snp, "Dbxref")); mrnaseq = gt_hashmap_get(sav->rnaseqs, mRNA); gt_assert(mrnaseq); if (mrnaseq) { char codon[3], variant_codon[3]; GtStr *effect_string; char oldamino, newamino; GT_UNUSED GtUword mrnalen; GtUword startpos = variant_pos / GT_CODON_LENGTH, variantoffset = variant_pos % GT_CODON_LENGTH; mrnalen = strlen(mrnaseq); gt_assert(variant_pos < mrnalen); variant_codon[0] = codon[0] = mrnaseq[3*startpos]; variant_codon[1] = codon[1] = mrnaseq[3*startpos+1]; variant_codon[2] = codon[2] = mrnaseq[3*startpos+2]; variant_codon[variantoffset] = variant_char; #ifndef NDEBUG gt_assert(toupper(codon[variantoffset]) == toupper(reference_char)); #endif if (gt_trans_table_is_stop_codon(sav->tt, codon[0], codon[1], codon[2])) { if (gt_trans_table_is_stop_codon(sav->tt, variant_codon[0], variant_codon[1], variant_codon[2])) { variant_effect = gt_symbol(GT_SNP_SYNONYMOUS_STOP_EFFECT); } else { variant_effect = gt_symbol(GT_SNP_STOP_LOST_EFFECT); } } else { if (gt_trans_table_is_stop_codon(sav->tt, variant_codon[0], variant_codon[1], variant_codon[2])) { variant_effect = gt_symbol(GT_SNP_NONSENSE_EFFECT); } else { had_err = gt_trans_table_translate_codon(sav->tt, codon[0], codon[1], codon[2], &oldamino, err); if (!had_err) { had_err = gt_trans_table_translate_codon(sav->tt, variant_codon[0], variant_codon[1], variant_codon[2], &newamino, err); } if (!had_err) { if (newamino == oldamino) { variant_effect = gt_symbol(GT_SNP_SYNONYMOUS_AMINO_EFFECT); } else { variant_effect = gt_symbol(GT_SNP_MISSENSE_EFFECT); } } } } if (!had_err) { const char *var_attrib; gt_assert(variant_effect != NULL); if ((var_attrib = gt_feature_node_get_attribute(snp, GT_GVF_VARIANT_EFFECT))) { effect_string = gt_str_new_cstr(var_attrib); gt_str_append_cstr(effect_string, ","); gt_str_append_cstr(effect_string, variant_effect); } else { effect_string = gt_str_new_cstr(variant_effect); } gt_str_append_cstr(effect_string, " "); gt_str_append_ulong(effect_string, variant_idx); gt_str_append_cstr(effect_string, " "); gt_str_append_cstr(effect_string, gt_feature_node_get_type(mRNA)); gt_str_append_cstr(effect_string, " "); gt_str_append_cstr(effect_string, gt_feature_node_get_attribute(mRNA, GT_GFF_ID)); gt_feature_node_set_attribute(snp, GT_GVF_VARIANT_EFFECT, gt_str_get(effect_string)); gt_str_reset(effect_string); gt_str_delete(effect_string); } } return had_err; }
static int gt_snp_annotator_visitor_prepare_gene(GtSNPAnnotatorVisitor *sav, GtError *err) { GtFeatureNodeIterator *fni, *mrnafni; GtFeatureNode *curnode, *last_mRNA = NULL; GtStr *mrnaseq, *seqid; int had_err = 0; mrnaseq = gt_str_new(); seqid = gt_genome_node_get_seqid((GtGenomeNode*) sav->gene); fni = gt_feature_node_iterator_new(sav->gene); while (!had_err && (curnode = gt_feature_node_iterator_next(fni))) { if (gt_feature_node_get_type(curnode) == sav->mRNA_type) { GtFeatureNode *curnode2; if (last_mRNA) { char *mrna_charseq = gt_calloc(gt_str_length(mrnaseq)+1, sizeof (char)); (void) strncpy(mrna_charseq, gt_str_get(mrnaseq), gt_str_length(mrnaseq)); if (gt_feature_node_get_strand(sav->gene) == GT_STRAND_REVERSE) { had_err = gt_reverse_complement(mrna_charseq, gt_str_length(mrnaseq), err); } if (!had_err) { gt_hashmap_add(sav->rnaseqs, last_mRNA, mrna_charseq); last_mRNA = curnode; gt_str_reset(mrnaseq); } } else last_mRNA = curnode; if (!had_err) { mrnafni = gt_feature_node_iterator_new(curnode); while (!had_err && (curnode2 = gt_feature_node_iterator_next(mrnafni))) { if (gt_feature_node_get_type(curnode2) == sav->CDS_type) { char *tmp; GtRange rng = gt_genome_node_get_range((GtGenomeNode*) curnode2); had_err = gt_region_mapping_get_sequence(sav->rmap, &tmp, seqid, rng.start, rng.end, err); if (!had_err) { gt_str_append_cstr_nt(mrnaseq, tmp, gt_range_length(&rng)); gt_free(tmp); } } } gt_feature_node_iterator_delete(mrnafni); } } } if (!had_err && last_mRNA) { char *mrna_charseq = gt_calloc(gt_str_length(mrnaseq)+1, sizeof (char)); (void) strncpy(mrna_charseq, gt_str_get(mrnaseq), gt_str_length(mrnaseq)); if (gt_feature_node_get_strand(sav->gene) == GT_STRAND_REVERSE) { had_err = gt_reverse_complement(mrna_charseq, gt_str_length(mrnaseq), err); } if (!had_err) { gt_hashmap_add(sav->rnaseqs, last_mRNA, mrna_charseq); } } gt_feature_node_iterator_delete(fni); gt_str_delete(mrnaseq); return had_err; }
static int snp_annotator_visitor_feature_node(GtNodeVisitor *nv, GtFeatureNode *fn, GtError *err) { int had_err = 0; GtSNPAnnotatorVisitor *sav; GtFeatureNodeIterator *fni, *mrnafni; GtFeatureNode *curnode, *curnode2; GtRange snp_rng; gt_error_check(err); sav = snp_annotator_visitor_cast(nv); /* ignore non-nodes */ if (!fn) return 0; /* only process SNPs */ if (!(gt_feature_node_get_type(fn) == sav->SNV_type || gt_feature_node_get_type(fn) == sav->SNP_type)) { return 0; } fni = gt_feature_node_iterator_new_direct(sav->gene); snp_rng = gt_genome_node_get_range((GtGenomeNode*) fn); while (!had_err && (curnode = gt_feature_node_iterator_next(fni))) { if (gt_feature_node_get_type(curnode) == sav->mRNA_type) { GtStrand mrna_strand = gt_feature_node_get_strand(curnode); #ifndef NDEBUG const char *refstr; #endif GtUword mrnasnppos = 0; mrnafni = gt_feature_node_iterator_new(curnode); while (!had_err && (curnode2 = gt_feature_node_iterator_next(mrnafni))) { if (gt_feature_node_get_type(curnode2) == sav->CDS_type) { GtRange cds_rng = gt_genome_node_get_range((GtGenomeNode*) curnode2); if (gt_range_overlap(&snp_rng, &cds_rng)) { char *mRNA, origchar; char *variantchars, *variantptr = NULL; GT_UNUSED char *refchars, *refptr = NULL; mRNA = (char*) gt_hashmap_get(sav->rnaseqs, curnode); gt_assert(mRNA); gt_assert(snp_rng.start >= cds_rng.start); mrnasnppos += (snp_rng.start - cds_rng.start); if (mrna_strand == GT_STRAND_REVERSE) mrnasnppos = strlen(mRNA) - mrnasnppos - 1; gt_assert(mrnasnppos < strlen(mRNA)); origchar = mRNA[mrnasnppos]; #ifndef NDEBUG refstr = refptr = gt_cstr_dup(gt_feature_node_get_attribute(fn, GT_GVF_REFERENCE_SEQ)); if (!had_err && refstr) { if (gt_feature_node_get_strand(curnode) == GT_STRAND_REVERSE) { int rval = gt_complement(&origchar, origchar, err); gt_assert(rval == 0); } gt_assert(toupper(origchar) == toupper(refstr[0])); } #endif variantchars = variantptr = gt_cstr_dup( gt_feature_node_get_attribute(fn, GT_GVF_VARIANT_SEQ)); if (!had_err && variantchars) { GtUword i = 0; while (!had_err && (*variantchars != ';' && *variantchars != '\0')) { if (*variantchars != ',' && *variantchars != origchar) { char variantchar = *variantchars; #ifndef NDEBUG char refchar = refstr ? refstr[0] : '-'; /* XXX */ if (!had_err && mrna_strand == GT_STRAND_REVERSE) had_err = gt_complement(&refchar, refchar, err); #endif if (!had_err && mrna_strand == GT_STRAND_REVERSE) had_err = gt_complement(&variantchar, variantchar, err); if (!had_err) { had_err = snp_annotator_classify_snp(sav, curnode, fn, mrnasnppos, i++, variantchar, #ifndef NDEBUG refchar, #endif err); } } else if (*variantchars == origchar) { i++; } variantchars++; } gt_free(variantptr); gt_free(refptr); } } else { mrnasnppos += gt_range_length(&cds_rng); } } } gt_feature_node_iterator_delete(mrnafni); } } gt_feature_node_iterator_delete(fni); return had_err; }
static int gt_ltrdigest_pdom_visitor_feature_node(GtNodeVisitor *nv, GtFeatureNode *fn, GtError *err) { GtLTRdigestPdomVisitor *lv; GtFeatureNodeIterator *fni; GtFeatureNode *curnode = NULL; int had_err = 0; GtRange rng; GtUword i; lv = gt_ltrdigest_pdom_visitor_cast(nv); gt_assert(lv); gt_error_check(err); /* traverse annotation subgraph and find LTR element */ fni = gt_feature_node_iterator_new(fn); while (!had_err && (curnode = gt_feature_node_iterator_next(fni))) { if (strcmp(gt_feature_node_get_type(curnode), lv->root_type) == 0) { lv->ltr_retrotrans = curnode; } } gt_feature_node_iterator_delete(fni); if (!had_err && lv->ltr_retrotrans != NULL) { GtCodonIterator *ci; GtTranslator *tr; GtTranslatorStatus status; GtUword seqlen; char translated, *rev_seq; #ifndef _WIN32 FILE *instream; GtHMMERParseStatus *pstatus; #endif unsigned int frame; GtStr *seq; seq = gt_str_new(); rng = gt_genome_node_get_range((GtGenomeNode*) lv->ltr_retrotrans); lv->leftLTR_5 = rng.start - 1; lv->rightLTR_3 = rng.end - 1; seqlen = gt_range_length(&rng); had_err = gt_extract_feature_sequence(seq, (GtGenomeNode*) lv->ltr_retrotrans, lv->root_type, false, NULL, NULL, lv->rmap, err); if (!had_err) { for (i = 0UL; i < 3UL; i++) { gt_str_reset(lv->fwd[i]); gt_str_reset(lv->rev[i]); } /* create translations */ ci = gt_codon_iterator_simple_new(gt_str_get(seq), seqlen, NULL); gt_assert(ci); tr = gt_translator_new(ci); status = gt_translator_next(tr, &translated, &frame, err); while (status == GT_TRANSLATOR_OK && translated) { gt_str_append_char(lv->fwd[frame], translated); status = gt_translator_next(tr, &translated, &frame, NULL); } if (status == GT_TRANSLATOR_ERROR) had_err = -1; if (!had_err) { rev_seq = gt_malloc((size_t) seqlen * sizeof (char)); strncpy(rev_seq, gt_str_get(seq), (size_t) seqlen * sizeof (char)); (void) gt_reverse_complement(rev_seq, seqlen, NULL); gt_codon_iterator_delete(ci); ci = gt_codon_iterator_simple_new(rev_seq, seqlen, NULL); gt_translator_set_codon_iterator(tr, ci); status = gt_translator_next(tr, &translated, &frame, err); while (status == GT_TRANSLATOR_OK && translated) { gt_str_append_char(lv->rev[frame], translated); status = gt_translator_next(tr, &translated, &frame, NULL); } if (status == GT_TRANSLATOR_ERROR) had_err = -1; gt_free(rev_seq); } gt_codon_iterator_delete(ci); gt_translator_delete(tr); } /* run HMMER and handle results */ if (!had_err) { #ifndef _WIN32 int pid, pc[2], cp[2]; GT_UNUSED int rval; (void) signal(SIGCHLD, SIG_IGN); /* XXX: for now, ignore child's exit status */ rval = pipe(pc); gt_assert(rval == 0); rval = pipe(cp); gt_assert(rval == 0); switch ((pid = (int) fork())) { case -1: perror("Can't fork"); exit(1); /* XXX: error handling */ case 0: /* child */ (void) close(1); /* close current stdout. */ rval = dup(cp[1]); /* make stdout go to write end of pipe. */ (void) close(0); /* close current stdin. */ rval = dup(pc[0]); /* make stdin come from read end of pipe. */ (void) close(pc[0]); (void) close(pc[1]); (void) close(cp[0]); (void) close(cp[1]); (void) execvp("hmmscan", lv->args); /* XXX: read path from env */ perror("couldn't execute hmmscan!"); exit(1); default: /* parent */ for (i = 0UL; i < 3UL; i++) { char buf[5]; GT_UNUSED ssize_t written; (void) sprintf(buf, ">"GT_WU"%c\n", i, '+'); written = write(pc[1], buf, 4 * sizeof (char)); written = write(pc[1], gt_str_get(lv->fwd[i]), (size_t) gt_str_length(lv->fwd[i]) * sizeof (char)); written = write(pc[1], "\n", 1 * sizeof (char)); (void) sprintf(buf, ">"GT_WU"%c\n", i, '-'); written = write(pc[1], buf, 4 * sizeof (char)); written = write(pc[1], gt_str_get(lv->rev[i]), (size_t) gt_str_length(lv->rev[i]) * sizeof (char)); written = write(pc[1], "\n", 1 * sizeof (char)); } (void) close(pc[0]); (void) close(pc[1]); (void) close(cp[1]); instream = fdopen(cp[0], "r"); pstatus = gt_hmmer_parse_status_new(); had_err = gt_ltrdigest_pdom_visitor_parse_output(lv, pstatus, instream, err); (void) fclose(instream); if (!had_err) had_err = gt_ltrdigest_pdom_visitor_process_hits(lv, pstatus, err); gt_hmmer_parse_status_delete(pstatus); } #else /* XXX */ gt_error_set(err, "HMMER call not implemented on Windows\n"); had_err = -1; #endif } gt_str_delete(seq); } if (!had_err) had_err = gt_ltrdigest_pdom_visitor_choose_strand(lv); return had_err; }
static int gt_ltr_visitor_feature_node(GtNodeVisitor *nv, GtFeatureNode *fn, GT_UNUSED GtError *err) { GtLTRVisitor *lv; GtRange node_range; GtArray *pdomarr = NULL; const char *pfamname; const char *fnt; lv = gt_ltr_visitor_cast(nv); gt_assert(lv); gt_error_check(err); fnt = gt_feature_node_get_type(fn); if (strcmp(fnt, gt_ft_LTR_retrotransposon) == 0) { lv->element->mainnode = fn; } else if (strcmp(fnt, gt_ft_long_terminal_repeat) == 0) { if (lv->element->leftLTR == NULL) { node_range = gt_genome_node_get_range((GtGenomeNode*) fn); lv->element->leftLTR = fn; /* compensate for 1-based node coords */ lv->element->leftLTR_5 = node_range.start - 1; lv->element->leftLTR_3 = node_range.end - 1; } else { node_range = gt_genome_node_get_range((GtGenomeNode*) fn); lv->element->rightLTR = fn; /* compensate for 1-based node coords */ lv->element->rightLTR_5 = node_range.start - 1; lv->element->rightLTR_3 = node_range.end - 1; } } else if (strcmp(fnt, gt_ft_target_site_duplication) == 0) { if (lv->element->leftTSD == NULL) { lv->element->leftTSD = fn; } else { lv->element->rightTSD = fn; } } else if (strcmp(fnt, gt_ft_RR_tract) == 0) { if (lv->element->ppt == NULL) { lv->element->ppt = fn; } } else if (strcmp(fnt, gt_ft_primer_binding_site) == 0) { if (lv->element->pbs == NULL) { lv->element->pbs = fn; } } else if (strcmp(fnt, gt_ft_protein_match) == 0) { char buf[BUFSIZ]; if (!lv->element->pdoms) { lv->element->pdoms = gt_hashmap_new(GT_HASH_STRING, gt_free_func, (GtFree) gt_array_delete); } pfamname = gt_feature_node_get_attribute(fn, "name"); (void) snprintf(buf, BUFSIZ-1, "%s", pfamname); gt_cstr_rep(buf, '/', '_'); if (!(pdomarr = (GtArray*) gt_hashmap_get(lv->element->pdoms, buf))) { char *pfamcpy = gt_cstr_dup(buf); pdomarr = gt_array_new(sizeof (GtFeatureNode*)); gt_hashmap_add(lv->element->pdoms, pfamcpy, pdomarr); if (lv->element->pdomorder != NULL) gt_array_add(lv->element->pdomorder, pfamcpy); } gt_array_add(pdomarr, fn); } return 0; }
static int construct_mRNAs(GT_UNUSED void *key, void *value, void *data, GtError *err) { ConstructionInfo *cinfo = (ConstructionInfo*) data; GtArray *gt_genome_node_array = (GtArray*) value, *mRNAs = (GtArray*) cinfo->mRNAs; GtGenomeNode *mRNA_node, *first_node, *gn; const char *tname; GtStrand mRNA_strand; GtRange mRNA_range; GtStr *mRNA_seqid; GtUword i; int had_err = 0; gt_error_check(err); gt_assert(key && value && data); /* at least one node in array */ gt_assert(gt_array_size(gt_genome_node_array)); /* determine the range and the strand of the mRNA */ first_node = *(GtGenomeNode**) gt_array_get(gt_genome_node_array, 0); mRNA_range = gt_genome_node_get_range(first_node); mRNA_strand = gt_feature_node_get_strand((GtFeatureNode*) first_node); mRNA_seqid = gt_genome_node_get_seqid(first_node); /* TODO: support discontinuous start/stop codons */ for (i = 0; !had_err && i < gt_array_size(gt_genome_node_array); i++) { gn = *(GtGenomeNode**) gt_array_get(gt_genome_node_array, i); if (gt_feature_node_get_attribute((GtFeatureNode*) gn, GTF_PARSER_STOP_CODON_FLAG)) { GtUword j; GtRange stop_codon_rng = gt_genome_node_get_range(gn); bool found_cds = false; for (j = 0; !had_err && j < gt_array_size(gt_genome_node_array); j++) { GtGenomeNode* gn2; GtRange this_rng; const char *this_type; gn2 = *(GtGenomeNode**) gt_array_get(gt_genome_node_array, j); if (gn == gn2) continue; this_rng = gt_genome_node_get_range(gn2); this_type = gt_feature_node_get_type((GtFeatureNode*) gn2); if (this_type == gt_symbol(gt_ft_CDS)) { if (gt_range_contains(&this_rng, &stop_codon_rng)) { if (cinfo->tidy) { gt_warning("stop codon on line %u in file %s is contained in " "CDS in line %u", gt_genome_node_get_line_number(gn), gt_genome_node_get_filename(gn), gt_genome_node_get_line_number(gn2)); found_cds = true; } else { gt_error_set(err, "stop codon on line %u in file %s is " "contained in CDS in line %u", gt_genome_node_get_line_number(gn), gt_genome_node_get_filename(gn), gt_genome_node_get_line_number(gn2)); had_err = -1; } break; } if (this_rng.end + 1 == stop_codon_rng.start) { this_rng.end = stop_codon_rng.end; gt_genome_node_set_range(gn2, &this_rng); found_cds = true; break; } if (this_rng.start == stop_codon_rng.end + 1) { this_rng.start = stop_codon_rng.start; gt_genome_node_set_range(gn2, &this_rng); found_cds = true; break; } } } if (!found_cds) { if (!had_err) { if (cinfo->tidy) { gt_warning("found stop codon on line %u in file %s with no " "flanking CDS, ignoring it", gt_genome_node_get_line_number(gn), gt_genome_node_get_filename(gn)); } else { gt_error_set(err, "found stop codon on line %u in file %s with no " "flanking CDS", gt_genome_node_get_line_number(gn), gt_genome_node_get_filename(gn)); had_err = -1; break; } } } else { gt_array_rem(gt_genome_node_array, i); gt_genome_node_delete(gn); } } } for (i = 1; !had_err && i < gt_array_size(gt_genome_node_array); i++) { GtRange range; GtStrand strand; gn = *(GtGenomeNode**) gt_array_get(gt_genome_node_array, i); range = gt_genome_node_get_range(gn); mRNA_range = gt_range_join(&mRNA_range, &range); strand = gt_feature_node_get_strand((GtFeatureNode*) gn); if (strand != mRNA_strand) { gt_error_set(err, "feature %s on line %u has strand %c, but the " "parent transcript has strand %c", (const char*) key, gt_genome_node_get_line_number(gn), GT_STRAND_CHARS[strand], GT_STRAND_CHARS[mRNA_strand]); had_err = -1; break; } else { mRNA_strand = gt_strand_join(mRNA_strand, strand); } if (!had_err && gt_str_cmp(mRNA_seqid, gt_genome_node_get_seqid(gn))) { gt_error_set(err, "The features on lines %u and %u refer to different " "genomic sequences (``seqname''), although they have the same " "gene IDs (``gene_id'') which must be globally unique", gt_genome_node_get_line_number(first_node), gt_genome_node_get_line_number(gn)); had_err = -1; break; } } if (!had_err) { mRNA_node = gt_feature_node_new(mRNA_seqid, gt_ft_mRNA, mRNA_range.start, mRNA_range.end, mRNA_strand); gt_feature_node_add_attribute(((GtFeatureNode*) mRNA_node), "ID", key); gt_feature_node_add_attribute(((GtFeatureNode*) mRNA_node), "transcript_id", key); if ((tname = gt_hashmap_get(cinfo->transcript_id_to_name_mapping, (const char*) key)) && strlen(tname) > 0) { gt_feature_node_add_attribute((GtFeatureNode*) mRNA_node, GT_GFF_NAME, tname); } /* register children */ for (i = 0; i < gt_array_size(gt_genome_node_array); i++) { gn = *(GtGenomeNode**) gt_array_get(gt_genome_node_array, i); gt_feature_node_add_child((GtFeatureNode*) mRNA_node, (GtFeatureNode*) gt_genome_node_ref(gn)); } /* store the mRNA */ gt_array_add(mRNAs, mRNA_node); } return had_err; }