static const char* get_node_name_or_id(GtFeatureNode *gn) { const char *ret; if (!gn) return NULL; if (!(ret = gt_feature_node_get_attribute(gn, GT_GFF_NAME))) { if (!(ret = gt_feature_node_get_attribute(gn, GT_GFF_ID))) ret = NULL; } return ret; }
static void infer_cds_visitor_check_cds_multi(AgnInferCDSVisitor *v) { if(gt_array_size(v->cds) <= 1) { return; } GtFeatureNode **firstsegment = gt_array_get(v->cds, 0); const char *id = gt_feature_node_get_attribute(*firstsegment, "ID"); if(id == NULL) { char newid[64]; sprintf(newid, "CDS%lu", v->cdscounter++); gt_feature_node_add_attribute(*firstsegment, "ID", newid); } gt_feature_node_make_multi_representative(*firstsegment); GtUword i; for(i = 0; i < gt_array_size(v->cds); i++) { GtFeatureNode **segment = gt_array_get(v->cds, i); if(!gt_feature_node_is_multi(*segment)) { gt_feature_node_set_multi_representative(*segment, *firstsegment); } } }
static GtFeatureNode* find_root(const GtFeatureInfo *fi, const char *id) { const char *delim, *parents; GtFeatureNode *this_feature, *parent_pseudo_feature; gt_assert(fi && id); /* get feature */ delim = strchr(id, ';'); if (delim) { char *first_parent = gt_cstr_dup_nt(id, delim - id); this_feature = gt_hashmap_get(fi->id_to_genome_node, first_parent); parent_pseudo_feature = gt_hashmap_get(fi->id_to_pseudo_parent, first_parent); gt_free(first_parent); } else { this_feature = gt_hashmap_get(fi->id_to_genome_node, id); parent_pseudo_feature = gt_hashmap_get(fi->id_to_pseudo_parent, id); } gt_assert(this_feature); /* recursion */ parents = gt_feature_node_get_attribute(this_feature, GT_GFF_PARENT); if (parents) return find_root(fi, parents); else if (parent_pseudo_feature) return parent_pseudo_feature; return this_feature; }
static void split_cds_feature(GtFeatureNode *cds_feature, GtFeatureNode *fn) { GtArray *parents; unsigned long i; gt_assert(cds_feature && fn); /* find parents */ parents = find_cds_parents(cds_feature, fn); /* remove CDS feature */ gt_feature_node_remove_leaf(fn, cds_feature); /* add CDS feature to all parents */ for (i = 0; i < gt_array_size(parents); i++) { GtFeatureNode *parent = *(GtFeatureNode**) gt_array_get(parents, i); const char *id = gt_feature_node_get_attribute(parent, GT_GFF_ID); if (!i) { gt_feature_node_set_attribute(cds_feature, GT_GFF_PARENT, id); gt_feature_node_add_child(parent, cds_feature); } else { GtFeatureNode *new_cds = gt_feature_node_clone(cds_feature); gt_feature_node_set_attribute(new_cds, GT_GFF_PARENT, id); gt_feature_node_add_child(parent, new_cds); gt_genome_node_delete((GtGenomeNode*) cds_feature); } } gt_array_delete(parents); }
static int m2i_change_seqid(GtFeatureNode *fn, void *data, GtError *err) { const char *target; M2IChangeSeqidInfo *info = (M2IChangeSeqidInfo*) data; gt_error_check(err); gt_assert(fn && info); gt_genome_node_change_seqid((GtGenomeNode*) fn, info->new_seqid); if ((target = gt_feature_node_get_attribute(fn, GT_GFF_TARGET))) return m2i_change_target_seqids(fn, target, info->region_mapping, err); return 0; }
void gt_feature_info_replace_pseudo_parent(GtFeatureInfo *fi, GtFeatureNode *child, GtFeatureNode *new_pseudo_parent) { const char *id; gt_assert(fi && child && new_pseudo_parent); gt_assert(gt_feature_node_is_pseudo((GtFeatureNode*) new_pseudo_parent)); id = gt_feature_node_get_attribute(child, GT_GFF_ID); gt_assert(id); gt_hashmap_remove(fi->id_to_pseudo_parent, id); gt_feature_info_add_pseudo_parent(fi, id, new_pseudo_parent); }
static void infer_cds_visitor_check_stop(AgnInferCDSVisitor *v) { if(gt_array_size(v->cds) == 0) return; const char *mrnaid = gt_feature_node_get_attribute(v->mrna, "ID"); unsigned int ln = gt_genome_node_get_line_number((GtGenomeNode *)v->mrna); GtStrand strand = gt_feature_node_get_strand(v->mrna); GtRange stoprange; GtUword threeprimeindex = gt_array_size(v->cds) - 1; GtGenomeNode **threeprimesegment = gt_array_get(v->cds, threeprimeindex); stoprange = gt_genome_node_get_range(*threeprimesegment); stoprange.start = stoprange.end - 2; if(strand == GT_STRAND_REVERSE) { threeprimesegment = gt_array_get(v->cds, 0); stoprange = gt_genome_node_get_range(*threeprimesegment); stoprange.end = stoprange.start + 2; } if(gt_array_size(v->stops) > 1) { gt_logger_log(v->logger, "mRNA '%s' (line %u) has %lu stop codons", mrnaid, ln, gt_array_size(v->starts)); } else if(gt_array_size(v->stops) == 1) { GtGenomeNode **codon = gt_array_get(v->stops, 0); GtRange testrange = gt_genome_node_get_range(*codon); if(gt_range_compare(&stoprange, &testrange) != 0) { gt_logger_log(v->logger, "stop codon inferred from CDS [%lu, %lu] does " "not match explicitly provided stop codon [%lu, %lu] for " "mRNA '%s'", stoprange.start, stoprange.end, testrange.start, testrange.end, mrnaid); } } else // agn_assert(gt_array_size(v->stops) == 0) { GtStr *seqid = gt_genome_node_get_seqid((GtGenomeNode *)v->mrna); GtGenomeNode *codonfeature = gt_feature_node_new(seqid, "stop_codon", stoprange.start, stoprange.end, strand); if(v->source) gt_feature_node_set_source((GtFeatureNode *)codonfeature, v->source); GtFeatureNode *cf = (GtFeatureNode *)codonfeature; gt_feature_node_add_child(v->mrna, cf); gt_array_add(v->stops, cf); } }
static int create_block_features(GtBEDParser *bed_parser, GtFeatureNode *fn, GtUword block_count, GtSplitter *size_splitter, GtSplitter *start_splitter, GtIO *bed_file, GtError *err) { GtUword i; int had_err = 0; gt_assert(fn && block_count && size_splitter && start_splitter); gt_assert(gt_splitter_size(size_splitter) == block_count); gt_assert(gt_splitter_size(start_splitter) == block_count); for (i = 0; !had_err && i < block_count; i++) { GtUword block_size, block_start, start, end; GtGenomeNode *block; const char *name; if (gt_parse_uword(&block_size, gt_splitter_get_token(size_splitter, i))) { gt_error_set(err, "file \"%s\": line "GT_WU": could not parse blockSize '%s'", gt_io_get_filename(bed_file), gt_io_get_line_number(bed_file), gt_splitter_get_token(size_splitter, i)); had_err = -1; } if (!had_err && gt_parse_uword(&block_start, gt_splitter_get_token(start_splitter, i))) { gt_error_set(err, "file \"%s\": line "GT_WU": could not parse blockStart " "'%s'", gt_io_get_filename(bed_file), gt_io_get_line_number(bed_file), gt_splitter_get_token(start_splitter, i)); had_err = -1; } if (!had_err) { start = gt_genome_node_get_start((GtGenomeNode*) fn) + block_start; end = start + block_size - 1; block = gt_feature_node_new(gt_genome_node_get_seqid((GtGenomeNode*) fn), bed_parser->block_type ? bed_parser->block_type : BED_BLOCK_TYPE, start, end, gt_feature_node_get_strand(fn)); if ((name = gt_feature_node_get_attribute(fn, GT_GFF_NAME))) { gt_feature_node_add_attribute((GtFeatureNode*) block, GT_GFF_NAME, name); } gt_feature_node_set_score((GtFeatureNode*) block, gt_feature_node_get_score(fn)); gt_feature_node_set_strand((GtFeatureNode*) block, gt_feature_node_get_strand(fn)); gt_feature_node_add_child(fn, (GtFeatureNode*) block); } } return had_err; }
static void pbs_attach_results_to_gff3(GtPBSResults *results, GtLTRElement *element, GtStrand *canonical_strand, GtStr *tag) { GtRange pbs_range; GtGenomeNode *gf; unsigned long i = 0; char buffer[BUFSIZ]; GtPBSHit* hit = gt_pbs_results_get_ranked_hit(results, i++); if (*canonical_strand == GT_STRAND_UNKNOWN) *canonical_strand = gt_pbs_hit_get_strand(hit); else { /* do we have to satisfy a strand constraint? * then find best-scoring PBS on the given canonical strand */ while (gt_pbs_hit_get_strand(hit) != *canonical_strand && i < gt_pbs_results_get_number_of_hits(results)) { gt_log_log("dropping PBS because of nonconsistent strand: %s\n", gt_feature_node_get_attribute(element->mainnode, "ID")); hit = gt_pbs_results_get_ranked_hit(results, i++); } /* if there is none, do not report a PBS */ if (gt_pbs_hit_get_strand(hit) != *canonical_strand) return; } pbs_range = gt_pbs_hit_get_coords(hit); pbs_range.start++; pbs_range.end++; /* GFF3 is 1-based */ gf = gt_feature_node_new(gt_genome_node_get_seqid((GtGenomeNode*) element->mainnode), GT_PBS_TYPE, pbs_range.start, pbs_range.end, gt_pbs_hit_get_strand(hit)); gt_feature_node_set_source((GtFeatureNode*) gf, tag); gt_feature_node_set_score((GtFeatureNode*) gf, (float) gt_pbs_hit_get_score(hit)); if (gt_pbs_hit_get_trna(hit) != NULL) { gt_feature_node_add_attribute((GtFeatureNode*) gf, "trna", gt_pbs_hit_get_trna(hit)); } gt_feature_node_set_strand(element->mainnode, gt_pbs_hit_get_strand(hit)); (void) snprintf(buffer, BUFSIZ-1, "%lu", gt_pbs_hit_get_tstart(hit)); gt_feature_node_add_attribute((GtFeatureNode*) gf, "trnaoffset", buffer); (void) snprintf(buffer, BUFSIZ-1, "%lu", gt_pbs_hit_get_offset(hit)); gt_feature_node_add_attribute((GtFeatureNode*) gf, "pbsoffset", buffer); (void) snprintf(buffer, BUFSIZ-1, "%lu", gt_pbs_hit_get_edist(hit)); gt_feature_node_add_attribute((GtFeatureNode*) gf, "edist", buffer); gt_feature_node_add_child(element->mainnode, (GtFeatureNode*) gf); }
static int feature_node_lua_get_attribute(lua_State *L) { GtGenomeNode **gn = check_genome_node(L, 1); const char *attr = NULL, *attrval = NULL; attr = luaL_checkstring(L, 2); GtFeatureNode *fn; /* make sure we get a feature node */ fn = gt_feature_node_try_cast(*gn); luaL_argcheck(L, fn, 1, "not a feature node"); attrval = gt_feature_node_get_attribute(fn, attr); if (attrval) lua_pushstring(L, attrval); else lua_pushnil(L); return 1; }
static int store_ids(GtFeatureNode *fn, void *data, GtError *err) { GtGFF3Visitor *gff3_visitor = (GtGFF3Visitor*) data; AddIDInfo add_id_info; int had_err = 0; GtStr *id; gt_error_check(err); gt_assert(fn && gff3_visitor); if (gt_feature_node_has_children(fn) || gt_feature_node_is_multi(fn) || (gff3_visitor->retain_ids && gt_feature_node_get_attribute(fn, "ID"))) { if (gt_feature_node_is_multi(fn)) { id = gt_hashmap_get(gff3_visitor->feature_node_to_unique_id_str, gt_feature_node_get_multi_representative(fn)); if (!id) { /* the representative does not have its own id yet -> create it */ if (gff3_visitor->retain_ids) { id = make_id_unique(gff3_visitor, gt_feature_node_get_multi_representative(fn)); } else { id = create_unique_id(gff3_visitor, gt_feature_node_get_multi_representative(fn)); } } /* store id for feature, if the feature was not the representative */ if (gt_feature_node_get_multi_representative(fn) != fn) { gt_hashmap_add(gff3_visitor->feature_node_to_unique_id_str, fn, gt_str_ref(id)); } } else { if (gff3_visitor->retain_ids) id = make_id_unique(gff3_visitor, fn); else id = create_unique_id(gff3_visitor, fn); } /* for each child -> store the parent feature in the hash map */ add_id_info.gt_feature_node_to_id_array = gff3_visitor->feature_node_to_id_array, add_id_info.id = gt_str_get(id); had_err = gt_feature_node_traverse_direct_children(fn, &add_id_info, add_id, err); } return had_err; }
static int gff3_visitor_feature_node(GtNodeVisitor *nv, GtFeatureNode *fn, GtError *err) { GtGFF3Visitor *gff3_visitor; int had_err; gt_error_check(err); gff3_visitor = gff3_visitor_cast(nv); gff3_version_string(nv); had_err = gt_feature_node_traverse_children(fn, gff3_visitor, store_ids, true, err); if (!had_err) { if (gt_feature_node_is_tree(fn)) { had_err = gt_feature_node_traverse_children(fn, gff3_visitor, gff3_show_feature_node, true, err); } else { /* got a DAG -> traverse in topologically sorted depth first fashion to make sure that the 'Parent' attributes are shown in correct order */ had_err = gt_feature_node_traverse_children_top(fn, gff3_visitor, gff3_show_feature_node, err); } } /* reset hashmaps */ gt_hashmap_reset(gff3_visitor->feature_node_to_id_array); gt_hashmap_reset(gff3_visitor->feature_node_to_unique_id_str); /* show terminator, if the feature has children (otherwise it is clear that the feature is complete, because no ID attribute has been shown) */ if (gt_feature_node_has_children(fn) || (gff3_visitor->retain_ids && gt_feature_node_get_attribute(fn, "ID"))) { if (!gff3_visitor->outstr) gt_file_xprintf(gff3_visitor->outfp, "%s\n", GT_GFF_TERMINATOR); else { gt_str_append_cstr(gff3_visitor->outstr, GT_GFF_TERMINATOR); gt_str_append_char(gff3_visitor->outstr, '\n'); } } return had_err; }
static int gaeval_visitor_visit_feature_node(GtNodeVisitor *nv, GtFeatureNode *fn, GtError *error) { AgnGaevalVisitor *v = gaeval_visitor_cast(nv); gt_error_check(error); GtFeatureNodeIterator *feats = gt_feature_node_iterator_new(fn); GtFeatureNode *tempfeat; for(tempfeat = gt_feature_node_iterator_next(feats); tempfeat != NULL; tempfeat = gt_feature_node_iterator_next(feats)) { if(agn_typecheck_mrna(tempfeat) == false) continue; double coverage = gaeval_visitor_calculate_coverage(v, tempfeat, error); char covstr[16]; sprintf(covstr, "%.3lf", coverage); gt_feature_node_add_attribute(tempfeat, "gaeval_coverage", covstr); double integrity_components[5]; double integrity = gaeval_visitor_calculate_integrity( v, tempfeat, coverage, integrity_components, error ); char intstr[16]; sprintf(intstr, "%.3lf", integrity); gt_feature_node_add_attribute(tempfeat, "gaeval_integrity", intstr); if(v->tsvout) { const char *mrnaid = gt_feature_node_get_attribute(tempfeat, "ID"); const char *mrnalabel = agn_feature_node_get_label(tempfeat); GtUword num_introns = agn_typecheck_count(tempfeat, agn_typecheck_intron); fprintf(v->tsvout, "%s\t%s\t%s\t%s\t%lu\t%.3lf\t%.3lf\t%.3lf\t%.3lf\n", mrnaid, mrnalabel, intstr, covstr, num_introns, integrity_components[0], integrity_components[1], integrity_components[2], integrity_components[3]); } } gt_feature_node_iterator_delete(feats); return 0; }
void gt_orphanage_add(GtOrphanage *o, GtGenomeNode *orphan, const char *orphan_id, GtStrArray *missing_parents) { const char *missing_parent; GtUword i; gt_assert(o && orphan); gt_assert(gt_feature_node_get_attribute((GtFeatureNode*) orphan, GT_GFF_PARENT)); gt_queue_add(o->orphans, orphan); if (orphan_id && !gt_cstr_table_get(o->orphan_ids, orphan_id)) gt_cstr_table_add(o->orphan_ids, orphan_id); if (missing_parents) { for (i = 0; i < gt_str_array_size(missing_parents); i++) { missing_parent = gt_str_array_get(missing_parents, i); if (!gt_cstr_table_get(o->missing_parents, missing_parent)) gt_cstr_table_add(o->missing_parents, missing_parent); } } }
static bool filter_targetstrand(GtFeatureNode *fn, GtStrand targetstrand) { const char *target; gt_assert(fn); if (targetstrand != GT_NUM_OF_STRAND_TYPES && (target = gt_feature_node_get_attribute(fn, GT_GFF_TARGET))) { unsigned long num_of_targets; GtStrand parsed_strand; GT_UNUSED int had_err; had_err = gt_gff3_parser_parse_target_attributes(target, &num_of_targets, NULL, NULL, &parsed_strand, "", 0, NULL); gt_assert(!had_err); if (num_of_targets == 1 && parsed_strand != GT_NUM_OF_STRAND_TYPES && parsed_strand != targetstrand) { return true; } } return false; }
static int extracttarget_from_node(GtGenomeNode *gn, GtStrArray *seqfiles, GtError *err) { GtFeatureNodeIterator *fni; int had_err = 0; gt_error_check(err); gt_assert(gn && seqfiles); if (gt_genome_node_cast(gt_feature_node_class(), gn)) { const char *target; GtFeatureNode *child; fni = gt_feature_node_iterator_new(gt_feature_node_cast(gn)); while (!had_err && /* XXX remove cast */ (child = (GtFeatureNode*) gt_feature_node_iterator_next(fni))) { if ((target = gt_feature_node_get_attribute(child, "Target"))) had_err = extracttarget_from_seqfiles(target, seqfiles, err); } gt_feature_node_iterator_delete(fni); } return had_err; }
static GtStr* make_id_unique(GtGFF3Visitor *gff3_visitor, GtFeatureNode *fn) { GtUword i = 1; GtStr *id = gt_str_new_cstr(gt_feature_node_get_attribute(fn, "ID")); if (gt_cstr_table_get(gff3_visitor->used_ids, gt_str_get(id))) { GtStr *buf = gt_str_new(); while (!id_string_is_unique(id, buf, gff3_visitor->used_ids, i++)); gt_warning("feature ID \"%s\" not unique: changing to %s", gt_str_get(id), gt_str_get(buf)); gt_str_set(id, gt_str_get(buf)); gt_str_delete(buf); } /* update table with the new id */ gt_cstr_table_add(gff3_visitor->used_ids, gt_str_get(id)); /* store (unique) id */ gt_hashmap_add(gff3_visitor->feature_node_to_unique_id_str, fn, id); return id; }
static void filter_targetbest(GtFeatureNode *current_feature, GtDlist *trees, GtHashmap *target_to_elem) { unsigned long num_of_targets; GtDlistelem *previous_elem; GtStr *first_target_id; const char *target; int had_err; gt_assert(current_feature && trees); target = gt_feature_node_get_attribute(current_feature, TARGET_STRING); gt_assert(target); first_target_id = gt_str_new(); had_err = gt_gff3_parser_parse_target_attributes(target, &num_of_targets, first_target_id, NULL, NULL, "", 0, NULL); gt_assert(!had_err); if (num_of_targets == 1) { GtStr *key = gt_str_new(); build_key(key, current_feature, first_target_id); if (!(previous_elem = gt_hashmap_get(target_to_elem, gt_str_get(key)))) { /* element with this target_id not included yet -> include it */ include_feature(trees, target_to_elem, current_feature, key); } else { GtFeatureNode *previous_feature = gt_dlistelem_get_data(previous_elem); /* element with this target_id included already -> compare them */ if (gt_feature_node_get_score(current_feature) > gt_feature_node_get_score(previous_feature)) { /* current feature is better -> replace previous feature */ replace_previous_elem(previous_elem, current_feature, trees, target_to_elem, key); } else /* current feature is not better -> remove it */ gt_genome_node_delete((GtGenomeNode*) current_feature); } gt_str_delete(key); } else gt_dlist_add(trees, current_feature); gt_str_delete(first_target_id); }
static void construct_thick_feature(GtBEDParser *bed_parser, GtFeatureNode *fn, GtRange range) { GtGenomeNode *thick_feature; const char *name; gt_assert(fn); thick_feature = gt_feature_node_new(gt_genome_node_get_seqid((GtGenomeNode*) fn), bed_parser->thick_feature_type ? bed_parser->thick_feature_type : BED_THICK_FEATURE_TYPE, range.start, range.end, gt_feature_node_get_strand(fn)); if ((name = gt_feature_node_get_attribute(fn, "Name"))) gt_feature_node_add_attribute((GtFeatureNode*) thick_feature, "Name", name); gt_feature_node_set_score((GtFeatureNode*) thick_feature, gt_feature_node_get_score(fn)); gt_feature_node_set_strand((GtFeatureNode*) thick_feature, gt_feature_node_get_strand(fn)); gt_feature_node_add_child(fn, (GtFeatureNode*) thick_feature); }
static int targetbest_filter_stream_next(GtNodeStream *gs, GtGenomeNode **gn, GtError *err) { GtTargetbestFilterStream *tfs; GtGenomeNode *node; int had_err = 0; gt_error_check(err); tfs = targetbest_filter_stream_cast(gs); if (!tfs->in_stream_processed) { while (!(had_err = gt_node_stream_next(tfs->in_stream, &node, err)) && node) { if (gt_feature_node_try_cast(node) && gt_feature_node_get_attribute((GtFeatureNode*) node, "Target")) { filter_targetbest((GtFeatureNode*) node, tfs->trees, tfs->target_to_elem); } else gt_dlist_add(tfs->trees, node); } tfs->next = gt_dlist_first(tfs->trees); tfs->in_stream_processed = true; } if (!had_err) { gt_assert(tfs->in_stream_processed); if (tfs->next) { *gn = gt_dlistelem_get_data(tfs->next); tfs->next = gt_dlistelem_next(tfs->next); } else *gn = NULL; return 0; } return had_err; }
static void ppt_attach_results_to_gff3(GtPPTResults *results, GtLTRElement *element, GtStrand *canonical_strand, GtStr *tag) { GtRange ppt_range; unsigned long i = 0; GtGenomeNode *gf; GtPPTHit* hit = gt_ppt_results_get_ranked_hit(results, i++); if (*canonical_strand == GT_STRAND_UNKNOWN) *canonical_strand = gt_ppt_hit_get_strand(hit); else { /* find best-scoring PPT on the given canonical strand */ while (gt_ppt_hit_get_strand(hit) != *canonical_strand && i < gt_ppt_results_get_number_of_hits(results)) { gt_log_log("dropping PPT because of nonconsistent strand: %s\n", gt_feature_node_get_attribute(element->mainnode, "ID")); hit = gt_ppt_results_get_ranked_hit(results, i++); } /* if there is none, do not report a PPT */ if (gt_ppt_hit_get_strand(hit) != *canonical_strand) return; } ppt_range = gt_ppt_hit_get_coords(hit); ppt_range.start++; ppt_range.end++; /* GFF3 is 1-based */ gf = gt_feature_node_new(gt_genome_node_get_seqid((GtGenomeNode*) element->mainnode), GT_PPT_TYPE, ppt_range.start, ppt_range.end, gt_ppt_hit_get_strand(hit)); gt_feature_node_set_source((GtFeatureNode*) gf, tag); gt_feature_node_set_strand(element->mainnode, gt_ppt_hit_get_strand(hit)); gt_feature_node_add_child(element->mainnode, (GtFeatureNode*) gf); }
static int gt_extract_feature_sequence_generic(GtStr *sequence, GtGenomeNode *gn, const char *type, bool join, GtStr *seqid, GtStrArray *target_ids, unsigned int *out_phase_offset, GtRegionMapping *region_mapping, GtError *err) { GtFeatureNode *fn; GtRange range; unsigned int phase_offset = 0; char *outsequence; const char *target; int had_err = 0; gt_error_check(err); fn = gt_genome_node_cast(gt_feature_node_class(), gn); gt_assert(fn); if (seqid) gt_str_append_str(seqid, gt_genome_node_get_seqid(gn)); if (target_ids && (target = gt_feature_node_get_attribute(fn, GT_GFF_TARGET))) { had_err = gt_gff3_parser_parse_all_target_attributes(target, false, target_ids, NULL, NULL, "", 0, err); } if (!had_err) { if (join) { GtFeatureNodeIterator *fni; GtFeatureNode *child; bool reverse_strand = false, first_child = true, first_child_of_type_seen = false; GtPhase phase = GT_PHASE_UNDEFINED; /* in this case we have to traverse the children */ fni = gt_feature_node_iterator_new_direct(gt_feature_node_cast(gn)); while (!had_err && (child = gt_feature_node_iterator_next(fni))) { if (first_child) { if (target_ids && (target = gt_feature_node_get_attribute(child, GT_GFF_TARGET))) { gt_str_array_reset(target_ids); had_err = gt_gff3_parser_parse_all_target_attributes(target, false, target_ids, NULL, NULL, "", 0, err); } first_child = false; } if (!had_err) { if (extract_join_feature((GtGenomeNode*) child, type, region_mapping, sequence, &reverse_strand, &first_child_of_type_seen, &phase, err)) { had_err = -1; } if (phase != GT_PHASE_UNDEFINED) { phase_offset = (int) phase; } } } gt_feature_node_iterator_delete(fni); gt_assert(phase_offset <= (unsigned int) GT_PHASE_UNDEFINED); if (!had_err && gt_str_length(sequence)) { if (reverse_strand) { had_err = gt_reverse_complement(gt_str_get(sequence), gt_str_length(sequence), err); } } } else if (gt_feature_node_get_type(fn) == type) { GtPhase phase = gt_feature_node_get_phase(fn); gt_assert(!had_err); if (phase != GT_PHASE_UNDEFINED) phase_offset = (unsigned int) phase; /* otherwise we only have to look at this feature */ range = gt_genome_node_get_range(gn); gt_assert(range.start); /* 1-based coordinates */ had_err = gt_region_mapping_get_sequence(region_mapping, &outsequence, gt_genome_node_get_seqid(gn), range.start, range.end, err); if (!had_err) { gt_str_append_cstr_nt(sequence, outsequence, gt_range_length(&range)); gt_free(outsequence); if (gt_feature_node_get_strand(fn) == GT_STRAND_REVERSE) { had_err = gt_reverse_complement(gt_str_get(sequence), gt_str_length(sequence), err); } } } } if (out_phase_offset && phase_offset != GT_PHASE_UNDEFINED) { *out_phase_offset = phase_offset; } return had_err; }
static int add_ids_visitor_feature_node(GtNodeVisitor *nv, GtFeatureNode *fn, GtError *err) { AutomaticSequenceRegion *auto_sr; GtAddIDsVisitor *aiv; const char *seqid; bool is_circular; aiv = add_ids_visitor_cast(nv); seqid = gt_str_get(gt_genome_node_get_seqid((GtGenomeNode*) fn)); if (aiv->ensure_sorting && !gt_cstr_table_get(aiv->defined_seqids, seqid)) { gt_error_set(err, "the file %s is not sorted (seqid \"%s\" on line %u has " "not been previously introduced with a \"%s\" line)", gt_genome_node_get_filename((GtGenomeNode*) fn), seqid, gt_genome_node_get_line_number((GtGenomeNode*) fn), GT_GFF_SEQUENCE_REGION); return -1; } if (!gt_cstr_table_get(aiv->defined_seqids, seqid)) { GtFeatureNodeIterator *fni; GtFeatureNode *node; GtRange range = gt_genome_node_get_range((GtGenomeNode*) fn); is_circular = gt_feature_node_get_attribute(fn, GT_GFF_IS_CIRCULAR) ? true : false; if (!is_circular) { fni = gt_feature_node_iterator_new(fn); while ((node = gt_feature_node_iterator_next(fni))) { GtRange node_range = gt_genome_node_get_range((GtGenomeNode*) node); range = gt_range_join(&range, &node_range); } gt_feature_node_iterator_delete(fni); } /* sequence region has not been previously introduced -> check if one has already been created automatically */ auto_sr = gt_hashmap_get(aiv->undefined_sequence_regions, seqid); if (!auto_sr) { GtStr *seqid_str; /* sequence region has not been createad automatically -> do it now */ gt_warning("seqid \"%s\" on line %u in file \"%s\" has not been " "previously introduced with a \"%s\" line, create such a line " "automatically", seqid, gt_genome_node_get_line_number((GtGenomeNode*) fn), gt_genome_node_get_filename((GtGenomeNode*) fn), GT_GFF_SEQUENCE_REGION); auto_sr = automatic_sequence_region_new(is_circular); seqid_str = gt_genome_node_get_seqid((GtGenomeNode*) fn); auto_sr->sequence_region = gt_region_node_new(seqid_str, range.start, range.end); gt_hashmap_add(aiv->undefined_sequence_regions, gt_str_get(seqid_str), auto_sr); } else { if (auto_sr->is_circular) { gt_assert(!is_circular); /* XXX */ } else if (is_circular) { gt_assert(!auto_sr->is_circular); /* XXX */ auto_sr->is_circular = true; gt_genome_node_set_range(auto_sr->sequence_region, &range); } else { GtRange joined_range, sr_range = gt_genome_node_get_range(auto_sr->sequence_region); /* update the range of the sequence region */ joined_range = gt_range_join(&range, &sr_range); gt_genome_node_set_range(auto_sr->sequence_region, &joined_range); } } gt_array_add(auto_sr->feature_nodes, fn); } else gt_queue_add(aiv->node_buffer, fn); return 0; }
static int snp_annotator_classify_snp(GtSNPAnnotatorVisitor *sav, GtFeatureNode *mRNA, GtFeatureNode *snp, GtUword variant_pos, GtUword variant_idx, char variant_char, #ifndef NDEBUG GT_UNUSED char reference_char, #endif GT_UNUSED GtError *err) { int had_err = 0; char *mrnaseq; const char *variant_effect = NULL; gt_assert(mRNA && snp && sav); gt_log_log("processing variant char %c for SNP %s\n", variant_char, gt_feature_node_get_attribute(snp, "Dbxref")); mrnaseq = gt_hashmap_get(sav->rnaseqs, mRNA); gt_assert(mrnaseq); if (mrnaseq) { char codon[3], variant_codon[3]; GtStr *effect_string; char oldamino, newamino; GT_UNUSED GtUword mrnalen; GtUword startpos = variant_pos / GT_CODON_LENGTH, variantoffset = variant_pos % GT_CODON_LENGTH; mrnalen = strlen(mrnaseq); gt_assert(variant_pos < mrnalen); variant_codon[0] = codon[0] = mrnaseq[3*startpos]; variant_codon[1] = codon[1] = mrnaseq[3*startpos+1]; variant_codon[2] = codon[2] = mrnaseq[3*startpos+2]; variant_codon[variantoffset] = variant_char; #ifndef NDEBUG gt_assert(toupper(codon[variantoffset]) == toupper(reference_char)); #endif if (gt_trans_table_is_stop_codon(sav->tt, codon[0], codon[1], codon[2])) { if (gt_trans_table_is_stop_codon(sav->tt, variant_codon[0], variant_codon[1], variant_codon[2])) { variant_effect = gt_symbol(GT_SNP_SYNONYMOUS_STOP_EFFECT); } else { variant_effect = gt_symbol(GT_SNP_STOP_LOST_EFFECT); } } else { if (gt_trans_table_is_stop_codon(sav->tt, variant_codon[0], variant_codon[1], variant_codon[2])) { variant_effect = gt_symbol(GT_SNP_NONSENSE_EFFECT); } else { had_err = gt_trans_table_translate_codon(sav->tt, codon[0], codon[1], codon[2], &oldamino, err); if (!had_err) { had_err = gt_trans_table_translate_codon(sav->tt, variant_codon[0], variant_codon[1], variant_codon[2], &newamino, err); } if (!had_err) { if (newamino == oldamino) { variant_effect = gt_symbol(GT_SNP_SYNONYMOUS_AMINO_EFFECT); } else { variant_effect = gt_symbol(GT_SNP_MISSENSE_EFFECT); } } } } if (!had_err) { const char *var_attrib; gt_assert(variant_effect != NULL); if ((var_attrib = gt_feature_node_get_attribute(snp, GT_GVF_VARIANT_EFFECT))) { effect_string = gt_str_new_cstr(var_attrib); gt_str_append_cstr(effect_string, ","); gt_str_append_cstr(effect_string, variant_effect); } else { effect_string = gt_str_new_cstr(variant_effect); } gt_str_append_cstr(effect_string, " "); gt_str_append_ulong(effect_string, variant_idx); gt_str_append_cstr(effect_string, " "); gt_str_append_cstr(effect_string, gt_feature_node_get_type(mRNA)); gt_str_append_cstr(effect_string, " "); gt_str_append_cstr(effect_string, gt_feature_node_get_attribute(mRNA, GT_GFF_ID)); gt_feature_node_set_attribute(snp, GT_GVF_VARIANT_EFFECT, gt_str_get(effect_string)); gt_str_reset(effect_string); gt_str_delete(effect_string); } } return had_err; }
static int snp_annotator_visitor_feature_node(GtNodeVisitor *nv, GtFeatureNode *fn, GtError *err) { int had_err = 0; GtSNPAnnotatorVisitor *sav; GtFeatureNodeIterator *fni, *mrnafni; GtFeatureNode *curnode, *curnode2; GtRange snp_rng; gt_error_check(err); sav = snp_annotator_visitor_cast(nv); /* ignore non-nodes */ if (!fn) return 0; /* only process SNPs */ if (!(gt_feature_node_get_type(fn) == sav->SNV_type || gt_feature_node_get_type(fn) == sav->SNP_type)) { return 0; } fni = gt_feature_node_iterator_new_direct(sav->gene); snp_rng = gt_genome_node_get_range((GtGenomeNode*) fn); while (!had_err && (curnode = gt_feature_node_iterator_next(fni))) { if (gt_feature_node_get_type(curnode) == sav->mRNA_type) { GtStrand mrna_strand = gt_feature_node_get_strand(curnode); #ifndef NDEBUG const char *refstr; #endif GtUword mrnasnppos = 0; mrnafni = gt_feature_node_iterator_new(curnode); while (!had_err && (curnode2 = gt_feature_node_iterator_next(mrnafni))) { if (gt_feature_node_get_type(curnode2) == sav->CDS_type) { GtRange cds_rng = gt_genome_node_get_range((GtGenomeNode*) curnode2); if (gt_range_overlap(&snp_rng, &cds_rng)) { char *mRNA, origchar; char *variantchars, *variantptr = NULL; GT_UNUSED char *refchars, *refptr = NULL; mRNA = (char*) gt_hashmap_get(sav->rnaseqs, curnode); gt_assert(mRNA); gt_assert(snp_rng.start >= cds_rng.start); mrnasnppos += (snp_rng.start - cds_rng.start); if (mrna_strand == GT_STRAND_REVERSE) mrnasnppos = strlen(mRNA) - mrnasnppos - 1; gt_assert(mrnasnppos < strlen(mRNA)); origchar = mRNA[mrnasnppos]; #ifndef NDEBUG refstr = refptr = gt_cstr_dup(gt_feature_node_get_attribute(fn, GT_GVF_REFERENCE_SEQ)); if (!had_err && refstr) { if (gt_feature_node_get_strand(curnode) == GT_STRAND_REVERSE) { int rval = gt_complement(&origchar, origchar, err); gt_assert(rval == 0); } gt_assert(toupper(origchar) == toupper(refstr[0])); } #endif variantchars = variantptr = gt_cstr_dup( gt_feature_node_get_attribute(fn, GT_GVF_VARIANT_SEQ)); if (!had_err && variantchars) { GtUword i = 0; while (!had_err && (*variantchars != ';' && *variantchars != '\0')) { if (*variantchars != ',' && *variantchars != origchar) { char variantchar = *variantchars; #ifndef NDEBUG char refchar = refstr ? refstr[0] : '-'; /* XXX */ if (!had_err && mrna_strand == GT_STRAND_REVERSE) had_err = gt_complement(&refchar, refchar, err); #endif if (!had_err && mrna_strand == GT_STRAND_REVERSE) had_err = gt_complement(&variantchar, variantchar, err); if (!had_err) { had_err = snp_annotator_classify_snp(sav, curnode, fn, mrnasnppos, i++, variantchar, #ifndef NDEBUG refchar, #endif err); } } else if (*variantchars == origchar) { i++; } variantchars++; } gt_free(variantptr); gt_free(refptr); } } else { mrnasnppos += gt_range_length(&cds_rng); } } } gt_feature_node_iterator_delete(mrnafni); } } gt_feature_node_iterator_delete(fni); return had_err; }
int gt_ltrfileout_stream_next(GtNodeStream *ns, GtGenomeNode **gn, GtError *err) { GtLTRdigestFileOutStream *ls; GtFeatureNode *fn; GtRange lltr_rng = {GT_UNDEF_UWORD, GT_UNDEF_UWORD}, rltr_rng = {GT_UNDEF_UWORD, GT_UNDEF_UWORD}, ppt_rng = {GT_UNDEF_UWORD, GT_UNDEF_UWORD}, pbs_rng = {GT_UNDEF_UWORD, GT_UNDEF_UWORD}; int had_err; GtUword i=0; gt_error_check(err); ls = gt_ltrdigest_file_out_stream_cast(ns); /* initialize this element */ memset(&ls->element, 0, sizeof (GtLTRElement)); /* get annotations from parser */ had_err = gt_node_stream_next(ls->in_stream, gn, err); if (!had_err && *gn) { GtFeatureNodeIterator* gni; GtFeatureNode *mygn; /* only process feature nodes */ if (!(fn = gt_feature_node_try_cast(*gn))) return 0; ls->element.pdomorder = gt_array_new(sizeof (const char*)); /* fill LTRElement structure from GFF3 subgraph */ gni = gt_feature_node_iterator_new(fn); for (mygn = fn; mygn != NULL; mygn = gt_feature_node_iterator_next(gni)) (void) gt_genome_node_accept((GtGenomeNode*) mygn, (GtNodeVisitor*) ls->lv, err); gt_feature_node_iterator_delete(gni); } if (!had_err && ls->element.mainnode != NULL) { char desc[GT_MAXFASTAHEADER]; GtFeatureNode *ltr3, *ltr5; GtStr *sdesc, *sreg, *seq; /* find sequence in GtEncseq */ sreg = gt_genome_node_get_seqid((GtGenomeNode*) ls->element.mainnode); sdesc = gt_str_new(); had_err = gt_region_mapping_get_description(ls->rmap, sdesc, sreg, err); if (!had_err) { GtRange rng; ls->element.seqid = gt_calloc((size_t) ls->seqnamelen+1, sizeof (char)); (void) snprintf(ls->element.seqid, MIN((size_t) gt_str_length(sdesc), (size_t) ls->seqnamelen)+1, "%s", gt_str_get(sdesc)); gt_cstr_rep(ls->element.seqid, ' ', '_'); if (gt_str_length(sdesc) > (GtUword) ls->seqnamelen) ls->element.seqid[ls->seqnamelen] = '\0'; (void) gt_ltrelement_format_description(&ls->element, ls->seqnamelen, desc, (size_t) (GT_MAXFASTAHEADER-1)); gt_str_delete(sdesc); /* output basic retrotransposon data */ lltr_rng = gt_genome_node_get_range((GtGenomeNode*) ls->element.leftLTR); rltr_rng = gt_genome_node_get_range((GtGenomeNode*) ls->element.rightLTR); rng = gt_genome_node_get_range((GtGenomeNode*) ls->element.mainnode); gt_file_xprintf(ls->tabout_file, GT_WU"\t"GT_WU"\t"GT_WU"\t%s\t"GT_WU"\t"GT_WU"\t"GT_WU"\t" GT_WU"\t"GT_WU"\t"GT_WU"\t", rng.start, rng.end, gt_ltrelement_length(&ls->element), ls->element.seqid, lltr_rng.start, lltr_rng.end, gt_ltrelement_leftltrlen(&ls->element), rltr_rng.start, rltr_rng.end, gt_ltrelement_rightltrlen(&ls->element)); } seq = gt_str_new(); /* output TSDs */ if (!had_err && ls->element.leftTSD != NULL) { GtRange tsd_rng; tsd_rng = gt_genome_node_get_range((GtGenomeNode*) ls->element.leftTSD); had_err = gt_extract_feature_sequence(seq, (GtGenomeNode*) ls->element.leftTSD, gt_symbol(gt_ft_target_site_duplication), false, NULL, NULL, ls->rmap, err); if (!had_err) { gt_file_xprintf(ls->tabout_file, ""GT_WU"\t"GT_WU"\t%s\t", tsd_rng.start, tsd_rng.end, gt_str_get(seq)); } gt_str_reset(seq); } else gt_file_xprintf(ls->tabout_file, "\t\t\t"); if (!had_err && ls->element.rightTSD != NULL) { GtRange tsd_rng; tsd_rng = gt_genome_node_get_range((GtGenomeNode*) ls->element.rightTSD); had_err = gt_extract_feature_sequence(seq, (GtGenomeNode*) ls->element.rightTSD, gt_symbol(gt_ft_target_site_duplication), false, NULL, NULL, ls->rmap, err); if (!had_err) { gt_file_xprintf(ls->tabout_file, ""GT_WU"\t"GT_WU"\t%s\t", tsd_rng.start, tsd_rng.end, gt_str_get(seq)); } gt_str_reset(seq); } else gt_file_xprintf(ls->tabout_file, "\t\t\t"); /* output PPT */ if (!had_err && ls->element.ppt != NULL) { GtStrand ppt_strand = gt_feature_node_get_strand(ls->element.ppt); ppt_rng = gt_genome_node_get_range((GtGenomeNode*) ls->element.ppt); had_err = gt_extract_feature_sequence(seq, (GtGenomeNode*) ls->element.ppt, gt_symbol(gt_ft_RR_tract), false, NULL, NULL, ls->rmap, err); if (!had_err) { gt_fasta_show_entry(desc, gt_str_get(seq), gt_range_length(&ppt_rng), GT_FSWIDTH, ls->pptout_file); gt_file_xprintf(ls->tabout_file, ""GT_WU"\t"GT_WU"\t%s\t%c\t%d\t", ppt_rng.start, ppt_rng.end, gt_str_get(seq), GT_STRAND_CHARS[ppt_strand], (ppt_strand == GT_STRAND_FORWARD ? abs((int) (rltr_rng.start - ppt_rng.end)) : abs((int) (lltr_rng.end - ppt_rng.start)))); } gt_str_reset(seq); } else gt_file_xprintf(ls->tabout_file, "\t\t\t\t\t"); /* output PBS */ if (!had_err && ls->element.pbs != NULL) { GtStrand pbs_strand; pbs_strand = gt_feature_node_get_strand(ls->element.pbs); pbs_rng = gt_genome_node_get_range((GtGenomeNode*) ls->element.pbs); had_err = gt_extract_feature_sequence(seq, (GtGenomeNode*) ls->element.pbs, gt_symbol(gt_ft_primer_binding_site), false, NULL, NULL, ls->rmap, err); if (!had_err) { gt_fasta_show_entry(desc, gt_str_get(seq), gt_range_length(&pbs_rng), GT_FSWIDTH, ls->pbsout_file); gt_file_xprintf(ls->tabout_file, ""GT_WU"\t"GT_WU"\t%c\t%s\t%s\t%s\t%s\t%s\t", pbs_rng.start, pbs_rng.end, GT_STRAND_CHARS[pbs_strand], gt_feature_node_get_attribute(ls->element.pbs, "trna"), gt_str_get(seq), gt_feature_node_get_attribute(ls->element.pbs, "pbsoffset"), gt_feature_node_get_attribute(ls->element.pbs, "trnaoffset"), gt_feature_node_get_attribute(ls->element.pbs, "edist")); } gt_str_reset(seq); } else gt_file_xprintf(ls->tabout_file, "\t\t\t\t\t\t\t\t"); /* output protein domains */ if (!had_err && ls->element.pdoms != NULL) { GtStr *pdomorderstr = gt_str_new(); for (i=0; !had_err && i<gt_array_size(ls->element.pdomorder); i++) { const char* key = *(const char**) gt_array_get(ls->element.pdomorder, i); GtArray *entry = (GtArray*) gt_hashmap_get(ls->element.pdoms, key); had_err = write_pdom(ls, entry, key, ls->rmap, desc, err); } if (GT_STRAND_REVERSE == gt_feature_node_get_strand(ls->element.mainnode)) gt_array_reverse(ls->element.pdomorder); for (i=0 ;!had_err && i<gt_array_size(ls->element.pdomorder); i++) { const char* name = *(const char**) gt_array_get(ls->element.pdomorder, i); gt_str_append_cstr(pdomorderstr, name); if (i != gt_array_size(ls->element.pdomorder)-1) gt_str_append_cstr(pdomorderstr, "/"); } gt_file_xprintf(ls->tabout_file, "%s", gt_str_get(pdomorderstr)); gt_str_delete(pdomorderstr); } /* output LTRs (we just expect them to exist) */ switch (gt_feature_node_get_strand(ls->element.mainnode)) { case GT_STRAND_REVERSE: ltr5 = ls->element.rightLTR; ltr3 = ls->element.leftLTR; break; case GT_STRAND_FORWARD: default: ltr5 = ls->element.leftLTR; ltr3 = ls->element.rightLTR; break; } if (!had_err) { had_err = gt_extract_feature_sequence(seq, (GtGenomeNode*) ltr5, gt_symbol(gt_ft_long_terminal_repeat), false, NULL, NULL, ls->rmap, err); } if (!had_err) { gt_fasta_show_entry(desc, gt_str_get(seq), gt_str_length(seq), GT_FSWIDTH, ls->ltr5out_file); gt_str_reset(seq); } if (!had_err) { had_err = gt_extract_feature_sequence(seq, (GtGenomeNode*) ltr3, gt_symbol(gt_ft_long_terminal_repeat), false, NULL, NULL, ls->rmap, err); } if (!had_err) { gt_fasta_show_entry(desc, gt_str_get(seq), gt_str_length(seq), GT_FSWIDTH, ls->ltr3out_file); gt_str_reset(seq); } /* output complete oriented element */ if (!had_err) { had_err = gt_extract_feature_sequence(seq, (GtGenomeNode*) ls->element.mainnode, gt_symbol(gt_ft_LTR_retrotransposon), false, NULL, NULL, ls->rmap, err); } if (!had_err) { gt_fasta_show_entry(desc,gt_str_get(seq), gt_str_length(seq), GT_FSWIDTH, ls->elemout_file); gt_str_reset(seq); } gt_file_xprintf(ls->tabout_file, "\n"); gt_str_delete(seq); } gt_hashmap_delete(ls->element.pdoms); gt_array_delete(ls->element.pdomorder); gt_free(ls->element.seqid); return had_err; }
static int gt_ltr_visitor_feature_node(GtNodeVisitor *nv, GtFeatureNode *fn, GT_UNUSED GtError *err) { GtLTRVisitor *lv; GtRange node_range; GtArray *pdomarr = NULL; const char *pfamname; const char *fnt; lv = gt_ltr_visitor_cast(nv); gt_assert(lv); gt_error_check(err); fnt = gt_feature_node_get_type(fn); if (strcmp(fnt, gt_ft_LTR_retrotransposon) == 0) { lv->element->mainnode = fn; } else if (strcmp(fnt, gt_ft_long_terminal_repeat) == 0) { if (lv->element->leftLTR == NULL) { node_range = gt_genome_node_get_range((GtGenomeNode*) fn); lv->element->leftLTR = fn; /* compensate for 1-based node coords */ lv->element->leftLTR_5 = node_range.start - 1; lv->element->leftLTR_3 = node_range.end - 1; } else { node_range = gt_genome_node_get_range((GtGenomeNode*) fn); lv->element->rightLTR = fn; /* compensate for 1-based node coords */ lv->element->rightLTR_5 = node_range.start - 1; lv->element->rightLTR_3 = node_range.end - 1; } } else if (strcmp(fnt, gt_ft_target_site_duplication) == 0) { if (lv->element->leftTSD == NULL) { lv->element->leftTSD = fn; } else { lv->element->rightTSD = fn; } } else if (strcmp(fnt, gt_ft_RR_tract) == 0) { if (lv->element->ppt == NULL) { lv->element->ppt = fn; } } else if (strcmp(fnt, gt_ft_primer_binding_site) == 0) { if (lv->element->pbs == NULL) { lv->element->pbs = fn; } } else if (strcmp(fnt, gt_ft_protein_match) == 0) { char buf[BUFSIZ]; if (!lv->element->pdoms) { lv->element->pdoms = gt_hashmap_new(GT_HASH_STRING, gt_free_func, (GtFree) gt_array_delete); } pfamname = gt_feature_node_get_attribute(fn, "name"); (void) snprintf(buf, BUFSIZ-1, "%s", pfamname); gt_cstr_rep(buf, '/', '_'); if (!(pdomarr = (GtArray*) gt_hashmap_get(lv->element->pdoms, buf))) { char *pfamcpy = gt_cstr_dup(buf); pdomarr = gt_array_new(sizeof (GtFeatureNode*)); gt_hashmap_add(lv->element->pdoms, pfamcpy, pdomarr); if (lv->element->pdomorder != NULL) gt_array_add(lv->element->pdomorder, pfamcpy); } gt_array_add(pdomarr, fn); } return 0; }
int gt_gtf_parser_parse(GtGTFParser *parser, GtQueue *genome_nodes, GtStr *filenamestr, GtFile *fpin, bool be_tolerant, GtError *err) { GtStr *seqid_str, *source_str, *line_buffer; char *line; size_t line_length; GtUword i, line_number = 0; GtGenomeNode *gn; GtRange range; GtPhase phase_value; GtStrand gt_strand_value; GtSplitter *splitter, *attribute_splitter; float score_value; char *seqname, *source, *feature, *start, *end, *score, *strand, *frame, *attributes, *token, *gene_id, *gene_name = NULL, *transcript_id, *transcript_name = NULL, **tokens; GtHashmap *transcript_id_hash; /* map from transcript id to array of genome nodes */ GtArray *gt_genome_node_array; ConstructionInfo cinfo; GTF_feature_type gtf_feature_type; GT_UNUSED bool gff_type_is_valid = false; const char *type = NULL; const char *filename; bool score_is_defined; int had_err = 0; gt_assert(parser && genome_nodes); gt_error_check(err); filename = gt_str_get(filenamestr); /* alloc */ line_buffer = gt_str_new(); splitter = gt_splitter_new(), attribute_splitter = gt_splitter_new(); #define HANDLE_ERROR \ if (had_err) { \ if (be_tolerant) { \ fprintf(stderr, "skipping line: %s\n", gt_error_get(err)); \ gt_error_unset(err); \ gt_str_reset(line_buffer); \ had_err = 0; \ continue; \ } \ else { \ had_err = -1; \ break; \ } \ } while (gt_str_read_next_line_generic(line_buffer, fpin) != EOF) { line = gt_str_get(line_buffer); line_length = gt_str_length(line_buffer); line_number++; gene_name = gene_id = transcript_id = transcript_name = NULL; had_err = 0; if (line_length == 0) { gt_warning("skipping blank line " GT_WU " in file \"%s\"", line_number, filename); } else if (line[0] == '#') { /* storing comment */ if (line_length >= 2 && line[1] == '#') gn = gt_comment_node_new(line+2); /* store '##' line as '#' line */ else gn = gt_comment_node_new(line+1); gt_genome_node_set_origin(gn, filenamestr, line_number); gt_queue_add(genome_nodes, gn); } else { bool stop_codon = false; char *tokendup, *attrkey; GtStrArray *attrkeys, *attrvals; /* process tab delimited GTF line */ gt_splitter_reset(splitter); gt_splitter_split(splitter, line, line_length, '\t'); if (gt_splitter_size(splitter) != 9UL) { gt_error_set(err, "line " GT_WU " in file \"%s\" contains " GT_WU " tab (\\t) " "separated fields instead of 9", line_number, filename, gt_splitter_size(splitter)); had_err = -1; break; } tokens = gt_splitter_get_tokens(splitter); seqname = tokens[0]; source = tokens[1]; feature = tokens[2]; start = tokens[3]; end = tokens[4]; score = tokens[5]; strand = tokens[6]; frame = tokens[7]; attributes = tokens[8]; /* parse feature */ if (GTF_feature_type_get(>f_feature_type, feature) == -1) { /* we skip unknown features */ fprintf(stderr, "skipping line " GT_WU " in file \"%s\": unknown " "feature: \"%s\"\n", line_number, filename, feature); gt_str_reset(line_buffer); continue; } /* translate into GFF3 feature type */ switch (gtf_feature_type) { case GTF_stop_codon: stop_codon = true; case GTF_CDS: gff_type_is_valid = gt_type_checker_is_valid(parser->type_checker, gt_ft_CDS); type = gt_ft_CDS; break; case GTF_exon: gff_type_is_valid = gt_type_checker_is_valid(parser->type_checker, gt_ft_exon); type = gt_ft_exon; break; case GTF_start_codon: /* we can skip the start codons, they are part of the CDS anyway */ gt_str_reset(line_buffer); continue; } gt_assert(gff_type_is_valid); /* parse the range */ had_err = gt_parse_range(&range, start, end, line_number, filename, err); HANDLE_ERROR; /* process seqname (we have to do it here because we need the range) */ gt_region_node_builder_add_region(parser->region_node_builder, seqname, range); /* parse the score */ had_err = gt_parse_score(&score_is_defined, &score_value, score, line_number, filename, err); HANDLE_ERROR; /* parse the strand */ had_err = gt_parse_strand(>_strand_value, strand, line_number, filename, err); HANDLE_ERROR; /* parse the frame */ had_err = gt_parse_phase(&phase_value, frame, line_number, filename, err); HANDLE_ERROR; /* parse the attributes */ attrkeys = gt_str_array_new(); attrvals = gt_str_array_new(); gt_splitter_reset(attribute_splitter); gene_id = NULL; transcript_id = NULL; gt_splitter_split(attribute_splitter, attributes, strlen(attributes), ';'); for (i = 0; i < gt_splitter_size(attribute_splitter); i++) { token = gt_splitter_get_token(attribute_splitter, i); /* skip leading blanks */ while (*token == ' ') token++; tokendup = gt_cstr_dup(token); attrkey = strtok(tokendup, " "); if (attrkey) { char *attrval = strtok(NULL, " "); if (attrval == NULL || strcmp(attrval, "") == 0 || strcmp(attrval, "\"\"") == 0) { gt_error_set(err, "missing value to attribute \"%s\" on line " GT_WU " in file \"%s\"", attrkey,line_number,filename); had_err = -1; } HANDLE_ERROR; if (*attrval == '"') attrval++; if (attrval[strlen(attrval)-1] == '"') attrval[strlen(attrval)-1] = '\0'; gt_assert(attrkey && strlen(attrkey) > 0); gt_assert(attrval && strlen(attrval) > 0); gt_str_array_add_cstr(attrkeys, attrkey); gt_str_array_add_cstr(attrvals, attrval); } gt_free(tokendup); /* look for the two mandatory attributes */ if (strncmp(token, GENE_ID_ATTRIBUTE, strlen(GENE_ID_ATTRIBUTE)) == 0) { if (strlen(token) + 2 < strlen(GENE_ID_ATTRIBUTE)) { gt_error_set(err, "missing value to attribute \"%s\" on line " GT_WU "in file \"%s\"", GENE_ID_ATTRIBUTE, line_number, filename); had_err = -1; } HANDLE_ERROR; gene_id = token + strlen(GENE_ID_ATTRIBUTE) + 1; if (*gene_id == '"') gene_id++; if (gene_id[strlen(gene_id)-1] == '"') gene_id[strlen(gene_id)-1] = '\0'; } else if (strncmp(token, TRANSCRIPT_ID_ATTRIBUTE, strlen(TRANSCRIPT_ID_ATTRIBUTE)) == 0) { if (strlen(token) + 2 < strlen(TRANSCRIPT_ID_ATTRIBUTE)) { gt_error_set(err, "missing value to attribute \"%s\" on line " GT_WU "in file \"%s\"", TRANSCRIPT_ID_ATTRIBUTE, line_number, filename); had_err = -1; } HANDLE_ERROR; transcript_id = token + strlen(TRANSCRIPT_ID_ATTRIBUTE) + 1; if (*transcript_id == '"') transcript_id++; if (transcript_id[strlen(transcript_id)-1] == '"') transcript_id[strlen(transcript_id)-1] = '\0'; } else if (strncmp(token, GENE_NAME_ATTRIBUTE, strlen(GENE_NAME_ATTRIBUTE)) == 0) { if (strlen(token) + 2 < strlen(GENE_NAME_ATTRIBUTE)) { gt_error_set(err, "missing value to attribute \"%s\" on line " GT_WU "in file \"%s\"", GENE_NAME_ATTRIBUTE, line_number, filename); had_err = -1; } HANDLE_ERROR; gene_name = token + strlen(GENE_NAME_ATTRIBUTE) + 1; /* for output we want to strip quotes */ if (*gene_name == '"') gene_name++; if (gene_name[strlen(gene_name)-1] == '"') gene_name[strlen(gene_name)-1] = '\0'; } else if (strncmp(token, TRANSCRIPT_NAME_ATTRIBUTE, strlen(TRANSCRIPT_NAME_ATTRIBUTE)) == 0) { if (strlen(token) + 2 < strlen(TRANSCRIPT_NAME_ATTRIBUTE)) { gt_error_set(err, "missing value to attribute \"%s\" on line " GT_WU "in file \"%s\"", TRANSCRIPT_NAME_ATTRIBUTE, line_number, filename); had_err = -1; } HANDLE_ERROR; transcript_name = token + strlen(TRANSCRIPT_NAME_ATTRIBUTE) + 1; /* for output we want to strip quotes */ if (*transcript_name == '"') transcript_name++; if (transcript_name[strlen(transcript_name)-1] == '"') transcript_name[strlen(transcript_name)-1] = '\0'; } } /* check for the mandatory attributes */ if (!gene_id) { gt_error_set(err, "missing attribute \"%s\" on line " GT_WU " in file \"%s\"", GENE_ID_ATTRIBUTE, line_number, filename); had_err = -1; } HANDLE_ERROR; if (!transcript_id) { gt_error_set(err, "missing attribute \"%s\" on line " GT_WU " in file \"%s\"", TRANSCRIPT_ID_ATTRIBUTE, line_number, filename); had_err = -1; } HANDLE_ERROR; /* process the mandatory attributes */ if (!(transcript_id_hash = gt_hashmap_get(parser->gene_id_hash, gene_id))) { transcript_id_hash = gt_hashmap_new(GT_HASH_STRING, gt_free_func, (GtFree) gt_array_delete); gt_hashmap_add(parser->gene_id_hash, gt_cstr_dup(gene_id), transcript_id_hash); } gt_assert(transcript_id_hash); if (!(gt_genome_node_array = gt_hashmap_get(transcript_id_hash, transcript_id))) { gt_genome_node_array = gt_array_new(sizeof (GtGenomeNode*)); gt_hashmap_add(transcript_id_hash, gt_cstr_dup(transcript_id), gt_genome_node_array); } gt_assert(gt_genome_node_array); /* save optional gene_name and transcript_name attributes */ if (transcript_name && strlen(transcript_name) > 0 && !gt_hashmap_get(parser->transcript_id_to_name_mapping, transcript_id)) { gt_hashmap_add(parser->transcript_id_to_name_mapping, gt_cstr_dup(transcript_id), gt_cstr_dup(transcript_name)); } if (gene_name && strlen(gene_name) > 0 && !gt_hashmap_get(parser->gene_id_to_name_mapping, gene_id)) { gt_hashmap_add(parser->gene_id_to_name_mapping, gt_cstr_dup(gene_id), gt_cstr_dup(gene_name)); } /* get seqid */ seqid_str = gt_hashmap_get(parser->seqid_to_str_mapping, seqname); if (!seqid_str) { seqid_str = gt_str_new_cstr(seqname); gt_hashmap_add(parser->seqid_to_str_mapping, gt_str_get(seqid_str), seqid_str); } gt_assert(seqid_str); /* construct the new feature */ gn = gt_feature_node_new(seqid_str, type, range.start, range.end, gt_strand_value); gt_genome_node_set_origin(gn, filenamestr, line_number); if (stop_codon) { gt_feature_node_add_attribute((GtFeatureNode*) gn, GTF_PARSER_STOP_CODON_FLAG, "true"); } for (i = 0; i < gt_str_array_size(attrkeys); i++) { GtFeatureNode *fn = (GtFeatureNode *)gn; const char *key = gt_str_array_get(attrkeys, i); const char *val = gt_str_array_get(attrvals, i); /* Not a comprehensive solution to ensure correct encoding, just bare minimum required to get Cufflinks output parsed */ if (strcmp(val, "=") == 0) val = "%26"; if (gt_feature_node_get_attribute(fn, key) != NULL) { const char *oldval = gt_feature_node_get_attribute(fn, key); GtStr *newval = gt_str_new_cstr(oldval); gt_str_append_char(newval, ','); gt_str_append_cstr(newval, val); gt_feature_node_set_attribute(fn, key, gt_str_get(newval)); gt_str_delete(newval); } else gt_feature_node_add_attribute(fn, key, val); } gt_str_array_delete(attrkeys); gt_str_array_delete(attrvals); /* set source */ source_str = gt_hashmap_get(parser->source_to_str_mapping, source); if (!source_str) { source_str = gt_str_new_cstr(source); gt_hashmap_add(parser->source_to_str_mapping, gt_str_get(source_str), source_str); } gt_assert(source_str); gt_feature_node_set_source((GtFeatureNode*) gn, source_str); if (score_is_defined) gt_feature_node_set_score((GtFeatureNode*) gn, score_value); if (phase_value != GT_PHASE_UNDEFINED) gt_feature_node_set_phase((GtFeatureNode*) gn, phase_value); gt_array_add(gt_genome_node_array, gn); } gt_str_reset(line_buffer); } /* process all region nodes */ if (!had_err) gt_region_node_builder_build(parser->region_node_builder, genome_nodes); /* process all feature nodes */ cinfo.genome_nodes = genome_nodes; cinfo.tidy = be_tolerant; cinfo.gene_id_to_name_mapping = parser->gene_id_to_name_mapping; cinfo.transcript_id_to_name_mapping = parser->transcript_id_to_name_mapping; if (!had_err) { had_err = gt_hashmap_foreach(parser->gene_id_hash, construct_genes, &cinfo, err); } gt_hashmap_foreach(parser->gene_id_hash, delete_genes, NULL, err); /* free */ gt_splitter_delete(splitter); gt_splitter_delete(attribute_splitter); gt_str_delete(line_buffer); return had_err; }
static int CpGI_score_stream_next(GtNodeStream * ns, GtGenomeNode ** gn, GtError * err) { GtGenomeNode * cur_node; int err_num = 0; *gn = NULL; CpGI_score_stream * score_stream; unsigned long island_start; unsigned long island_end; float island_score; int chromosome_num; GtStr * seqID_gtstr; char * seqID_str; char * num_cg_str; unsigned long num_cg = 0; score_stream = CpGI_score_stream_cast(ns); // find the CpGI's, process methylome score if(!gt_node_stream_next(score_stream->in_stream, &cur_node, err ) && cur_node != NULL ) { *gn = cur_node; // try casting as a feature node so we can test type if(!gt_genome_node_try_cast(gt_feature_node_class(), cur_node)) { return 0; } else // we found a feature node { if(!gt_feature_node_has_type(cur_node, feature_type_CpGI)) return 0; #if DEBUG_SCORE printf("found CpGI\n"); #endif island_start = gt_genome_node_get_start(cur_node); island_end = gt_genome_node_get_end(cur_node); seqID_gtstr = gt_genome_node_get_seqid(cur_node); seqID_str = gt_str_get(seqID_gtstr); sscanf(seqID_str, "Chr%d", &chromosome_num); num_cg_str = gt_feature_node_get_attribute(cur_node, "sumcg"); if (!num_cg_str) return 0; sscanf(num_cg_str, "%d", &num_cg); // now figure out the score island_score = CpGI_score_stream_score_island(score_stream , chromosome_num, num_cg, island_start, island_end); // gt_str_delete(seqID_gtstr); // save the score into the node gt_feature_node_set_score(cur_node, island_score); return 0; } } return err_num; }
static int CpGIOverlap_stream_next(GtNodeStream * ns, GtGenomeNode ** gn, GtError * err) { GtGenomeNode * cur_node, * next_node; GtFeatureNodeIterator * iter; int err_num = 0; *gn = NULL; CpGIOverlap_stream * context; const char * gene_name = NULL; const char * overlap_name = NULL; char chr_str[255]; int chr_num; unsigned int TSS; float CpGIOverlap; context = CpGIOverlap_stream_cast(ns); // find the genes, determine expression level if(!gt_node_stream_next(context->in_stream, &cur_node, err ) && cur_node != NULL ) { *gn = cur_node; // try casting as a feature node so we can test type if(!gt_genome_node_try_cast(gt_feature_node_class(), cur_node)) { return 0; } else // we found a feature node { // first check if it is a pseudo node, if so find the gene in it if available if (gt_feature_node_is_pseudo(cur_node)) { iter = gt_feature_node_iterator_new(cur_node); if (iter == NULL) return; while ((next_node = gt_feature_node_iterator_next(iter)) && !gt_feature_node_has_type(next_node, feature_type_gene)); gt_feature_node_iterator_delete(iter); if (NULL == (cur_node = next_node)) return 0; } if(!gt_feature_node_has_type(cur_node, feature_type_gene)) return 0; // find name of gene gene_name = gt_feature_node_get_attribute(cur_node, "Name"); if (gene_name == NULL) return; if ( 1 != sscanf(gt_str_get(gt_genome_node_get_seqid(cur_node)), "Chr%d", &chr_num)) return 0; TSS = (gt_feature_node_get_strand(cur_node) == GT_STRAND_FORWARD) ? gt_genome_node_get_start(cur_node) : gt_genome_node_get_end(cur_node); // now figure out the overlapping gene if (! (overlap_name = CpGIOverlap_stream_find_gene_overlap( context, TSS, chr_num))) return 0; // save the score into the node gt_feature_node_set_attribute(cur_node, "cpgi_at_tss", overlap_name); return 0; } } return err_num; }