GtNodeVisitor* gt_snp_annotator_visitor_new(GtFeatureNode *gene, GtTransTable *trans_table, GtRegionMapping *rmap, GtError *err) { GtNodeVisitor *nv; GtSNPAnnotatorVisitor *sav; gt_assert(gene && gt_feature_node_get_type(gene) == gt_symbol(gt_ft_gene)); nv = gt_node_visitor_create(gt_snp_annotator_visitor_class()); sav = snp_annotator_visitor_cast(nv); sav->gene = (GtFeatureNode*) gt_genome_node_ref((GtGenomeNode*) gene); sav->rmap = gt_region_mapping_ref(rmap); sav->mRNA_type = gt_symbol(gt_ft_mRNA); sav->CDS_type = gt_symbol(gt_ft_CDS); sav->SNV_type = gt_symbol(gt_ft_SNV); sav->SNP_type = gt_symbol(gt_ft_SNP); sav->rnaseqs = gt_hashmap_new(GT_HASH_DIRECT, NULL, gt_free_func); if (trans_table) { sav->tt = trans_table; sav->own_tt = false; } else { sav->tt = gt_trans_table_new_standard(err); sav->own_tt = true; } if (!sav->tt || gt_snp_annotator_visitor_prepare_gene(sav, err) != 0) { gt_node_visitor_delete(nv); return NULL; } return nv; }
static void* test_symbol(GT_UNUSED void *data) { GtStr *symbol; GtUword i; symbol = gt_str_new(); for (i = 0; i < NUMBER_OF_SYMBOLS; i++) { gt_str_reset(symbol); gt_str_append_ulong(symbol, gt_rand_max(MAX_SYMBOL)); gt_symbol(gt_str_get(symbol)); gt_assert(!strcmp(gt_symbol(gt_str_get(symbol)), gt_str_get(symbol))); } gt_str_delete(symbol); return NULL; }
GtGenomeNode* gt_feature_node_new(GtStr *seqid, const char *type, GtUword start, GtUword end, GtStrand strand) { GtGenomeNode *gn; GtFeatureNode *fn; gt_assert(seqid && type); gt_assert(start <= end); gn = gt_genome_node_create(gt_feature_node_class()); fn = gt_feature_node_cast(gn); fn->seqid = gt_str_ref(seqid); fn->source = NULL; fn->type = gt_symbol(type); fn->score = GT_UNDEF_FLOAT; fn->range.start = start; fn->range.end = end; fn->representative = NULL; fn->attributes = NULL; fn->bit_field = 0; fn->bit_field |= strand << STRAND_OFFSET; fn->children = NULL; /* the children list is create on demand */ fn->observer = NULL; gt_feature_node_set_phase(fn, GT_PHASE_UNDEFINED); set_transcriptfeaturetype(fn, TRANSCRIPT_FEATURE_TYPE_UNDETERMINED); set_tree_status(&fn->bit_field, IS_TREE); /* the DFS status is set to DFS_WHITE already */ fn->representative = NULL; return gn; }
void gt_type_graph_add_stanza(GtTypeGraph *type_graph, const GtOBOStanza *stanza) { const char *id_value, *name_value; GtUword i, size; GtTypeNode *node; GtStr *buf; gt_assert(type_graph && stanza && !type_graph->ready); gt_assert(gt_obo_stanza_size(stanza, "id") == 1); gt_assert(gt_obo_stanza_size(stanza, "name") == 1); id_value = gt_symbol(gt_obo_stanza_get_value(stanza, "id", 0)); name_value = gt_symbol(gt_obo_stanza_get_value(stanza, "name", 0)); gt_assert(id_value); gt_assert(name_value); gt_assert(!gt_hashmap_get(type_graph->nodemap, id_value)); node = gt_type_node_new(gt_array_size(type_graph->nodes), id_value); gt_hashmap_add(type_graph->name2id, (char*) name_value, (char*) id_value); gt_hashmap_add(type_graph->id2name, (char*) id_value, (char*) name_value); gt_hashmap_add(type_graph->nodemap, (char*) id_value, node); gt_array_add(type_graph->nodes, node); buf = gt_str_new(); /* store is_a entries in node, if necessary */ if ((size = gt_obo_stanza_size(stanza, "is_a"))) { for (i = 0; i < size; i++) { const char *id = gt_obo_stanza_get_value(stanza, "is_a", i); gt_str_reset(buf); gt_str_append_cstr_nt(buf, id, strcspn(id, " \n")); gt_type_node_is_a_add(node, gt_symbol(gt_str_get(buf))); } } /* store part_of entries in node, if necessary */ if ((size = gt_obo_stanza_size(stanza, "relationship"))) { for (i = 0; i < size; i++) { const char *rel = gt_obo_stanza_get_value(stanza, "relationship", i); gt_str_reset(buf); /* match part_of */ if (!strncmp(rel, PART_OF, strlen(PART_OF))) { const char *part_of = rel + strlen(PART_OF) + 1; gt_str_append_cstr_nt(buf, part_of, strcspn(part_of, " \n")); gt_type_node_part_of_add(node, gt_symbol(gt_str_get(buf))); continue; } /* match member_of */ if (!strncmp(rel, MEMBER_OF, strlen(MEMBER_OF))) { const char *member_of = rel + strlen(MEMBER_OF) + 1; gt_str_append_cstr_nt(buf, member_of, strcspn(member_of, " \n")); gt_type_node_part_of_add(node, gt_symbol(gt_str_get(buf))); continue; } /* match integral_part_of */ if (!strncmp(rel, INTEGRAL_PART_OF, strlen(INTEGRAL_PART_OF))) { const char *integral_part_of = rel + strlen(INTEGRAL_PART_OF) + 1; gt_str_append_cstr_nt(buf, integral_part_of, strcspn(integral_part_of, " \n")); gt_type_node_part_of_add(node, gt_symbol(gt_str_get(buf))); } } } gt_str_delete(buf); }
bool gt_script_filter_validate(GtScriptFilter *script_filter, GtError *err) { const char *result; #ifndef NDEBUG GT_UNUSED int stack_size; #endif gt_assert(script_filter); gt_error_check(err); #ifndef NDEBUG stack_size = lua_gettop(script_filter->L); #endif result = gt_script_filter_get_name(script_filter, err); if (result == gt_symbol("undefined")) { gt_error_set(err, "metadata 'name' not found"); return false; } result = gt_script_filter_get_description(script_filter, err); if (result == gt_symbol("undefined")) { gt_error_set(err, "metadata 'description' not found"); return false; } result = gt_script_filter_get_short_description(script_filter, err); if (result == gt_symbol("undefined")) { gt_error_set(err, "metadata 'short_descr' not found"); return false; } result = gt_script_filter_get_author(script_filter, err); if (result == gt_symbol("undefined")) { gt_error_set(err, "metadata 'author' not found"); return false; } result = gt_script_filter_get_email(script_filter, err); if (result == gt_symbol("undefined")) { gt_error_set(err, "metadata 'email' not found"); return false; } result = gt_script_filter_get_version(script_filter, err); if (result == gt_symbol("undefined")) { gt_error_set(err, "metadata 'version' not found"); return false; } lua_getglobal(script_filter->L, "filter"); if (lua_isnil(script_filter->L, -1)) { gt_error_set(err, "function 'filter' is not defined"); lua_pop(script_filter->L, 1); return false; } return true; }
/* TODO: caching */ static const char *gt_script_filter_get_string(GtScriptFilter *script_filter, const char *name, GtError *err) { #ifndef NDEBUG int stack_size; #endif gt_assert(script_filter && name); gt_error_check(err); #ifndef NDEBUG stack_size = lua_gettop(script_filter->L); #endif lua_getglobal(script_filter->L, name); if (lua_isnil(script_filter->L, -1)) { lua_pop(script_filter->L, 1); return gt_symbol("undefined"); } /* execute callback if function is given */ if (lua_isfunction(script_filter->L, -1)) { int num_of_args = 0; if (lua_pcall(script_filter->L, num_of_args, 1, 0) != 0) { gt_error_set(err, "%s", lua_tostring(script_filter->L, -1)); lua_pop(script_filter->L, 1); gt_assert(lua_gettop(script_filter->L) == stack_size); return NULL; } } if (lua_isnil(script_filter->L, -1) || !lua_isstring(script_filter->L, -1)) { lua_pop(script_filter->L, 1); gt_assert(lua_gettop(script_filter->L) == stack_size); gt_error_set(err, "script filter '%s': '%s' must return a string", gt_str_get(script_filter->filename), name); return NULL; } /* retrieve string */ return lua_tostring(script_filter->L, -1); }
GtNodeVisitor* gt_extract_feature_visitor_new(GtRegionMapping *rm, const char *type, bool join, bool translate, bool seqid, bool target, GtUword width, GtFile *outfp) { GtNodeVisitor *nv; GtExtractFeatureVisitor *efv; gt_assert(rm); nv = gt_node_visitor_create(gt_extract_feature_visitor_class()); efv= gt_extract_feature_visitor_cast(nv); efv->type = gt_symbol(type); efv->join = join; efv->translate = translate; efv->seqid = seqid; efv->target = target; efv->fastaseq_counter = 0; efv->region_mapping = rm; efv->width = width; efv->outfp = outfp; return nv; }
static int snp_annotator_classify_snp(GtSNPAnnotatorVisitor *sav, GtFeatureNode *mRNA, GtFeatureNode *snp, GtUword variant_pos, GtUword variant_idx, char variant_char, #ifndef NDEBUG GT_UNUSED char reference_char, #endif GT_UNUSED GtError *err) { int had_err = 0; char *mrnaseq; const char *variant_effect = NULL; gt_assert(mRNA && snp && sav); gt_log_log("processing variant char %c for SNP %s\n", variant_char, gt_feature_node_get_attribute(snp, "Dbxref")); mrnaseq = gt_hashmap_get(sav->rnaseqs, mRNA); gt_assert(mrnaseq); if (mrnaseq) { char codon[3], variant_codon[3]; GtStr *effect_string; char oldamino, newamino; GT_UNUSED GtUword mrnalen; GtUword startpos = variant_pos / GT_CODON_LENGTH, variantoffset = variant_pos % GT_CODON_LENGTH; mrnalen = strlen(mrnaseq); gt_assert(variant_pos < mrnalen); variant_codon[0] = codon[0] = mrnaseq[3*startpos]; variant_codon[1] = codon[1] = mrnaseq[3*startpos+1]; variant_codon[2] = codon[2] = mrnaseq[3*startpos+2]; variant_codon[variantoffset] = variant_char; #ifndef NDEBUG gt_assert(toupper(codon[variantoffset]) == toupper(reference_char)); #endif if (gt_trans_table_is_stop_codon(sav->tt, codon[0], codon[1], codon[2])) { if (gt_trans_table_is_stop_codon(sav->tt, variant_codon[0], variant_codon[1], variant_codon[2])) { variant_effect = gt_symbol(GT_SNP_SYNONYMOUS_STOP_EFFECT); } else { variant_effect = gt_symbol(GT_SNP_STOP_LOST_EFFECT); } } else { if (gt_trans_table_is_stop_codon(sav->tt, variant_codon[0], variant_codon[1], variant_codon[2])) { variant_effect = gt_symbol(GT_SNP_NONSENSE_EFFECT); } else { had_err = gt_trans_table_translate_codon(sav->tt, codon[0], codon[1], codon[2], &oldamino, err); if (!had_err) { had_err = gt_trans_table_translate_codon(sav->tt, variant_codon[0], variant_codon[1], variant_codon[2], &newamino, err); } if (!had_err) { if (newamino == oldamino) { variant_effect = gt_symbol(GT_SNP_SYNONYMOUS_AMINO_EFFECT); } else { variant_effect = gt_symbol(GT_SNP_MISSENSE_EFFECT); } } } } if (!had_err) { const char *var_attrib; gt_assert(variant_effect != NULL); if ((var_attrib = gt_feature_node_get_attribute(snp, GT_GVF_VARIANT_EFFECT))) { effect_string = gt_str_new_cstr(var_attrib); gt_str_append_cstr(effect_string, ","); gt_str_append_cstr(effect_string, variant_effect); } else { effect_string = gt_str_new_cstr(variant_effect); } gt_str_append_cstr(effect_string, " "); gt_str_append_ulong(effect_string, variant_idx); gt_str_append_cstr(effect_string, " "); gt_str_append_cstr(effect_string, gt_feature_node_get_type(mRNA)); gt_str_append_cstr(effect_string, " "); gt_str_append_cstr(effect_string, gt_feature_node_get_attribute(mRNA, GT_GFF_ID)); gt_feature_node_set_attribute(snp, GT_GVF_VARIANT_EFFECT, gt_str_get(effect_string)); gt_str_reset(effect_string); gt_str_delete(effect_string); } } return had_err; }
int gt_ltrfileout_stream_next(GtNodeStream *ns, GtGenomeNode **gn, GtError *err) { GtLTRdigestFileOutStream *ls; GtFeatureNode *fn; GtRange lltr_rng = {GT_UNDEF_UWORD, GT_UNDEF_UWORD}, rltr_rng = {GT_UNDEF_UWORD, GT_UNDEF_UWORD}, ppt_rng = {GT_UNDEF_UWORD, GT_UNDEF_UWORD}, pbs_rng = {GT_UNDEF_UWORD, GT_UNDEF_UWORD}; int had_err; GtUword i=0; gt_error_check(err); ls = gt_ltrdigest_file_out_stream_cast(ns); /* initialize this element */ memset(&ls->element, 0, sizeof (GtLTRElement)); /* get annotations from parser */ had_err = gt_node_stream_next(ls->in_stream, gn, err); if (!had_err && *gn) { GtFeatureNodeIterator* gni; GtFeatureNode *mygn; /* only process feature nodes */ if (!(fn = gt_feature_node_try_cast(*gn))) return 0; ls->element.pdomorder = gt_array_new(sizeof (const char*)); /* fill LTRElement structure from GFF3 subgraph */ gni = gt_feature_node_iterator_new(fn); for (mygn = fn; mygn != NULL; mygn = gt_feature_node_iterator_next(gni)) (void) gt_genome_node_accept((GtGenomeNode*) mygn, (GtNodeVisitor*) ls->lv, err); gt_feature_node_iterator_delete(gni); } if (!had_err && ls->element.mainnode != NULL) { char desc[GT_MAXFASTAHEADER]; GtFeatureNode *ltr3, *ltr5; GtStr *sdesc, *sreg, *seq; /* find sequence in GtEncseq */ sreg = gt_genome_node_get_seqid((GtGenomeNode*) ls->element.mainnode); sdesc = gt_str_new(); had_err = gt_region_mapping_get_description(ls->rmap, sdesc, sreg, err); if (!had_err) { GtRange rng; ls->element.seqid = gt_calloc((size_t) ls->seqnamelen+1, sizeof (char)); (void) snprintf(ls->element.seqid, MIN((size_t) gt_str_length(sdesc), (size_t) ls->seqnamelen)+1, "%s", gt_str_get(sdesc)); gt_cstr_rep(ls->element.seqid, ' ', '_'); if (gt_str_length(sdesc) > (GtUword) ls->seqnamelen) ls->element.seqid[ls->seqnamelen] = '\0'; (void) gt_ltrelement_format_description(&ls->element, ls->seqnamelen, desc, (size_t) (GT_MAXFASTAHEADER-1)); gt_str_delete(sdesc); /* output basic retrotransposon data */ lltr_rng = gt_genome_node_get_range((GtGenomeNode*) ls->element.leftLTR); rltr_rng = gt_genome_node_get_range((GtGenomeNode*) ls->element.rightLTR); rng = gt_genome_node_get_range((GtGenomeNode*) ls->element.mainnode); gt_file_xprintf(ls->tabout_file, GT_WU"\t"GT_WU"\t"GT_WU"\t%s\t"GT_WU"\t"GT_WU"\t"GT_WU"\t" GT_WU"\t"GT_WU"\t"GT_WU"\t", rng.start, rng.end, gt_ltrelement_length(&ls->element), ls->element.seqid, lltr_rng.start, lltr_rng.end, gt_ltrelement_leftltrlen(&ls->element), rltr_rng.start, rltr_rng.end, gt_ltrelement_rightltrlen(&ls->element)); } seq = gt_str_new(); /* output TSDs */ if (!had_err && ls->element.leftTSD != NULL) { GtRange tsd_rng; tsd_rng = gt_genome_node_get_range((GtGenomeNode*) ls->element.leftTSD); had_err = gt_extract_feature_sequence(seq, (GtGenomeNode*) ls->element.leftTSD, gt_symbol(gt_ft_target_site_duplication), false, NULL, NULL, ls->rmap, err); if (!had_err) { gt_file_xprintf(ls->tabout_file, ""GT_WU"\t"GT_WU"\t%s\t", tsd_rng.start, tsd_rng.end, gt_str_get(seq)); } gt_str_reset(seq); } else gt_file_xprintf(ls->tabout_file, "\t\t\t"); if (!had_err && ls->element.rightTSD != NULL) { GtRange tsd_rng; tsd_rng = gt_genome_node_get_range((GtGenomeNode*) ls->element.rightTSD); had_err = gt_extract_feature_sequence(seq, (GtGenomeNode*) ls->element.rightTSD, gt_symbol(gt_ft_target_site_duplication), false, NULL, NULL, ls->rmap, err); if (!had_err) { gt_file_xprintf(ls->tabout_file, ""GT_WU"\t"GT_WU"\t%s\t", tsd_rng.start, tsd_rng.end, gt_str_get(seq)); } gt_str_reset(seq); } else gt_file_xprintf(ls->tabout_file, "\t\t\t"); /* output PPT */ if (!had_err && ls->element.ppt != NULL) { GtStrand ppt_strand = gt_feature_node_get_strand(ls->element.ppt); ppt_rng = gt_genome_node_get_range((GtGenomeNode*) ls->element.ppt); had_err = gt_extract_feature_sequence(seq, (GtGenomeNode*) ls->element.ppt, gt_symbol(gt_ft_RR_tract), false, NULL, NULL, ls->rmap, err); if (!had_err) { gt_fasta_show_entry(desc, gt_str_get(seq), gt_range_length(&ppt_rng), GT_FSWIDTH, ls->pptout_file); gt_file_xprintf(ls->tabout_file, ""GT_WU"\t"GT_WU"\t%s\t%c\t%d\t", ppt_rng.start, ppt_rng.end, gt_str_get(seq), GT_STRAND_CHARS[ppt_strand], (ppt_strand == GT_STRAND_FORWARD ? abs((int) (rltr_rng.start - ppt_rng.end)) : abs((int) (lltr_rng.end - ppt_rng.start)))); } gt_str_reset(seq); } else gt_file_xprintf(ls->tabout_file, "\t\t\t\t\t"); /* output PBS */ if (!had_err && ls->element.pbs != NULL) { GtStrand pbs_strand; pbs_strand = gt_feature_node_get_strand(ls->element.pbs); pbs_rng = gt_genome_node_get_range((GtGenomeNode*) ls->element.pbs); had_err = gt_extract_feature_sequence(seq, (GtGenomeNode*) ls->element.pbs, gt_symbol(gt_ft_primer_binding_site), false, NULL, NULL, ls->rmap, err); if (!had_err) { gt_fasta_show_entry(desc, gt_str_get(seq), gt_range_length(&pbs_rng), GT_FSWIDTH, ls->pbsout_file); gt_file_xprintf(ls->tabout_file, ""GT_WU"\t"GT_WU"\t%c\t%s\t%s\t%s\t%s\t%s\t", pbs_rng.start, pbs_rng.end, GT_STRAND_CHARS[pbs_strand], gt_feature_node_get_attribute(ls->element.pbs, "trna"), gt_str_get(seq), gt_feature_node_get_attribute(ls->element.pbs, "pbsoffset"), gt_feature_node_get_attribute(ls->element.pbs, "trnaoffset"), gt_feature_node_get_attribute(ls->element.pbs, "edist")); } gt_str_reset(seq); } else gt_file_xprintf(ls->tabout_file, "\t\t\t\t\t\t\t\t"); /* output protein domains */ if (!had_err && ls->element.pdoms != NULL) { GtStr *pdomorderstr = gt_str_new(); for (i=0; !had_err && i<gt_array_size(ls->element.pdomorder); i++) { const char* key = *(const char**) gt_array_get(ls->element.pdomorder, i); GtArray *entry = (GtArray*) gt_hashmap_get(ls->element.pdoms, key); had_err = write_pdom(ls, entry, key, ls->rmap, desc, err); } if (GT_STRAND_REVERSE == gt_feature_node_get_strand(ls->element.mainnode)) gt_array_reverse(ls->element.pdomorder); for (i=0 ;!had_err && i<gt_array_size(ls->element.pdomorder); i++) { const char* name = *(const char**) gt_array_get(ls->element.pdomorder, i); gt_str_append_cstr(pdomorderstr, name); if (i != gt_array_size(ls->element.pdomorder)-1) gt_str_append_cstr(pdomorderstr, "/"); } gt_file_xprintf(ls->tabout_file, "%s", gt_str_get(pdomorderstr)); gt_str_delete(pdomorderstr); } /* output LTRs (we just expect them to exist) */ switch (gt_feature_node_get_strand(ls->element.mainnode)) { case GT_STRAND_REVERSE: ltr5 = ls->element.rightLTR; ltr3 = ls->element.leftLTR; break; case GT_STRAND_FORWARD: default: ltr5 = ls->element.leftLTR; ltr3 = ls->element.rightLTR; break; } if (!had_err) { had_err = gt_extract_feature_sequence(seq, (GtGenomeNode*) ltr5, gt_symbol(gt_ft_long_terminal_repeat), false, NULL, NULL, ls->rmap, err); } if (!had_err) { gt_fasta_show_entry(desc, gt_str_get(seq), gt_str_length(seq), GT_FSWIDTH, ls->ltr5out_file); gt_str_reset(seq); } if (!had_err) { had_err = gt_extract_feature_sequence(seq, (GtGenomeNode*) ltr3, gt_symbol(gt_ft_long_terminal_repeat), false, NULL, NULL, ls->rmap, err); } if (!had_err) { gt_fasta_show_entry(desc, gt_str_get(seq), gt_str_length(seq), GT_FSWIDTH, ls->ltr3out_file); gt_str_reset(seq); } /* output complete oriented element */ if (!had_err) { had_err = gt_extract_feature_sequence(seq, (GtGenomeNode*) ls->element.mainnode, gt_symbol(gt_ft_LTR_retrotransposon), false, NULL, NULL, ls->rmap, err); } if (!had_err) { gt_fasta_show_entry(desc,gt_str_get(seq), gt_str_length(seq), GT_FSWIDTH, ls->elemout_file); gt_str_reset(seq); } gt_file_xprintf(ls->tabout_file, "\n"); gt_str_delete(seq); } gt_hashmap_delete(ls->element.pdoms); gt_array_delete(ls->element.pdomorder); gt_free(ls->element.seqid); return had_err; }
static int write_pdom(GtLTRdigestFileOutStream *ls, GtArray *pdoms, const char *pdomname, GT_UNUSED GtRegionMapping *rmap, char *desc, GtError *err) { int had_err = 0; GtFile *seqfile = NULL, *alifile = NULL, *aafile = NULL; GtUword i = 0, seq_length = 0; GtStr *pdom_seq, *pdom_aaseq; gt_error_check(err); pdom_seq = gt_str_new(); pdom_aaseq = gt_str_new(); /* get protein domain output file */ seqfile = (GtFile*) gt_hashmap_get(ls->pdomout_files, pdomname); if (seqfile == NULL) { /* no file opened for this domain yet, do it */ char buffer[GT_MAXFILENAMELEN]; (void) snprintf(buffer, (size_t) (GT_MAXFILENAMELEN-1), "%s_pdom_%s.fas", ls->fileprefix, pdomname); seqfile = gt_file_xopen(buffer, "w+"); gt_hashmap_add(ls->pdomout_files, gt_cstr_dup(pdomname), seqfile); } /* get protein alignment output file */ if (ls->write_pdom_alignments) { alifile = (GtFile*) gt_hashmap_get(ls->pdomali_files, pdomname); if (alifile == NULL) { /* no file opened for this domain yet, do it */ char buffer[GT_MAXFILENAMELEN]; (void) snprintf(buffer, (size_t) (GT_MAXFILENAMELEN-1), "%s_pdom_%s.ali", ls->fileprefix, pdomname); alifile = gt_file_xopen(buffer, "w+"); gt_hashmap_add(ls->pdomali_files, gt_cstr_dup(pdomname), alifile); } } /* get amino acid sequence output file */ if (ls->write_pdom_aaseqs) { aafile = (GtFile*) gt_hashmap_get(ls->pdomaa_files, pdomname); if (aafile == NULL) { /* no file opened for this domain yet, do it */ char buffer[GT_MAXFILENAMELEN]; (void) snprintf(buffer, (size_t) (GT_MAXFILENAMELEN-1), "%s_pdom_%s_aa.fas", ls->fileprefix, pdomname); aafile = gt_file_xopen(buffer, "w+"); gt_hashmap_add(ls->pdomaa_files, gt_cstr_dup(pdomname), aafile); } } if (gt_array_size(pdoms) > 1UL) { for (i=1UL; i<gt_array_size(pdoms); i++) { gt_assert(gt_genome_node_cmp(*(GtGenomeNode**)gt_array_get(pdoms, i), *(GtGenomeNode**)gt_array_get(pdoms, i-1)) >= 0); } if (gt_feature_node_get_strand(*(GtFeatureNode**) gt_array_get(pdoms, 0UL)) == GT_STRAND_REVERSE) { gt_array_reverse(pdoms); } } /* output protein domain data */ for (i=0;i<gt_array_size(pdoms);i++) { GtRange pdom_rng; GtStr *ali, *aaseq; GtFeatureNode *fn; int rval; fn = *(GtFeatureNode**) gt_array_get(pdoms, i); ali = gt_genome_node_get_user_data((GtGenomeNode*) fn, "pdom_alignment"); aaseq = gt_genome_node_get_user_data((GtGenomeNode*) fn, "pdom_aaseq"); pdom_rng = gt_genome_node_get_range((GtGenomeNode*) fn); rval = gt_extract_feature_sequence(pdom_seq, (GtGenomeNode*) fn, gt_symbol(gt_ft_protein_match), false, NULL, NULL, rmap, err); if (rval) { had_err = -1; break; } if (ls->write_pdom_alignments && ali) { char buf[BUFSIZ]; /* write away alignment */ (void) snprintf(buf, BUFSIZ-1, "Protein domain alignment in translated " "sequence for candidate\n'%s':\n\n", desc); gt_file_xwrite(alifile, buf, (size_t) strlen(buf) * sizeof (char)); gt_file_xwrite(alifile, gt_str_get(ali), (size_t) gt_str_length(ali) * sizeof (char)); gt_file_xwrite(alifile, "---\n\n", 5 * sizeof (char)); } if (ls->write_pdom_aaseqs && aaseq) { /* append amino acid sequence */ gt_str_append_str(pdom_aaseq, aaseq); } gt_genome_node_release_user_data((GtGenomeNode*) fn, "pdom_alignment"); gt_genome_node_release_user_data((GtGenomeNode*) fn, "pdom_aaseq"); seq_length += gt_range_length(&pdom_rng); } if (!had_err) { gt_fasta_show_entry(desc, gt_str_get(pdom_seq), seq_length, GT_FSWIDTH, seqfile); if (ls->write_pdom_aaseqs) { gt_fasta_show_entry(desc, gt_str_get(pdom_aaseq), gt_str_length(pdom_aaseq), GT_FSWIDTH, aafile); } } gt_str_delete(pdom_seq); gt_str_delete(pdom_aaseq); return had_err; }
GtNodeVisitor* gt_ltrdigest_pdom_visitor_new(GtPdomModelSet *model, double eval_cutoff, unsigned int chain_max_gap_length, GtPdomCutoff cutoff, GtRegionMapping *rmap, GtError *err) { GtNodeVisitor *nv; GtLTRdigestPdomVisitor *lv; GtStr *cmd; int had_err = 0, i, rval; gt_assert(model && rmap); rval = system("hmmscan -h > /dev/null"); if (rval == -1) { gt_error_set(err, "error executing system(hmmscan)"); return NULL; } #ifndef _WIN32 if (WEXITSTATUS(rval) != 0) { gt_error_set(err, "cannot find the hmmscan executable in PATH"); return NULL; } #else /* XXX */ gt_error_set(err, "hmmscan for Windows not implemented"); return NULL; #endif nv = gt_node_visitor_create(gt_ltrdigest_pdom_visitor_class()); lv = gt_ltrdigest_pdom_visitor_cast(nv); lv->eval_cutoff = eval_cutoff; lv->cutoff = cutoff; lv->chain_max_gap_length = chain_max_gap_length; lv->rmap = rmap; lv->output_all_chains = false; lv->tag = gt_str_new_cstr("GenomeTools"); lv->root_type = gt_symbol(gt_ft_LTR_retrotransposon); for (i = 0; i < 3; i++) { lv->fwd[i] = gt_str_new(); lv->rev[i] = gt_str_new(); } if (!had_err) { cmd = gt_str_new_cstr("hmmscan --cpu "); gt_str_append_uint(cmd, gt_jobs); gt_str_append_cstr(cmd, " "); switch (cutoff) { case GT_PHMM_CUTOFF_GA: gt_str_append_cstr(cmd, "--cut_ga"); break; case GT_PHMM_CUTOFF_TC: gt_str_append_cstr(cmd, "--cut_tc"); break; case GT_PHMM_CUTOFF_NONE: gt_str_append_cstr(cmd, "--domE "); gt_str_append_double(cmd, eval_cutoff, 50); break; } gt_str_append_cstr(cmd, " "); gt_str_append_cstr(cmd, gt_pdom_model_set_get_filename(model)); gt_str_append_cstr(cmd, " -"); lv->cmdline = cmd; lv->args = gt_cstr_split(gt_str_get(lv->cmdline), ' '); gt_log_log("HMMER cmdline: %s", gt_str_get(cmd)); } return nv; }
void gt_ltrdigest_pdom_visitor_set_root_type(GtLTRdigestPdomVisitor *lv, const char *type) { gt_assert(lv && type); lv->root_type = gt_symbol(type); }
static int gt_ltrdigest_pdom_visitor_feature_node(GtNodeVisitor *nv, GtFeatureNode *fn, GtError *err) { GtLTRdigestPdomVisitor *lv; GtFeatureNodeIterator *fni; GtFeatureNode *curnode = NULL; int had_err = 0; GtRange rng; unsigned long i; lv = gt_ltrdigest_pdom_visitor_cast(nv); gt_assert(lv); gt_error_check(err); /* traverse annotation subgraph and find LTR element */ fni = gt_feature_node_iterator_new(fn); while (!had_err && (curnode = gt_feature_node_iterator_next(fni))) { if (strcmp(gt_feature_node_get_type(curnode), gt_ft_LTR_retrotransposon) == 0) { lv->ltr_retrotrans = curnode; } } gt_feature_node_iterator_delete(fni); if (!had_err && lv->ltr_retrotrans != NULL) { GtCodonIterator *ci; GtTranslator *tr; GtTranslatorStatus status; unsigned long seqlen; char translated, *rev_seq; FILE *instream; GtHMMERParseStatus *pstatus; unsigned int frame; GtStr *seq; seq = gt_str_new(); rng = gt_genome_node_get_range((GtGenomeNode*) lv->ltr_retrotrans); lv->leftLTR_5 = rng.start - 1; lv->rightLTR_3 = rng.end - 1; seqlen = gt_range_length(&rng); had_err = gt_extract_feature_sequence(seq, (GtGenomeNode*) lv->ltr_retrotrans, gt_symbol(gt_ft_LTR_retrotransposon), false, NULL, NULL, lv->rmap, err); if (!had_err) { for (i = 0UL; i < 3UL; i++) { gt_str_reset(lv->fwd[i]); gt_str_reset(lv->rev[i]); } /* create translations */ ci = gt_codon_iterator_simple_new(gt_str_get(seq), seqlen, NULL); gt_assert(ci); tr = gt_translator_new(ci); status = gt_translator_next(tr, &translated, &frame, err); while (status == GT_TRANSLATOR_OK && translated) { gt_str_append_char(lv->fwd[frame], translated); status = gt_translator_next(tr, &translated, &frame, NULL); } if (status == GT_TRANSLATOR_ERROR) had_err = -1; if (!had_err) { rev_seq = gt_malloc((size_t) seqlen * sizeof (char)); strncpy(rev_seq, gt_str_get(seq), (size_t) seqlen * sizeof (char)); (void) gt_reverse_complement(rev_seq, seqlen, NULL); gt_codon_iterator_delete(ci); ci = gt_codon_iterator_simple_new(rev_seq, seqlen, NULL); gt_translator_set_codon_iterator(tr, ci); status = gt_translator_next(tr, &translated, &frame, err); while (status == GT_TRANSLATOR_OK && translated) { gt_str_append_char(lv->rev[frame], translated); status = gt_translator_next(tr, &translated, &frame, NULL); } if (status == GT_TRANSLATOR_ERROR) had_err = -1; gt_free(rev_seq); } gt_codon_iterator_delete(ci); gt_translator_delete(tr); } /* run HMMER and handle results */ if (!had_err) { int pid, pc[2], cp[2]; GT_UNUSED int rval; (void) signal(SIGCHLD, SIG_IGN); /* XXX: for now, ignore child's exit status */ rval = pipe(pc); gt_assert(rval == 0); rval = pipe(cp); gt_assert(rval == 0); switch ((pid = (int) fork())) { case -1: perror("Can't fork"); exit(1); /* XXX: error handling */ case 0: /* child */ (void) close(1); /* close current stdout. */ rval = dup(cp[1]); /* make stdout go to write end of pipe. */ (void) close(0); /* close current stdin. */ rval = dup(pc[0]); /* make stdin come from read end of pipe. */ (void) close(pc[1]); (void) close(cp[0]); (void) execvp("hmmscan", lv->args); /* XXX: read path from env */ perror("couldn't execute hmmscan!"); exit(1); default: /* parent */ for (i = 0UL; i < 3UL; i++) { char buf[5]; GT_UNUSED ssize_t written; (void) sprintf(buf, ">%lu%c\n", i, '+'); written = write(pc[1], buf, 4 * sizeof (char)); written = write(pc[1], gt_str_get(lv->fwd[i]), (size_t) gt_str_length(lv->fwd[i]) * sizeof (char)); written = write(pc[1], "\n", 1 * sizeof (char)); (void) sprintf(buf, ">%lu%c\n", i, '-'); written = write(pc[1], buf, 4 * sizeof (char)); written = write(pc[1], gt_str_get(lv->rev[i]), (size_t) gt_str_length(lv->rev[i]) * sizeof (char)); written = write(pc[1], "\n", 1 * sizeof (char)); } (void) close(pc[1]); (void) close(cp[1]); instream = fdopen(cp[0], "r"); pstatus = gt_hmmer_parse_status_new(); had_err = gt_ltrdigest_pdom_visitor_parse_output(lv, pstatus, instream, err); (void) fclose(instream); if (!had_err) had_err = gt_ltrdigest_pdom_visitor_process_hits(lv, pstatus, err); gt_hmmer_parse_status_delete(pstatus); } } gt_str_delete(seq); } if (!had_err) had_err = gt_ltrdigest_pdom_visitor_choose_strand(lv); return had_err; }
static int construct_mRNAs(GT_UNUSED void *key, void *value, void *data, GtError *err) { ConstructionInfo *cinfo = (ConstructionInfo*) data; GtArray *gt_genome_node_array = (GtArray*) value, *mRNAs = (GtArray*) cinfo->mRNAs; GtGenomeNode *mRNA_node, *first_node, *gn; const char *tname; GtStrand mRNA_strand; GtRange mRNA_range; GtStr *mRNA_seqid; GtUword i; int had_err = 0; gt_error_check(err); gt_assert(key && value && data); /* at least one node in array */ gt_assert(gt_array_size(gt_genome_node_array)); /* determine the range and the strand of the mRNA */ first_node = *(GtGenomeNode**) gt_array_get(gt_genome_node_array, 0); mRNA_range = gt_genome_node_get_range(first_node); mRNA_strand = gt_feature_node_get_strand((GtFeatureNode*) first_node); mRNA_seqid = gt_genome_node_get_seqid(first_node); /* TODO: support discontinuous start/stop codons */ for (i = 0; !had_err && i < gt_array_size(gt_genome_node_array); i++) { gn = *(GtGenomeNode**) gt_array_get(gt_genome_node_array, i); if (gt_feature_node_get_attribute((GtFeatureNode*) gn, GTF_PARSER_STOP_CODON_FLAG)) { GtUword j; GtRange stop_codon_rng = gt_genome_node_get_range(gn); bool found_cds = false; for (j = 0; !had_err && j < gt_array_size(gt_genome_node_array); j++) { GtGenomeNode* gn2; GtRange this_rng; const char *this_type; gn2 = *(GtGenomeNode**) gt_array_get(gt_genome_node_array, j); if (gn == gn2) continue; this_rng = gt_genome_node_get_range(gn2); this_type = gt_feature_node_get_type((GtFeatureNode*) gn2); if (this_type == gt_symbol(gt_ft_CDS)) { if (gt_range_contains(&this_rng, &stop_codon_rng)) { if (cinfo->tidy) { gt_warning("stop codon on line %u in file %s is contained in " "CDS in line %u", gt_genome_node_get_line_number(gn), gt_genome_node_get_filename(gn), gt_genome_node_get_line_number(gn2)); found_cds = true; } else { gt_error_set(err, "stop codon on line %u in file %s is " "contained in CDS in line %u", gt_genome_node_get_line_number(gn), gt_genome_node_get_filename(gn), gt_genome_node_get_line_number(gn2)); had_err = -1; } break; } if (this_rng.end + 1 == stop_codon_rng.start) { this_rng.end = stop_codon_rng.end; gt_genome_node_set_range(gn2, &this_rng); found_cds = true; break; } if (this_rng.start == stop_codon_rng.end + 1) { this_rng.start = stop_codon_rng.start; gt_genome_node_set_range(gn2, &this_rng); found_cds = true; break; } } } if (!found_cds) { if (!had_err) { if (cinfo->tidy) { gt_warning("found stop codon on line %u in file %s with no " "flanking CDS, ignoring it", gt_genome_node_get_line_number(gn), gt_genome_node_get_filename(gn)); } else { gt_error_set(err, "found stop codon on line %u in file %s with no " "flanking CDS", gt_genome_node_get_line_number(gn), gt_genome_node_get_filename(gn)); had_err = -1; break; } } } else { gt_array_rem(gt_genome_node_array, i); gt_genome_node_delete(gn); } } } for (i = 1; !had_err && i < gt_array_size(gt_genome_node_array); i++) { GtRange range; GtStrand strand; gn = *(GtGenomeNode**) gt_array_get(gt_genome_node_array, i); range = gt_genome_node_get_range(gn); mRNA_range = gt_range_join(&mRNA_range, &range); strand = gt_feature_node_get_strand((GtFeatureNode*) gn); if (strand != mRNA_strand) { gt_error_set(err, "feature %s on line %u has strand %c, but the " "parent transcript has strand %c", (const char*) key, gt_genome_node_get_line_number(gn), GT_STRAND_CHARS[strand], GT_STRAND_CHARS[mRNA_strand]); had_err = -1; break; } else { mRNA_strand = gt_strand_join(mRNA_strand, strand); } if (!had_err && gt_str_cmp(mRNA_seqid, gt_genome_node_get_seqid(gn))) { gt_error_set(err, "The features on lines %u and %u refer to different " "genomic sequences (``seqname''), although they have the same " "gene IDs (``gene_id'') which must be globally unique", gt_genome_node_get_line_number(first_node), gt_genome_node_get_line_number(gn)); had_err = -1; break; } } if (!had_err) { mRNA_node = gt_feature_node_new(mRNA_seqid, gt_ft_mRNA, mRNA_range.start, mRNA_range.end, mRNA_strand); gt_feature_node_add_attribute(((GtFeatureNode*) mRNA_node), "ID", key); gt_feature_node_add_attribute(((GtFeatureNode*) mRNA_node), "transcript_id", key); if ((tname = gt_hashmap_get(cinfo->transcript_id_to_name_mapping, (const char*) key)) && strlen(tname) > 0) { gt_feature_node_add_attribute((GtFeatureNode*) mRNA_node, GT_GFF_NAME, tname); } /* register children */ for (i = 0; i < gt_array_size(gt_genome_node_array); i++) { gn = *(GtGenomeNode**) gt_array_get(gt_genome_node_array, i); gt_feature_node_add_child((GtFeatureNode*) mRNA_node, (GtFeatureNode*) gt_genome_node_ref(gn)); } /* store the mRNA */ gt_array_add(mRNAs, mRNA_node); } return had_err; }
static int snp_annotator_stream_next(GtNodeStream *ns, GtGenomeNode **gn, GtError *err) { GtSNPAnnotatorStream *sas; int had_err = 0; bool complete_cluster = false; GtGenomeNode *mygn = NULL; GtFeatureNode *fn = NULL; const char *snv_type = gt_symbol(gt_ft_SNV), *snp_type = gt_symbol(gt_ft_SNP), *gene_type = gt_symbol(gt_ft_gene); gt_error_check(err); sas = gt_snp_annotator_stream_cast(ns); /* if there are still SNPs left in the buffer, output them */ if (gt_queue_size(sas->outqueue) > 0) { *gn = (GtGenomeNode*) gt_queue_get(sas->outqueue); return had_err; } else complete_cluster = false; while (!had_err && !complete_cluster) { had_err = gt_node_stream_next(sas->merge_stream, &mygn, err); /* stop if stream is at the end */ if (had_err || !mygn) break; /* process all feature nodes */ if ((fn = gt_feature_node_try_cast(mygn))) { GtGenomeNode *addgn; const char *type = gt_feature_node_get_type(fn); GtRange new_rng = gt_genome_node_get_range(mygn); if (type == snv_type || type == snp_type) { /* -----> this is a SNP <----- */ if (gt_range_overlap(&new_rng, &sas->cur_gene_range)) { /* it falls into the currently observed range */ gt_queue_add(sas->snps, gt_genome_node_ref((GtGenomeNode*) fn)); } else { /* SNP outside a gene, this cluster is done add to out queue and start serving */ gt_assert(gt_queue_size(sas->outqueue) == 0); had_err = snp_annotator_stream_process_current_gene(sas, err); gt_queue_add(sas->outqueue, mygn); if (gt_queue_size(sas->outqueue) > 0) { *gn = (GtGenomeNode*) gt_queue_get(sas->outqueue); complete_cluster = true; } } } else if (type == gene_type) { /* -----> this is a gene <----- */ if (gt_array_size(sas->cur_gene_set) == 0UL) { /* new overlapping gene cluster */ addgn = gt_genome_node_ref(mygn); gt_array_add(sas->cur_gene_set, addgn); sas->cur_gene_range = gt_genome_node_get_range(mygn); } else { if (gt_range_overlap(&new_rng, &sas->cur_gene_range)) { /* gene overlaps with current one, add to cluster */ addgn = gt_genome_node_ref(mygn); gt_array_add(sas->cur_gene_set, addgn); sas->cur_gene_range = gt_range_join(&sas->cur_gene_range, &new_rng); } else { /* finish current cluster and start a new one */ had_err = snp_annotator_stream_process_current_gene(sas, err); if (!had_err) { addgn = gt_genome_node_ref(mygn); gt_array_add(sas->cur_gene_set, addgn); sas->cur_gene_range = gt_genome_node_get_range(mygn); } if (gt_queue_size(sas->outqueue) > 0) { *gn = (GtGenomeNode*) gt_queue_get(sas->outqueue); complete_cluster = true; } } } /* from now on, genes are kept in gene cluster arrays only */ gt_genome_node_delete(mygn); } } else { /* meta node */ had_err = snp_annotator_stream_process_current_gene(sas, err); if (!had_err) { gt_queue_add(sas->outqueue, mygn); } if (gt_queue_size(sas->outqueue) > 0) { *gn = (GtGenomeNode*) gt_queue_get(sas->outqueue); complete_cluster = true; } } } return had_err; }