static void process_orf(GtRange orf_rng, unsigned int orf_frame, GtStrand strand, GtFeatureNode *gf, unsigned long offset, unsigned int min, unsigned int max, GT_UNUSED GtError *err) { gt_assert(gf); unsigned long tmp; if ((gt_range_length(&orf_rng) >= min) && (gt_range_length(&orf_rng) <= max)) { switch (strand) { case GT_STRAND_FORWARD: orf_rng.start = orf_rng.start + offset; orf_rng.end = orf_rng.end + offset; break; case GT_STRAND_REVERSE: tmp = orf_rng.start; orf_rng.start = offset - orf_rng.end; orf_rng.end = offset - tmp; break; default: exit(GT_EXIT_PROGRAMMING_ERROR); break; } orf_attach_results_to_gff3(gf, orf_rng, orf_frame, strand, err); } }
int gt_range_compare_by_length_ptr(const GtRange *range_a, const GtRange *range_b) { GtUword range_a_length, range_b_length; gt_assert(range_a && range_b); range_a_length = gt_range_length(range_a); range_b_length = gt_range_length(range_b); if (range_a_length == range_b_length) return 0; if (range_a_length > range_b_length) return -1; return 1; }
unsigned long gth_seq_con_get_length(GthSeqCon *seq_con, unsigned long seq_num) { GtRange range; gt_assert(seq_con); range = gth_seq_con_get_range(seq_con, seq_num); return gt_range_length(&range); }
GtUword gt_ranges_spanned_length(const GtArray *ranges) { GtRange spanned_range; gt_assert(ranges); spanned_range.start = ((GtRange*) gt_array_get_first(ranges))->start; spanned_range.end = ((GtRange*) gt_array_get_last(ranges))->end; return gt_range_length(&spanned_range); }
static int get_caption_display_status(GtDiagram *d, const char *gft, bool *result, GtError *err) { bool *status; gt_assert(d && gft); status = (bool*) gt_hashmap_get(d->caption_display_status, gft); if (!status) { GtUword threshold = GT_UNDEF_UWORD; double tmp = GT_UNDEF_DOUBLE; status = gt_malloc(sizeof (bool)); *status = true; if (gt_style_get_bool(d->style, "format", "show_block_captions", status, NULL, err) == GT_STYLE_QUERY_ERROR) { gt_free(status); return -1; } if (*status) { GtStyleQueryStatus rval; rval = gt_style_get_num(d->style, gft, "max_capt_show_width", &tmp, NULL, err); switch (rval) { case GT_STYLE_QUERY_ERROR: gt_free(status); return -1; break; /* should never reach this */ case GT_STYLE_QUERY_NOT_SET: *status = true; break; default: gt_assert(tmp != GT_UNDEF_DOUBLE); threshold = tmp; gt_assert(tmp != GT_UNDEF_UWORD); *status = (gt_range_length(&d->range) <= threshold); break; } *status = (gt_range_length(&d->range) <= threshold); } gt_hashmap_add(d->caption_display_status, (void*) gft, status); } *result = *status; return 0; }
static void compute_type_statistics(GtFeatureNode *fn, GtStatVisitor *sv) { GtRange range; gt_assert(fn && sv); if (gt_feature_node_has_type(fn, gt_ft_gene)) { sv->number_of_genes++; if (gt_feature_node_has_CDS(fn)) sv->number_of_protein_coding_genes++; if (sv->gene_length_distribution) { range = gt_genome_node_get_range((GtGenomeNode*) fn); gt_disc_distri_add(sv->gene_length_distribution, gt_range_length(&range)); } if (sv->gene_score_distribution) { gt_disc_distri_add(sv->gene_score_distribution, gt_feature_node_get_score(fn) * 100.0); } } else if (gt_feature_node_has_type(fn, gt_ft_mRNA)) { sv->number_of_mRNAs++; if (gt_feature_node_has_CDS(fn)) sv->number_of_protein_coding_mRNAs++; } else if (gt_feature_node_has_type(fn, gt_ft_exon)) { sv->number_of_exons++; if (sv->exon_length_distribution) { range = gt_genome_node_get_range((GtGenomeNode*) fn); gt_disc_distri_add(sv->exon_length_distribution, gt_range_length(&range)); } } else if (gt_feature_node_has_type(fn, gt_ft_CDS)) { sv->number_of_CDSs++; } else if (gt_feature_node_has_type(fn, gt_ft_intron)) { if (sv->intron_length_distribution) { range = gt_genome_node_get_range((GtGenomeNode*) fn); gt_disc_distri_add(sv->intron_length_distribution, gt_range_length(&range)); } } else if (gt_feature_node_has_type(fn, gt_ft_LTR_retrotransposon)) { sv->number_of_LTR_retrotransposons++; } }
static int stat_visitor_region_node(GtNodeVisitor *nv, GtRegionNode *rn, GT_UNUSED GtError *err) { GtStatVisitor *sv; GtRange range; gt_error_check(err); sv = stat_visitor_cast(nv); sv->number_of_sequence_regions++; range = gt_genome_node_get_range((GtGenomeNode*) rn); sv->total_length_of_sequence_regions += gt_range_length(&range); return 0; }
GT_UNUSED static int gt_cluster_matches_gap(GtArray *matches, GtClusteredSet *cs, unsigned long max_gap_size, GtError *err) { GtMatchReference *mref; GtMatchEdgeTable matchedgetab; GtMatchEdge matchedge; GtMatch *match; GtRange range; unsigned long i, j, gap_size = 0, length = 0; unsigned long num_of_matches; int had_err = 0; num_of_matches = gt_array_size(matches); if (gt_clustered_set_num_of_elements(cs, err) != num_of_matches) { had_err = -1; gt_error_set(err, "number of matches (%lu) unequals number of elements in" "clustered set (%lu)", num_of_matches, gt_clustered_set_num_of_elements(cs, err)); } if (!had_err) { matchedgetab.edges = gt_array_new(sizeof (GtMatchEdge)); matchedgetab.num_of_edges = 0; mref = gt_mirror_and_sort_matches(matches); for (i = 0; i < (2 * (num_of_matches) - 1); i++) { for (j = i + 1; j < (2 * num_of_matches); j++) { match = *(GtMatch**) gt_array_get(matches, mref[i].matchnum); gt_match_get_range_seq1(match, &range); length = gt_range_length(&range); gap_size = mref[j].startpos - mref[i].startpos + length; if (gap_size > max_gap_size) break; if (mref[i].matchnum != mref[j].matchnum) { STORECLUSTEREDGEG(mref[i].matchnum, mref[j].matchnum, gap_size); } } } if (gt_cluster_matches(cs, &matchedgetab, err) != 0) had_err = -1; gt_array_delete(matchedgetab.edges); gt_free(mref); } return had_err; }
static int add_exon_or_cds_number(GtFeatureNode *fn, void *data, GT_UNUSED GtError *err) { GtStatVisitor *sv = (GtStatVisitor*) data; gt_error_check(err); gt_assert(sv && fn); if (gt_feature_node_has_type(fn, gt_ft_exon)) sv->exon_number_for_distri++; else if (gt_feature_node_has_type(fn, gt_ft_CDS)) { GtRange range = gt_genome_node_get_range((GtGenomeNode*) fn); sv->cds_length_for_distri += gt_range_length(&range); } return 0; }
int gt_feature_index_get_features_for_range(GtFeatureIndex *feature_index, GtArray *results, const char *seqid, const GtRange *range, GtError *err) { int ret; gt_assert(feature_index && feature_index->c_class && results && seqid && range); gt_assert(gt_range_length(range) > 0); gt_rwlock_rdlock(feature_index->pvt->lock); ret = feature_index->c_class->get_features_for_range(feature_index, results, seqid, range, err); gt_rwlock_unlock(feature_index->pvt->lock); return ret; }
static double gaeval_visitor_coverage_resolve(GtFeatureNode *genemodel, GtArray *exon_coverage) { agn_assert(genemodel && exon_coverage); agn_assert(gt_feature_node_has_type(genemodel, "mRNA")); GtUword cum_exon_length = agn_typecheck_feature_combined_length(genemodel, agn_typecheck_exon); GtUword i, covered = 0; for(i = 0; i < gt_array_size(exon_coverage); i++) { GtRange *range = gt_array_get(exon_coverage, i); covered += gt_range_length(range); } agn_assert(covered <= cum_exon_length); return (double)covered / (double)cum_exon_length; }
GtRange gt_pbs_hit_get_coords(const GtPBSHit *h) { GtRange rng; gt_assert(h && h->end >= h->start); rng.start = h->start; rng.end = h->end; switch (h->strand) { case GT_STRAND_FORWARD: default: rng.start = h->res->elem->leftLTR_3 + 1 - h->res->opts->radius + rng.start; rng.end = rng.start + (h->end - h->start); break; case GT_STRAND_REVERSE: rng.end = h->res->elem->rightLTR_5 - 1 + h->res->opts->radius - rng.start; rng.start = rng.end - (h->end - h->start); break; } gt_assert(gt_range_length(&rng) == (h->end - h->start + 1)); return rng; }
static int extract_join_feature(GtGenomeNode *gn, const char *type, GtRegionMapping *region_mapping, GtStr *sequence, bool *reverse_strand, bool *first_child_of_type_seen, GtPhase *phase, GtError *err) { char *outsequence; GtFeatureNode *fn; GtRange range; int had_err = 0; gt_error_check(err); fn = gt_feature_node_cast(gn); gt_assert(fn); if (gt_feature_node_has_type(fn, type)) { if (gt_feature_node_get_strand(fn) == GT_STRAND_REVERSE) { *reverse_strand = true; *phase = gt_feature_node_get_phase(fn); } else { if (!(*first_child_of_type_seen)) { *first_child_of_type_seen = true; *phase = gt_feature_node_get_phase(fn); } else *phase = GT_PHASE_UNDEFINED; } range = gt_genome_node_get_range(gn); had_err = gt_region_mapping_get_sequence(region_mapping, &outsequence, gt_genome_node_get_seqid(gn), range.start, range.end, err); if (!had_err) { gt_str_append_cstr_nt(sequence, outsequence, gt_range_length(&range)); gt_free(outsequence); } } return had_err; }
double gt_coords_convert_point(GtRange viewrange, GtWord pos) { return ((double) (((GtWord) pos -(GtWord) viewrange.start))) / ((double) gt_range_length(&viewrange)); }
static int gt_extract_feature_sequence_generic(GtStr *sequence, GtGenomeNode *gn, const char *type, bool join, GtStr *seqid, GtStrArray *target_ids, unsigned int *out_phase_offset, GtRegionMapping *region_mapping, GtError *err) { GtFeatureNode *fn; GtRange range; unsigned int phase_offset = 0; char *outsequence; const char *target; int had_err = 0; gt_error_check(err); fn = gt_genome_node_cast(gt_feature_node_class(), gn); gt_assert(fn); if (seqid) gt_str_append_str(seqid, gt_genome_node_get_seqid(gn)); if (target_ids && (target = gt_feature_node_get_attribute(fn, GT_GFF_TARGET))) { had_err = gt_gff3_parser_parse_all_target_attributes(target, false, target_ids, NULL, NULL, "", 0, err); } if (!had_err) { if (join) { GtFeatureNodeIterator *fni; GtFeatureNode *child; bool reverse_strand = false, first_child = true, first_child_of_type_seen = false; GtPhase phase = GT_PHASE_UNDEFINED; /* in this case we have to traverse the children */ fni = gt_feature_node_iterator_new_direct(gt_feature_node_cast(gn)); while (!had_err && (child = gt_feature_node_iterator_next(fni))) { if (first_child) { if (target_ids && (target = gt_feature_node_get_attribute(child, GT_GFF_TARGET))) { gt_str_array_reset(target_ids); had_err = gt_gff3_parser_parse_all_target_attributes(target, false, target_ids, NULL, NULL, "", 0, err); } first_child = false; } if (!had_err) { if (extract_join_feature((GtGenomeNode*) child, type, region_mapping, sequence, &reverse_strand, &first_child_of_type_seen, &phase, err)) { had_err = -1; } if (phase != GT_PHASE_UNDEFINED) { phase_offset = (int) phase; } } } gt_feature_node_iterator_delete(fni); gt_assert(phase_offset <= (unsigned int) GT_PHASE_UNDEFINED); if (!had_err && gt_str_length(sequence)) { if (reverse_strand) { had_err = gt_reverse_complement(gt_str_get(sequence), gt_str_length(sequence), err); } } } else if (gt_feature_node_get_type(fn) == type) { GtPhase phase = gt_feature_node_get_phase(fn); gt_assert(!had_err); if (phase != GT_PHASE_UNDEFINED) phase_offset = (unsigned int) phase; /* otherwise we only have to look at this feature */ range = gt_genome_node_get_range(gn); gt_assert(range.start); /* 1-based coordinates */ had_err = gt_region_mapping_get_sequence(region_mapping, &outsequence, gt_genome_node_get_seqid(gn), range.start, range.end, err); if (!had_err) { gt_str_append_cstr_nt(sequence, outsequence, gt_range_length(&range)); gt_free(outsequence); if (gt_feature_node_get_strand(fn) == GT_STRAND_REVERSE) { had_err = gt_reverse_complement(gt_str_get(sequence), gt_str_length(sequence), err); } } } } if (out_phase_offset && phase_offset != GT_PHASE_UNDEFINED) { *out_phase_offset = phase_offset; } return had_err; }
static int select_visitor_feature_node(GtNodeVisitor *nv, GtFeatureNode *fn, GT_UNUSED GtError *err) { GtSelectVisitor *fv; bool filter_node = false; gt_error_check(err); fv = select_visitor_cast(nv); fv->current_feature++; if ((!gt_str_length(fv->seqid) || /* no seqid was specified or seqids are equal */ !gt_str_cmp(fv->seqid, gt_genome_node_get_seqid((GtGenomeNode*) fn))) && (!gt_str_length(fv->source) || /* no source was specified or sources are equal */ !strcmp(gt_str_get(fv->source), gt_feature_node_get_source(fn)))) { GtRange range = gt_genome_node_get_range((GtGenomeNode*) fn); /* enforce maximum gene length */ /* XXX: we (spuriously) assume that genes are always root nodes */ if (fn && gt_feature_node_has_type(fn, gt_ft_gene)) { if (fv->max_gene_length != GT_UNDEF_ULONG && gt_range_length(&range) > fv->max_gene_length) { filter_node = true; } else if (fv->max_gene_num != GT_UNDEF_ULONG && fv->gene_num >= fv->max_gene_num) { filter_node = true; } else if (fv->min_gene_score != GT_UNDEF_DOUBLE && gt_feature_node_get_score(fn) < fv->min_gene_score) { filter_node = true; } else if (fv->max_gene_score != GT_UNDEF_DOUBLE && gt_feature_node_get_score(fn) > fv->max_gene_score) { filter_node = true; } else if (fv->feature_num != GT_UNDEF_ULONG && fv->feature_num != fv->current_feature) { filter_node = true; } if (!filter_node) fv->gene_num++; /* gene passed filter */ } } else filter_node = true; if (!filter_node) filter_node = filter_contain_range(fn, fv->contain_range); if (!filter_node) filter_node = filter_overlap_range(fn, fv->overlap_range); if (!filter_node) filter_node = filter_strand(fn, fv->strand); if (!filter_node) filter_node = filter_targetstrand(fn, fv->targetstrand); if (!filter_node) filter_node = filter_has_CDS(fn, fv->has_CDS); if (!filter_node) filter_node = filter_min_average_ssp(fn, fv->min_average_splice_site_prob); if (filter_node) gt_genome_node_delete((GtGenomeNode*) fn); else gt_queue_add(fv->node_buffer, fn); return 0; }
static int gt_snp_annotator_visitor_prepare_gene(GtSNPAnnotatorVisitor *sav, GtError *err) { GtFeatureNodeIterator *fni, *mrnafni; GtFeatureNode *curnode, *last_mRNA = NULL; GtStr *mrnaseq, *seqid; int had_err = 0; mrnaseq = gt_str_new(); seqid = gt_genome_node_get_seqid((GtGenomeNode*) sav->gene); fni = gt_feature_node_iterator_new(sav->gene); while (!had_err && (curnode = gt_feature_node_iterator_next(fni))) { if (gt_feature_node_get_type(curnode) == sav->mRNA_type) { GtFeatureNode *curnode2; if (last_mRNA) { char *mrna_charseq = gt_calloc(gt_str_length(mrnaseq)+1, sizeof (char)); (void) strncpy(mrna_charseq, gt_str_get(mrnaseq), gt_str_length(mrnaseq)); if (gt_feature_node_get_strand(sav->gene) == GT_STRAND_REVERSE) { had_err = gt_reverse_complement(mrna_charseq, gt_str_length(mrnaseq), err); } if (!had_err) { gt_hashmap_add(sav->rnaseqs, last_mRNA, mrna_charseq); last_mRNA = curnode; gt_str_reset(mrnaseq); } } else last_mRNA = curnode; if (!had_err) { mrnafni = gt_feature_node_iterator_new(curnode); while (!had_err && (curnode2 = gt_feature_node_iterator_next(mrnafni))) { if (gt_feature_node_get_type(curnode2) == sav->CDS_type) { char *tmp; GtRange rng = gt_genome_node_get_range((GtGenomeNode*) curnode2); had_err = gt_region_mapping_get_sequence(sav->rmap, &tmp, seqid, rng.start, rng.end, err); if (!had_err) { gt_str_append_cstr_nt(mrnaseq, tmp, gt_range_length(&rng)); gt_free(tmp); } } } gt_feature_node_iterator_delete(mrnafni); } } } if (!had_err && last_mRNA) { char *mrna_charseq = gt_calloc(gt_str_length(mrnaseq)+1, sizeof (char)); (void) strncpy(mrna_charseq, gt_str_get(mrnaseq), gt_str_length(mrnaseq)); if (gt_feature_node_get_strand(sav->gene) == GT_STRAND_REVERSE) { had_err = gt_reverse_complement(mrna_charseq, gt_str_length(mrnaseq), err); } if (!had_err) { gt_hashmap_add(sav->rnaseqs, last_mRNA, mrna_charseq); } } gt_feature_node_iterator_delete(fni); gt_str_delete(mrnaseq); return had_err; }
static int snp_annotator_visitor_feature_node(GtNodeVisitor *nv, GtFeatureNode *fn, GtError *err) { int had_err = 0; GtSNPAnnotatorVisitor *sav; GtFeatureNodeIterator *fni, *mrnafni; GtFeatureNode *curnode, *curnode2; GtRange snp_rng; gt_error_check(err); sav = snp_annotator_visitor_cast(nv); /* ignore non-nodes */ if (!fn) return 0; /* only process SNPs */ if (!(gt_feature_node_get_type(fn) == sav->SNV_type || gt_feature_node_get_type(fn) == sav->SNP_type)) { return 0; } fni = gt_feature_node_iterator_new_direct(sav->gene); snp_rng = gt_genome_node_get_range((GtGenomeNode*) fn); while (!had_err && (curnode = gt_feature_node_iterator_next(fni))) { if (gt_feature_node_get_type(curnode) == sav->mRNA_type) { GtStrand mrna_strand = gt_feature_node_get_strand(curnode); #ifndef NDEBUG const char *refstr; #endif GtUword mrnasnppos = 0; mrnafni = gt_feature_node_iterator_new(curnode); while (!had_err && (curnode2 = gt_feature_node_iterator_next(mrnafni))) { if (gt_feature_node_get_type(curnode2) == sav->CDS_type) { GtRange cds_rng = gt_genome_node_get_range((GtGenomeNode*) curnode2); if (gt_range_overlap(&snp_rng, &cds_rng)) { char *mRNA, origchar; char *variantchars, *variantptr = NULL; GT_UNUSED char *refchars, *refptr = NULL; mRNA = (char*) gt_hashmap_get(sav->rnaseqs, curnode); gt_assert(mRNA); gt_assert(snp_rng.start >= cds_rng.start); mrnasnppos += (snp_rng.start - cds_rng.start); if (mrna_strand == GT_STRAND_REVERSE) mrnasnppos = strlen(mRNA) - mrnasnppos - 1; gt_assert(mrnasnppos < strlen(mRNA)); origchar = mRNA[mrnasnppos]; #ifndef NDEBUG refstr = refptr = gt_cstr_dup(gt_feature_node_get_attribute(fn, GT_GVF_REFERENCE_SEQ)); if (!had_err && refstr) { if (gt_feature_node_get_strand(curnode) == GT_STRAND_REVERSE) { int rval = gt_complement(&origchar, origchar, err); gt_assert(rval == 0); } gt_assert(toupper(origchar) == toupper(refstr[0])); } #endif variantchars = variantptr = gt_cstr_dup( gt_feature_node_get_attribute(fn, GT_GVF_VARIANT_SEQ)); if (!had_err && variantchars) { GtUword i = 0; while (!had_err && (*variantchars != ';' && *variantchars != '\0')) { if (*variantchars != ',' && *variantchars != origchar) { char variantchar = *variantchars; #ifndef NDEBUG char refchar = refstr ? refstr[0] : '-'; /* XXX */ if (!had_err && mrna_strand == GT_STRAND_REVERSE) had_err = gt_complement(&refchar, refchar, err); #endif if (!had_err && mrna_strand == GT_STRAND_REVERSE) had_err = gt_complement(&variantchar, variantchar, err); if (!had_err) { had_err = snp_annotator_classify_snp(sav, curnode, fn, mrnasnppos, i++, variantchar, #ifndef NDEBUG refchar, #endif err); } } else if (*variantchars == origchar) { i++; } variantchars++; } gt_free(variantptr); gt_free(refptr); } } else { mrnasnppos += gt_range_length(&cds_rng); } } } gt_feature_node_iterator_delete(mrnafni); } } gt_feature_node_iterator_delete(fni); return had_err; }
int gt_ltrfileout_stream_next(GtNodeStream *ns, GtGenomeNode **gn, GtError *err) { GtLTRdigestFileOutStream *ls; GtFeatureNode *fn; GtRange lltr_rng = {GT_UNDEF_UWORD, GT_UNDEF_UWORD}, rltr_rng = {GT_UNDEF_UWORD, GT_UNDEF_UWORD}, ppt_rng = {GT_UNDEF_UWORD, GT_UNDEF_UWORD}, pbs_rng = {GT_UNDEF_UWORD, GT_UNDEF_UWORD}; int had_err; GtUword i=0; gt_error_check(err); ls = gt_ltrdigest_file_out_stream_cast(ns); /* initialize this element */ memset(&ls->element, 0, sizeof (GtLTRElement)); /* get annotations from parser */ had_err = gt_node_stream_next(ls->in_stream, gn, err); if (!had_err && *gn) { GtFeatureNodeIterator* gni; GtFeatureNode *mygn; /* only process feature nodes */ if (!(fn = gt_feature_node_try_cast(*gn))) return 0; ls->element.pdomorder = gt_array_new(sizeof (const char*)); /* fill LTRElement structure from GFF3 subgraph */ gni = gt_feature_node_iterator_new(fn); for (mygn = fn; mygn != NULL; mygn = gt_feature_node_iterator_next(gni)) (void) gt_genome_node_accept((GtGenomeNode*) mygn, (GtNodeVisitor*) ls->lv, err); gt_feature_node_iterator_delete(gni); } if (!had_err && ls->element.mainnode != NULL) { char desc[GT_MAXFASTAHEADER]; GtFeatureNode *ltr3, *ltr5; GtStr *sdesc, *sreg, *seq; /* find sequence in GtEncseq */ sreg = gt_genome_node_get_seqid((GtGenomeNode*) ls->element.mainnode); sdesc = gt_str_new(); had_err = gt_region_mapping_get_description(ls->rmap, sdesc, sreg, err); if (!had_err) { GtRange rng; ls->element.seqid = gt_calloc((size_t) ls->seqnamelen+1, sizeof (char)); (void) snprintf(ls->element.seqid, MIN((size_t) gt_str_length(sdesc), (size_t) ls->seqnamelen)+1, "%s", gt_str_get(sdesc)); gt_cstr_rep(ls->element.seqid, ' ', '_'); if (gt_str_length(sdesc) > (GtUword) ls->seqnamelen) ls->element.seqid[ls->seqnamelen] = '\0'; (void) gt_ltrelement_format_description(&ls->element, ls->seqnamelen, desc, (size_t) (GT_MAXFASTAHEADER-1)); gt_str_delete(sdesc); /* output basic retrotransposon data */ lltr_rng = gt_genome_node_get_range((GtGenomeNode*) ls->element.leftLTR); rltr_rng = gt_genome_node_get_range((GtGenomeNode*) ls->element.rightLTR); rng = gt_genome_node_get_range((GtGenomeNode*) ls->element.mainnode); gt_file_xprintf(ls->tabout_file, GT_WU"\t"GT_WU"\t"GT_WU"\t%s\t"GT_WU"\t"GT_WU"\t"GT_WU"\t" GT_WU"\t"GT_WU"\t"GT_WU"\t", rng.start, rng.end, gt_ltrelement_length(&ls->element), ls->element.seqid, lltr_rng.start, lltr_rng.end, gt_ltrelement_leftltrlen(&ls->element), rltr_rng.start, rltr_rng.end, gt_ltrelement_rightltrlen(&ls->element)); } seq = gt_str_new(); /* output TSDs */ if (!had_err && ls->element.leftTSD != NULL) { GtRange tsd_rng; tsd_rng = gt_genome_node_get_range((GtGenomeNode*) ls->element.leftTSD); had_err = gt_extract_feature_sequence(seq, (GtGenomeNode*) ls->element.leftTSD, gt_symbol(gt_ft_target_site_duplication), false, NULL, NULL, ls->rmap, err); if (!had_err) { gt_file_xprintf(ls->tabout_file, ""GT_WU"\t"GT_WU"\t%s\t", tsd_rng.start, tsd_rng.end, gt_str_get(seq)); } gt_str_reset(seq); } else gt_file_xprintf(ls->tabout_file, "\t\t\t"); if (!had_err && ls->element.rightTSD != NULL) { GtRange tsd_rng; tsd_rng = gt_genome_node_get_range((GtGenomeNode*) ls->element.rightTSD); had_err = gt_extract_feature_sequence(seq, (GtGenomeNode*) ls->element.rightTSD, gt_symbol(gt_ft_target_site_duplication), false, NULL, NULL, ls->rmap, err); if (!had_err) { gt_file_xprintf(ls->tabout_file, ""GT_WU"\t"GT_WU"\t%s\t", tsd_rng.start, tsd_rng.end, gt_str_get(seq)); } gt_str_reset(seq); } else gt_file_xprintf(ls->tabout_file, "\t\t\t"); /* output PPT */ if (!had_err && ls->element.ppt != NULL) { GtStrand ppt_strand = gt_feature_node_get_strand(ls->element.ppt); ppt_rng = gt_genome_node_get_range((GtGenomeNode*) ls->element.ppt); had_err = gt_extract_feature_sequence(seq, (GtGenomeNode*) ls->element.ppt, gt_symbol(gt_ft_RR_tract), false, NULL, NULL, ls->rmap, err); if (!had_err) { gt_fasta_show_entry(desc, gt_str_get(seq), gt_range_length(&ppt_rng), GT_FSWIDTH, ls->pptout_file); gt_file_xprintf(ls->tabout_file, ""GT_WU"\t"GT_WU"\t%s\t%c\t%d\t", ppt_rng.start, ppt_rng.end, gt_str_get(seq), GT_STRAND_CHARS[ppt_strand], (ppt_strand == GT_STRAND_FORWARD ? abs((int) (rltr_rng.start - ppt_rng.end)) : abs((int) (lltr_rng.end - ppt_rng.start)))); } gt_str_reset(seq); } else gt_file_xprintf(ls->tabout_file, "\t\t\t\t\t"); /* output PBS */ if (!had_err && ls->element.pbs != NULL) { GtStrand pbs_strand; pbs_strand = gt_feature_node_get_strand(ls->element.pbs); pbs_rng = gt_genome_node_get_range((GtGenomeNode*) ls->element.pbs); had_err = gt_extract_feature_sequence(seq, (GtGenomeNode*) ls->element.pbs, gt_symbol(gt_ft_primer_binding_site), false, NULL, NULL, ls->rmap, err); if (!had_err) { gt_fasta_show_entry(desc, gt_str_get(seq), gt_range_length(&pbs_rng), GT_FSWIDTH, ls->pbsout_file); gt_file_xprintf(ls->tabout_file, ""GT_WU"\t"GT_WU"\t%c\t%s\t%s\t%s\t%s\t%s\t", pbs_rng.start, pbs_rng.end, GT_STRAND_CHARS[pbs_strand], gt_feature_node_get_attribute(ls->element.pbs, "trna"), gt_str_get(seq), gt_feature_node_get_attribute(ls->element.pbs, "pbsoffset"), gt_feature_node_get_attribute(ls->element.pbs, "trnaoffset"), gt_feature_node_get_attribute(ls->element.pbs, "edist")); } gt_str_reset(seq); } else gt_file_xprintf(ls->tabout_file, "\t\t\t\t\t\t\t\t"); /* output protein domains */ if (!had_err && ls->element.pdoms != NULL) { GtStr *pdomorderstr = gt_str_new(); for (i=0; !had_err && i<gt_array_size(ls->element.pdomorder); i++) { const char* key = *(const char**) gt_array_get(ls->element.pdomorder, i); GtArray *entry = (GtArray*) gt_hashmap_get(ls->element.pdoms, key); had_err = write_pdom(ls, entry, key, ls->rmap, desc, err); } if (GT_STRAND_REVERSE == gt_feature_node_get_strand(ls->element.mainnode)) gt_array_reverse(ls->element.pdomorder); for (i=0 ;!had_err && i<gt_array_size(ls->element.pdomorder); i++) { const char* name = *(const char**) gt_array_get(ls->element.pdomorder, i); gt_str_append_cstr(pdomorderstr, name); if (i != gt_array_size(ls->element.pdomorder)-1) gt_str_append_cstr(pdomorderstr, "/"); } gt_file_xprintf(ls->tabout_file, "%s", gt_str_get(pdomorderstr)); gt_str_delete(pdomorderstr); } /* output LTRs (we just expect them to exist) */ switch (gt_feature_node_get_strand(ls->element.mainnode)) { case GT_STRAND_REVERSE: ltr5 = ls->element.rightLTR; ltr3 = ls->element.leftLTR; break; case GT_STRAND_FORWARD: default: ltr5 = ls->element.leftLTR; ltr3 = ls->element.rightLTR; break; } if (!had_err) { had_err = gt_extract_feature_sequence(seq, (GtGenomeNode*) ltr5, gt_symbol(gt_ft_long_terminal_repeat), false, NULL, NULL, ls->rmap, err); } if (!had_err) { gt_fasta_show_entry(desc, gt_str_get(seq), gt_str_length(seq), GT_FSWIDTH, ls->ltr5out_file); gt_str_reset(seq); } if (!had_err) { had_err = gt_extract_feature_sequence(seq, (GtGenomeNode*) ltr3, gt_symbol(gt_ft_long_terminal_repeat), false, NULL, NULL, ls->rmap, err); } if (!had_err) { gt_fasta_show_entry(desc, gt_str_get(seq), gt_str_length(seq), GT_FSWIDTH, ls->ltr3out_file); gt_str_reset(seq); } /* output complete oriented element */ if (!had_err) { had_err = gt_extract_feature_sequence(seq, (GtGenomeNode*) ls->element.mainnode, gt_symbol(gt_ft_LTR_retrotransposon), false, NULL, NULL, ls->rmap, err); } if (!had_err) { gt_fasta_show_entry(desc,gt_str_get(seq), gt_str_length(seq), GT_FSWIDTH, ls->elemout_file); gt_str_reset(seq); } gt_file_xprintf(ls->tabout_file, "\n"); gt_str_delete(seq); } gt_hashmap_delete(ls->element.pdoms); gt_array_delete(ls->element.pdomorder); gt_free(ls->element.seqid); return had_err; }
static int write_pdom(GtLTRdigestFileOutStream *ls, GtArray *pdoms, const char *pdomname, GT_UNUSED GtRegionMapping *rmap, char *desc, GtError *err) { int had_err = 0; GtFile *seqfile = NULL, *alifile = NULL, *aafile = NULL; GtUword i = 0, seq_length = 0; GtStr *pdom_seq, *pdom_aaseq; gt_error_check(err); pdom_seq = gt_str_new(); pdom_aaseq = gt_str_new(); /* get protein domain output file */ seqfile = (GtFile*) gt_hashmap_get(ls->pdomout_files, pdomname); if (seqfile == NULL) { /* no file opened for this domain yet, do it */ char buffer[GT_MAXFILENAMELEN]; (void) snprintf(buffer, (size_t) (GT_MAXFILENAMELEN-1), "%s_pdom_%s.fas", ls->fileprefix, pdomname); seqfile = gt_file_xopen(buffer, "w+"); gt_hashmap_add(ls->pdomout_files, gt_cstr_dup(pdomname), seqfile); } /* get protein alignment output file */ if (ls->write_pdom_alignments) { alifile = (GtFile*) gt_hashmap_get(ls->pdomali_files, pdomname); if (alifile == NULL) { /* no file opened for this domain yet, do it */ char buffer[GT_MAXFILENAMELEN]; (void) snprintf(buffer, (size_t) (GT_MAXFILENAMELEN-1), "%s_pdom_%s.ali", ls->fileprefix, pdomname); alifile = gt_file_xopen(buffer, "w+"); gt_hashmap_add(ls->pdomali_files, gt_cstr_dup(pdomname), alifile); } } /* get amino acid sequence output file */ if (ls->write_pdom_aaseqs) { aafile = (GtFile*) gt_hashmap_get(ls->pdomaa_files, pdomname); if (aafile == NULL) { /* no file opened for this domain yet, do it */ char buffer[GT_MAXFILENAMELEN]; (void) snprintf(buffer, (size_t) (GT_MAXFILENAMELEN-1), "%s_pdom_%s_aa.fas", ls->fileprefix, pdomname); aafile = gt_file_xopen(buffer, "w+"); gt_hashmap_add(ls->pdomaa_files, gt_cstr_dup(pdomname), aafile); } } if (gt_array_size(pdoms) > 1UL) { for (i=1UL; i<gt_array_size(pdoms); i++) { gt_assert(gt_genome_node_cmp(*(GtGenomeNode**)gt_array_get(pdoms, i), *(GtGenomeNode**)gt_array_get(pdoms, i-1)) >= 0); } if (gt_feature_node_get_strand(*(GtFeatureNode**) gt_array_get(pdoms, 0UL)) == GT_STRAND_REVERSE) { gt_array_reverse(pdoms); } } /* output protein domain data */ for (i=0;i<gt_array_size(pdoms);i++) { GtRange pdom_rng; GtStr *ali, *aaseq; GtFeatureNode *fn; int rval; fn = *(GtFeatureNode**) gt_array_get(pdoms, i); ali = gt_genome_node_get_user_data((GtGenomeNode*) fn, "pdom_alignment"); aaseq = gt_genome_node_get_user_data((GtGenomeNode*) fn, "pdom_aaseq"); pdom_rng = gt_genome_node_get_range((GtGenomeNode*) fn); rval = gt_extract_feature_sequence(pdom_seq, (GtGenomeNode*) fn, gt_symbol(gt_ft_protein_match), false, NULL, NULL, rmap, err); if (rval) { had_err = -1; break; } if (ls->write_pdom_alignments && ali) { char buf[BUFSIZ]; /* write away alignment */ (void) snprintf(buf, BUFSIZ-1, "Protein domain alignment in translated " "sequence for candidate\n'%s':\n\n", desc); gt_file_xwrite(alifile, buf, (size_t) strlen(buf) * sizeof (char)); gt_file_xwrite(alifile, gt_str_get(ali), (size_t) gt_str_length(ali) * sizeof (char)); gt_file_xwrite(alifile, "---\n\n", 5 * sizeof (char)); } if (ls->write_pdom_aaseqs && aaseq) { /* append amino acid sequence */ gt_str_append_str(pdom_aaseq, aaseq); } gt_genome_node_release_user_data((GtGenomeNode*) fn, "pdom_alignment"); gt_genome_node_release_user_data((GtGenomeNode*) fn, "pdom_aaseq"); seq_length += gt_range_length(&pdom_rng); } if (!had_err) { gt_fasta_show_entry(desc, gt_str_get(pdom_seq), seq_length, GT_FSWIDTH, seqfile); if (ls->write_pdom_aaseqs) { gt_fasta_show_entry(desc, gt_str_get(pdom_aaseq), gt_str_length(pdom_aaseq), GT_FSWIDTH, aafile); } } gt_str_delete(pdom_seq); gt_str_delete(pdom_aaseq); return had_err; }
static int process_node(GtDiagram *d, GtFeatureNode *node, GtFeatureNode *parent, GtError *err) { GtRange elem_range; bool *collapse; GtShouldGroupByParent *group; const char *feature_type = NULL, *parent_gft = NULL; double tmp; GtStyleQueryStatus rval; GtUword max_show_width = GT_UNDEF_UWORD, par_max_show_width = GT_UNDEF_UWORD; gt_assert(d && node); gt_log_log(">> getting '%s'", gt_feature_node_get_type(node)); /* skip pseudonodes */ if (gt_feature_node_is_pseudo(node)) return 0; feature_type = gt_feature_node_get_type(node); gt_assert(feature_type); /* discard elements that do not overlap with visible range */ elem_range = gt_genome_node_get_range((GtGenomeNode*) node); if (!gt_range_overlap(&d->range, &elem_range)) return 0; /* get maximal view widths in nucleotides to show this type */ rval = gt_style_get_num(d->style, feature_type, "max_show_width", &tmp, NULL, err); switch (rval) { case GT_STYLE_QUERY_OK: max_show_width = tmp; break; case GT_STYLE_QUERY_ERROR: return -1; break; /* should never be reached */ default: /* do not change default value */ break; } /* for non-root nodes, get maximal view with to show parent */ if (parent) { if (!gt_feature_node_is_pseudo(parent)) { parent_gft = gt_feature_node_get_type(parent); rval = gt_style_get_num(d->style, parent_gft, "max_show_width", &tmp, NULL, err); switch (rval) { case GT_STYLE_QUERY_OK: par_max_show_width = tmp; break; case GT_STYLE_QUERY_ERROR: return -1; break; /* should never be reached */ default: /* do not change default value */ break; } } else par_max_show_width = GT_UNDEF_UWORD; } /* check if this type is to be displayed at all */ if (max_show_width != GT_UNDEF_UWORD && gt_range_length(&d->range) > max_show_width) { return 0; } /* disregard parent node if it is configured not to be shown */ if (parent && par_max_show_width != GT_UNDEF_UWORD && gt_range_length(&d->range) > par_max_show_width) { parent = NULL; } /* check if this is a collapsing type, cache result */ if ((collapse = (bool*) gt_hashmap_get(d->collapsingtypes, feature_type)) == NULL) { collapse = gt_malloc(sizeof (bool)); *collapse = false; if (gt_style_get_bool(d->style, feature_type, "collapse_to_parent", collapse, NULL, err) == GT_STYLE_QUERY_ERROR) { gt_free(collapse); return -1; } gt_hashmap_add(d->collapsingtypes, (void*) feature_type, collapse); } /* check if type should be grouped by parent, cache result */ if ((group = (GtShouldGroupByParent*) gt_hashmap_get(d->groupedtypes, feature_type)) == NULL) { bool tmp; group = gt_malloc(sizeof (GtShouldGroupByParent)); rval = gt_style_get_bool(d->style, feature_type, "group_by_parent", &tmp, NULL, err); switch (rval) { case GT_STYLE_QUERY_OK: if (tmp) *group = GT_GROUP_BY_PARENT; else *group = GT_DO_NOT_GROUP_BY_PARENT; break; case GT_STYLE_QUERY_NOT_SET: *group = GT_UNDEFINED_GROUPING; break; case GT_STYLE_QUERY_ERROR: gt_free(group); return -1; break; /* should never be reached */ } gt_hashmap_add(d->groupedtypes, (void*) feature_type, group); } /* decide where to place this feature: */ if (*collapse) { /* user has specified collapsing to parent for this type */ if (parent && !gt_feature_node_is_pseudo(parent)) { /* collapsing child nodes are added to upwards blocks, but never collapse into pseudo nodes */ add_recursive(d, node, parent, node); } else { /* if no parent or only pseudo-parent, do not collapse */ if (add_to_current(d, node, parent, err) < 0) { return -1; } } } else /* (!*collapse) */ { if (parent) { bool do_not_overlap = false; do_not_overlap = gt_feature_node_direct_children_do_not_overlap_st(parent, node); if (*group == GT_GROUP_BY_PARENT || (do_not_overlap && *group == GT_UNDEFINED_GROUPING)) { if (gt_feature_node_is_pseudo(parent) && gt_feature_node_is_multi(node)) { if (add_to_rep(d, node, parent, err) < 0) { return -1; } } else if (gt_feature_node_number_of_children(parent) > 1) { if (add_to_parent(d, node, parent, err) < 0) { return -1; } } else { if (add_to_current(d, node, parent, err) < 0) { return -1; } } } else { if (gt_feature_node_is_pseudo(parent) && gt_feature_node_is_multi(node)) { if (add_to_rep(d, node, parent, err) < 0) { return -1; } } else { if (add_to_current(d, node, parent, err) < 0) { return -1; } } } } else { /* root nodes always get their own block */ if (add_to_current(d, node, parent, err) < 0) { return -1; } } } /* we can now assume that this node (or its representative) has been processed into the reverse lookup structure */ #ifndef NDEBUG if (gt_feature_node_is_multi(node)) { GtFeatureNode *rep; rep = gt_feature_node_get_multi_representative((GtFeatureNode*) node); gt_assert(gt_hashmap_get(d->nodeinfo, rep)); } else gt_assert(gt_hashmap_get(d->nodeinfo, node)); #endif return 0; }
static int calc_spliced_alignments(GthSACollection *sa_collection, GthChainCollection *chain_collection, GthCallInfo *call_info, GthInput *input, GthStat *stat, GtUword gen_file_num, GtUword ref_file_num, bool directmatches, GthMatchInfo *match_info, GthDNACompletePathMatrixJT dna_complete_path_matrix_jt, GthProteinCompletePathMatrixJT protein_complete_path_matrix_jt) { const unsigned char *ref_seq_tran, *ref_seq_orig, *ref_seq_tran_rc = NULL, *ref_seq_orig_rc = NULL; GtUword chainctr, gen_offset = GT_UNDEF_UWORD, gen_total_length, ref_total_length; GtFile *outfp = call_info->out->outfp; GtRange gen_seq_bounds, gen_seq_bounds_rc; bool refseqisdna; GthChain *chain; GtRange range; GthSA *saA; int rval; gt_assert(sa_collection && chain_collection); refseqisdna = gth_input_ref_file_is_dna(input, ref_file_num); for (chainctr = 0; chainctr < gth_chain_collection_size(chain_collection); chainctr++) { chain = gth_chain_collection_get(chain_collection, chainctr); if (++match_info->call_number > call_info->firstalshown && call_info->firstalshown > 0) { if (!(call_info->out->xmlout || call_info->out->gff3out)) gt_file_xfputc('\n', outfp); else if (call_info->out->xmlout) gt_file_xprintf(outfp, "<!--\n"); if (!call_info->out->gff3out) { gt_file_xprintf(outfp, "Maximal matching %s count (%u) reached.\n", refseqisdna ? "EST" : "protein", call_info->firstalshown); gt_file_xprintf(outfp, "Only the first %u matches will be " "displayed.\n", call_info->firstalshown); } if (!(call_info->out->xmlout || call_info->out->gff3out)) gt_file_xfputc('\n', outfp); else if (call_info->out->xmlout) gt_file_xprintf(outfp, "-->\n"); match_info->max_call_number_reached = true; break; /* break out of loop */ } /* compute considered genomic regions if not set by -frompos */ if (!gth_input_use_substring_spec(input)) { gen_seq_bounds = gth_input_get_genomic_range(input, chain->gen_file_num, chain->gen_seq_num); gen_total_length = gt_range_length(&gen_seq_bounds); gen_offset = gen_seq_bounds.start; gen_seq_bounds_rc = gen_seq_bounds; } else { /* genomic multiseq contains exactly one sequence */ gt_assert(gth_input_num_of_gen_seqs(input, chain->gen_file_num) == 1); gen_total_length = gth_input_genomic_file_total_length(input, chain ->gen_file_num); gen_seq_bounds.start = gth_input_genomic_substring_from(input); gen_seq_bounds.end = gth_input_genomic_substring_to(input); gen_offset = 0; gen_seq_bounds_rc.start = gen_total_length - 1 - gen_seq_bounds.end; gen_seq_bounds_rc.end = gen_total_length - 1 - gen_seq_bounds.start; } /* "retrieving" the reference sequence */ range = gth_input_get_reference_range(input, chain->ref_file_num, chain->ref_seq_num); ref_seq_tran = gth_input_current_ref_seq_tran(input) + range.start; ref_seq_orig = gth_input_current_ref_seq_orig(input) + range.start; if (refseqisdna) { ref_seq_tran_rc = gth_input_current_ref_seq_tran_rc(input) + range.start; ref_seq_orig_rc = gth_input_current_ref_seq_orig_rc(input) + range.start; } ref_total_length = range.end - range.start + 1; /* check if protein sequences have a stop amino acid */ if (!refseqisdna && !match_info->stop_amino_acid_warning && ref_seq_orig[ref_total_length - 1] != GT_STOP_AMINO) { GtStr *ref_id = gt_str_new(); gth_input_save_ref_id(input, ref_id, chain->ref_file_num, chain->ref_seq_num); gt_warning("protein sequence '%s' (#" GT_WU " in file %s) does not end " "with a stop amino acid ('%c'). If it is not a protein " "fragment you should add a stop amino acid to improve the " "prediction. For example with `gt seqtransform " "-addstopaminos` (see http://genometools.org for details).", gt_str_get(ref_id), chain->ref_seq_num, gth_input_get_reference_filename(input, chain->ref_file_num), GT_STOP_AMINO); match_info->stop_amino_acid_warning = true; gt_str_delete(ref_id); } /* allocating space for alignment */ saA = gth_sa_new_and_set(directmatches, true, input, chain->gen_file_num, chain->gen_seq_num, chain->ref_file_num, chain->ref_seq_num, match_info->call_number, gen_total_length, gen_offset, ref_total_length); /* extend the DP borders to the left and to the right */ gth_chain_extend_borders(chain, &gen_seq_bounds, &gen_seq_bounds_rc, gen_total_length, gen_offset); /* From here on the dp positions always refer to the forward strand of the genomic DNA. */ /* call the Dynamic Programming */ if (refseqisdna) { rval = call_dna_DP(directmatches, call_info, input, stat, sa_collection, saA, gen_file_num, ref_file_num, gen_total_length, gen_offset, &gen_seq_bounds, &gen_seq_bounds_rc, ref_total_length, range.start, chainctr, gth_chain_collection_size(chain_collection), match_info, ref_seq_tran, ref_seq_orig, ref_seq_tran_rc, ref_seq_orig_rc, chain, dna_complete_path_matrix_jt, protein_complete_path_matrix_jt); } else { rval = call_protein_DP(directmatches, call_info, input, stat, sa_collection, saA, gen_file_num, ref_file_num, gen_total_length, gen_offset, &gen_seq_bounds, &gen_seq_bounds_rc, ref_total_length, range.start, chainctr, gth_chain_collection_size(chain_collection), match_info, ref_seq_tran, ref_seq_orig, chain, dna_complete_path_matrix_jt, protein_complete_path_matrix_jt); } /* check return value */ if (rval == GTH_ERROR_DP_PARAMETER_ALLOCATION_FAILED) { /* statistics bookkeeping */ gth_stat_increment_numoffailedDPparameterallocations(stat); gth_stat_increment_numofundeterminedSAs(stat); /* free space */ gth_sa_delete(saA); match_info->call_number--; continue; /* continue with the next DP range */ } else if (rval) return -1; } if (!call_info->out->xmlout && !call_info->out->gff3out && !directmatches && !match_info->significant_match_found && match_info->call_number <= call_info->firstalshown) { show_no_match_line(gth_input_get_alphatype(input, ref_file_num), outfp); } return 0; }
static int gt_ltrdigest_pdom_visitor_feature_node(GtNodeVisitor *nv, GtFeatureNode *fn, GtError *err) { GtLTRdigestPdomVisitor *lv; GtFeatureNodeIterator *fni; GtFeatureNode *curnode = NULL; int had_err = 0; GtRange rng; GtUword i; lv = gt_ltrdigest_pdom_visitor_cast(nv); gt_assert(lv); gt_error_check(err); /* traverse annotation subgraph and find LTR element */ fni = gt_feature_node_iterator_new(fn); while (!had_err && (curnode = gt_feature_node_iterator_next(fni))) { if (strcmp(gt_feature_node_get_type(curnode), lv->root_type) == 0) { lv->ltr_retrotrans = curnode; } } gt_feature_node_iterator_delete(fni); if (!had_err && lv->ltr_retrotrans != NULL) { GtCodonIterator *ci; GtTranslator *tr; GtTranslatorStatus status; GtUword seqlen; char translated, *rev_seq; #ifndef _WIN32 FILE *instream; GtHMMERParseStatus *pstatus; #endif unsigned int frame; GtStr *seq; seq = gt_str_new(); rng = gt_genome_node_get_range((GtGenomeNode*) lv->ltr_retrotrans); lv->leftLTR_5 = rng.start - 1; lv->rightLTR_3 = rng.end - 1; seqlen = gt_range_length(&rng); had_err = gt_extract_feature_sequence(seq, (GtGenomeNode*) lv->ltr_retrotrans, lv->root_type, false, NULL, NULL, lv->rmap, err); if (!had_err) { for (i = 0UL; i < 3UL; i++) { gt_str_reset(lv->fwd[i]); gt_str_reset(lv->rev[i]); } /* create translations */ ci = gt_codon_iterator_simple_new(gt_str_get(seq), seqlen, NULL); gt_assert(ci); tr = gt_translator_new(ci); status = gt_translator_next(tr, &translated, &frame, err); while (status == GT_TRANSLATOR_OK && translated) { gt_str_append_char(lv->fwd[frame], translated); status = gt_translator_next(tr, &translated, &frame, NULL); } if (status == GT_TRANSLATOR_ERROR) had_err = -1; if (!had_err) { rev_seq = gt_malloc((size_t) seqlen * sizeof (char)); strncpy(rev_seq, gt_str_get(seq), (size_t) seqlen * sizeof (char)); (void) gt_reverse_complement(rev_seq, seqlen, NULL); gt_codon_iterator_delete(ci); ci = gt_codon_iterator_simple_new(rev_seq, seqlen, NULL); gt_translator_set_codon_iterator(tr, ci); status = gt_translator_next(tr, &translated, &frame, err); while (status == GT_TRANSLATOR_OK && translated) { gt_str_append_char(lv->rev[frame], translated); status = gt_translator_next(tr, &translated, &frame, NULL); } if (status == GT_TRANSLATOR_ERROR) had_err = -1; gt_free(rev_seq); } gt_codon_iterator_delete(ci); gt_translator_delete(tr); } /* run HMMER and handle results */ if (!had_err) { #ifndef _WIN32 int pid, pc[2], cp[2]; GT_UNUSED int rval; (void) signal(SIGCHLD, SIG_IGN); /* XXX: for now, ignore child's exit status */ rval = pipe(pc); gt_assert(rval == 0); rval = pipe(cp); gt_assert(rval == 0); switch ((pid = (int) fork())) { case -1: perror("Can't fork"); exit(1); /* XXX: error handling */ case 0: /* child */ (void) close(1); /* close current stdout. */ rval = dup(cp[1]); /* make stdout go to write end of pipe. */ (void) close(0); /* close current stdin. */ rval = dup(pc[0]); /* make stdin come from read end of pipe. */ (void) close(pc[0]); (void) close(pc[1]); (void) close(cp[0]); (void) close(cp[1]); (void) execvp("hmmscan", lv->args); /* XXX: read path from env */ perror("couldn't execute hmmscan!"); exit(1); default: /* parent */ for (i = 0UL; i < 3UL; i++) { char buf[5]; GT_UNUSED ssize_t written; (void) sprintf(buf, ">"GT_WU"%c\n", i, '+'); written = write(pc[1], buf, 4 * sizeof (char)); written = write(pc[1], gt_str_get(lv->fwd[i]), (size_t) gt_str_length(lv->fwd[i]) * sizeof (char)); written = write(pc[1], "\n", 1 * sizeof (char)); (void) sprintf(buf, ">"GT_WU"%c\n", i, '-'); written = write(pc[1], buf, 4 * sizeof (char)); written = write(pc[1], gt_str_get(lv->rev[i]), (size_t) gt_str_length(lv->rev[i]) * sizeof (char)); written = write(pc[1], "\n", 1 * sizeof (char)); } (void) close(pc[0]); (void) close(pc[1]); (void) close(cp[1]); instream = fdopen(cp[0], "r"); pstatus = gt_hmmer_parse_status_new(); had_err = gt_ltrdigest_pdom_visitor_parse_output(lv, pstatus, instream, err); (void) fclose(instream); if (!had_err) had_err = gt_ltrdigest_pdom_visitor_process_hits(lv, pstatus, err); gt_hmmer_parse_status_delete(pstatus); } #else /* XXX */ gt_error_set(err, "HMMER call not implemented on Windows\n"); had_err = -1; #endif } gt_str_delete(seq); } if (!had_err) had_err = gt_ltrdigest_pdom_visitor_choose_strand(lv); return had_err; }
/* Renders a ruler with dynamic scale labeling and optional grid. */ int gt_canvas_cairo_draw_ruler(GtCanvas *canvas, GtRange viewrange, GtError *err) { double step, minorstep, vmajor, vminor, theight = gt_graphics_get_text_height(canvas->pvt->g); long base_length, tick; GtColor rulercol, gridcol; GtStr *left_str, *right_str, *unit; char str[BUFSIZ]; GtStyleQueryStatus rval; bool showgrid = true; gt_assert(canvas); if (gt_style_get_bool(canvas->pvt->sty, "format", "show_grid", &showgrid, NULL, err) == GT_STYLE_QUERY_ERROR) { return -1; } if (gt_style_get_num(canvas->pvt->sty, "format", "ruler_font_size", &theight, NULL, err) == GT_STYLE_QUERY_ERROR) { return -1; } /* get unit value from style, default: empty */ unit = gt_str_new(); if (gt_style_get_str(canvas->pvt->sty, "format", "unit", unit, NULL, err) == GT_STYLE_QUERY_ERROR) { gt_str_delete(unit); return -1; } /* get additional description texts from style */ left_str = gt_str_new(); rval = gt_style_get_str(canvas->pvt->sty, "format", "ruler_left_text", left_str, NULL, err); switch (rval) { case GT_STYLE_QUERY_NOT_SET: gt_str_append_cstr(left_str, FIVE_PRIME_STRING); break; case GT_STYLE_QUERY_ERROR: gt_str_delete(unit); gt_str_delete(left_str); return -1; break; /* shouldn't reach this */ default: break; } right_str = gt_str_new(); rval = gt_style_get_str(canvas->pvt->sty, "format", "ruler_right_text", right_str, NULL, err); switch (rval) { case GT_STYLE_QUERY_NOT_SET: gt_str_append_cstr(right_str, THREE_PRIME_STRING); break; case GT_STYLE_QUERY_ERROR: gt_str_delete(unit); gt_str_delete(left_str); gt_str_delete(right_str); return -1; break; /* shouldn't reach this */ default: break; } /* reset font to default */ gt_graphics_set_font(canvas->pvt->g, "Sans", SLANT_NORMAL, WEIGHT_NORMAL, theight); rulercol.red = rulercol.green = rulercol.blue = RULER_GREY; rulercol.alpha = 1.0; gridcol.red = gridcol.green = gridcol.blue = GRID_GREY; gridcol.alpha = 1.0; /* determine range and step of the scale */ base_length = gt_range_length(&viewrange); /* determine tick steps */ step = pow(10,ceil(log10(base_length))-1); minorstep = step/10.0; /* calculate starting positions */ vminor = (double) (floor(viewrange.start / minorstep))*minorstep; vmajor = (double) (floor(viewrange.start / step))*step; /* draw major ticks */ for (tick = vmajor; tick <= viewrange.end; tick += step) { double drawtick = (gt_coords_convert_point(viewrange, tick) * (canvas->pvt->width-2*canvas->pvt->margins)) + canvas->pvt->margins; if (tick < viewrange.start) continue; gt_graphics_draw_vertical_line(canvas->pvt->g, drawtick, canvas->pvt->y + 30, rulercol, 10, 1.0); gt_format_ruler_label(str, tick, gt_str_get(unit), BUFSIZ); gt_graphics_draw_text_centered(canvas->pvt->g, drawtick, canvas->pvt->y + 20, str); } /* draw minor ticks */ if (minorstep >= 1) { for (tick = vminor; tick <= viewrange.end; tick += minorstep) { double drawtick; if (tick < viewrange.start) continue; drawtick = (gt_coords_convert_point(viewrange, tick) * (canvas->pvt->width-2*canvas->pvt->margins)) + canvas->pvt->margins; if (showgrid) { gt_graphics_draw_vertical_line(canvas->pvt->g, drawtick, canvas->pvt->y + 40, gridcol, canvas->pvt->height - 40 - 15, 1.0); } gt_graphics_draw_vertical_line(canvas->pvt->g, drawtick, canvas->pvt->y + 35, rulercol, 5, 1.0); } } /* draw ruler line */ gt_graphics_draw_horizontal_line(canvas->pvt->g, canvas->pvt->margins, canvas->pvt->y + 40, rulercol, canvas->pvt->width - 2 * canvas->pvt->margins, 1.25); gt_graphics_draw_text_right(canvas->pvt->g, canvas->pvt->margins - 10, canvas->pvt->y + 39 + (theight/2), gt_str_get(left_str)); gt_graphics_draw_text(canvas->pvt->g, canvas->pvt->width - canvas->pvt->margins + 10, canvas->pvt->y + 39 + (theight/2), gt_str_get(right_str)); gt_str_delete(unit); gt_str_delete(left_str); gt_str_delete(right_str); return 0; }
static int run_orffinder(GtRegionMapping *rmap, GtFeatureNode *gf, unsigned long start, GT_UNUSED unsigned long end, unsigned int min, unsigned int max, bool all, GtError *err) { int had_err = 0, i; unsigned long sum; GtCodonIterator* ci = NULL; GtTranslator* translator = NULL; GtORFIterator* orfi = NULL; GtORFIteratorStatus state; GtRange orf_rng, tmp_orf_rng[3]; GtStr *seq; unsigned int orf_frame; /* forward strand */ seq = gt_str_new(); had_err = gt_extract_feature_sequence(seq, (GtGenomeNode*) gf, gt_feature_node_get_type(gf), false, NULL, NULL, rmap, err); ci = gt_codon_iterator_simple_new(gt_str_get(seq), gt_str_length(seq), err); gt_assert(ci); translator = gt_translator_new(ci); gt_assert(translator); orfi = gt_orf_iterator_new(ci, translator); gt_assert(orfi); for (i = 0; i < 3; i++) { tmp_orf_rng[i].start = GT_UNDEF_ULONG; tmp_orf_rng[i].end = GT_UNDEF_ULONG; } while ((state = gt_orf_iterator_next(orfi, &orf_rng, &orf_frame, err)) == GT_ORF_ITERATOR_OK) { if (all) { process_orf(orf_rng, orf_frame, GT_STRAND_FORWARD, gf, start, min, max, err); } else { if (gt_range_length(&orf_rng) > gt_range_length(&tmp_orf_rng[orf_frame])) { tmp_orf_rng[orf_frame].start = orf_rng.start; tmp_orf_rng[orf_frame].end = orf_rng.end; } } } if (state == GT_ORF_ITERATOR_ERROR) had_err = -1; if (!had_err) { if (!all) { for (i = 0; i < 3; i++) { if (tmp_orf_rng[i].start != GT_UNDEF_ULONG) { process_orf(tmp_orf_rng[i], (unsigned int) i, GT_STRAND_FORWARD, gf, start, min, max, err); } } } gt_codon_iterator_delete(ci); gt_translator_delete(translator); gt_orf_iterator_delete(orfi); orfi = NULL; ci = NULL; translator = NULL; for (i = 0; i < 3; i++) { tmp_orf_rng[i].start = GT_UNDEF_ULONG; tmp_orf_rng[i].end = GT_UNDEF_ULONG; } /* reverse strand */ if (!had_err) { GT_UNUSED int rval = 0; unsigned long length = gt_str_length(seq); char *strp = (char*) gt_str_get_mem(seq); rval = gt_reverse_complement(strp, gt_str_length(seq), err); gt_assert(!rval); /* XXX */ ci = gt_codon_iterator_simple_new(gt_str_get(seq), gt_str_length(seq), err); gt_assert(ci); translator = gt_translator_new(ci); gt_assert(translator); orfi = gt_orf_iterator_new(ci, translator); gt_assert(orfi); sum = start + length - 1; while ((state = gt_orf_iterator_next(orfi, &orf_rng, &orf_frame, err)) == GT_ORF_ITERATOR_OK) { if (all) { process_orf(orf_rng, orf_frame, GT_STRAND_REVERSE, gf, sum, min, max, err); } else { if (gt_range_length(&orf_rng) > gt_range_length(&tmp_orf_rng[orf_frame])) { tmp_orf_rng[orf_frame].start = orf_rng.start; tmp_orf_rng[orf_frame].end = orf_rng.end; } } } if (state == GT_ORF_ITERATOR_ERROR) had_err = -1; if (!had_err) { if (!all) { for (i = 0; i < 3; i++) { if (tmp_orf_rng[i].start != GT_UNDEF_ULONG) { process_orf(tmp_orf_rng[i], (unsigned int) i, GT_STRAND_REVERSE, gf, sum, min, max, err); } } } } } gt_str_delete(seq); gt_codon_iterator_delete(ci); gt_translator_delete(translator); gt_orf_iterator_delete(orfi); } return had_err; }
static int cluster_sequences(GtArray *matches, GtClusteredSet *cs, GtHashmap *seqdesc2seqnum, unsigned int psmall, unsigned int plarge, GtEncseq *encseq, GtError *err) { GtMatch *match; GtMatchEdgeTable matchedgetab; GtMatchEdge matchedge; GtRange rng_seq1, rng_seq2; int had_err = 0; unsigned long i, lsmall, llarge, matchlen1, matchlen2, num_of_seq, seqnum1 = 0, seqnum2 = 0; const char *seqid; num_of_seq = gt_encseq_num_of_sequences(encseq); gt_assert(matches && cs && seqdesc2seqnum && encseq); if (gt_clustered_set_num_of_elements(cs, err) != num_of_seq) { had_err = -1; gt_error_set(err, "number of sequences (%lu) unequals number of elements in" " clustered set (%lu)", num_of_seq, gt_clustered_set_num_of_elements(cs, err)); } if (!had_err) { matchedgetab.edges = gt_array_new(sizeof (GtMatchEdge)); matchedgetab.num_of_edges = 0; for (i = 0; i < gt_array_size(matches); i++) { match = *(GtMatch**) gt_array_get(matches, i); gt_match_get_range_seq1(match, &rng_seq1); gt_match_get_range_seq2(match, &rng_seq2); matchlen1 = gt_range_length(&rng_seq1); matchlen2 = gt_range_length(&rng_seq2); seqid = gt_match_get_seqid1(match); if (gt_hashmap_get(seqdesc2seqnum, (void*) seqid) != NULL) seqnum1 = ((unsigned long) gt_hashmap_get(seqdesc2seqnum, seqid)) - 1; else { had_err = -1; gt_error_set(err, "key %s not found", seqid); } seqid = gt_match_get_seqid2(match); if (!had_err && gt_hashmap_get(seqdesc2seqnum, (void*) seqid) != NULL) seqnum2 = ((unsigned long) gt_hashmap_get(seqdesc2seqnum, seqid)) - 1; else { had_err = -1; gt_error_set(err, "key %s not found", seqid); } if (!had_err) { if (gt_encseq_seqlength(encseq, seqnum1) > gt_encseq_seqlength(encseq, seqnum2)) { llarge = gt_encseq_seqlength(encseq, seqnum1); lsmall = gt_encseq_seqlength(encseq, seqnum2); } else { lsmall = gt_encseq_seqlength(encseq, seqnum1); llarge = gt_encseq_seqlength(encseq, seqnum2); } if (((llarge * plarge)/100 <= matchlen1) && ((lsmall * psmall)/100 <= matchlen1) && ((llarge * plarge)/100 <= matchlen2) && ((lsmall * psmall)/100 <= matchlen2)) { if (seqnum1 != seqnum2) { matchedge.matchnum0 = seqnum1; matchedge.matchnum1 = seqnum2; gt_array_add(matchedgetab.edges, matchedge); matchedgetab.num_of_edges++; } } } } } if (!had_err) if (gt_cluster_matches(cs, &matchedgetab, err) != 0) had_err = -1; if (!had_err) gt_array_delete(matchedgetab.edges); return had_err; }