static void showdelimiterline(GtFile *outfp) { GtUword i; for (i = 0; i < DELIMITERLINELENGTH; i++) gt_file_xfputc(SA_DELIMITERLINECHAR, outfp); gt_file_xfputc('\n', outfp); }
static void showgthreferenceinformation(GthSA *sa, GthInput *input, bool showseqnums, GtFile *outfp) { gt_assert(gth_sa_ref_file_num(sa) != GT_UNDEF_UWORD); switch (gth_sa_alphatype(sa)) { case DNA_ALPHA: gt_file_xprintf(outfp, "EST Sequence: file=%s, strand=%c, description=", gth_input_get_reference_filename(input, gth_sa_ref_file_num(sa)), gth_sa_ref_strand_char(sa)); break; case PROTEIN_ALPHA: gt_file_xprintf(outfp, "Protein Sequence: file=%s, description=", gth_input_get_reference_filename(input, gth_sa_ref_file_num(sa))); break; default: gt_assert(0); } gth_sa_echo_reference_description(sa, input, outfp); if (showseqnums) gt_file_xprintf(outfp, ", seqnum="GT_WU"", gth_sa_ref_seq_num(sa)); gt_file_xfputc('\n', outfp); gt_file_xfputc('\n', outfp); }
static int gff3_show_feature_node(GtFeatureNode *fn, void *data, GT_UNUSED GtError *err) { bool part_shown = false; GtGFF3Visitor *gff3_visitor = (GtGFF3Visitor*) data; GtArray *parent_features = NULL; ShowAttributeInfo info; GtUword i; GtStr *id; gt_error_check(err); gt_assert(fn && gff3_visitor); /* output leading part */ gt_gff3_output_leading(fn, gff3_visitor->outfp); /* show unique id part of attributes */ if ((id = gt_hashmap_get(gff3_visitor->feature_node_to_unique_id_str, fn))) { gt_file_xprintf(gff3_visitor->outfp, "%s=%s", GT_GFF_ID, gt_str_get(id)); part_shown = true; } /* show parent part of attributes */ parent_features = gt_hashmap_get(gff3_visitor->feature_node_to_id_array, fn); if (gt_array_size(parent_features)) { if (part_shown) gt_file_xfputc(';', gff3_visitor->outfp); gt_file_xprintf(gff3_visitor->outfp, "%s=", GT_GFF_PARENT); for (i = 0; i < gt_array_size(parent_features); i++) { if (i) gt_file_xfputc(',', gff3_visitor->outfp); gt_file_xprintf(gff3_visitor->outfp, "%s", *(char**) gt_array_get(parent_features, i)); } part_shown = true; } /* show missing part of attributes */ info.attribute_shown = &part_shown; info.outfp = gff3_visitor->outfp; gt_feature_node_foreach_attribute(fn, show_attribute, &info); /* show dot if no attributes have been shown */ if (!part_shown) gt_file_xfputc('.', gff3_visitor->outfp); /* show terminal newline */ gt_file_xfputc('\n', gff3_visitor->outfp); return 0; }
/* The following function prints the "classic" GeneSeqer2 MATCH line */ static void xml_showmatchline(GthSA *sa, unsigned int indentlevel, GtFile *outfp) { gth_indent(outfp, indentlevel); gt_file_xprintf(outfp, "<MATCH_line gen_id=\"%s\" gen_strand=\"%c\" ", gth_sa_gen_id(sa), gth_sa_gen_strand_char(sa)); if (gth_sa_alphatype(sa) == DNA_ALPHA) { gt_file_xprintf(outfp, "ref_id=\"%s\" ref_strand=\"%c\">\n", gth_sa_ref_id(sa), gth_sa_ref_strand_char(sa)); } else gt_file_xprintf(outfp, "ref_id=\"%s\">\n", gth_sa_ref_id(sa)); indentlevel++; gth_indent(outfp, indentlevel); gt_file_xprintf(outfp, "<total_alignment_score>%.3f</total_alignment_score>\n", gth_sa_score(sa)); gth_indent(outfp, indentlevel); gt_file_xprintf(outfp, "<cumulative_length_of_scored_exons>%lu" "</cumulative_length_of_scored_exons>\n", gth_sa_cumlen_scored_exons(sa)); gth_indent(outfp, indentlevel); gt_file_xprintf(outfp, "<coverage percentage=\"%.3f\" high_type=\"", gth_sa_coverage(sa)); gt_file_xfputc(gth_sa_coverage_char(sa), outfp); gt_file_xprintf(outfp, "\"/>\n"); indentlevel--; gth_indent(outfp, indentlevel); gt_file_xprintf(outfp, "</MATCH_line>\n"); }
static void showconcreteline(const unsigned char *alignmentline, unsigned long cols, GtFile *outfp) { unsigned long i; for (i = 0; i < cols; i++) { switch (alignmentline[i]) { case ABSTRACTGAPSYMBOL: gt_file_xfputc(CONCRETEGAPSYMBOL, outfp); break; case ABSTRACTINTRONSYMBOL: gt_file_xfputc(CONCRETEINTRONSYMBOL, outfp); break; default: gt_file_xfputc(alignmentline[i], outfp); } } }
static void show_pgl(GthPGL *pgl, GtUword pglnum, GtUword translationtable, GthInput *input, unsigned int indentlevel, GthOutput *out) { GtUword i; GtFile *outfp = out->outfp; gt_assert(!out->gff3out); if (out->xmlout) { gth_indent(outfp, indentlevel); gt_file_xprintf(outfp, "<predicted_gene_location>\n"); indentlevel++; gth_indent(outfp, indentlevel); gt_file_xprintf(outfp, "<PGL_line PGL_serial=\"" GT_WU "\" " "PGL_strand=\"%c\" PGL_start=\"" GT_WU "\" PGL_stop=\"" GT_WU "\"/>\n", pglnum + OUTPUTOFFSET, SHOWSTRAND(gth_pgl_is_forward(pgl)), SHOWGENPOS(gth_pgl_is_forward(pgl), gth_pgl_total_length(pgl), gth_pgl_genomic_offset(pgl), pgl->maxrange.start), SHOWGENPOS(gth_pgl_is_forward(pgl), gth_pgl_total_length(pgl), gth_pgl_genomic_offset(pgl), pgl->maxrange.end)); } else { gt_file_xprintf(outfp, "PGL %3" GT_WUS " (%c strand): " GT_WU " " GT_WU, pglnum + OUTPUTOFFSET, SHOWSTRAND(gth_pgl_is_forward(pgl)), SHOWGENPOS(gth_pgl_is_forward(pgl), gth_pgl_total_length(pgl), gth_pgl_genomic_offset(pgl), pgl->maxrange.start), SHOWGENPOS(gth_pgl_is_forward(pgl), gth_pgl_total_length(pgl), gth_pgl_genomic_offset(pgl), pgl->maxrange.end)); if (out->pglgentemplate) gt_file_xprintf(outfp, " (genomic template '%s')", gth_pgl_gen_id(pgl)); gt_file_xfputc('\n', outfp); } for (i = 0; i < gt_array_size(pgl->assemblies); i++) { show_ags(gth_pgl_get_ags(pgl, i), pglnum, i, translationtable, input, indentlevel, out); } if (out->xmlout) { indentlevel--; gth_indent(outfp, indentlevel); gt_file_xprintf(outfp, "</predicted_gene_location>\n"); } }
static void gt_fastq_show_buffer(char separator, const char *description, const char *buffer, GtUword buffer_length, GtUword width, GtFile *outfp) { GtUword i, current_length; gt_file_xfputc(separator, outfp); if (description != NULL) gt_file_xfputs(description, outfp); gt_file_xfputc('\n', outfp); for (i = 0, current_length = 0; i < buffer_length; i++, current_length++) { if (width && current_length == width) { gt_file_xfputc('\n', outfp); current_length = 0; } gt_file_xfputc(buffer[i], outfp); } gt_file_xfputc('\n', outfp); }
static void txt_pgl_visitor_preface(GthPGLVisitor *pgl_visitor, GtUword num_of_pgls) { GtUword i; GthTxtPGLVisitor *visitor = txt_pgl_visitor_cast(pgl_visitor); for (i = 0; i < DELIMITERLINELENGTH; i++) gt_file_xfputc(PGLS_DELIMITERCHAR, visitor->out->outfp); gt_file_xprintf(visitor->out->outfp, "\n\n"); gt_file_xprintf(visitor->out->outfp, "Predicted gene locations (" GT_WU "):\n\n\n", num_of_pgls); }
void gt_ranges_show(GtArray *ranges, GtFile *outfp) { GtRange *range; GtUword i; gt_assert(ranges); for (i = 0; i < gt_array_size(ranges); i++) { range = gt_array_get(ranges, i); gt_file_xprintf(outfp, "("GT_WU","GT_WU")", range->start, range->end); } gt_file_xfputc('\n', outfp); }
void gth_sa_show_exons(const GthSA *sa, GtFile *outfp) { Exoninfo *exoninfo; GtUword i; gt_assert(sa); for (i = 0; i < gt_array_size(sa->exons); i++) { exoninfo = (Exoninfo*) gt_array_get(sa->exons, i); gt_file_xprintf(outfp, "("GT_WU","GT_WU")", exoninfo->leftgenomicexonborder, exoninfo->rightgenomicexonborder); } gt_file_xfputc('\n', outfp); }
static void showgthgenomicinformation(GthSA *sa, GthInput *input, bool showseqnums, GtFile *outfp) { gt_assert(gth_sa_gen_file_num(sa) != GT_UNDEF_UWORD); gt_file_xprintf(outfp, "Genomic Template: file=%s, strand=%c, from="GT_WU", " "to="GT_WU", description=", gth_input_get_genomic_filename(input, gth_sa_gen_file_num(sa)), gth_sa_gen_strand_char(sa), gth_sa_gen_dp_start_show(sa), gth_sa_gen_dp_end_show(sa)); gth_sa_echo_genomic_description(sa, input, outfp); if (showseqnums) gt_file_xprintf(outfp, ", seqnum="GT_WU"", gth_sa_gen_seq_num(sa)); gt_file_xfputc('\n', outfp); gt_file_xfputc('\n', outfp); }
void gt_fasta_show_entry_with_suffix(const char *description, const char *sequence, GtUword sequence_length, const char *suffix, GtUword width, GtFile *outfp) { GtUword i, current_length, suffix_length; gt_assert(sequence); gt_file_xfputc(GT_FASTA_SEPARATOR, outfp); if (description) gt_file_xfputs(description, outfp); gt_file_xfputc('\n', outfp); suffix_length = suffix ? strlen(suffix) : 0; for (i = 0, current_length = 0; i < sequence_length + suffix_length; i++, current_length++) { if (width && current_length == width) { gt_file_xfputc('\n', outfp); current_length = 0; } if (i < sequence_length) gt_file_xfputc(sequence[i], outfp); else gt_file_xfputc(suffix[i-sequence_length], outfp); } gt_file_xfputc('\n', outfp); }
static void show_attribute(const char *attr_name, const char *attr_value, void *data) { ShowAttributeInfo *info = (ShowAttributeInfo*) data; gt_assert(attr_name && attr_value && info); if (strcmp(attr_name, GT_GFF_ID) && strcmp(attr_name, GT_GFF_PARENT)) { if (*info->attribute_shown) gt_file_xfputc(';', info->outfp); else *info->attribute_shown = true; gt_file_xprintf(info->outfp, "%s=%s", attr_name, attr_value); } }
static void outputPGSlines(GtArray *alignments, GtFile *outfp) { GtUword i, j; GthSA *sa; for (i = 0; i < gt_array_size(alignments); i++) { sa = *(GthSA**) gt_array_get(alignments, i); gt_file_xprintf(outfp, " PGS ("); for (j = 0; j < gth_sa_num_of_exons(sa); j++) { if (j > 0) gt_file_xfputc(',', outfp); gt_file_xprintf(outfp, GT_WU " " GT_WU , gth_sa_left_genomic_exon_border(sa, j), gth_sa_right_genomic_exon_border(sa, j)); } gt_file_xprintf(outfp, ")\t%s%c\n", gth_sa_ref_id(sa), gth_sa_ref_strand_char(sa)); } gt_file_xfputc('\n', outfp); }
static void outputAGSline(const GthAGS *ags, GtUword agsnum, GtFile *outfp) { GthExonAGS *exon; GtUword i; gt_file_xprintf(outfp, "AGS-" GT_WU " (", agsnum + OUTPUTOFFSET); for (i = 0; i < gth_ags_num_of_exons(ags); i++) { exon = gth_ags_get_exon(ags, i); if (i > 0) gt_file_xfputc(',', outfp); gt_file_xprintf(outfp, GT_WU " " GT_WU, SHOWGENPOSAGS(exon->range.start), SHOWGENPOSAGS(exon->range.end)); } gt_file_xprintf(outfp, ")\n"); }
static void output_exon_intron_lines(const GthAGS *ags, int widthforgenpos, GtFile *outfp) { GthSpliceSiteProb *splicesiteprob; GthExonAGS *exon; GtUword i, leftexonborder, rightexonborder, exonlength, leftintronborder = GT_UNDEF_UWORD, rightintronborder, intronlength; GthDbl exonscore; GthFlt donorsiteprob, acceptorsiteprob; for (i = 0; i < gt_array_size(ags->exons); i++) { exon = (GthExonAGS*) gt_array_get(ags->exons, i); leftexonborder = exon->range.start; rightexonborder = exon->range.end; exonlength = rightexonborder - leftexonborder + 1; exonscore = exon->score; if (i > 0) { rightintronborder = leftexonborder - 1; intronlength = rightintronborder - leftintronborder + 1; splicesiteprob = (GthSpliceSiteProb*) gt_array_get(ags->splicesiteprobs, i-1); donorsiteprob = splicesiteprob->donorsiteprob; acceptorsiteprob = splicesiteprob->acceptorsiteprob; /* output intron */ gt_file_xprintf(outfp, " Intron %2" GT_WUS " %*" GT_WUS " %*" GT_WUS " (%4" GT_WUS " n); " "Pd: %5.3f Pa: %5.3f\n", i - 1 + OUTPUTOFFSET, widthforgenpos, SHOWGENPOSAGS(leftintronborder), widthforgenpos, SHOWGENPOSAGS(rightintronborder), intronlength, donorsiteprob, acceptorsiteprob); } leftintronborder = rightexonborder + 1; /* output exon */ gt_file_xprintf(outfp, " Exon %2" GT_WUS " %*" GT_WUS " %*" GT_WUS " (%4" GT_WUS " n); score: %5.3f\n", i + OUTPUTOFFSET, widthforgenpos, SHOWGENPOSAGS(leftexonborder), widthforgenpos, SHOWGENPOSAGS(rightexonborder), exonlength, exonscore); } gt_file_xfputc('\n', outfp); }
static void outputSCRline(const GthAGS *ags, GtFile *outfp) { GthSpliceSiteProb *splicesiteprob; GtUword i; gt_file_xprintf(outfp, "SCR ("); for (i = 0; i < gt_array_size(ags->exons) - 1; i++) { splicesiteprob = (GthSpliceSiteProb*) gt_array_get(ags->splicesiteprobs, i); gt_file_xprintf(outfp, "e %5.3f d %5.3f a %5.3f,", ((GthExonAGS*) gt_array_get(ags->exons, i))->score, splicesiteprob->donorsiteprob, splicesiteprob->acceptorsiteprob); } gt_file_xprintf(outfp, "e %5.3f)\n", ((GthExonAGS*) gt_array_get(ags->exons, i))->score); gt_file_xfputc('\n', outfp); }
void gt_gff3_output_leading(GtFeatureNode *fn, GtFile *outfp) { GtGenomeNode *gn; gt_assert(fn); gn = (GtGenomeNode*) fn; gt_file_xprintf(outfp, "%s\t%s\t%s\t"GT_WU"\t"GT_WU"\t", gt_str_get(gt_genome_node_get_seqid(gn)), gt_feature_node_get_source(fn), gt_feature_node_get_type(fn), gt_genome_node_get_start(gn), gt_genome_node_get_end(gn)); if (gt_feature_node_score_is_defined(fn)) gt_file_xprintf(outfp, "%.3g", gt_feature_node_get_score(fn)); else gt_file_xfputc('.', outfp); gt_file_xprintf(outfp, "\t%c\t%c\t", GT_STRAND_CHARS[gt_feature_node_get_strand(fn)], GT_PHASE_CHARS[gt_feature_node_get_phase(fn)]); }
/* The following function prints the "classic" GeneSeqer2 PGS line */ static void showpgsline(GthSA *sa, GtFile *outfp) { GtUword i, numofexons; gt_assert(sa); numofexons = gth_sa_num_of_exons(sa); gt_file_xprintf(outfp, "PGS_%s%c_%s%c\t(", gth_sa_gen_id(sa), gth_sa_gen_strand_char(sa), gth_sa_ref_id(sa), gth_sa_ref_strand_char(sa)); for (i = 0; i < numofexons; i++) { gt_file_xprintf(outfp, ""GT_WU" "GT_WU"", gth_sa_left_genomic_exon_border(sa, i), gth_sa_right_genomic_exon_border(sa, i)); if (i == numofexons - 1) gt_file_xprintf(outfp, ")\n\n"); else gt_file_xfputc(',', outfp); } }
static void showalignmentheader(GthSA *sa, bool gs2out, int widthforgenpos, GtUword minintronlength, GtFile *outfp) { GtUword i, leftreferenceexonborder, rightreferenceexonborder, referenceexonlength; GthDbl exonscore, donorsitescore, acceptorsitescore; GthFlt donorsiteprobability, acceptorsiteprobability; Exoninfo *exoninfo; Introninfo *introninfo; gt_file_xprintf(outfp, "Predicted gene structure"); if (gs2out) { gt_file_xprintf(outfp, " (within gDNA segment "GT_WU" to "GT_WU"):\n", gth_sa_gen_dp_start_show(sa), gth_sa_gen_dp_end_show(sa)); } else gt_file_xprintf(outfp, ":\n"); gt_file_xfputc('\n', outfp); for (i = 0; i < gth_sa_num_of_exons(sa); i++) { exoninfo = gth_sa_get_exon(sa, i); leftreferenceexonborder = exoninfo->leftreferenceexonborder; rightreferenceexonborder = exoninfo->rightreferenceexonborder; referenceexonlength = rightreferenceexonborder - leftreferenceexonborder + 1; exonscore = exoninfo->exonscore; if (i > 0) { introninfo = gth_sa_get_intron(sa, i-1); donorsiteprobability = introninfo->donorsiteprobability; donorsitescore = introninfo->donorsitescore; acceptorsiteprobability = introninfo->acceptorsiteprobability; acceptorsitescore = introninfo->acceptorsitescore; gt_file_xprintf(outfp, " Intron %2" GT_WUS " %*" GT_WUS " %*" GT_WUS " (%4" GT_WUS " n); ", i - 1 + OUTPUTOFFSET, widthforgenpos, gth_sa_left_intron_border(sa, i-1), widthforgenpos, gth_sa_right_intron_border(sa, i-1), gth_sa_intron_length(sa, i-1)); gt_file_xprintf(outfp, "Pd: %5.3f ", donorsiteprobability); if (gth_sa_alphatype(sa) == DNA_ALPHA) { if (donorsitescore == 0.0) gt_file_xprintf(outfp, "(s: 0), "); else gt_file_xprintf(outfp, "(s: %4.2f), ", donorsitescore); } else gt_file_xprintf(outfp, " "); gt_file_xprintf(outfp, "Pa: %5.3f ", acceptorsiteprobability); if (gth_sa_alphatype(sa) == DNA_ALPHA) { if (acceptorsitescore == 0.0) gt_file_xprintf(outfp, "(s: 0)"); else gt_file_xprintf(outfp, "(s: %4.2f)", acceptorsitescore); } /* if the intron is shorter or equal than the minimum intron length two question marks are shown at the end of the line */ if (gth_sa_intron_length(sa, i-1) <= minintronlength) gt_file_xprintf(outfp, " ??"); gt_file_xfputc('\n', outfp); } gt_file_xprintf(outfp, " Exon %2" GT_WUS " %*" GT_WUS " %*" GT_WUS " (%4" GT_WUS " n); %s %6" GT_WUS " %6" GT_WUS " (%4" GT_WUS " %s); " "score: %5.3f\n", i + OUTPUTOFFSET, widthforgenpos, gth_sa_left_genomic_exon_border(sa, i), widthforgenpos, gth_sa_right_genomic_exon_border(sa, i), gth_sa_genomic_exon_length(sa, i), gth_sa_alphastring(sa), leftreferenceexonborder + OUTPUTOFFSET, rightreferenceexonborder + OUTPUTOFFSET, referenceexonlength, gth_sa_alphatype(sa) == DNA_ALPHA ? "n" : "aa", exonscore); } /* showing PPA line (if an poly-A tail was determined) */ if (gth_sa_alphatype(sa) == DNA_ALPHA) showppaline(sa, outfp); gt_file_xfputc('\n', outfp); /* showing MATCH line */ showmatchline(sa, outfp); /* showing PGS line */ showpgsline(sa, outfp); }
void gth_stat_show(GthStat *stat, bool show_full_stats, bool xmlout, GtFile *outfp) { char *timestring; gt_assert(stat); /* begin XML comment */ if (xmlout) gt_file_xprintf(outfp, "<!--\n"); /* output exon length distribution */ if (stat->exondistri) { gt_file_xprintf(outfp, "%c length distribution of all exons:\n", COMMENTCHAR); gt_disc_distri_show(stat->exondistribution, outfp); } /* output intron length distribution */ if (stat->introndistri) { if (stat->exondistri) gt_file_xprintf(outfp, "%c\n", COMMENTCHAR); gt_file_xprintf(outfp, "%c length distribution of all introns:\n", COMMENTCHAR); gt_disc_distri_show(stat->introndistribution, outfp); } /* output match number distribution */ if (stat->matchnumdistri) { if (stat->exondistri || stat->introndistri) gt_file_xprintf(outfp, "%c\n", COMMENTCHAR); gt_file_xprintf(outfp, "%c distribution of match numbers (per genomic " "file, per reference sequence:\n", COMMENTCHAR); gt_disc_distri_show(stat->matchnumdistribution, outfp); } /* output reference sequence coverage distribution */ if (stat->refseqcovdistri) { if (stat->exondistri || stat->introndistri || stat->matchnumdistri) gt_file_xprintf(outfp, "%c\n", COMMENTCHAR); gt_file_xprintf(outfp, "%c reference sequence coverage distribution (of " "global chains):\n", COMMENTCHAR); gt_disc_distri_show(stat->refseqcoveragedistribution, outfp); } /* output spliced alignment statistics */ if (stat->sa_stats) { if (stat->exondistri || stat->introndistri || stat->matchnumdistri || stat->refseqcovdistri) { gt_file_xprintf(outfp, "%c\n", COMMENTCHAR); } INFOCHAR; gt_file_xprintf(outfp, "spliced alignment alignment score distribution:\n"); gt_disc_distri_show(stat->sa_alignment_score_distribution, outfp); INFOCHAR; gt_file_xfputc('\n', outfp); INFOCHAR; gt_file_xprintf(outfp, "spliced alignment coverage distribution:\n"); gt_disc_distri_show(stat->sa_coverage_distribution, outfp); } /* output general statistics */ outputgeneralstatistics(stat, show_full_stats, outfp); INFOCHAR; gt_file_xfputc('\n', outfp); /* output the memory statistics */ outputmemorystatistics(stat, show_full_stats, outfp); /* output time */ INFOCHAR; gt_file_xfputc('\n', outfp); INFOCHAR; timestring = gth_get_time(); gt_file_xprintf(outfp, "date finished: %s\n", timestring); gt_free(timestring); /* output important messages */ if (stat->numofremovedzerobaseexons || stat->numofautointroncutoutcalls || stat->numofunsuccessfulintroncutoutDPs || stat->numoffailedDPparameterallocations || stat->numoffailedmatrixallocations || stat->numofundeterminedSAs || stat->numoffilteredpolyAtailmatches) { gt_file_xprintf(outfp, "%c\n", COMMENTCHAR); gt_file_xprintf(outfp, "%c important messages:\n", COMMENTCHAR); if (stat->numofremovedzerobaseexons > 0) { gt_file_xprintf(outfp, "%c %lu removed zero base exons\n", COMMENTCHAR, stat->numofremovedzerobaseexons); } if (stat->numofautointroncutoutcalls > 0) { gt_file_xprintf(outfp, "%c %lu times the intron cutout technique was " "used automatically\n", COMMENTCHAR, stat->numofautointroncutoutcalls); } if (stat->numofunsuccessfulintroncutoutDPs > 0) { gt_file_xprintf(outfp, "%c %lu unsuccessful DP calls using intron " "cutout technique\n", COMMENTCHAR, stat->numofunsuccessfulintroncutoutDPs); } if (stat->numoffailedDPparameterallocations > 0) { gt_file_xprintf(outfp, "%c %lu DP parameter allocations failed\n", COMMENTCHAR, stat->numoffailedDPparameterallocations); } if (stat->numoffailedmatrixallocations > 0) { gt_file_xprintf(outfp, "%c %lu matrix allocations failed\n", COMMENTCHAR, stat->numoffailedmatrixallocations); } if (stat->numofundeterminedSAs > 0) { gt_file_xprintf(outfp, "%c %lu undetermined spliced alignments\n", COMMENTCHAR, stat->numofundeterminedSAs); } if (stat->numoffilteredpolyAtailmatches > 0) { gt_file_xprintf(outfp, "%c %lu matches containing a poly(A) tail filtered\n", COMMENTCHAR, stat->numoffilteredpolyAtailmatches); } } /* end XML comment */ if (xmlout) gt_file_xprintf(outfp, "-->\n"); }
static int gt_condenseq_extract_runner(GT_UNUSED int argc, const char **argv, int parsed_args, void *tool_arguments, GtError *err) { int had_err = 0; GtCondenserExtractArguments *arguments = tool_arguments; GtCondenseq *condenseq = NULL; GtLogger *logger = NULL; gt_error_check(err); gt_assert(arguments); logger = gt_logger_new(arguments->verbose, GT_LOGGER_DEFLT_PREFIX, stderr); if (!had_err) { condenseq = gt_condenseq_new_from_file(argv[parsed_args], logger, err); if (condenseq == NULL) { had_err = -1; } } if (!had_err) { const char *buffer = NULL; const char *desc = NULL; GtUword desclen, seqlen, rend = gt_condenseq_total_length(condenseq), send = gt_condenseq_num_of_sequences(condenseq); bool concat = strcmp(gt_str_get(arguments->mode), "concat") == 0; /* single sequence to extract = range of length 1 */ if (arguments->seq != GT_UNDEF_UWORD) { arguments->seqrange.start = arguments->seqrange.end = arguments->seq; } /* no range given at all: extract all seqs */ if (arguments->range.start == GT_UNDEF_UWORD && arguments->seqrange.start == GT_UNDEF_UWORD) { arguments->seqrange.start = 0; arguments->seqrange.end = send - 1; } /* if seqs are specified, and concat is given, switch to posrange */ if (concat && arguments->seqrange.start != GT_UNDEF_UWORD) { if (arguments->seqrange.end >= send) { had_err = -1; gt_error_set(err, "range end " GT_WU " excedes number of sequences " GT_WU " (ranges are zero based sequence ids)", arguments->seqrange.end, send); } else { arguments->range.start = gt_condenseq_seqstartpos(condenseq, arguments->seqrange.start); arguments->range.end = gt_condenseq_seqstartpos(condenseq, arguments->seqrange.end) + gt_condenseq_seqlength(condenseq, arguments->seqrange.end) - 1; } } /* extract sequence region */ if (!had_err && arguments->range.start != GT_UNDEF_UWORD) { const GtUword maxbuffsize = ((GtUword) 1) << 17; /* ~ 100000byte */ GtUword clen, rstart, current_length = 0, i; const char sepchar = gt_str_get(arguments->sepchar)[0]; if (arguments->range.end >= rend) { had_err = -1; gt_error_set(err, "range end " GT_WU " excedes length of sequence " GT_WU " (ranges are zero based positions)", arguments->range.end, rend); } if (!had_err) { rstart = arguments->range.start; rend = arguments->range.end; /* nextlength = gt_condenseq_seqlength(condenseq, seqnum); */ /* seqstart = gt_condenseq_seqstartpos(condenseq, seqnum); */ /* gt_assert(rstart >= seqstart); */ /* nextlength -= rstart - seqstart; [> handle first seq <] */ while (rstart <= rend) { GtRange cur_range; if (rend - rstart > maxbuffsize) { GtUword seqnum = gt_condenseq_pos2seqnum(condenseq, rstart + maxbuffsize), closest_sep = gt_condenseq_seqstartpos(condenseq, seqnum) - 1; gt_assert(closest_sep > rstart); clen = closest_sep - rstart + 1; } else clen = rend - rstart + 1; cur_range.start = rstart; cur_range.end = rstart + clen - 1; buffer = gt_condenseq_extract_decoded_range(condenseq, cur_range, sepchar); gt_assert(buffer != NULL); for (i = 0; i < clen; i++, current_length++) { if (arguments->width && current_length == arguments->width) { gt_file_xfputc('\n', arguments->outfp); current_length = 0; } gt_file_xfputc(buffer[i], arguments->outfp); } rstart += clen; } gt_file_xfputc('\n', arguments->outfp); } } else if (!had_err) { /* extract seqwise and always fasta */ GtUword seqnum, sstart = arguments->seqrange.start; if (arguments->seqrange.end >= send) { had_err = -1; gt_error_set(err, "range end " GT_WU " excedes number of sequences " GT_WU " (ranges are zero based sequence ids)", arguments->seqrange.end, send); } send = arguments->seqrange.end; for (seqnum = sstart; !had_err && seqnum <= send; ++seqnum) { buffer = gt_condenseq_extract_decoded(condenseq, &seqlen, seqnum); desc = gt_condenseq_description(condenseq, &desclen, seqnum); gt_fasta_show_entry_nt(desc, desclen, buffer, seqlen, arguments->width, arguments->outfp); } } } gt_condenseq_delete(condenseq); gt_logger_delete(logger); return had_err; }
static int gt_convertseq_runner(int argc, const char **argv, int parsed_args, void *tool_arguments, GtError *err) { GtConvertseqArguments *arguments = tool_arguments; int had_err = 0, i; GtFilelengthvalues *flv; GtSeqIterator *seqit; GtSequenceBuffer *sb = NULL; GtStrArray *files; const GtUchar *sequence; char *desc; GtUword len, j; off_t totalsize; gt_error_check(err); gt_assert(arguments != NULL); files = gt_str_array_new(); for (i = parsed_args; i < argc; i++) { gt_str_array_add_cstr(files, argv[i]); } totalsize = gt_files_estimate_total_size(files); flv = gt_calloc((size_t) gt_str_array_size(files), sizeof (GtFilelengthvalues)); sb = gt_sequence_buffer_new_guess_type(files, err); if (!sb) { had_err = -1; } if (!had_err) { gt_sequence_buffer_set_filelengthtab(sb, flv); /* read input using seqiterator */ seqit = gt_seq_iterator_sequence_buffer_new_with_buffer(sb); if (arguments->verbose) { gt_progressbar_start(gt_seq_iterator_getcurrentcounter(seqit, (GtUint64) totalsize), (GtUint64) totalsize); } while (true) { GtUchar *seq = NULL; desc = NULL; j = 0UL; had_err = gt_seq_iterator_next(seqit, &sequence, &len, &desc, err); if (had_err != 1) break; if (arguments->revcomp) { GtUchar *newseq = gt_calloc((size_t) len+1, sizeof (GtUchar)); memcpy(newseq, sequence, (size_t) len*sizeof (GtUchar)); had_err = gt_reverse_complement((char*) newseq, len, err); if (had_err) break; seq = newseq; } else seq = (GtUchar*) sequence; if (!arguments->showseq) { bool in_wildcard = false; gt_file_xprintf(arguments->outfp, ">%s\n", desc); for (i = 0; (GtUword) i < len; i++) { if (arguments->reduce_wc_dna) { switch (seq[i]) { case 'a': case 'A': case 'c': case 'C': case 'g': case 'G': case 't': case 'u': case 'T': case 'U': in_wildcard = false; gt_file_xfputc((int) seq[i], arguments->outfp); j++; break; default: if (!in_wildcard) { in_wildcard = true; if (isupper((int) seq[i])) gt_file_xfputc((int) 'N', arguments->outfp); else gt_file_xfputc((int) 'n', arguments->outfp); j++; } } } else if (arguments->reduce_wc_prot) { switch (seq[i]) { case 'X': case 'B': case 'Z': if (!in_wildcard) { in_wildcard = true; gt_file_xfputc((int) 'N', arguments->outfp); j++; } break; case 'x': case 'b': case 'z': if (!in_wildcard) { in_wildcard = true; gt_file_xfputc((int) 'n', arguments->outfp); j++; } break; default: in_wildcard = false; gt_file_xfputc((int) seq[i], arguments->outfp); j++; } } else { gt_file_xfputc((int) seq[i], arguments->outfp); j++; } if (arguments->fastawidth > 0 && j % arguments->fastawidth == 0) { j = 0; gt_file_xprintf(arguments->outfp, "\n"); } } if (arguments->fastawidth == 0 || len % arguments->fastawidth != 0) gt_file_xprintf(arguments->outfp, "\n"); } if (arguments->revcomp) { gt_free(seq); } } if (arguments->showflv) { for (j=0;j<gt_str_array_size(files);j++) { fprintf(stderr, "file "GT_WU" (%s): "GT_WU"/"GT_WU"\n", j, gt_str_array_get(files, j), (GtUword) flv[j].length, (GtUword) flv[j].effectivelength); } } if (arguments->verbose) { gt_progressbar_stop(); } gt_sequence_buffer_delete(sb); gt_seq_iterator_delete(seqit); } gt_str_array_delete(files); gt_free(flv); return had_err; }
static void showtranslation(GthSplicedSeq *splicedseq, char *frame0_in, char *frame1_in, char *frame2_in, GtArray *exons, bool gen_strand_forward, unsigned long gen_total_length, unsigned long gen_offset, unsigned int indentlevel, GthOutput *out) { char *dotline, *template_out, *frame0_out, *frame1_out, *frame2_out; unsigned long i, exonseparatorwidth = strlen(EXONSEPARATORSTRING), outlen = splicedseq->splicedseqlen + ((gt_array_size(exons) - 1) * exonseparatorwidth) + (splicedseq->splicedseqlen / TRANSLATIONLINEWIDTH); GtFile *outfp = out->outfp; dotline = gt_malloc(sizeof (unsigned char) * outlen); template_out = gt_malloc(sizeof (unsigned char) * outlen); frame0_out = gt_malloc(sizeof (unsigned char) * outlen); frame1_out = gt_malloc(sizeof (unsigned char) * outlen); frame2_out = gt_malloc(sizeof (unsigned char) * outlen); createoutputlines(dotline, template_out, frame0_out, frame1_out, frame2_out, (char*) splicedseq->splicedseq, frame0_in, frame1_in, frame2_in, splicedseq, exonseparatorwidth, outlen, out->gs2out); if (out->xmlout) { gth_indent(outfp, indentlevel); gt_file_xprintf(outfp, "<translation>\n"); indentlevel++; gth_indent(outfp, indentlevel); gt_file_xprintf(outfp, "<gDNA_template>"); for (i = 0; i < outlen; i++) { if (template_out[i] != '\n') { gt_file_xfputc(template_out[i], outfp); } } gt_file_xprintf(outfp, "</gDNA_template>\n"); gth_indent(outfp, indentlevel); gt_file_xprintf(outfp, "<first_frame>"); for (i = 0; i < outlen; i++) { if (frame0_out[i] != '\n') { gt_file_xfputc(frame0_out[i], outfp); } } gt_file_xprintf(outfp, "</first_frame>\n"); gth_indent(outfp, indentlevel); gt_file_xprintf(outfp, "<second_frame>"); for (i = 0; i < outlen; i++) { if (frame1_out[i] != '\n') { gt_file_xfputc(frame1_out[i], outfp); } } gt_file_xprintf(outfp, "</second_frame>\n"); gth_indent(outfp, indentlevel); gt_file_xprintf(outfp, "<third_frame>"); for (i = 0; i < outlen; i++) { if (frame2_out[i] != '\n') { gt_file_xfputc(frame2_out[i], outfp); } } gt_file_xprintf(outfp, "</third_frame>\n"); indentlevel--; gth_indent(outfp, indentlevel); gt_file_xprintf(outfp, "</translation>\n"); } else { showoutputlines(dotline, template_out, frame0_out, frame1_out, frame2_out, outlen, gen_strand_forward, gen_total_length, gen_offset, splicedseq->positionmapping, out); } gt_free(dotline); gt_free(template_out); gt_free(frame0_out); gt_free(frame1_out); gt_free(frame2_out); }
static int calc_spliced_alignments(GthSACollection *sa_collection, GthChainCollection *chain_collection, GthCallInfo *call_info, GthInput *input, GthStat *stat, GtUword gen_file_num, GtUword ref_file_num, bool directmatches, GthMatchInfo *match_info, GthDNACompletePathMatrixJT dna_complete_path_matrix_jt, GthProteinCompletePathMatrixJT protein_complete_path_matrix_jt) { const unsigned char *ref_seq_tran, *ref_seq_orig, *ref_seq_tran_rc = NULL, *ref_seq_orig_rc = NULL; GtUword chainctr, gen_offset = GT_UNDEF_UWORD, gen_total_length, ref_total_length; GtFile *outfp = call_info->out->outfp; GtRange gen_seq_bounds, gen_seq_bounds_rc; bool refseqisdna; GthChain *chain; GtRange range; GthSA *saA; int rval; gt_assert(sa_collection && chain_collection); refseqisdna = gth_input_ref_file_is_dna(input, ref_file_num); for (chainctr = 0; chainctr < gth_chain_collection_size(chain_collection); chainctr++) { chain = gth_chain_collection_get(chain_collection, chainctr); if (++match_info->call_number > call_info->firstalshown && call_info->firstalshown > 0) { if (!(call_info->out->xmlout || call_info->out->gff3out)) gt_file_xfputc('\n', outfp); else if (call_info->out->xmlout) gt_file_xprintf(outfp, "<!--\n"); if (!call_info->out->gff3out) { gt_file_xprintf(outfp, "Maximal matching %s count (%u) reached.\n", refseqisdna ? "EST" : "protein", call_info->firstalshown); gt_file_xprintf(outfp, "Only the first %u matches will be " "displayed.\n", call_info->firstalshown); } if (!(call_info->out->xmlout || call_info->out->gff3out)) gt_file_xfputc('\n', outfp); else if (call_info->out->xmlout) gt_file_xprintf(outfp, "-->\n"); match_info->max_call_number_reached = true; break; /* break out of loop */ } /* compute considered genomic regions if not set by -frompos */ if (!gth_input_use_substring_spec(input)) { gen_seq_bounds = gth_input_get_genomic_range(input, chain->gen_file_num, chain->gen_seq_num); gen_total_length = gt_range_length(&gen_seq_bounds); gen_offset = gen_seq_bounds.start; gen_seq_bounds_rc = gen_seq_bounds; } else { /* genomic multiseq contains exactly one sequence */ gt_assert(gth_input_num_of_gen_seqs(input, chain->gen_file_num) == 1); gen_total_length = gth_input_genomic_file_total_length(input, chain ->gen_file_num); gen_seq_bounds.start = gth_input_genomic_substring_from(input); gen_seq_bounds.end = gth_input_genomic_substring_to(input); gen_offset = 0; gen_seq_bounds_rc.start = gen_total_length - 1 - gen_seq_bounds.end; gen_seq_bounds_rc.end = gen_total_length - 1 - gen_seq_bounds.start; } /* "retrieving" the reference sequence */ range = gth_input_get_reference_range(input, chain->ref_file_num, chain->ref_seq_num); ref_seq_tran = gth_input_current_ref_seq_tran(input) + range.start; ref_seq_orig = gth_input_current_ref_seq_orig(input) + range.start; if (refseqisdna) { ref_seq_tran_rc = gth_input_current_ref_seq_tran_rc(input) + range.start; ref_seq_orig_rc = gth_input_current_ref_seq_orig_rc(input) + range.start; } ref_total_length = range.end - range.start + 1; /* check if protein sequences have a stop amino acid */ if (!refseqisdna && !match_info->stop_amino_acid_warning && ref_seq_orig[ref_total_length - 1] != GT_STOP_AMINO) { GtStr *ref_id = gt_str_new(); gth_input_save_ref_id(input, ref_id, chain->ref_file_num, chain->ref_seq_num); gt_warning("protein sequence '%s' (#" GT_WU " in file %s) does not end " "with a stop amino acid ('%c'). If it is not a protein " "fragment you should add a stop amino acid to improve the " "prediction. For example with `gt seqtransform " "-addstopaminos` (see http://genometools.org for details).", gt_str_get(ref_id), chain->ref_seq_num, gth_input_get_reference_filename(input, chain->ref_file_num), GT_STOP_AMINO); match_info->stop_amino_acid_warning = true; gt_str_delete(ref_id); } /* allocating space for alignment */ saA = gth_sa_new_and_set(directmatches, true, input, chain->gen_file_num, chain->gen_seq_num, chain->ref_file_num, chain->ref_seq_num, match_info->call_number, gen_total_length, gen_offset, ref_total_length); /* extend the DP borders to the left and to the right */ gth_chain_extend_borders(chain, &gen_seq_bounds, &gen_seq_bounds_rc, gen_total_length, gen_offset); /* From here on the dp positions always refer to the forward strand of the genomic DNA. */ /* call the Dynamic Programming */ if (refseqisdna) { rval = call_dna_DP(directmatches, call_info, input, stat, sa_collection, saA, gen_file_num, ref_file_num, gen_total_length, gen_offset, &gen_seq_bounds, &gen_seq_bounds_rc, ref_total_length, range.start, chainctr, gth_chain_collection_size(chain_collection), match_info, ref_seq_tran, ref_seq_orig, ref_seq_tran_rc, ref_seq_orig_rc, chain, dna_complete_path_matrix_jt, protein_complete_path_matrix_jt); } else { rval = call_protein_DP(directmatches, call_info, input, stat, sa_collection, saA, gen_file_num, ref_file_num, gen_total_length, gen_offset, &gen_seq_bounds, &gen_seq_bounds_rc, ref_total_length, range.start, chainctr, gth_chain_collection_size(chain_collection), match_info, ref_seq_tran, ref_seq_orig, chain, dna_complete_path_matrix_jt, protein_complete_path_matrix_jt); } /* check return value */ if (rval == GTH_ERROR_DP_PARAMETER_ALLOCATION_FAILED) { /* statistics bookkeeping */ gth_stat_increment_numoffailedDPparameterallocations(stat); gth_stat_increment_numofundeterminedSAs(stat); /* free space */ gth_sa_delete(saA); match_info->call_number--; continue; /* continue with the next DP range */ } else if (rval) return -1; } if (!call_info->out->xmlout && !call_info->out->gff3out && !directmatches && !match_info->significant_match_found && match_info->call_number <= call_info->firstalshown) { show_no_match_line(gth_input_get_alphatype(input, ref_file_num), outfp); } return 0; }