static void storeSAincurrentPGL(GtArray *pgls, GtUword currentPGLindex, GthSA *sa) { GtUword leftgenomicexonborder, rightgenomicexonborder; GthPGL *currentPGL; /* the current PGL index is defined */ gt_assert(currentPGLindex != GT_UNDEF_UWORD); currentPGL = *(GthPGL**) gt_array_get(pgls, currentPGLindex); /* update maxrange */ leftgenomicexonborder = gth_sa_get_exon(sa, 0) ->leftgenomicexonborder; rightgenomicexonborder = gth_sa_get_exon(sa, gth_sa_num_of_exons(sa)-1) ->rightgenomicexonborder; if (leftgenomicexonborder < currentPGL->maxrange.start) currentPGL->maxrange.start = leftgenomicexonborder; if (rightgenomicexonborder > currentPGL->maxrange.end) currentPGL->maxrange.end = rightgenomicexonborder; /* save SA */ gth_pgl_add_sa(currentPGL, sa); }
static void addSAtoexondistribution(GtDiscDistri *exondistribution, GthSA *sa) { Exoninfo *exoninfo; unsigned long i; /* add values to exondistribution */ for (i = 0; i < gth_sa_num_of_exons(sa); i++) { exoninfo = gth_sa_get_exon(sa, i); gt_disc_distri_add(exondistribution, exoninfo->rightgenomicexonborder - exoninfo->leftgenomicexonborder + 1); } }
static void xml_outputPGSlines(GtArray *alignments, unsigned int indentlevel, GtFile *outfp) { unsigned long i, j; GthSA *sa; gth_indent(outfp, indentlevel); gt_file_xprintf(outfp, "<supporting_evidence xmlns=\"" "http://www.genomethreader.org/" "GTH_output/PGL_module/predicted_gene_location/" "AGS_information/supporting_evidence/\">\n"); indentlevel++; for (i = 0; i < gt_array_size(alignments); i++) { sa = *(GthSA**) gt_array_get(alignments, i); gth_indent(outfp, indentlevel); gt_file_xprintf(outfp, "<PGS_line>\n"); indentlevel++; gth_indent(outfp, indentlevel); gt_file_xprintf(outfp, "<gDNA_exon_coordinates>\n"); indentlevel++; for (j = 0; j < gth_sa_num_of_exons(sa); j++) { gth_indent(outfp, indentlevel); gt_file_xprintf(outfp, "<exon start=\"%lu\" stop=\"%lu\"/>\n", gth_sa_left_genomic_exon_border(sa, j), gth_sa_right_genomic_exon_border(sa, j)); } indentlevel--; gth_indent(outfp, indentlevel); gt_file_xprintf(outfp, "</gDNA_exon_coordinates>\n"); gth_indent(outfp, indentlevel); if (gth_sa_alphatype(sa) == DNA_ALPHA) { gt_file_xprintf(outfp, "<referenceDNA id=\"%s\" strand=\"%c\"/>\n", gth_sa_ref_id(sa), gth_sa_ref_strand_char(sa)); } else { gt_file_xprintf(outfp, "<referenceProtein id=\"%s\"/>\n", gth_sa_ref_id(sa)); } indentlevel--; gth_indent(outfp, indentlevel); gt_file_xprintf(outfp, "</PGS_line>\n"); } indentlevel--; gth_indent(outfp, indentlevel); gt_file_xprintf(outfp, "</supporting_evidence>\n"); }
static void storeSAinnewPGL(GtArray *pgls, GtUword *currentPGLindex, GthSA *sa) { GthPGL *pgl; pgl = gth_pgl_new(gth_sa_gen_strand_forward(sa)); pgl->maxrange.start = gth_sa_get_exon(sa, 0)->leftgenomicexonborder; pgl->maxrange.end = gth_sa_get_exon(sa,gth_sa_num_of_exons(sa)-1) ->rightgenomicexonborder; gth_pgl_add_sa(pgl, sa); gt_array_add(pgls, pgl); /* set the current PGL index */ *currentPGLindex = gt_array_size(pgls) - 1; }
/* The following function prints the "classic" GeneSeqer2 PGS line */ static void showpgsline(GthSA *sa, GtFile *outfp) { GtUword i, numofexons; gt_assert(sa); numofexons = gth_sa_num_of_exons(sa); gt_file_xprintf(outfp, "PGS_%s%c_%s%c\t(", gth_sa_gen_id(sa), gth_sa_gen_strand_char(sa), gth_sa_ref_id(sa), gth_sa_ref_strand_char(sa)); for (i = 0; i < numofexons; i++) { gt_file_xprintf(outfp, ""GT_WU" "GT_WU"", gth_sa_left_genomic_exon_border(sa, i), gth_sa_right_genomic_exon_border(sa, i)); if (i == numofexons - 1) gt_file_xprintf(outfp, ")\n\n"); else gt_file_xfputc(',', outfp); } }
static void outputPGSlines(GtArray *alignments, GtFile *outfp) { GtUword i, j; GthSA *sa; for (i = 0; i < gt_array_size(alignments); i++) { sa = *(GthSA**) gt_array_get(alignments, i); gt_file_xprintf(outfp, " PGS ("); for (j = 0; j < gth_sa_num_of_exons(sa); j++) { if (j > 0) gt_file_xfputc(',', outfp); gt_file_xprintf(outfp, GT_WU " " GT_WU , gth_sa_left_genomic_exon_border(sa, j), gth_sa_right_genomic_exon_border(sa, j)); } gt_file_xprintf(outfp, ")\t%s%c\n", gth_sa_ref_id(sa), gth_sa_ref_strand_char(sa)); } gt_file_xfputc('\n', outfp); }
/* The following function prints the "classic" GeneSeqer2 PGS line */ static void xml_showpgsline(GthSA *sa, unsigned int indentlevel, GtFile *outfp) { unsigned long i, numofexons; gt_assert(sa); numofexons = gth_sa_num_of_exons(sa); gth_indent(outfp, indentlevel); gt_file_xprintf(outfp, "<PGS_line>\n"); indentlevel++; gth_indent(outfp, indentlevel); gt_file_xprintf(outfp, "<gDNA gen_id=\"%s\" gen_strand=\"%c\"/>\n", gth_sa_gen_id(sa), gth_sa_gen_strand_char(sa)); gth_indent(outfp, indentlevel); if (gth_sa_alphatype(sa) == DNA_ALPHA) { gt_file_xprintf(outfp, "<rDNA rDNA_id=\"%s\" rDNA_strand=\"%c\"/>\n", gth_sa_ref_id(sa), gth_sa_ref_strand_char(sa)); } else { gt_file_xprintf(outfp, "<rProt rProt_id=\"%s\"/>\n", gth_sa_ref_id(sa)); } gth_indent(outfp, indentlevel); gt_file_xprintf(outfp, "<gDNA_exon_coordinates>\n"); indentlevel++; for (i = 0; i < numofexons; i++) { gth_indent(outfp, indentlevel); gt_file_xprintf(outfp, "<exon e_start=\"%lu\" e_stop=\"%lu\"/>\n", gth_sa_left_genomic_exon_border(sa, i), gth_sa_right_genomic_exon_border(sa, i)); } indentlevel--; gth_indent(outfp, indentlevel); gt_file_xprintf(outfp, "</gDNA_exon_coordinates>\n"); indentlevel--; gth_indent(outfp, indentlevel); gt_file_xprintf(outfp, "</PGS_line>\n"); }
static void showexons(GthSA *sa, unsigned int indentlevel, GtFile *outfp) { Exoninfo *exoninfo; GtUword i; for (i = 0; i < gth_sa_num_of_exons(sa); i++) { exoninfo = gth_sa_get_exon(sa, i); gth_indent(outfp, indentlevel); gt_file_xprintf(outfp, "<exoninfo>\n"); indentlevel++; gth_indent(outfp, indentlevel); gt_file_xprintf(outfp, "<leftgenomicexonborder>"GT_WU"</leftgenomicexonborder>\n", exoninfo->leftgenomicexonborder); gth_indent(outfp, indentlevel); gt_file_xprintf(outfp, "<rightgenomicexonborder>"GT_WU "</rightgenomicexonborder>\n", exoninfo->rightgenomicexonborder); gth_indent(outfp, indentlevel); gt_file_xprintf(outfp, "<leftreferenceexonborder>"GT_WU "</leftreferenceexonborder>\n", exoninfo->leftreferenceexonborder); gth_indent(outfp, indentlevel); gt_file_xprintf(outfp, "<rightreferenceexonborder>"GT_WU "</rightreferenceexonborder>\n", exoninfo->rightreferenceexonborder); gth_indent(outfp, indentlevel); gt_file_xprintf(outfp, "<exonscore>%.*f</exonscore>\n", PRECISION, exoninfo->exonscore); indentlevel--; gth_indent(outfp, indentlevel); gt_file_xprintf(outfp, "</exoninfo>\n"); } }
void gth_compute_scores(GthSA *sa, bool proteineop, GthDPParam *dp_param, void *dp_options_est, const unsigned char *gen_seq_tran, const unsigned char *ref_seq_tran, const unsigned char *ref_seq_orig, const GtTransTable *transtable, unsigned long gen_dp_start, unsigned long scoreminexonlen, bool introncutout, bool gs2out, GthSplicedSeq *spliced_seq, unsigned long ref_dp_length, GtAlphabet *gen_alphabet, GtAlphabet *ref_alphabet, GthDPScoresProtein *dp_scores_protein) { Traversealignmentfunctions travfunctions; Traversealignmentstate travstate; Computebordersandscoresdata data; GthFlt score, coverageofgenomicsegment, coverageofreferencesegment; gt_assert(!gth_sa_num_of_exons(sa)); gt_assert(!gth_sa_num_of_introns(sa)); travfunctions.processmismatch = computescoresprocmismatch; travfunctions.processdeletion = computescoresprocdeletion; travfunctions.processinsertion = computebordersandscoresprocinsertion; travfunctions.processmatch = computebordersandscoresprocmatch; travfunctions.processintron = computebordersandscoresprocintron; travfunctions.breakcondition = NULL; /* additional functions for protein edit operations */ travfunctions.processintron_with_1_base_left = computebordersandscoresprocintron; travfunctions.processintron_with_2_bases_left = computebordersandscoresprocintron; travfunctions.processmismatch_with_1_gap = computescoresprocmismatchordeletionwithgap; travfunctions.processmismatch_with_2_gaps = computescoresprocmismatchordeletionwithgap; travfunctions.processdeletion_with_1_gap = computescoresprocmismatchordeletionwithgap; travfunctions.processdeletion_with_2_gaps = computescoresprocmismatchordeletionwithgap; travstate.proteineop = proteineop; travstate.processing_intron_with_1_base_left = false; travstate.processing_intron_with_2_bases_left = false; travstate.alignment = gth_sa_get_editoperations(sa); travstate.alignmentlength = gth_sa_get_editoperations_length(sa); travstate.eopptr = travstate.alignment + travstate.alignmentlength - 1; travstate.genomicptr = gth_sa_genomiccutoff_start(sa); travstate.referenceptr = gth_sa_referencecutoff_start(sa); if (travstate.alignmentlength <= 0) { /* in this case the alignmentscore is set to 0, which leads to discarding this alignment later */ gth_sa_set_score(sa, 0.0); return; } /* editoperations contain no zero base exons */ gt_assert(gth_sa_contains_no_zero_base_exons(sa)); /* editoperations contain no leading or terminal introns or insertions */ gt_assert(containsnoleadingorterminalintronsorinsertions(travstate.alignment, travstate .alignmentlength, proteineop)); /* sum of edit operations equals referencelength */ gt_assert(gt_eops_equal_referencelength(travstate.alignment, travstate.alignmentlength, ref_dp_length - gth_sa_referencecutoff_start(sa) - gth_sa_referencecutoff_end(sa), proteineop)); data.proteineop = proteineop; data.newexon = true; data.newintron = true; data.firstexon = true; data.introncutout = introncutout; data.gs2out = gs2out; data.spliced_seq = spliced_seq; data.singleexonweight = (GthFlt) 0.0; data.maxsingleexonweight = (GthFlt) 0.0; data.overallexonweight = (GthFlt) 0.0; data.maxoverallexonweight = (GthFlt) 0.0; data.cumulativelengthofscoredexons = 0; data.exon.leftgenomicexonborder = GT_UNDEF_ULONG; data.exon.rightgenomicexonborder = GT_UNDEF_ULONG; data.exon.leftreferenceexonborder = GT_UNDEF_ULONG; data.exon.rightreferenceexonborder = GT_UNDEF_ULONG; data.exon.exonscore = GTH_UNDEF_GTHDBL; data.intron.donorsiteprobability = GTH_UNDEF_GTHFLT; data.intron.acceptorsiteprobability = GTH_UNDEF_GTHFLT; data.intron.donorsitescore = GTH_UNDEF_GTHDBL; data.intron.acceptorsitescore = GTH_UNDEF_GTHDBL; data.sa = sa; data.dp_param = dp_param; data.dp_options_est = dp_options_est; data.gen_seq_tran = gen_seq_tran; data.ref_seq_tran = ref_seq_tran; data.ref_seq_orig = ref_seq_orig; data.transtable = transtable; data.gen_dp_start = gen_dp_start; data.scoreminexonlen = scoreminexonlen; data.ref_dp_length = ref_dp_length; data.gen_alphabet = gen_alphabet; data.gen_alphabet_characters = gen_alphabet ? gt_alphabet_characters(gen_alphabet) : NULL; data.dp_scores_protein = dp_scores_protein; gthtraversealignment(true, &travstate, proteineop, &data, &travfunctions); /* this is for saving the last exon */ evalnewintronifpossible(proteineop, &data.newexon, &data.newintron, true, data.introncutout, data.gs2out, data.spliced_seq, &data.exon, &data.intron, &data.singleexonweight, &data.maxsingleexonweight, &data.overallexonweight, &data.maxoverallexonweight, &data.cumulativelengthofscoredexons, sa, &travstate, gen_alphabet, data.dp_param, data.dp_options_est, data.gen_seq_tran, data.ref_seq_tran, data.gen_dp_start, data.scoreminexonlen); /* saving the scores for the whole alignment */ if (data.maxoverallexonweight > 0.0) { score = data.overallexonweight / data.maxoverallexonweight; /* XXX: the way the alignmentscore is computed, it is possible to get a score > 1.0. Since we don't want this, we cap it */ if (score > 1.0) score = 1.0; } else score = 0.0; gth_sa_set_score(sa, score); gth_sa_set_cumlen_scored_exons(sa, data.cumulativelengthofscoredexons); /* fraction of the gen_dp_length which is scored/weighted */ coverageofgenomicsegment = (GthFlt) data.cumulativelengthofscoredexons / (GthFlt) gth_sa_gen_dp_length(sa); /* coverage of genomic segment is valid value */ gt_assert(coverageofgenomicsegment >= 0.0 && coverageofgenomicsegment <= 1.0); /* fraction of the referencelength which is scored/weighted */ coverageofreferencesegment = (GthFlt) data.cumulativelengthofscoredexons / (GthFlt) ((proteineop ? GT_CODON_LENGTH : 1) * gth_sa_ref_total_length(sa)); if (coverageofgenomicsegment > coverageofreferencesegment) { gth_sa_set_coverage(sa, coverageofgenomicsegment); gth_sa_set_highest_cov(sa, true); } else { gth_sa_set_coverage(sa, coverageofreferencesegment); gth_sa_set_highest_cov(sa, false); } /* test the assumption that the coverage is never larger then the default */ gt_assert(gth_sa_coverage(sa) <= GTH_DEFAULT_MAX_COVERAGE); /* compute poly(A) tail position */ gth_sa_calc_polyAtailpos(sa, ref_seq_tran, ref_alphabet); /* determined exons are forward and consecutive */ gt_assert(gth_sa_exons_are_forward_and_consecutive(sa)); }
static void xml_showalignmentheader(GthSA *sa, unsigned long minintronlength, unsigned int indentlevel, GtFile *outfp) { unsigned long i, leftreferenceexonborder, rightreferenceexonborder, referenceexonlength; GthDbl exonscore, donorsitescore, acceptorsitescore; GthFlt donorsiteprobability, acceptorsiteprobability; Exoninfo *exoninfo; Introninfo *introninfo; gth_indent(outfp, indentlevel); gt_file_xprintf(outfp, "<predicted_gene_structure>\n"); indentlevel++; gth_indent(outfp, indentlevel); gt_file_xprintf(outfp, "<exon-intron_info>\n"); indentlevel++; for (i = 0; i < gth_sa_num_of_exons(sa); i++) { exoninfo = gth_sa_get_exon(sa, i); leftreferenceexonborder = exoninfo->leftreferenceexonborder; rightreferenceexonborder = exoninfo->rightreferenceexonborder; referenceexonlength = rightreferenceexonborder - leftreferenceexonborder + 1; exonscore = exoninfo->exonscore; if (i > 0) { introninfo = gth_sa_get_intron(sa, i-1); donorsiteprobability = introninfo->donorsiteprobability; donorsitescore = introninfo->donorsitescore; acceptorsiteprobability = introninfo->acceptorsiteprobability; acceptorsitescore = introninfo->acceptorsitescore; gth_indent(outfp, indentlevel); gt_file_xprintf(outfp, "<intron i_serial=\"%lu\">\n", i - 1 + OUTPUTOFFSET); indentlevel++; gth_indent(outfp, indentlevel); gt_file_xprintf(outfp, "<gDNA_intron_boundary i_start=\"%lu\" i_stop=\"%lu\" " "i_length=\"%lu\">\n", gth_sa_left_intron_border(sa, i-1), gth_sa_right_intron_border(sa, i-1), gth_sa_intron_length(sa, i-1)); indentlevel++; gth_indent(outfp, indentlevel); gt_file_xprintf(outfp, "<donor d_prob=\"%.3f\"", donorsiteprobability); if (gth_sa_alphatype(sa) == DNA_ALPHA) gt_file_xprintf(outfp, " d_score=\"%.2f\"", donorsitescore); gt_file_xprintf(outfp, "/>\n"); gth_indent(outfp, indentlevel); gt_file_xprintf(outfp, "<acceptor a_prob=\"%.3f\"", acceptorsiteprobability); if (gth_sa_alphatype(sa) == DNA_ALPHA) gt_file_xprintf(outfp, " a_score=\"%.2f\"", acceptorsitescore); gt_file_xprintf(outfp, "/>\n"); indentlevel--; gth_indent(outfp, indentlevel); gt_file_xprintf(outfp, "</gDNA_intron_boundary>\n"); /* if the intron is shorter or equal than the minimal intron length an additional tag is shown */ if (gth_sa_intron_length(sa, i-1) <= minintronlength) { gth_indent(outfp, indentlevel); gt_file_xprintf(outfp, "<shorter_than_min_intron_len/>\n"); } indentlevel--; gth_indent(outfp, indentlevel); gt_file_xprintf(outfp, "</intron>\n"); } gth_indent(outfp, indentlevel); gt_file_xprintf(outfp, "<exon e_serial=\"%lu\">\n", i + OUTPUTOFFSET); indentlevel++; gth_indent(outfp, indentlevel); gt_file_xprintf(outfp, "<gDNA_exon_boundary g_start=\"%lu\" g_stop=" "\"%lu\" g_length=\"%lu\"/>\n", gth_sa_left_genomic_exon_border(sa, i), gth_sa_right_genomic_exon_border(sa, i), gth_sa_genomic_exon_length(sa, i)); gth_indent(outfp, indentlevel); gt_file_xprintf(outfp, "<reference_exon_boundary r_type=\"%s\" r_start=\"%lu\" " "r_stop=\"%lu\" r_length=\"%lu\" r_score=\"%5.3f\"/>\n", gth_sa_alphastring(sa), leftreferenceexonborder + OUTPUTOFFSET , rightreferenceexonborder + OUTPUTOFFSET , referenceexonlength, exonscore); indentlevel--; gth_indent(outfp, indentlevel); gt_file_xprintf(outfp, "</exon>\n"); } indentlevel--; gth_indent(outfp, indentlevel); gt_file_xprintf(outfp, "</exon-intron_info>\n"); /* showing PPA line (if an poly-A tail was determined) */ if (gth_sa_alphatype(sa) == DNA_ALPHA) xml_showppaline(sa, indentlevel, outfp); /* showing MATCH line */ xml_showmatchline(sa, indentlevel, outfp); /* showing PGS line */ xml_showpgsline(sa, indentlevel, outfp); }
static void showalignmentheader(GthSA *sa, bool gs2out, int widthforgenpos, GtUword minintronlength, GtFile *outfp) { GtUword i, leftreferenceexonborder, rightreferenceexonborder, referenceexonlength; GthDbl exonscore, donorsitescore, acceptorsitescore; GthFlt donorsiteprobability, acceptorsiteprobability; Exoninfo *exoninfo; Introninfo *introninfo; gt_file_xprintf(outfp, "Predicted gene structure"); if (gs2out) { gt_file_xprintf(outfp, " (within gDNA segment "GT_WU" to "GT_WU"):\n", gth_sa_gen_dp_start_show(sa), gth_sa_gen_dp_end_show(sa)); } else gt_file_xprintf(outfp, ":\n"); gt_file_xfputc('\n', outfp); for (i = 0; i < gth_sa_num_of_exons(sa); i++) { exoninfo = gth_sa_get_exon(sa, i); leftreferenceexonborder = exoninfo->leftreferenceexonborder; rightreferenceexonborder = exoninfo->rightreferenceexonborder; referenceexonlength = rightreferenceexonborder - leftreferenceexonborder + 1; exonscore = exoninfo->exonscore; if (i > 0) { introninfo = gth_sa_get_intron(sa, i-1); donorsiteprobability = introninfo->donorsiteprobability; donorsitescore = introninfo->donorsitescore; acceptorsiteprobability = introninfo->acceptorsiteprobability; acceptorsitescore = introninfo->acceptorsitescore; gt_file_xprintf(outfp, " Intron %2" GT_WUS " %*" GT_WUS " %*" GT_WUS " (%4" GT_WUS " n); ", i - 1 + OUTPUTOFFSET, widthforgenpos, gth_sa_left_intron_border(sa, i-1), widthforgenpos, gth_sa_right_intron_border(sa, i-1), gth_sa_intron_length(sa, i-1)); gt_file_xprintf(outfp, "Pd: %5.3f ", donorsiteprobability); if (gth_sa_alphatype(sa) == DNA_ALPHA) { if (donorsitescore == 0.0) gt_file_xprintf(outfp, "(s: 0), "); else gt_file_xprintf(outfp, "(s: %4.2f), ", donorsitescore); } else gt_file_xprintf(outfp, " "); gt_file_xprintf(outfp, "Pa: %5.3f ", acceptorsiteprobability); if (gth_sa_alphatype(sa) == DNA_ALPHA) { if (acceptorsitescore == 0.0) gt_file_xprintf(outfp, "(s: 0)"); else gt_file_xprintf(outfp, "(s: %4.2f)", acceptorsitescore); } /* if the intron is shorter or equal than the minimum intron length two question marks are shown at the end of the line */ if (gth_sa_intron_length(sa, i-1) <= minintronlength) gt_file_xprintf(outfp, " ??"); gt_file_xfputc('\n', outfp); } gt_file_xprintf(outfp, " Exon %2" GT_WUS " %*" GT_WUS " %*" GT_WUS " (%4" GT_WUS " n); %s %6" GT_WUS " %6" GT_WUS " (%4" GT_WUS " %s); " "score: %5.3f\n", i + OUTPUTOFFSET, widthforgenpos, gth_sa_left_genomic_exon_border(sa, i), widthforgenpos, gth_sa_right_genomic_exon_border(sa, i), gth_sa_genomic_exon_length(sa, i), gth_sa_alphastring(sa), leftreferenceexonborder + OUTPUTOFFSET, rightreferenceexonborder + OUTPUTOFFSET, referenceexonlength, gth_sa_alphatype(sa) == DNA_ALPHA ? "n" : "aa", exonscore); } /* showing PPA line (if an poly-A tail was determined) */ if (gth_sa_alphatype(sa) == DNA_ALPHA) showppaline(sa, outfp); gt_file_xfputc('\n', outfp); /* showing MATCH line */ showmatchline(sa, outfp); /* showing PGS line */ showpgsline(sa, outfp); }