void gth_sa_set(GthSA *sa, GthAlphatype ref_alphatype, GtUword gen_dp_start, GtUword gen_dp_length) { gth_backtrace_path_set_gen_dp_start(sa->backtrace_path, gen_dp_start); gth_backtrace_path_set_gen_dp_length(sa->backtrace_path, gen_dp_length); gth_sa_set_score(sa, 0.0); gth_sa_set_coverage(sa, 0.0); gth_sa_set_highest_cov(sa, true); gth_sa_set_cumlen_scored_exons(sa, 0); /* reset edit operations */ gth_backtrace_path_reset(sa->backtrace_path); gth_backtrace_path_set_alphatype(sa->backtrace_path, ref_alphatype); /* reset arrays */ gt_array_reset(sa->exons); gt_array_reset(sa->introns); }
void gth_compute_scores(GthSA *sa, bool proteineop, GthDPParam *dp_param, void *dp_options_est, const unsigned char *gen_seq_tran, const unsigned char *ref_seq_tran, const unsigned char *ref_seq_orig, const GtTransTable *transtable, unsigned long gen_dp_start, unsigned long scoreminexonlen, bool introncutout, bool gs2out, GthSplicedSeq *spliced_seq, unsigned long ref_dp_length, GtAlphabet *gen_alphabet, GtAlphabet *ref_alphabet, GthDPScoresProtein *dp_scores_protein) { Traversealignmentfunctions travfunctions; Traversealignmentstate travstate; Computebordersandscoresdata data; GthFlt score, coverageofgenomicsegment, coverageofreferencesegment; gt_assert(!gth_sa_num_of_exons(sa)); gt_assert(!gth_sa_num_of_introns(sa)); travfunctions.processmismatch = computescoresprocmismatch; travfunctions.processdeletion = computescoresprocdeletion; travfunctions.processinsertion = computebordersandscoresprocinsertion; travfunctions.processmatch = computebordersandscoresprocmatch; travfunctions.processintron = computebordersandscoresprocintron; travfunctions.breakcondition = NULL; /* additional functions for protein edit operations */ travfunctions.processintron_with_1_base_left = computebordersandscoresprocintron; travfunctions.processintron_with_2_bases_left = computebordersandscoresprocintron; travfunctions.processmismatch_with_1_gap = computescoresprocmismatchordeletionwithgap; travfunctions.processmismatch_with_2_gaps = computescoresprocmismatchordeletionwithgap; travfunctions.processdeletion_with_1_gap = computescoresprocmismatchordeletionwithgap; travfunctions.processdeletion_with_2_gaps = computescoresprocmismatchordeletionwithgap; travstate.proteineop = proteineop; travstate.processing_intron_with_1_base_left = false; travstate.processing_intron_with_2_bases_left = false; travstate.alignment = gth_sa_get_editoperations(sa); travstate.alignmentlength = gth_sa_get_editoperations_length(sa); travstate.eopptr = travstate.alignment + travstate.alignmentlength - 1; travstate.genomicptr = gth_sa_genomiccutoff_start(sa); travstate.referenceptr = gth_sa_referencecutoff_start(sa); if (travstate.alignmentlength <= 0) { /* in this case the alignmentscore is set to 0, which leads to discarding this alignment later */ gth_sa_set_score(sa, 0.0); return; } /* editoperations contain no zero base exons */ gt_assert(gth_sa_contains_no_zero_base_exons(sa)); /* editoperations contain no leading or terminal introns or insertions */ gt_assert(containsnoleadingorterminalintronsorinsertions(travstate.alignment, travstate .alignmentlength, proteineop)); /* sum of edit operations equals referencelength */ gt_assert(gt_eops_equal_referencelength(travstate.alignment, travstate.alignmentlength, ref_dp_length - gth_sa_referencecutoff_start(sa) - gth_sa_referencecutoff_end(sa), proteineop)); data.proteineop = proteineop; data.newexon = true; data.newintron = true; data.firstexon = true; data.introncutout = introncutout; data.gs2out = gs2out; data.spliced_seq = spliced_seq; data.singleexonweight = (GthFlt) 0.0; data.maxsingleexonweight = (GthFlt) 0.0; data.overallexonweight = (GthFlt) 0.0; data.maxoverallexonweight = (GthFlt) 0.0; data.cumulativelengthofscoredexons = 0; data.exon.leftgenomicexonborder = GT_UNDEF_ULONG; data.exon.rightgenomicexonborder = GT_UNDEF_ULONG; data.exon.leftreferenceexonborder = GT_UNDEF_ULONG; data.exon.rightreferenceexonborder = GT_UNDEF_ULONG; data.exon.exonscore = GTH_UNDEF_GTHDBL; data.intron.donorsiteprobability = GTH_UNDEF_GTHFLT; data.intron.acceptorsiteprobability = GTH_UNDEF_GTHFLT; data.intron.donorsitescore = GTH_UNDEF_GTHDBL; data.intron.acceptorsitescore = GTH_UNDEF_GTHDBL; data.sa = sa; data.dp_param = dp_param; data.dp_options_est = dp_options_est; data.gen_seq_tran = gen_seq_tran; data.ref_seq_tran = ref_seq_tran; data.ref_seq_orig = ref_seq_orig; data.transtable = transtable; data.gen_dp_start = gen_dp_start; data.scoreminexonlen = scoreminexonlen; data.ref_dp_length = ref_dp_length; data.gen_alphabet = gen_alphabet; data.gen_alphabet_characters = gen_alphabet ? gt_alphabet_characters(gen_alphabet) : NULL; data.dp_scores_protein = dp_scores_protein; gthtraversealignment(true, &travstate, proteineop, &data, &travfunctions); /* this is for saving the last exon */ evalnewintronifpossible(proteineop, &data.newexon, &data.newintron, true, data.introncutout, data.gs2out, data.spliced_seq, &data.exon, &data.intron, &data.singleexonweight, &data.maxsingleexonweight, &data.overallexonweight, &data.maxoverallexonweight, &data.cumulativelengthofscoredexons, sa, &travstate, gen_alphabet, data.dp_param, data.dp_options_est, data.gen_seq_tran, data.ref_seq_tran, data.gen_dp_start, data.scoreminexonlen); /* saving the scores for the whole alignment */ if (data.maxoverallexonweight > 0.0) { score = data.overallexonweight / data.maxoverallexonweight; /* XXX: the way the alignmentscore is computed, it is possible to get a score > 1.0. Since we don't want this, we cap it */ if (score > 1.0) score = 1.0; } else score = 0.0; gth_sa_set_score(sa, score); gth_sa_set_cumlen_scored_exons(sa, data.cumulativelengthofscoredexons); /* fraction of the gen_dp_length which is scored/weighted */ coverageofgenomicsegment = (GthFlt) data.cumulativelengthofscoredexons / (GthFlt) gth_sa_gen_dp_length(sa); /* coverage of genomic segment is valid value */ gt_assert(coverageofgenomicsegment >= 0.0 && coverageofgenomicsegment <= 1.0); /* fraction of the referencelength which is scored/weighted */ coverageofreferencesegment = (GthFlt) data.cumulativelengthofscoredexons / (GthFlt) ((proteineop ? GT_CODON_LENGTH : 1) * gth_sa_ref_total_length(sa)); if (coverageofgenomicsegment > coverageofreferencesegment) { gth_sa_set_coverage(sa, coverageofgenomicsegment); gth_sa_set_highest_cov(sa, true); } else { gth_sa_set_coverage(sa, coverageofreferencesegment); gth_sa_set_highest_cov(sa, false); } /* test the assumption that the coverage is never larger then the default */ gt_assert(gth_sa_coverage(sa) <= GTH_DEFAULT_MAX_COVERAGE); /* compute poly(A) tail position */ gth_sa_calc_polyAtailpos(sa, ref_seq_tran, ref_alphabet); /* determined exons are forward and consecutive */ gt_assert(gth_sa_exons_are_forward_and_consecutive(sa)); }
static void end_element_handler(void *info, const XML_Char *name) { Parseinfo *parseinfo = (Parseinfo*) info; GthSA *sa = parseinfo->currentSA; GtUword datalength; double retdouble; GtWord ret; char *data; /* save data and data length */ data = gt_str_get(parseinfo->databuf); datalength = gt_str_length(parseinfo->databuf); /* perform actions depending on end tag */ if (strcmp(name, SPLICEDALIGNMENT_TAG) == 0) { /* before we store the spliced alignment we have to reverse its edit operations */ gt_assert(sa && gth_sa_backtrace_path(sa)); gth_backtrace_path_reverse(gth_sa_backtrace_path(sa)); /* ensure that before an intron which is not in phase the edit operation has length 1 (only for protein spliced alignments) */ gth_backtrace_path_ensure_length_1_before_introns( gth_sa_backtrace_path(sa)); if (parseinfo->saprocessfunc(parseinfo->data , sa, parseinfo->outputfilename, parseinfo->err)) { /* XXX */ fprintf(stderr, "error: %s\n", gt_error_get(parseinfo->err)); exit(EXIT_FAILURE); } /* reset current spliced alignment */ parseinfo->currentSA = NULL; } else if (strcmp(name, REFERENCEALPHATYPE_TAG) == 0) { if (strcmp(data, "DNA_ALPHA") == 0) gth_sa_set_alphatype(sa, DNA_ALPHA); else if (strcmp(data, "PROTEIN_ALPHA") == 0) { gth_sa_set_alphatype(sa, PROTEIN_ALPHA); } else { ILLEGAL_DATA; } } else if (strcmp(name, DNA_EOP_TYPE_TAG) == 0) { if (strcmp(data, "match") == 0) parseinfo->eoptype = EOP_TYPE_MATCH; else if (strcmp(data, "deletion") == 0) parseinfo->eoptype = EOP_TYPE_DELETION; else if (strcmp(data, "insertion") == 0) parseinfo->eoptype = EOP_TYPE_INSERTION; else if (strcmp(data, "mismatch") == 0) parseinfo->eoptype = EOP_TYPE_MISMATCH; else if (strcmp(data, "intron") == 0) parseinfo->eoptype = EOP_TYPE_INTRON; else { ILLEGAL_DATA; } } else if (strcmp(name, DNA_EOP_LENGTH_TAG) == 0) { SCANUINT; gth_backtrace_path_add_eop(gth_sa_backtrace_path(sa), parseinfo->eoptype, ret); } else if (strcmp(name, PROTEIN_EOP_TYPE_TAG) == 0) { if (strcmp(data, "match") == 0) parseinfo->eoptype = EOP_TYPE_MATCH; else if (strcmp(data, "deletion") == 0) parseinfo->eoptype = EOP_TYPE_DELETION; else if (strcmp(data, "insertion") == 0) parseinfo->eoptype = EOP_TYPE_INSERTION; else if (strcmp(data, "mismatch") == 0) parseinfo->eoptype = EOP_TYPE_MISMATCH; else if (strcmp(data, "intron") == 0) parseinfo->eoptype = EOP_TYPE_INTRON; else if (strcmp(data, "mismatch_with_1_gap") == 0) parseinfo->eoptype = EOP_TYPE_MISMATCH_WITH_1_GAP; else if (strcmp(data, "mismatch_with_2_gaps") == 0) parseinfo->eoptype = EOP_TYPE_MISMATCH_WITH_2_GAPS; else if (strcmp(data, "deletion_with_1_gap") == 0) parseinfo->eoptype = EOP_TYPE_DELETION_WITH_1_GAP; else if (strcmp(data, "deletion_with_2_gaps") == 0) parseinfo->eoptype = EOP_TYPE_DELETION_WITH_2_GAPS; else if (strcmp(data, "intron_with_1_base_left") == 0) parseinfo->eoptype = EOP_TYPE_INTRON_WITH_1_BASE_LEFT; else if (strcmp(data, "intron_with_2_bases_left") == 0) parseinfo->eoptype = EOP_TYPE_INTRON_WITH_2_BASES_LEFT; else { ILLEGAL_DATA; } } else if (strcmp(name, PROTEIN_EOP_LENGTH_TAG) == 0) { SCANUINT; gth_backtrace_path_add_eop(gth_sa_backtrace_path(sa), parseinfo->eoptype, ret); } else if (strcmp(name, INDELCOUNT_TAG) == 0) { SCANUINT; /* ignore indelcount, gets recomputed anyway */ } else if (strcmp(name, GENOMICLENGTHDP_TAG) == 0) { SCANUINT; gth_sa_set_gen_dp_length(sa, ret); } else if (strcmp(name, GENOMICLENGTHTOTAL_TAG) == 0) { SCANUINT; gth_sa_set_gen_total_length(sa, ret); } else if (strcmp(name, GENOMICOFFSET_TAG) == 0) { SCANUINT; gth_sa_set_gen_offset(sa, ret); } else if (strcmp(name, REFERENCELENGTH_TAG) == 0) { SCANUINT; gth_sa_set_ref_total_length(sa, ret); } else if (strcmp(name, DPSTARTPOS_TAG) == 0) { SCANUINT; gth_sa_set_gen_dp_start(sa, ret); } else if (strcmp(name, DPENDPOS_TAG) == 0) { SCANUINT; /* ignore DP end pos, gets recomputed from gen_dp_length anyway */ gt_assert(gth_sa_gen_dp_end(sa) == ret); } else if (strcmp(name, GENOMICFILENAME_TAG) == 0) { /* save genomic file name */ gt_str_append_cstr_nt(parseinfo->genomicfilename, data, datalength); } else if (strcmp(name, GENOMICFILEHASH_TAG) == 0) { gth_sa_set_gen_file_num(sa, process_file(parseinfo->input, gt_str_get(parseinfo->genomicfilename), data, false, UNDEF_ALPHA)); /* reset genomic filename */ gt_str_reset(parseinfo->genomicfilename); } else if (strcmp(name, GENOMICSEQNUM_TAG) == 0) { SCANUINT; gth_sa_set_gen_seq_num(sa, ret); } else if (strcmp(name, REFERENCEFILENAME_TAG) == 0) { /* save reference file name */ gt_str_append_cstr_nt(parseinfo->referencefilename, data, datalength); } else if (strcmp(name, REFERENCEFILEHASH_TAG) == 0) { gth_sa_set_ref_file_num(sa, process_file(parseinfo->input, gt_str_get(parseinfo->referencefilename), data, true, gth_sa_alphatype(sa))); /* reset reference filename */ gt_str_reset(parseinfo->referencefilename); } else if (strcmp(name, REFERENCESEQNUM_TAG) == 0) { SCANUINT; gth_sa_set_ref_seq_num(sa, ret); } else if (strcmp(name, GENOMICID_TAG) == 0) gth_sa_set_gen_id(sa, data); else if (strcmp(name, REFERENCEID_TAG) == 0) gth_sa_set_ref_id(sa, data); else if (strcmp(name, GENOMICSTRANDISFORWARD_TAG) == 0) gth_sa_set_gen_strand(sa, parse_boolean(data, parseinfo)); else if (strcmp(name, REFERENCESTRANDISFORWARD_TAG) == 0) gth_sa_set_ref_strand(sa, parse_boolean(data, parseinfo)); else if (strcmp(name, GENOMICCUTOFF_TAG) == 0) { SCANUINT; parseinfo->cutoffs.genomiccutoff = ret; } else if (strcmp(name, REFERENCECUTOFF_TAG) == 0) { SCANUINT; parseinfo->cutoffs.referencecutoff = ret; } else if (strcmp(name, EOPCUTOFF_TAG) == 0) { SCANUINT; parseinfo->cutoffs.eopcutoff = ret; } else if (strcmp(name, CUTOFFSSTART_TAG) == 0) gth_sa_set_cutoffs_start(sa, &parseinfo->cutoffs); else if (strcmp(name, CUTOFFSEND_TAG) == 0) gth_sa_set_cutoffs_end(sa, &parseinfo->cutoffs); else if (strcmp(name, LEFTGENOMICEXONBORDER_TAG) == 0) { SCANUINT; parseinfo->exoninfo.leftgenomicexonborder = ret; } else if (strcmp(name, RIGHTGENOMICEXONBORDER_TAG) == 0) { SCANUINT; parseinfo->exoninfo.rightgenomicexonborder = ret; } else if (strcmp(name, LEFTREFERENCEEXONBORDER_TAG) == 0) { SCANUINT; parseinfo->exoninfo.leftreferenceexonborder = ret; } else if (strcmp(name, RIGHTREFERENCEEXONBORDER_TAG) == 0) { SCANUINT; parseinfo->exoninfo.rightreferenceexonborder = ret; } else if (strcmp(name, EXONSCORE_TAG) == 0) { SCANDOUBLE; parseinfo->exoninfo.exonscore = retdouble; } else if (strcmp(name, EXONINFO_TAG) == 0) gth_sa_add_exon(sa, &parseinfo->exoninfo); else if (strcmp(name, DONORSITEPROBABILITY_TAG) == 0) { SCANDOUBLE; parseinfo->introninfo.donorsiteprobability = (GthFlt) retdouble; } else if (strcmp(name, ACCEPTORSITEPROBABILITY_TAG) == 0) { SCANDOUBLE; parseinfo->introninfo.acceptorsiteprobability = (GthFlt) retdouble; } else if (strcmp(name, DONORSITESCORE_TAG) == 0) { SCANDOUBLE; parseinfo->introninfo.donorsitescore = retdouble; } else if (strcmp(name, ACCEPTORSITESCORE_TAG) == 0) { SCANDOUBLE; parseinfo->introninfo.acceptorsitescore = retdouble; } else if (strcmp(name, INTRONINFO_TAG) == 0) gth_sa_add_intron(sa, &parseinfo->introninfo); else if (strcmp(name, POLYASTART_TAG) == 0) { SCANUINT; gth_sa_set_polyAtail_start(sa, ret); } else if (strcmp(name, POLYAEND_TAG) == 0) { SCANUINT; gth_sa_set_polyAtail_stop(sa, ret); } else if (strcmp(name, ALIGNMENTSCORE_TAG) == 0) { SCANDOUBLE; gth_sa_set_score(sa, retdouble); } else if (strcmp(name, COVERAGE_TAG) == 0) { SCANDOUBLE; gth_sa_set_coverage(sa, retdouble); } else if (strcmp(name, COVERAGEOFGENOMICSEGMENTISHIGHEST_TAG) == 0) { gth_sa_set_highest_cov(sa, parse_boolean(data, parseinfo)); } else if (strcmp(name, CUMULATIVELENGTHOFSCOREDEXONS_TAG) == 0) { SCANUINT; gth_sa_set_cumlen_scored_exons(sa, ret); } }