/* The following function prints the "classic" GeneSeqer2 MATCH line */ static void xml_showmatchline(GthSA *sa, unsigned int indentlevel, GtFile *outfp) { gth_indent(outfp, indentlevel); gt_file_xprintf(outfp, "<MATCH_line gen_id=\"%s\" gen_strand=\"%c\" ", gth_sa_gen_id(sa), gth_sa_gen_strand_char(sa)); if (gth_sa_alphatype(sa) == DNA_ALPHA) { gt_file_xprintf(outfp, "ref_id=\"%s\" ref_strand=\"%c\">\n", gth_sa_ref_id(sa), gth_sa_ref_strand_char(sa)); } else gt_file_xprintf(outfp, "ref_id=\"%s\">\n", gth_sa_ref_id(sa)); indentlevel++; gth_indent(outfp, indentlevel); gt_file_xprintf(outfp, "<total_alignment_score>%.3f</total_alignment_score>\n", gth_sa_score(sa)); gth_indent(outfp, indentlevel); gt_file_xprintf(outfp, "<cumulative_length_of_scored_exons>%lu" "</cumulative_length_of_scored_exons>\n", gth_sa_cumlen_scored_exons(sa)); gth_indent(outfp, indentlevel); gt_file_xprintf(outfp, "<coverage percentage=\"%.3f\" high_type=\"", gth_sa_coverage(sa)); gt_file_xfputc(gth_sa_coverage_char(sa), outfp); gt_file_xprintf(outfp, "\"/>\n"); indentlevel--; gth_indent(outfp, indentlevel); gt_file_xprintf(outfp, "</MATCH_line>\n"); }
bool gth_sa_filter_filter_sa(const GthSAFilter *sa_filter, GthSA *sa) { gt_assert(sa_filter && sa); /* alignment score is larger or equal then default min value */ gt_assert(gth_sa_score(sa) >= GTH_DEFAULT_MIN_ALIGNMENTSCORE); /* alignment score is smaller or equal then default max value */ gt_assert(gth_sa_score(sa) <= GTH_DEFAULT_MAX_ALIGNMENTSCORE); /* coverage is larger or equal then default min value */ gt_assert(gth_sa_coverage(sa) >= GTH_DEFAULT_MIN_COVERAGE); /* coverage score is smaller or equal then default max value */ gt_assert(gth_sa_coverage(sa) <= GTH_DEFAULT_MAX_COVERAGE); /* filter */ if (gth_sa_score(sa) < sa_filter->min_alignmentscore || gth_sa_score(sa) > sa_filter->max_alignmentscore || gth_sa_coverage(sa) < sa_filter->min_coverage || gth_sa_coverage(sa) > sa_filter->max_coverage) { return true; } return false; }
/* The following function prints the "classic" GeneSeqer2 MATCH line */ static void showmatchline(GthSA *sa, GtFile *outfp) { gt_file_xprintf(outfp, "MATCH\t%s%c\t%s%c\t%5.3f\t"GT_WU"\t%5.3f\t%c\n", gth_sa_gen_id(sa), gth_sa_gen_strand_char(sa), gth_sa_ref_id(sa), gth_sa_ref_strand_char(sa), gth_sa_score(sa), gth_sa_cumlen_scored_exons(sa), gth_sa_coverage(sa), gth_sa_coverage_char(sa)); }
void gth_compute_scores(GthSA *sa, bool proteineop, GthDPParam *dp_param, void *dp_options_est, const unsigned char *gen_seq_tran, const unsigned char *ref_seq_tran, const unsigned char *ref_seq_orig, const GtTransTable *transtable, unsigned long gen_dp_start, unsigned long scoreminexonlen, bool introncutout, bool gs2out, GthSplicedSeq *spliced_seq, unsigned long ref_dp_length, GtAlphabet *gen_alphabet, GtAlphabet *ref_alphabet, GthDPScoresProtein *dp_scores_protein) { Traversealignmentfunctions travfunctions; Traversealignmentstate travstate; Computebordersandscoresdata data; GthFlt score, coverageofgenomicsegment, coverageofreferencesegment; gt_assert(!gth_sa_num_of_exons(sa)); gt_assert(!gth_sa_num_of_introns(sa)); travfunctions.processmismatch = computescoresprocmismatch; travfunctions.processdeletion = computescoresprocdeletion; travfunctions.processinsertion = computebordersandscoresprocinsertion; travfunctions.processmatch = computebordersandscoresprocmatch; travfunctions.processintron = computebordersandscoresprocintron; travfunctions.breakcondition = NULL; /* additional functions for protein edit operations */ travfunctions.processintron_with_1_base_left = computebordersandscoresprocintron; travfunctions.processintron_with_2_bases_left = computebordersandscoresprocintron; travfunctions.processmismatch_with_1_gap = computescoresprocmismatchordeletionwithgap; travfunctions.processmismatch_with_2_gaps = computescoresprocmismatchordeletionwithgap; travfunctions.processdeletion_with_1_gap = computescoresprocmismatchordeletionwithgap; travfunctions.processdeletion_with_2_gaps = computescoresprocmismatchordeletionwithgap; travstate.proteineop = proteineop; travstate.processing_intron_with_1_base_left = false; travstate.processing_intron_with_2_bases_left = false; travstate.alignment = gth_sa_get_editoperations(sa); travstate.alignmentlength = gth_sa_get_editoperations_length(sa); travstate.eopptr = travstate.alignment + travstate.alignmentlength - 1; travstate.genomicptr = gth_sa_genomiccutoff_start(sa); travstate.referenceptr = gth_sa_referencecutoff_start(sa); if (travstate.alignmentlength <= 0) { /* in this case the alignmentscore is set to 0, which leads to discarding this alignment later */ gth_sa_set_score(sa, 0.0); return; } /* editoperations contain no zero base exons */ gt_assert(gth_sa_contains_no_zero_base_exons(sa)); /* editoperations contain no leading or terminal introns or insertions */ gt_assert(containsnoleadingorterminalintronsorinsertions(travstate.alignment, travstate .alignmentlength, proteineop)); /* sum of edit operations equals referencelength */ gt_assert(gt_eops_equal_referencelength(travstate.alignment, travstate.alignmentlength, ref_dp_length - gth_sa_referencecutoff_start(sa) - gth_sa_referencecutoff_end(sa), proteineop)); data.proteineop = proteineop; data.newexon = true; data.newintron = true; data.firstexon = true; data.introncutout = introncutout; data.gs2out = gs2out; data.spliced_seq = spliced_seq; data.singleexonweight = (GthFlt) 0.0; data.maxsingleexonweight = (GthFlt) 0.0; data.overallexonweight = (GthFlt) 0.0; data.maxoverallexonweight = (GthFlt) 0.0; data.cumulativelengthofscoredexons = 0; data.exon.leftgenomicexonborder = GT_UNDEF_ULONG; data.exon.rightgenomicexonborder = GT_UNDEF_ULONG; data.exon.leftreferenceexonborder = GT_UNDEF_ULONG; data.exon.rightreferenceexonborder = GT_UNDEF_ULONG; data.exon.exonscore = GTH_UNDEF_GTHDBL; data.intron.donorsiteprobability = GTH_UNDEF_GTHFLT; data.intron.acceptorsiteprobability = GTH_UNDEF_GTHFLT; data.intron.donorsitescore = GTH_UNDEF_GTHDBL; data.intron.acceptorsitescore = GTH_UNDEF_GTHDBL; data.sa = sa; data.dp_param = dp_param; data.dp_options_est = dp_options_est; data.gen_seq_tran = gen_seq_tran; data.ref_seq_tran = ref_seq_tran; data.ref_seq_orig = ref_seq_orig; data.transtable = transtable; data.gen_dp_start = gen_dp_start; data.scoreminexonlen = scoreminexonlen; data.ref_dp_length = ref_dp_length; data.gen_alphabet = gen_alphabet; data.gen_alphabet_characters = gen_alphabet ? gt_alphabet_characters(gen_alphabet) : NULL; data.dp_scores_protein = dp_scores_protein; gthtraversealignment(true, &travstate, proteineop, &data, &travfunctions); /* this is for saving the last exon */ evalnewintronifpossible(proteineop, &data.newexon, &data.newintron, true, data.introncutout, data.gs2out, data.spliced_seq, &data.exon, &data.intron, &data.singleexonweight, &data.maxsingleexonweight, &data.overallexonweight, &data.maxoverallexonweight, &data.cumulativelengthofscoredexons, sa, &travstate, gen_alphabet, data.dp_param, data.dp_options_est, data.gen_seq_tran, data.ref_seq_tran, data.gen_dp_start, data.scoreminexonlen); /* saving the scores for the whole alignment */ if (data.maxoverallexonweight > 0.0) { score = data.overallexonweight / data.maxoverallexonweight; /* XXX: the way the alignmentscore is computed, it is possible to get a score > 1.0. Since we don't want this, we cap it */ if (score > 1.0) score = 1.0; } else score = 0.0; gth_sa_set_score(sa, score); gth_sa_set_cumlen_scored_exons(sa, data.cumulativelengthofscoredexons); /* fraction of the gen_dp_length which is scored/weighted */ coverageofgenomicsegment = (GthFlt) data.cumulativelengthofscoredexons / (GthFlt) gth_sa_gen_dp_length(sa); /* coverage of genomic segment is valid value */ gt_assert(coverageofgenomicsegment >= 0.0 && coverageofgenomicsegment <= 1.0); /* fraction of the referencelength which is scored/weighted */ coverageofreferencesegment = (GthFlt) data.cumulativelengthofscoredexons / (GthFlt) ((proteineop ? GT_CODON_LENGTH : 1) * gth_sa_ref_total_length(sa)); if (coverageofgenomicsegment > coverageofreferencesegment) { gth_sa_set_coverage(sa, coverageofgenomicsegment); gth_sa_set_highest_cov(sa, true); } else { gth_sa_set_coverage(sa, coverageofreferencesegment); gth_sa_set_highest_cov(sa, false); } /* test the assumption that the coverage is never larger then the default */ gt_assert(gth_sa_coverage(sa) <= GTH_DEFAULT_MAX_COVERAGE); /* compute poly(A) tail position */ gth_sa_calc_polyAtailpos(sa, ref_seq_tran, ref_alphabet); /* determined exons are forward and consecutive */ gt_assert(gth_sa_exons_are_forward_and_consecutive(sa)); }
static void xml_inter_show_spliced_alignment(GthSA *sa, GthInput *input, unsigned int indentlevel, GtFile *outfp) { bool dnaalpha = true; gth_indent(outfp, indentlevel); gt_file_xprintf(outfp, "<spliced_alignment xmlns=\"http://www.GenomeThreader.org/" "SplicedAlignment/spliced_alignment/\">\n"); indentlevel++; gth_indent(outfp, indentlevel); gt_file_xprintf(outfp, "<referencealphatype>"); switch (gth_sa_alphatype(sa)) { case DNA_ALPHA: gt_file_xprintf(outfp, "DNA_ALPHA"); break; case PROTEIN_ALPHA: gt_file_xprintf(outfp, "PROTEIN_ALPHA"); dnaalpha = false; break; default: gt_assert(0); } gt_file_xprintf(outfp, "</referencealphatype>\n"); gth_indent(outfp, indentlevel); gt_file_xprintf(outfp, "<editoperations>\n"); indentlevel++; gth_backtrace_path_show_complete(gth_sa_backtrace_path(sa), true, indentlevel, outfp); indentlevel--; gth_indent(outfp, indentlevel); gt_file_xprintf(outfp, "</editoperations>\n"); gth_indent(outfp, indentlevel); gt_file_xprintf(outfp, "<indelcount>"GT_WU"</indelcount>\n", gth_sa_indelcount(sa)); gth_indent(outfp, indentlevel); gt_file_xprintf(outfp, "<genomiclengthDP>"GT_WU"</genomiclengthDP>\n", gth_sa_gen_dp_length(sa)); gth_indent(outfp, indentlevel); gt_file_xprintf(outfp, "<genomiclengthtotal>"GT_WU"</genomiclengthtotal>\n", gth_sa_gen_total_length(sa)); gth_indent(outfp, indentlevel); gt_file_xprintf(outfp, "<genomicoffset>"GT_WU"</genomicoffset>\n", gth_sa_gen_offset(sa)); gth_indent(outfp, indentlevel); gt_file_xprintf(outfp, "<referencelength>"GT_WU"</referencelength>\n", gth_sa_ref_total_length(sa)); gth_indent(outfp, indentlevel); gt_file_xprintf(outfp, "<dpstartpos>"GT_WU"</dpstartpos>\n", gth_sa_gen_dp_start(sa)); gth_indent(outfp, indentlevel); gt_file_xprintf(outfp, "<dpendpos>"GT_WU"</dpendpos>\n", gth_sa_gen_dp_end(sa)); showgenomicfilename(sa, input, indentlevel, outfp); gth_indent(outfp, indentlevel); gt_file_xprintf(outfp, "<genomicseqnum>"GT_WU"</genomicseqnum>\n", gth_sa_gen_seq_num(sa)); showreferencefilename(sa, input, indentlevel, outfp); gth_indent(outfp, indentlevel); gt_file_xprintf(outfp, "<referenceseqnum>"GT_WU"</referenceseqnum>\n", gth_sa_ref_seq_num(sa)); gth_indent(outfp, indentlevel); gt_file_xprintf(outfp, "<genomicid>%s</genomicid>\n", gth_sa_gen_id(sa)); gth_indent(outfp, indentlevel); gt_file_xprintf(outfp, "<referenceid>%s</referenceid>\n", gth_sa_ref_id(sa)); gth_indent(outfp, indentlevel); gt_file_xprintf(outfp, "<genomicstrandisforward>%s</genomicstrandisforward>\n", GTH_SHOWBOOL(gth_sa_gen_strand_forward(sa))); gth_indent(outfp, indentlevel); gt_file_xprintf(outfp, "<referencestrandisforward>%s</referencestrandisforward>\n", GTH_SHOWBOOL(gth_sa_ref_strand_forward(sa))); showalignmentcutoffs(sa, indentlevel, outfp); showexons(sa, indentlevel, outfp); showintrons(sa, dnaalpha, indentlevel, outfp); showpolyAtailpos(sa, indentlevel, outfp); gth_indent(outfp, indentlevel); gt_file_xprintf(outfp, "<alignmentscore>%.*f</alignmentscore>\n", PRECISION, gth_sa_score(sa)); gth_indent(outfp, indentlevel); gt_file_xprintf(outfp, "<coverage>%.*f</coverage>\n", PRECISION, gth_sa_coverage(sa)); gth_indent(outfp, indentlevel); gt_file_xprintf(outfp, "<coverageofgenomicsegmentishighest>%s" "</coverageofgenomicsegmentishighest>\n", GTH_SHOWBOOL(gth_sa_genomic_cov_is_highest(sa))); gth_indent(outfp, indentlevel); gt_file_xprintf(outfp, "<cumulativelengthofscoredexons>"GT_WU"" "</cumulativelengthofscoredexons>\n", gth_sa_cumlen_scored_exons(sa)); indentlevel--; gth_indent(outfp, indentlevel); gt_file_xprintf(outfp, "</spliced_alignment>\n"); }
static int store_in_subset_file(void *data, GthSA *sa, const char *outputfilename, GtError *err) { Store_in_subset_file_data *store_in_subset_file_data = (Store_in_subset_file_data*) data; double split_determing_percentage = 0.0; unsigned long filenum; char filenamesuffix[4]; int had_err = 0; gt_error_check(err); /* filter before we do any further processing */ if (gth_sa_filter_filter_sa(store_in_subset_file_data->sa_filter, sa)) { /* and free it afterwards */ gth_sa_delete(sa); /* discard */ return 0; } /* check whether we got a new output file to process */ if (!store_in_subset_file_data->current_outputfilename) { store_in_subset_file_data->current_outputfilename = gt_cstr_dup(outputfilename); } else if (strcmp(store_in_subset_file_data->current_outputfilename, outputfilename)) { /* close current output files */ close_output_files(store_in_subset_file_data); gt_free(store_in_subset_file_data->current_outputfilename); } /* determine in which file the current sa needs to be put */ switch (store_in_subset_file_data->gthsplitinfo->splitmode) { case ALIGNMENTSCORE_SPLIT: split_determing_percentage = gth_sa_score(sa); strcpy(filenamesuffix, "scr"); break; case COVERAGE_SPLIT: split_determing_percentage = gth_sa_coverage(sa); strcpy(filenamesuffix, "cov"); break; default: gt_assert(0); } gt_assert(split_determing_percentage >= 0.0); /* XXX: change into an assertion when coverage problem is fixed */ if (split_determing_percentage > 1.0) split_determing_percentage = 1.0; if (split_determing_percentage == 1.0) filenum = store_in_subset_file_data->num_of_subset_files - 1; else { filenum = floor(split_determing_percentage * 100.0 / store_in_subset_file_data->gthsplitinfo->range); } gt_assert(filenum < store_in_subset_file_data->num_of_subset_files); /* make sure the file exists and is open */ if (!store_in_subset_file_data->subset_files[filenum]) { gt_assert(store_in_subset_file_data->subset_filenames[filenum] == NULL); store_in_subset_file_data->subset_filenames[filenum] = gt_str_new(); gt_str_append_cstr_nt(store_in_subset_file_data->subset_filenames[filenum], outputfilename, gt_file_basename_length(outputfilename)); gt_str_append_char(store_in_subset_file_data->subset_filenames[filenum], '.'); gt_str_append_cstr(store_in_subset_file_data->subset_filenames[filenum], filenamesuffix); gt_str_append_ulong(store_in_subset_file_data->subset_filenames[filenum], filenum * store_in_subset_file_data->gthsplitinfo->range); gt_str_append_char(store_in_subset_file_data->subset_filenames[filenum], '-'); gt_str_append_ulong(store_in_subset_file_data->subset_filenames[filenum], (filenum + 1) * store_in_subset_file_data->gthsplitinfo->range); gt_str_append_cstr(store_in_subset_file_data->subset_filenames[filenum], gt_file_mode_suffix(store_in_subset_file_data ->gthsplitinfo->file_mode)); /* if not disabled by -force, check if file already exists */ if (!store_in_subset_file_data->gthsplitinfo->force) { store_in_subset_file_data->subset_files[filenum] = gt_file_open(store_in_subset_file_data->gthsplitinfo->file_mode, gt_str_get(store_in_subset_file_data ->subset_filenames[filenum]), "r", NULL); if (store_in_subset_file_data->subset_files[filenum]) { gt_error_set(err, "file \"%s\" exists already. use option -%s to " "overwrite", gt_str_get(store_in_subset_file_data ->subset_filenames[filenum]), GT_FORCE_OPT_CSTR); had_err = -1; } } if (!had_err) { /* open split file for writing */ store_in_subset_file_data->subset_files[filenum] = gt_file_xopen_file_mode(store_in_subset_file_data->gthsplitinfo ->file_mode, gt_str_get(store_in_subset_file_data ->subset_filenames[filenum]), "w"); /* store XML header in file */ gth_xml_show_leader(true, store_in_subset_file_data->subset_files[filenum]); } } /* put it there */ if (!had_err) { gth_xml_inter_sa_visitor_set_outfp(store_in_subset_file_data->sa_visitor, store_in_subset_file_data ->subset_files[filenum]); gth_sa_visitor_visit_sa(store_in_subset_file_data->sa_visitor, sa); } /* adjust counter */ if (!had_err) store_in_subset_file_data->subset_file_sa_counter[filenum]++; /* and free it afterwards */ gth_sa_delete(sa); return had_err; }