static void showgthreferenceinformation(GthSA *sa, GthInput *input, bool showseqnums, GtFile *outfp) { gt_assert(gth_sa_ref_file_num(sa) != GT_UNDEF_UWORD); switch (gth_sa_alphatype(sa)) { case DNA_ALPHA: gt_file_xprintf(outfp, "EST Sequence: file=%s, strand=%c, description=", gth_input_get_reference_filename(input, gth_sa_ref_file_num(sa)), gth_sa_ref_strand_char(sa)); break; case PROTEIN_ALPHA: gt_file_xprintf(outfp, "Protein Sequence: file=%s, description=", gth_input_get_reference_filename(input, gth_sa_ref_file_num(sa))); break; default: gt_assert(0); } gth_sa_echo_reference_description(sa, input, outfp); if (showseqnums) gt_file_xprintf(outfp, ", seqnum="GT_WU"", gth_sa_ref_seq_num(sa)); gt_file_xfputc('\n', outfp); gt_file_xfputc('\n', outfp); }
static void xml_showgthreferenceinformation(GthSA *sa, GthInput *input, unsigned int indentlevel, GtFile *outfp) { gt_assert(gth_sa_ref_file_num(sa) != GT_UNDEF_ULONG); gth_indent(outfp, indentlevel); switch (gth_sa_alphatype(sa)) { case DNA_ALPHA: gt_file_xprintf(outfp, "<reference ref_file=\"%s\" ref_id=\"%s\" " "ref_strand=\"%c\" ref_description=\"", gth_input_get_reference_filename(input, gth_sa_ref_file_num(sa)), gth_sa_ref_id(sa), gth_sa_ref_strand_char(sa)); break; case PROTEIN_ALPHA: gt_file_xprintf(outfp, "<reference ref_file=\"%s\" ref_id=\"%s\" " "ref_description=\"", gth_input_get_reference_filename(input, gth_sa_ref_file_num(sa)), gth_sa_ref_id(sa)); break; default: gt_assert(0); } gth_input_echo_reference_description(input, gth_sa_ref_file_num(sa), gth_sa_ref_seq_num(sa), outfp); gt_file_xprintf(outfp, "\">\n"); }
/* The following function prints the "classic" GeneSeqer2 MATCH line */ static void xml_showmatchline(GthSA *sa, unsigned int indentlevel, GtFile *outfp) { gth_indent(outfp, indentlevel); gt_file_xprintf(outfp, "<MATCH_line gen_id=\"%s\" gen_strand=\"%c\" ", gth_sa_gen_id(sa), gth_sa_gen_strand_char(sa)); if (gth_sa_alphatype(sa) == DNA_ALPHA) { gt_file_xprintf(outfp, "ref_id=\"%s\" ref_strand=\"%c\">\n", gth_sa_ref_id(sa), gth_sa_ref_strand_char(sa)); } else gt_file_xprintf(outfp, "ref_id=\"%s\">\n", gth_sa_ref_id(sa)); indentlevel++; gth_indent(outfp, indentlevel); gt_file_xprintf(outfp, "<total_alignment_score>%.3f</total_alignment_score>\n", gth_sa_score(sa)); gth_indent(outfp, indentlevel); gt_file_xprintf(outfp, "<cumulative_length_of_scored_exons>%lu" "</cumulative_length_of_scored_exons>\n", gth_sa_cumlen_scored_exons(sa)); gth_indent(outfp, indentlevel); gt_file_xprintf(outfp, "<coverage percentage=\"%.3f\" high_type=\"", gth_sa_coverage(sa)); gt_file_xfputc(gth_sa_coverage_char(sa), outfp); gt_file_xprintf(outfp, "\"/>\n"); indentlevel--; gth_indent(outfp, indentlevel); gt_file_xprintf(outfp, "</MATCH_line>\n"); }
char gth_sa_coverage_char(const GthSA *sa) { gt_assert(sa); if (gth_sa_genomic_cov_is_highest(sa)) return 'G'; else { if (gth_sa_alphatype(sa) == DNA_ALPHA) return 'C'; else return 'P'; } }
static void xml_outputPGSlines(GtArray *alignments, unsigned int indentlevel, GtFile *outfp) { unsigned long i, j; GthSA *sa; gth_indent(outfp, indentlevel); gt_file_xprintf(outfp, "<supporting_evidence xmlns=\"" "http://www.genomethreader.org/" "GTH_output/PGL_module/predicted_gene_location/" "AGS_information/supporting_evidence/\">\n"); indentlevel++; for (i = 0; i < gt_array_size(alignments); i++) { sa = *(GthSA**) gt_array_get(alignments, i); gth_indent(outfp, indentlevel); gt_file_xprintf(outfp, "<PGS_line>\n"); indentlevel++; gth_indent(outfp, indentlevel); gt_file_xprintf(outfp, "<gDNA_exon_coordinates>\n"); indentlevel++; for (j = 0; j < gth_sa_num_of_exons(sa); j++) { gth_indent(outfp, indentlevel); gt_file_xprintf(outfp, "<exon start=\"%lu\" stop=\"%lu\"/>\n", gth_sa_left_genomic_exon_border(sa, j), gth_sa_right_genomic_exon_border(sa, j)); } indentlevel--; gth_indent(outfp, indentlevel); gt_file_xprintf(outfp, "</gDNA_exon_coordinates>\n"); gth_indent(outfp, indentlevel); if (gth_sa_alphatype(sa) == DNA_ALPHA) { gt_file_xprintf(outfp, "<referenceDNA id=\"%s\" strand=\"%c\"/>\n", gth_sa_ref_id(sa), gth_sa_ref_strand_char(sa)); } else { gt_file_xprintf(outfp, "<referenceProtein id=\"%s\"/>\n", gth_sa_ref_id(sa)); } indentlevel--; gth_indent(outfp, indentlevel); gt_file_xprintf(outfp, "</PGS_line>\n"); } indentlevel--; gth_indent(outfp, indentlevel); gt_file_xprintf(outfp, "</supporting_evidence>\n"); }
/* The following function prints the "classic" GeneSeqer2 PGS line */ static void xml_showpgsline(GthSA *sa, unsigned int indentlevel, GtFile *outfp) { unsigned long i, numofexons; gt_assert(sa); numofexons = gth_sa_num_of_exons(sa); gth_indent(outfp, indentlevel); gt_file_xprintf(outfp, "<PGS_line>\n"); indentlevel++; gth_indent(outfp, indentlevel); gt_file_xprintf(outfp, "<gDNA gen_id=\"%s\" gen_strand=\"%c\"/>\n", gth_sa_gen_id(sa), gth_sa_gen_strand_char(sa)); gth_indent(outfp, indentlevel); if (gth_sa_alphatype(sa) == DNA_ALPHA) { gt_file_xprintf(outfp, "<rDNA rDNA_id=\"%s\" rDNA_strand=\"%c\"/>\n", gth_sa_ref_id(sa), gth_sa_ref_strand_char(sa)); } else { gt_file_xprintf(outfp, "<rProt rProt_id=\"%s\"/>\n", gth_sa_ref_id(sa)); } gth_indent(outfp, indentlevel); gt_file_xprintf(outfp, "<gDNA_exon_coordinates>\n"); indentlevel++; for (i = 0; i < numofexons; i++) { gth_indent(outfp, indentlevel); gt_file_xprintf(outfp, "<exon e_start=\"%lu\" e_stop=\"%lu\"/>\n", gth_sa_left_genomic_exon_border(sa, i), gth_sa_right_genomic_exon_border(sa, i)); } indentlevel--; gth_indent(outfp, indentlevel); gt_file_xprintf(outfp, "</gDNA_exon_coordinates>\n"); indentlevel--; gth_indent(outfp, indentlevel); gt_file_xprintf(outfp, "</PGS_line>\n"); }
static void end_element_handler(void *info, const XML_Char *name) { Parseinfo *parseinfo = (Parseinfo*) info; GthSA *sa = parseinfo->currentSA; GtUword datalength; double retdouble; GtWord ret; char *data; /* save data and data length */ data = gt_str_get(parseinfo->databuf); datalength = gt_str_length(parseinfo->databuf); /* perform actions depending on end tag */ if (strcmp(name, SPLICEDALIGNMENT_TAG) == 0) { /* before we store the spliced alignment we have to reverse its edit operations */ gt_assert(sa && gth_sa_backtrace_path(sa)); gth_backtrace_path_reverse(gth_sa_backtrace_path(sa)); /* ensure that before an intron which is not in phase the edit operation has length 1 (only for protein spliced alignments) */ gth_backtrace_path_ensure_length_1_before_introns( gth_sa_backtrace_path(sa)); if (parseinfo->saprocessfunc(parseinfo->data , sa, parseinfo->outputfilename, parseinfo->err)) { /* XXX */ fprintf(stderr, "error: %s\n", gt_error_get(parseinfo->err)); exit(EXIT_FAILURE); } /* reset current spliced alignment */ parseinfo->currentSA = NULL; } else if (strcmp(name, REFERENCEALPHATYPE_TAG) == 0) { if (strcmp(data, "DNA_ALPHA") == 0) gth_sa_set_alphatype(sa, DNA_ALPHA); else if (strcmp(data, "PROTEIN_ALPHA") == 0) { gth_sa_set_alphatype(sa, PROTEIN_ALPHA); } else { ILLEGAL_DATA; } } else if (strcmp(name, DNA_EOP_TYPE_TAG) == 0) { if (strcmp(data, "match") == 0) parseinfo->eoptype = EOP_TYPE_MATCH; else if (strcmp(data, "deletion") == 0) parseinfo->eoptype = EOP_TYPE_DELETION; else if (strcmp(data, "insertion") == 0) parseinfo->eoptype = EOP_TYPE_INSERTION; else if (strcmp(data, "mismatch") == 0) parseinfo->eoptype = EOP_TYPE_MISMATCH; else if (strcmp(data, "intron") == 0) parseinfo->eoptype = EOP_TYPE_INTRON; else { ILLEGAL_DATA; } } else if (strcmp(name, DNA_EOP_LENGTH_TAG) == 0) { SCANUINT; gth_backtrace_path_add_eop(gth_sa_backtrace_path(sa), parseinfo->eoptype, ret); } else if (strcmp(name, PROTEIN_EOP_TYPE_TAG) == 0) { if (strcmp(data, "match") == 0) parseinfo->eoptype = EOP_TYPE_MATCH; else if (strcmp(data, "deletion") == 0) parseinfo->eoptype = EOP_TYPE_DELETION; else if (strcmp(data, "insertion") == 0) parseinfo->eoptype = EOP_TYPE_INSERTION; else if (strcmp(data, "mismatch") == 0) parseinfo->eoptype = EOP_TYPE_MISMATCH; else if (strcmp(data, "intron") == 0) parseinfo->eoptype = EOP_TYPE_INTRON; else if (strcmp(data, "mismatch_with_1_gap") == 0) parseinfo->eoptype = EOP_TYPE_MISMATCH_WITH_1_GAP; else if (strcmp(data, "mismatch_with_2_gaps") == 0) parseinfo->eoptype = EOP_TYPE_MISMATCH_WITH_2_GAPS; else if (strcmp(data, "deletion_with_1_gap") == 0) parseinfo->eoptype = EOP_TYPE_DELETION_WITH_1_GAP; else if (strcmp(data, "deletion_with_2_gaps") == 0) parseinfo->eoptype = EOP_TYPE_DELETION_WITH_2_GAPS; else if (strcmp(data, "intron_with_1_base_left") == 0) parseinfo->eoptype = EOP_TYPE_INTRON_WITH_1_BASE_LEFT; else if (strcmp(data, "intron_with_2_bases_left") == 0) parseinfo->eoptype = EOP_TYPE_INTRON_WITH_2_BASES_LEFT; else { ILLEGAL_DATA; } } else if (strcmp(name, PROTEIN_EOP_LENGTH_TAG) == 0) { SCANUINT; gth_backtrace_path_add_eop(gth_sa_backtrace_path(sa), parseinfo->eoptype, ret); } else if (strcmp(name, INDELCOUNT_TAG) == 0) { SCANUINT; /* ignore indelcount, gets recomputed anyway */ } else if (strcmp(name, GENOMICLENGTHDP_TAG) == 0) { SCANUINT; gth_sa_set_gen_dp_length(sa, ret); } else if (strcmp(name, GENOMICLENGTHTOTAL_TAG) == 0) { SCANUINT; gth_sa_set_gen_total_length(sa, ret); } else if (strcmp(name, GENOMICOFFSET_TAG) == 0) { SCANUINT; gth_sa_set_gen_offset(sa, ret); } else if (strcmp(name, REFERENCELENGTH_TAG) == 0) { SCANUINT; gth_sa_set_ref_total_length(sa, ret); } else if (strcmp(name, DPSTARTPOS_TAG) == 0) { SCANUINT; gth_sa_set_gen_dp_start(sa, ret); } else if (strcmp(name, DPENDPOS_TAG) == 0) { SCANUINT; /* ignore DP end pos, gets recomputed from gen_dp_length anyway */ gt_assert(gth_sa_gen_dp_end(sa) == ret); } else if (strcmp(name, GENOMICFILENAME_TAG) == 0) { /* save genomic file name */ gt_str_append_cstr_nt(parseinfo->genomicfilename, data, datalength); } else if (strcmp(name, GENOMICFILEHASH_TAG) == 0) { gth_sa_set_gen_file_num(sa, process_file(parseinfo->input, gt_str_get(parseinfo->genomicfilename), data, false, UNDEF_ALPHA)); /* reset genomic filename */ gt_str_reset(parseinfo->genomicfilename); } else if (strcmp(name, GENOMICSEQNUM_TAG) == 0) { SCANUINT; gth_sa_set_gen_seq_num(sa, ret); } else if (strcmp(name, REFERENCEFILENAME_TAG) == 0) { /* save reference file name */ gt_str_append_cstr_nt(parseinfo->referencefilename, data, datalength); } else if (strcmp(name, REFERENCEFILEHASH_TAG) == 0) { gth_sa_set_ref_file_num(sa, process_file(parseinfo->input, gt_str_get(parseinfo->referencefilename), data, true, gth_sa_alphatype(sa))); /* reset reference filename */ gt_str_reset(parseinfo->referencefilename); } else if (strcmp(name, REFERENCESEQNUM_TAG) == 0) { SCANUINT; gth_sa_set_ref_seq_num(sa, ret); } else if (strcmp(name, GENOMICID_TAG) == 0) gth_sa_set_gen_id(sa, data); else if (strcmp(name, REFERENCEID_TAG) == 0) gth_sa_set_ref_id(sa, data); else if (strcmp(name, GENOMICSTRANDISFORWARD_TAG) == 0) gth_sa_set_gen_strand(sa, parse_boolean(data, parseinfo)); else if (strcmp(name, REFERENCESTRANDISFORWARD_TAG) == 0) gth_sa_set_ref_strand(sa, parse_boolean(data, parseinfo)); else if (strcmp(name, GENOMICCUTOFF_TAG) == 0) { SCANUINT; parseinfo->cutoffs.genomiccutoff = ret; } else if (strcmp(name, REFERENCECUTOFF_TAG) == 0) { SCANUINT; parseinfo->cutoffs.referencecutoff = ret; } else if (strcmp(name, EOPCUTOFF_TAG) == 0) { SCANUINT; parseinfo->cutoffs.eopcutoff = ret; } else if (strcmp(name, CUTOFFSSTART_TAG) == 0) gth_sa_set_cutoffs_start(sa, &parseinfo->cutoffs); else if (strcmp(name, CUTOFFSEND_TAG) == 0) gth_sa_set_cutoffs_end(sa, &parseinfo->cutoffs); else if (strcmp(name, LEFTGENOMICEXONBORDER_TAG) == 0) { SCANUINT; parseinfo->exoninfo.leftgenomicexonborder = ret; } else if (strcmp(name, RIGHTGENOMICEXONBORDER_TAG) == 0) { SCANUINT; parseinfo->exoninfo.rightgenomicexonborder = ret; } else if (strcmp(name, LEFTREFERENCEEXONBORDER_TAG) == 0) { SCANUINT; parseinfo->exoninfo.leftreferenceexonborder = ret; } else if (strcmp(name, RIGHTREFERENCEEXONBORDER_TAG) == 0) { SCANUINT; parseinfo->exoninfo.rightreferenceexonborder = ret; } else if (strcmp(name, EXONSCORE_TAG) == 0) { SCANDOUBLE; parseinfo->exoninfo.exonscore = retdouble; } else if (strcmp(name, EXONINFO_TAG) == 0) gth_sa_add_exon(sa, &parseinfo->exoninfo); else if (strcmp(name, DONORSITEPROBABILITY_TAG) == 0) { SCANDOUBLE; parseinfo->introninfo.donorsiteprobability = (GthFlt) retdouble; } else if (strcmp(name, ACCEPTORSITEPROBABILITY_TAG) == 0) { SCANDOUBLE; parseinfo->introninfo.acceptorsiteprobability = (GthFlt) retdouble; } else if (strcmp(name, DONORSITESCORE_TAG) == 0) { SCANDOUBLE; parseinfo->introninfo.donorsitescore = retdouble; } else if (strcmp(name, ACCEPTORSITESCORE_TAG) == 0) { SCANDOUBLE; parseinfo->introninfo.acceptorsitescore = retdouble; } else if (strcmp(name, INTRONINFO_TAG) == 0) gth_sa_add_intron(sa, &parseinfo->introninfo); else if (strcmp(name, POLYASTART_TAG) == 0) { SCANUINT; gth_sa_set_polyAtail_start(sa, ret); } else if (strcmp(name, POLYAEND_TAG) == 0) { SCANUINT; gth_sa_set_polyAtail_stop(sa, ret); } else if (strcmp(name, ALIGNMENTSCORE_TAG) == 0) { SCANDOUBLE; gth_sa_set_score(sa, retdouble); } else if (strcmp(name, COVERAGE_TAG) == 0) { SCANDOUBLE; gth_sa_set_coverage(sa, retdouble); } else if (strcmp(name, COVERAGEOFGENOMICSEGMENTISHIGHEST_TAG) == 0) { gth_sa_set_highest_cov(sa, parse_boolean(data, parseinfo)); } else if (strcmp(name, CUMULATIVELENGTHOFSCOREDEXONS_TAG) == 0) { SCANUINT; gth_sa_set_cumlen_scored_exons(sa, ret); } }
static void xml_inter_show_spliced_alignment(GthSA *sa, GthInput *input, unsigned int indentlevel, GtFile *outfp) { bool dnaalpha = true; gth_indent(outfp, indentlevel); gt_file_xprintf(outfp, "<spliced_alignment xmlns=\"http://www.GenomeThreader.org/" "SplicedAlignment/spliced_alignment/\">\n"); indentlevel++; gth_indent(outfp, indentlevel); gt_file_xprintf(outfp, "<referencealphatype>"); switch (gth_sa_alphatype(sa)) { case DNA_ALPHA: gt_file_xprintf(outfp, "DNA_ALPHA"); break; case PROTEIN_ALPHA: gt_file_xprintf(outfp, "PROTEIN_ALPHA"); dnaalpha = false; break; default: gt_assert(0); } gt_file_xprintf(outfp, "</referencealphatype>\n"); gth_indent(outfp, indentlevel); gt_file_xprintf(outfp, "<editoperations>\n"); indentlevel++; gth_backtrace_path_show_complete(gth_sa_backtrace_path(sa), true, indentlevel, outfp); indentlevel--; gth_indent(outfp, indentlevel); gt_file_xprintf(outfp, "</editoperations>\n"); gth_indent(outfp, indentlevel); gt_file_xprintf(outfp, "<indelcount>"GT_WU"</indelcount>\n", gth_sa_indelcount(sa)); gth_indent(outfp, indentlevel); gt_file_xprintf(outfp, "<genomiclengthDP>"GT_WU"</genomiclengthDP>\n", gth_sa_gen_dp_length(sa)); gth_indent(outfp, indentlevel); gt_file_xprintf(outfp, "<genomiclengthtotal>"GT_WU"</genomiclengthtotal>\n", gth_sa_gen_total_length(sa)); gth_indent(outfp, indentlevel); gt_file_xprintf(outfp, "<genomicoffset>"GT_WU"</genomicoffset>\n", gth_sa_gen_offset(sa)); gth_indent(outfp, indentlevel); gt_file_xprintf(outfp, "<referencelength>"GT_WU"</referencelength>\n", gth_sa_ref_total_length(sa)); gth_indent(outfp, indentlevel); gt_file_xprintf(outfp, "<dpstartpos>"GT_WU"</dpstartpos>\n", gth_sa_gen_dp_start(sa)); gth_indent(outfp, indentlevel); gt_file_xprintf(outfp, "<dpendpos>"GT_WU"</dpendpos>\n", gth_sa_gen_dp_end(sa)); showgenomicfilename(sa, input, indentlevel, outfp); gth_indent(outfp, indentlevel); gt_file_xprintf(outfp, "<genomicseqnum>"GT_WU"</genomicseqnum>\n", gth_sa_gen_seq_num(sa)); showreferencefilename(sa, input, indentlevel, outfp); gth_indent(outfp, indentlevel); gt_file_xprintf(outfp, "<referenceseqnum>"GT_WU"</referenceseqnum>\n", gth_sa_ref_seq_num(sa)); gth_indent(outfp, indentlevel); gt_file_xprintf(outfp, "<genomicid>%s</genomicid>\n", gth_sa_gen_id(sa)); gth_indent(outfp, indentlevel); gt_file_xprintf(outfp, "<referenceid>%s</referenceid>\n", gth_sa_ref_id(sa)); gth_indent(outfp, indentlevel); gt_file_xprintf(outfp, "<genomicstrandisforward>%s</genomicstrandisforward>\n", GTH_SHOWBOOL(gth_sa_gen_strand_forward(sa))); gth_indent(outfp, indentlevel); gt_file_xprintf(outfp, "<referencestrandisforward>%s</referencestrandisforward>\n", GTH_SHOWBOOL(gth_sa_ref_strand_forward(sa))); showalignmentcutoffs(sa, indentlevel, outfp); showexons(sa, indentlevel, outfp); showintrons(sa, dnaalpha, indentlevel, outfp); showpolyAtailpos(sa, indentlevel, outfp); gth_indent(outfp, indentlevel); gt_file_xprintf(outfp, "<alignmentscore>%.*f</alignmentscore>\n", PRECISION, gth_sa_score(sa)); gth_indent(outfp, indentlevel); gt_file_xprintf(outfp, "<coverage>%.*f</coverage>\n", PRECISION, gth_sa_coverage(sa)); gth_indent(outfp, indentlevel); gt_file_xprintf(outfp, "<coverageofgenomicsegmentishighest>%s" "</coverageofgenomicsegmentishighest>\n", GTH_SHOWBOOL(gth_sa_genomic_cov_is_highest(sa))); gth_indent(outfp, indentlevel); gt_file_xprintf(outfp, "<cumulativelengthofscoredexons>"GT_WU"" "</cumulativelengthofscoredexons>\n", gth_sa_cumlen_scored_exons(sa)); indentlevel--; gth_indent(outfp, indentlevel); gt_file_xprintf(outfp, "</spliced_alignment>\n"); }
static void xml_final_show_spliced_alignment(GthSA *sa, GthInput *input, unsigned long minintronlength, unsigned long translationtable, unsigned int indentlevel, GtFile *outfp) { unsigned char *first_line, *second_line, *third_line; GT_UNUSED bool reverse_subject_pos = false; unsigned long cols; gt_assert(sa && input); gth_indent(outfp, indentlevel); gt_file_xprintf(outfp, "<spliced_alignment xmlns=\"http://www.genomethreader.org/" "GTH_output/alignment_module/spliced_alignment/\">\n"); indentlevel++; /* If the reverse complement of the genomic DNA is considered, this opition is needed for correct output of the genomic sequence positions by the function showalignmentgeneric() */ if (!gth_sa_gen_strand_forward(sa)) reverse_subject_pos = true; xml_showgthreferenceinformation(sa, input, indentlevel, outfp); indentlevel++; gth_indent(outfp, indentlevel); gt_file_xprintf(outfp, "<seq>"); gth_sa_echo_reference_sequence(sa, input, false, outfp); gt_file_xprintf(outfp, "</seq>\n"); indentlevel--; gth_indent(outfp, indentlevel); gt_file_xprintf(outfp, "</reference>\n"); xml_showgthgenomicinformation(sa, input, indentlevel, outfp); xml_showalignmentheader(sa, minintronlength, indentlevel, outfp); indentlevel++; gth_indent(outfp, indentlevel); gt_file_xprintf(outfp, "<alignment>\n"); /* compute the alignment lines */ cols = gth_sa_get_alignment_lines(sa, &first_line, &second_line, &third_line, translationtable, input); indentlevel++; gth_indent(outfp, indentlevel); gt_file_xprintf(outfp, "<genome_strand>"); showconcreteline(first_line, cols, outfp); gt_file_xprintf(outfp, "</genome_strand>\n"); gth_indent(outfp, indentlevel); switch (gth_sa_alphatype(sa)) { case DNA_ALPHA: gt_file_xprintf(outfp, "<mrna_strand>"); showconcreteline(second_line, cols, outfp); gt_file_xprintf(outfp, "</mrna_strand>\n"); break; case PROTEIN_ALPHA: gt_file_xprintf(outfp, "<genomeProt>"); showconcreteline(second_line, cols, outfp); gt_file_xprintf(outfp, "</genomeProt>\n"); gth_indent(outfp, indentlevel); gt_file_xprintf(outfp, "<queryProt>"); showconcreteline(third_line, cols, outfp); gt_file_xprintf(outfp, "</queryProt>\n"); break; default: gt_assert(0); } indentlevel--; gth_indent(outfp, indentlevel); gt_file_xprintf(outfp, "</alignment>\n"); indentlevel--; gth_indent(outfp, indentlevel); gt_file_xprintf(outfp, "</predicted_gene_structure>\n"); indentlevel--; gth_indent(outfp, indentlevel); gt_file_xprintf(outfp, "</spliced_alignment>\n"); /* free */ gt_free(first_line); gt_free(second_line); gt_free(third_line); }
static void xml_showalignmentheader(GthSA *sa, unsigned long minintronlength, unsigned int indentlevel, GtFile *outfp) { unsigned long i, leftreferenceexonborder, rightreferenceexonborder, referenceexonlength; GthDbl exonscore, donorsitescore, acceptorsitescore; GthFlt donorsiteprobability, acceptorsiteprobability; Exoninfo *exoninfo; Introninfo *introninfo; gth_indent(outfp, indentlevel); gt_file_xprintf(outfp, "<predicted_gene_structure>\n"); indentlevel++; gth_indent(outfp, indentlevel); gt_file_xprintf(outfp, "<exon-intron_info>\n"); indentlevel++; for (i = 0; i < gth_sa_num_of_exons(sa); i++) { exoninfo = gth_sa_get_exon(sa, i); leftreferenceexonborder = exoninfo->leftreferenceexonborder; rightreferenceexonborder = exoninfo->rightreferenceexonborder; referenceexonlength = rightreferenceexonborder - leftreferenceexonborder + 1; exonscore = exoninfo->exonscore; if (i > 0) { introninfo = gth_sa_get_intron(sa, i-1); donorsiteprobability = introninfo->donorsiteprobability; donorsitescore = introninfo->donorsitescore; acceptorsiteprobability = introninfo->acceptorsiteprobability; acceptorsitescore = introninfo->acceptorsitescore; gth_indent(outfp, indentlevel); gt_file_xprintf(outfp, "<intron i_serial=\"%lu\">\n", i - 1 + OUTPUTOFFSET); indentlevel++; gth_indent(outfp, indentlevel); gt_file_xprintf(outfp, "<gDNA_intron_boundary i_start=\"%lu\" i_stop=\"%lu\" " "i_length=\"%lu\">\n", gth_sa_left_intron_border(sa, i-1), gth_sa_right_intron_border(sa, i-1), gth_sa_intron_length(sa, i-1)); indentlevel++; gth_indent(outfp, indentlevel); gt_file_xprintf(outfp, "<donor d_prob=\"%.3f\"", donorsiteprobability); if (gth_sa_alphatype(sa) == DNA_ALPHA) gt_file_xprintf(outfp, " d_score=\"%.2f\"", donorsitescore); gt_file_xprintf(outfp, "/>\n"); gth_indent(outfp, indentlevel); gt_file_xprintf(outfp, "<acceptor a_prob=\"%.3f\"", acceptorsiteprobability); if (gth_sa_alphatype(sa) == DNA_ALPHA) gt_file_xprintf(outfp, " a_score=\"%.2f\"", acceptorsitescore); gt_file_xprintf(outfp, "/>\n"); indentlevel--; gth_indent(outfp, indentlevel); gt_file_xprintf(outfp, "</gDNA_intron_boundary>\n"); /* if the intron is shorter or equal than the minimal intron length an additional tag is shown */ if (gth_sa_intron_length(sa, i-1) <= minintronlength) { gth_indent(outfp, indentlevel); gt_file_xprintf(outfp, "<shorter_than_min_intron_len/>\n"); } indentlevel--; gth_indent(outfp, indentlevel); gt_file_xprintf(outfp, "</intron>\n"); } gth_indent(outfp, indentlevel); gt_file_xprintf(outfp, "<exon e_serial=\"%lu\">\n", i + OUTPUTOFFSET); indentlevel++; gth_indent(outfp, indentlevel); gt_file_xprintf(outfp, "<gDNA_exon_boundary g_start=\"%lu\" g_stop=" "\"%lu\" g_length=\"%lu\"/>\n", gth_sa_left_genomic_exon_border(sa, i), gth_sa_right_genomic_exon_border(sa, i), gth_sa_genomic_exon_length(sa, i)); gth_indent(outfp, indentlevel); gt_file_xprintf(outfp, "<reference_exon_boundary r_type=\"%s\" r_start=\"%lu\" " "r_stop=\"%lu\" r_length=\"%lu\" r_score=\"%5.3f\"/>\n", gth_sa_alphastring(sa), leftreferenceexonborder + OUTPUTOFFSET , rightreferenceexonborder + OUTPUTOFFSET , referenceexonlength, exonscore); indentlevel--; gth_indent(outfp, indentlevel); gt_file_xprintf(outfp, "</exon>\n"); } indentlevel--; gth_indent(outfp, indentlevel); gt_file_xprintf(outfp, "</exon-intron_info>\n"); /* showing PPA line (if an poly-A tail was determined) */ if (gth_sa_alphatype(sa) == DNA_ALPHA) xml_showppaline(sa, indentlevel, outfp); /* showing MATCH line */ xml_showmatchline(sa, indentlevel, outfp); /* showing PGS line */ xml_showpgsline(sa, indentlevel, outfp); }
const char* gth_sa_alphastring(const GthSA *sa) { gt_assert(sa); return gth_sa_alphatype(sa) == DNA_ALPHA ? "cDNA" : "Protein"; }
bool gth_sas_are_equal(const GthSA *saA, const GthSA *saB) { Exoninfo *exoninfoA, *exoninfoB; Introninfo *introninfoA, *introninfoB; GtUword i; /* compare element 0 */ if (gth_sa_alphatype(saA) != gth_sa_alphatype(saB)) return false; /* compare element 1 */ if (gth_backtrace_path_length(saA->backtrace_path) != gth_backtrace_path_length(saB->backtrace_path)) { return false; } for (i = 0; i < gth_backtrace_path_length(saA->backtrace_path); i++) { if (((Editoperation*) gth_backtrace_path_get(saA->backtrace_path))[i] != ((Editoperation*) gth_backtrace_path_get(saB->backtrace_path))[i]) { return false; } } /* element 2 has been removed (indelcount) */ /* compare element 3 */ if (gth_sa_gen_dp_length(saA) != gth_sa_gen_dp_length(saB)) return false; /* compare element 4 */ if (saA->gen_total_length != saB->gen_total_length) return false; /* compare element 5 */ if (saA->gen_offset != saB->gen_offset) return false; /* compare element 6 */ if (gth_sa_ref_total_length(saA) != gth_sa_ref_total_length(saB)) return false; /* compare element 7 */ if (gth_sa_gen_dp_start(saA) != gth_sa_gen_dp_start(saB)) return false; /* element 8 has been removed (gen_dp_end) */ /* compare element 9 */ if (saA->gen_file_num != saB->gen_file_num) return false; /* compare element 10 */ if (saA->gen_seq_num != saB->gen_seq_num) return false; /* compare element 11 */ if (saA->ref_file_num != saB->ref_file_num) return false; /* compare element 12 */ if (saA->ref_seq_num != saB->ref_seq_num) return false; /* compare element 13 */ if (gt_str_cmp(saA->gen_id, saB->gen_id)) return false; /* compare element 14 */ if (gt_str_cmp(saA->ref_id, saB->ref_id)) return false; /* compare element 15 */ if (saA->gen_strand_forward != saB->gen_strand_forward) return false; /* compare element 16 */ if (saA->ref_strand_forward != saB->ref_strand_forward) return false; /* compare element 17 */ if (gth_sa_genomiccutoff_start(saA) != gth_sa_genomiccutoff_start(saB)) return false; if (gth_sa_referencecutoff_start(saA) != gth_sa_referencecutoff_start(saB)) return false; if (gth_sa_eopcutoff_start(saA) != gth_sa_eopcutoff_start(saB)) return false; if (gth_sa_genomiccutoff_end(saA) != gth_sa_genomiccutoff_end(saB)) return false; if (gth_sa_referencecutoff_end(saA) != gth_sa_referencecutoff_end(saB)) return false; if (gth_sa_eopcutoff_end(saA) != gth_sa_eopcutoff_end(saB)) return false; /* compare element 18 */ if (gt_array_size(saA->exons) != gt_array_size(saB->exons)) return false; for (i = 0; i < gt_array_size(saA->exons); i++) { exoninfoA = (Exoninfo*) gt_array_get(saA->exons, i); exoninfoB = (Exoninfo*) gt_array_get(saB->exons, i); if (exoninfoA->leftgenomicexonborder != exoninfoB->leftgenomicexonborder) return false; if (exoninfoA->rightgenomicexonborder != exoninfoB->rightgenomicexonborder) return false; if (exoninfoA->leftreferenceexonborder != exoninfoB->leftreferenceexonborder) { return false; } if (exoninfoA->rightreferenceexonborder != exoninfoB->rightreferenceexonborder) { return false; } if (!gt_double_equals_double(exoninfoA->exonscore, exoninfoB->exonscore)) { return false; } } /* compare element 19 */ if (gt_array_size(saA->introns) != gt_array_size(saB->introns)) return false; for (i = 0; i < gt_array_size(saA->introns); i++) { introninfoA = (Introninfo*) gt_array_get(saA->introns, i); introninfoB = (Introninfo*) gt_array_get(saB->introns, i); if (!gt_double_equals_double(introninfoA->donorsiteprobability, introninfoB->donorsiteprobability)) { return false; } if (!gt_double_equals_double(introninfoA->acceptorsiteprobability, introninfoB->acceptorsiteprobability)) { return false; } if (!gt_double_equals_double(introninfoA->donorsitescore, introninfoB->donorsitescore)) { return false; } if (!gt_double_equals_double(introninfoA->acceptorsitescore, introninfoB->acceptorsitescore)) { return false; } } /* compare element 20 */ if (saA->polyAtailpos.start != saB->polyAtailpos.start) return false; if (saA->polyAtailpos.end != saB->polyAtailpos.end) return false; /* compare element 21 */ if (saA->alignmentscore != saB->alignmentscore) return false; /* compare element 22 */ if (saA->coverage != saB->coverage) return false; /* compare element 23 */ if (saA->genomic_cov_is_highest != saB->genomic_cov_is_highest) return false; /* compare element 24 */ if (saA->cumlen_scored_exons != saB->cumlen_scored_exons) return false; return true; }
GtUword gth_sa_get_alignment_lines(const GthSA *sa, unsigned char **first_line, unsigned char **second_line, unsigned char **third_line, GtUword translationtable, GthInput *input) { GtUword genomicstartcutoff, genomicendcutoff, genomictotalcutoff, referencestartcutoff, referenceendcutoff, referencetotalcutoff; GT_UNUSED bool reverse_subject_pos = false; gt_assert(sa && first_line && second_line && third_line && input); /* only for cosmetic reasons */ genomicstartcutoff = gth_sa_genomiccutoff_start(sa); genomicendcutoff = gth_sa_genomiccutoff_end(sa); genomictotalcutoff = genomicstartcutoff + genomicendcutoff; referencestartcutoff = gth_sa_referencecutoff_start(sa); referenceendcutoff = gth_sa_referencecutoff_end(sa); referencetotalcutoff = referencestartcutoff + referenceendcutoff; /* sequences */ unsigned char *gen_seq_orig, *ref_seq_orig; GtUword cols = 0; GthSeqCon *ref_seq_con; /* make sure that the correct files are loaded */ gth_input_load_reference_file(input, gth_sa_ref_file_num(sa), false); ref_seq_con = gth_input_current_ref_seq_con(input); /* If the reverse complement of the genomic DNA is considered, this opition is needed for correct output of the genomic sequence positions by the function showalignmentgeneric() */ if (!gth_sa_gen_strand_forward(sa)) reverse_subject_pos = true; /* get genomic sequence */ gen_seq_orig = (unsigned char*) gth_input_original_genomic_sequence(input, gth_sa_gen_file_num(sa), gth_sa_gen_strand_forward(sa)) + gth_sa_gen_dp_start(sa); /* get reference sequence */ if (gth_sa_ref_strand_forward(sa)) { ref_seq_orig = gth_seq_con_get_orig_seq(ref_seq_con, gth_sa_ref_seq_num(sa)); } else { ref_seq_orig = gth_seq_con_get_orig_seq_rc(ref_seq_con, gth_sa_ref_seq_num(sa)); } switch (gth_sa_alphatype(sa)) { case DNA_ALPHA: /* compute the two alignment lines */ cols = gthfillthetwoalignmentlines(first_line, second_line, gen_seq_orig + genomicstartcutoff, gth_sa_gen_dp_length(sa) - genomictotalcutoff, ref_seq_orig + referencestartcutoff, gth_sa_ref_total_length(sa) - referencetotalcutoff, gth_sa_get_editoperations(sa), gth_sa_get_editoperations_length(sa), 0, /* linewidth not important here */ 0, /* no short introns here */ NULL,/* therefore no shortintroninfo */ gth_sa_indelcount(sa)); *third_line = NULL; break; case PROTEIN_ALPHA: /* compute the three alignment lines */ cols = gthfillthethreealignmentlines(first_line, second_line, third_line, gth_sa_get_editoperations(sa), gth_sa_get_editoperations_length(sa), gth_sa_indelcount(sa), gen_seq_orig + genomicstartcutoff, gth_sa_gen_dp_length(sa) - genomictotalcutoff, ref_seq_orig + referencestartcutoff, gth_sa_ref_total_length(sa) - referencetotalcutoff, translationtable); break; default: gt_assert(0); } return cols; }
void gth_sa_echo_alignment(const GthSA *sa, GtUword showintronmaxlen, GtUword translationtable, bool wildcardimplosion, GthInput *input, GtFile *outfp) { GtUword genomicstartcutoff, genomicendcutoff, genomictotalcutoff, referencestartcutoff, referenceendcutoff, referencetotalcutoff; bool reverse_subject_pos = false; const unsigned char *gen_seq_orig, *ref_seq_orig; GthSeqCon *ref_seq_con; GtAlphabet *ref_alphabet; gt_assert(sa && input); /* only for cosmetic reasons */ genomicstartcutoff = gth_sa_genomiccutoff_start(sa); genomicendcutoff = gth_sa_genomiccutoff_end(sa); genomictotalcutoff = genomicstartcutoff + genomicendcutoff; referencestartcutoff = gth_sa_referencecutoff_start(sa); referenceendcutoff = gth_sa_referencecutoff_end(sa); referencetotalcutoff = referencestartcutoff + referenceendcutoff; /* make sure that the correct files are loaded */ gth_input_load_reference_file(input, gth_sa_ref_file_num(sa), false); ref_seq_con = gth_input_current_ref_seq_con(input); ref_alphabet = gth_input_current_ref_alphabet(input); /* If the reverse complement of the genomic DNA is considered, this opition is needed for correct output of the genomic sequence positions by the function showalignmentgeneric() */ if (!gth_sa_gen_strand_forward(sa)) reverse_subject_pos = true; /* get genomic sequence */ gen_seq_orig = gth_input_original_genomic_sequence(input, sa->gen_file_num, sa->gen_strand_forward) + gth_sa_gen_dp_start(sa); /* get reference sequence */ if (gth_sa_ref_strand_forward(sa)) { ref_seq_orig = gth_seq_con_get_orig_seq(ref_seq_con, gth_sa_ref_seq_num(sa)); } else { ref_seq_orig = gth_seq_con_get_orig_seq_rc(ref_seq_con, gth_sa_ref_seq_num(sa)); } switch (gth_sa_alphatype(sa)) { case DNA_ALPHA: gthshowalignmentdna(outfp,ALIGNMENTLINEWIDTH, gth_sa_get_editoperations(sa), gth_sa_get_editoperations_length(sa), gth_sa_indelcount(sa), gen_seq_orig + genomicstartcutoff, gth_sa_gen_dp_length(sa) - genomictotalcutoff, ref_seq_orig + referencestartcutoff, gth_sa_ref_total_length(sa) - referencetotalcutoff, gth_sa_gen_dp_start(sa) + genomicstartcutoff - gth_sa_gen_offset(sa), referencestartcutoff, gth_sa_gen_total_length(sa), showintronmaxlen, ref_alphabet, reverse_subject_pos, wildcardimplosion); break; case PROTEIN_ALPHA: gthshowalignmentprotein(outfp, ALIGNMENTLINEWIDTH, gth_sa_get_editoperations(sa), gth_sa_get_editoperations_length(sa), gth_sa_indelcount(sa), gen_seq_orig + genomicstartcutoff, gth_sa_gen_dp_length(sa) - genomictotalcutoff, ref_seq_orig + referencestartcutoff, gth_sa_ref_total_length(sa) - referencetotalcutoff, gth_sa_gen_dp_start(sa) + genomicstartcutoff - gth_sa_gen_offset(sa), referencestartcutoff, gth_sa_gen_total_length(sa), showintronmaxlen, ref_alphabet, translationtable, gth_input_score_matrix(input), gth_input_score_matrix_alpha(input), reverse_subject_pos, wildcardimplosion); break; default: gt_assert(0); } }
static void showalignmentheader(GthSA *sa, bool gs2out, int widthforgenpos, GtUword minintronlength, GtFile *outfp) { GtUword i, leftreferenceexonborder, rightreferenceexonborder, referenceexonlength; GthDbl exonscore, donorsitescore, acceptorsitescore; GthFlt donorsiteprobability, acceptorsiteprobability; Exoninfo *exoninfo; Introninfo *introninfo; gt_file_xprintf(outfp, "Predicted gene structure"); if (gs2out) { gt_file_xprintf(outfp, " (within gDNA segment "GT_WU" to "GT_WU"):\n", gth_sa_gen_dp_start_show(sa), gth_sa_gen_dp_end_show(sa)); } else gt_file_xprintf(outfp, ":\n"); gt_file_xfputc('\n', outfp); for (i = 0; i < gth_sa_num_of_exons(sa); i++) { exoninfo = gth_sa_get_exon(sa, i); leftreferenceexonborder = exoninfo->leftreferenceexonborder; rightreferenceexonborder = exoninfo->rightreferenceexonborder; referenceexonlength = rightreferenceexonborder - leftreferenceexonborder + 1; exonscore = exoninfo->exonscore; if (i > 0) { introninfo = gth_sa_get_intron(sa, i-1); donorsiteprobability = introninfo->donorsiteprobability; donorsitescore = introninfo->donorsitescore; acceptorsiteprobability = introninfo->acceptorsiteprobability; acceptorsitescore = introninfo->acceptorsitescore; gt_file_xprintf(outfp, " Intron %2" GT_WUS " %*" GT_WUS " %*" GT_WUS " (%4" GT_WUS " n); ", i - 1 + OUTPUTOFFSET, widthforgenpos, gth_sa_left_intron_border(sa, i-1), widthforgenpos, gth_sa_right_intron_border(sa, i-1), gth_sa_intron_length(sa, i-1)); gt_file_xprintf(outfp, "Pd: %5.3f ", donorsiteprobability); if (gth_sa_alphatype(sa) == DNA_ALPHA) { if (donorsitescore == 0.0) gt_file_xprintf(outfp, "(s: 0), "); else gt_file_xprintf(outfp, "(s: %4.2f), ", donorsitescore); } else gt_file_xprintf(outfp, " "); gt_file_xprintf(outfp, "Pa: %5.3f ", acceptorsiteprobability); if (gth_sa_alphatype(sa) == DNA_ALPHA) { if (acceptorsitescore == 0.0) gt_file_xprintf(outfp, "(s: 0)"); else gt_file_xprintf(outfp, "(s: %4.2f)", acceptorsitescore); } /* if the intron is shorter or equal than the minimum intron length two question marks are shown at the end of the line */ if (gth_sa_intron_length(sa, i-1) <= minintronlength) gt_file_xprintf(outfp, " ??"); gt_file_xfputc('\n', outfp); } gt_file_xprintf(outfp, " Exon %2" GT_WUS " %*" GT_WUS " %*" GT_WUS " (%4" GT_WUS " n); %s %6" GT_WUS " %6" GT_WUS " (%4" GT_WUS " %s); " "score: %5.3f\n", i + OUTPUTOFFSET, widthforgenpos, gth_sa_left_genomic_exon_border(sa, i), widthforgenpos, gth_sa_right_genomic_exon_border(sa, i), gth_sa_genomic_exon_length(sa, i), gth_sa_alphastring(sa), leftreferenceexonborder + OUTPUTOFFSET, rightreferenceexonborder + OUTPUTOFFSET, referenceexonlength, gth_sa_alphatype(sa) == DNA_ALPHA ? "n" : "aa", exonscore); } /* showing PPA line (if an poly-A tail was determined) */ if (gth_sa_alphatype(sa) == DNA_ALPHA) showppaline(sa, outfp); gt_file_xfputc('\n', outfp); /* showing MATCH line */ showmatchline(sa, outfp); /* showing PGS line */ showpgsline(sa, outfp); }