static void xml_showgthgenomicinformation(GthSA *sa, GthInput *input, unsigned int indentlevel, GtFile *outfp) { gt_assert(gth_sa_gen_file_num(sa) != GT_UNDEF_ULONG); gth_indent(outfp, indentlevel); gt_file_xprintf(outfp, "<gDNA_segment>\n"); indentlevel++; gth_indent(outfp, indentlevel); gt_file_xprintf(outfp, "<template temp_file=\"%s\" temp_id=\"%s\" " "temp_strand=\"%c\" temp_description=\"", gth_input_get_genomic_filename(input, gth_sa_gen_file_num(sa)), gth_sa_gen_id(sa), gth_sa_gen_strand_char(sa)); gth_input_echo_genomic_description(input, gth_sa_gen_file_num(sa), gth_sa_gen_seq_num(sa), outfp); gt_file_xprintf(outfp, "\">\n"); indentlevel++; gth_indent(outfp, indentlevel); gt_file_xprintf(outfp, "<position start=\"%lu\" stop=\"%lu\"/>\n", gth_sa_gen_dp_start_show(sa), gth_sa_gen_dp_end_show(sa)); indentlevel--; gth_indent(outfp, indentlevel); gt_file_xprintf(outfp, "</template>\n"); indentlevel--; gth_indent(outfp, indentlevel); gt_file_xprintf(outfp, "</gDNA_segment>\n"); }
static void showgenomicfilename(GthSA *sa, GthInput *input, unsigned int indentlevel, GtFile *outfp) { gth_indent(outfp, indentlevel); gt_file_xprintf(outfp, "<genomicfile>\n"); indentlevel++; gth_indent(outfp, indentlevel); gt_file_xprintf(outfp, "<genomicfilename>%s</genomicfilename>\n", gth_input_get_genomic_filename(input, gth_sa_gen_file_num(sa))); gth_indent(outfp, indentlevel); gt_file_xprintf(outfp, "<genomicfilehash>%s</genomicfilehash>\n", GTH_UNDEFINED_HASH); indentlevel--; gth_indent(outfp, indentlevel); gt_file_xprintf(outfp, "</genomicfile>\n"); }
static void showgthgenomicinformation(GthSA *sa, GthInput *input, bool showseqnums, GtFile *outfp) { gt_assert(gth_sa_gen_file_num(sa) != GT_UNDEF_UWORD); gt_file_xprintf(outfp, "Genomic Template: file=%s, strand=%c, from="GT_WU", " "to="GT_WU", description=", gth_input_get_genomic_filename(input, gth_sa_gen_file_num(sa)), gth_sa_gen_strand_char(sa), gth_sa_gen_dp_start_show(sa), gth_sa_gen_dp_end_show(sa)); gth_sa_echo_genomic_description(sa, input, outfp); if (showseqnums) gt_file_xprintf(outfp, ", seqnum="GT_WU"", gth_sa_gen_seq_num(sa)); gt_file_xfputc('\n', outfp); gt_file_xfputc('\n', outfp); }
static void make_sequence_region(GtHashmap *sequence_regions, GtStr *sequenceid, GthRegionFactory *srf, GthInput *input, GtUword filenum, GtUword seqnum) { GtUword offset_is_defined = false; GtRange range, descrange; GtGenomeNode *sr = NULL; gt_assert(sequence_regions && sequenceid && srf && input); if (gth_input_use_substring_spec(input)) { range.start = gth_input_genomic_substring_from(input); range.end = gth_input_genomic_substring_to(input); } else { range = gth_input_get_relative_genomic_range(input, filenum, seqnum); } if (srf->use_desc_ranges) { GtStr *description = gt_str_new(); gth_input_get_genomic_description(input, description, filenum, seqnum); if (!gt_parse_description_range(gt_str_get(description), &descrange)) offset_is_defined = true; gt_str_delete(description); } if (offset_is_defined) range = gt_range_offset(&range, descrange.start); else range = gt_range_offset(&range, 1); /* 1-based */ if (!gt_str_length(sequenceid) || (gt_cstr_table_get(srf->used_seqids, gt_str_get(sequenceid)) && !offset_is_defined)) { /* sequenceid is empty or exists already (and no offset has been parsed) -> make one up */ GtStr *seqid; char *base; base = gt_basename(gth_input_get_genomic_filename(input, filenum)); seqid = gt_str_new_cstr(base); gt_free(base); gt_str_append_char(seqid, '|'); gt_str_append_uword(seqid, seqnum + 1); /* 1-based */ seqid_store_add(srf->seqid_store, filenum, seqnum, seqid, GT_UNDEF_UWORD); gt_assert(!gt_cstr_table_get(srf->used_seqids, gt_str_get(seqid))); gt_cstr_table_add(srf->used_seqids, gt_str_get(seqid)); sr = gt_region_node_new(seqid_store_get(srf->seqid_store, filenum, seqnum), range.start, range.end); gt_hashmap_add(sequence_regions, (void*) gt_cstr_table_get(srf->used_seqids, gt_str_get(seqid)), sr); gt_str_delete(seqid); } else { /* sequenceid does not exists already (or an offset has been parsed) -> use this one */ if (!gt_cstr_table_get(srf->used_seqids, gt_str_get(sequenceid))) { /* no sequence region with this id exists -> create one */ gt_cstr_table_add(srf->used_seqids, gt_str_get(sequenceid)); seqid_store_add(srf->seqid_store, filenum, seqnum, sequenceid, offset_is_defined ? descrange.start : GT_UNDEF_UWORD); sr = gt_region_node_new(seqid_store_get(srf->seqid_store, filenum, seqnum), range.start, range.end); gt_hashmap_add(sequence_regions, (void*) gt_cstr_table_get(srf->used_seqids, gt_str_get(sequenceid)), sr); } else { GtRange prev_range, new_range; /* sequence region with this id exists already -> modify range */ sr = gt_hashmap_get(sequence_regions, gt_str_get(sequenceid)); gt_assert(sr); prev_range = gt_genome_node_get_range(sr); new_range = gt_range_join(&prev_range, &range); gt_genome_node_set_range(sr, &new_range); seqid_store_add(srf->seqid_store, filenum, seqnum, sequenceid, offset_is_defined ? descrange.start : GT_UNDEF_UWORD); } } gt_assert(sr); }
void gth_chaining(GthChainCollection *chain_collection, GtUword gen_file_num, GtUword ref_file_num, GthCallInfo *call_info, GthInput *input, GthStat *stat, bool directmatches, const GthPlugins *plugins) { GtUword i, numofsequences = 0; GtArray *matches; GthChainingInfo chaining_info; void *matcher_arguments; GtFile *outfp = call_info->out->outfp; GthMatchProcessorInfo match_processor_info; bool refseqisdna = gth_input_ref_file_is_dna(input, ref_file_num); /* make sure matcher is defined */ gt_assert(plugins); gt_assert(plugins->matcher_arguments_new); gt_assert(plugins->matcher_arguments_delete); gt_assert(plugins->matcher_runner); /* init */ matches = gt_array_new(sizeof (GthMatch)); chaining_info_init(&chaining_info, directmatches, refseqisdna, call_info, input, stat, gen_file_num, ref_file_num); matcher_arguments = plugins->matcher_arguments_new(true, input, call_info->simfilterparam.inverse || !refseqisdna ? gth_input_get_genomic_filename(input, gen_file_num) : gth_input_get_reference_filename(input, ref_file_num), call_info->simfilterparam.inverse || !refseqisdna ? gth_input_get_reference_filename(input, ref_file_num) : gth_input_get_genomic_filename(input, gen_file_num), directmatches, refseqisdna, call_info->progname, gt_str_get(gth_input_proteinsmap(input)), call_info->simfilterparam.exact, call_info->simfilterparam.edist, false, 0, call_info->simfilterparam.minmatchlength, call_info->simfilterparam.seedlength, call_info->simfilterparam.exdrop, call_info->simfilterparam.prminmatchlen, call_info->simfilterparam.prseedlength, call_info->simfilterparam.prhdist, call_info->translationtable, call_info->simfilterparam.online, call_info->simfilterparam.noautoindex, call_info->simfilterparam.maskpolyAtails, false); match_processor_info_init(&match_processor_info, matches, chain_collection, directmatches, refseqisdna, call_info->simfilterparam.online, call_info->simfilterparam.inverse, stat, &chaining_info, call_info->simfilterparam.maxnumofmatches, call_info->simfilterparam.rare, call_info->fragweightfactor, plugins->jump_table_new, plugins->jump_table_new_reverse, plugins->jump_table_delete); if (call_info->simfilterparam.maxnumofmatches > 0 || gth_stat_get_matchnumdistri(stat)) { /* alloc space of match number counter */ numofsequences = gth_input_num_of_ref_seqs(input, ref_file_num); match_processor_info.matchnumcounter = gt_malloc(sizeof (GtUword) * numofsequences); /* init match number counter to 0 */ memset(match_processor_info.matchnumcounter, 0, (size_t) numofsequences * sizeof (GtUword)); } /* free input, which contains the virtual trees. because vmatch loads the virtual trees into memory, too. this prevents that the virtual trees are loaded twice. */ gth_input_delete_current(input); /* call matcher */ if (call_info->out->showverbose) call_info->out->showverbose("call vmatch to compute matches"); plugins->matcher_runner(matcher_arguments, call_info->out->showverbose, call_info->out->showverboseVM, &match_processor_info); /* free matcher stuff here, because otherwise the reference file is mapped twice below */ plugins->matcher_arguments_delete(matcher_arguments); /* free sequence collections (if they have been filled by the matcher) */ gth_seq_con_delete(match_processor_info.gen_seq_con); gth_seq_con_delete(match_processor_info.ref_seq_con); /* save match numbers of match number distribution, if necessary */ if (gth_stat_get_matchnumdistri(stat)) { for (i = 0; i < numofsequences; i++) { if (match_processor_info.matchnumcounter[i] > 0) { gth_stat_add_to_matchnumdistri(stat, match_processor_info.matchnumcounter[i]); } } } /* free match number counter */ gt_free(match_processor_info.matchnumcounter); /* return if no match has been found */ if (!gt_array_size(matches)) { if (call_info->out->comments) gt_file_xprintf(outfp, "%c no match has been found\n", COMMENTCHAR); gt_array_delete(matches); return; } /* load genomic file back into memory */ gth_input_load_genomic_file(input, gen_file_num, true); /* load reference file back into memory */ gth_input_load_reference_file(input, ref_file_num, true); /* compute chains from matches */ calc_chains_from_matches(chain_collection, matches, &chaining_info, gth_input_current_gen_seq_con(input), gth_input_current_ref_seq_con(input), call_info->simfilterparam.rare, call_info->fragweightfactor, plugins->jump_table_new, plugins->jump_table_new_reverse, plugins->jump_table_delete); if (call_info->out->showverbose) { call_info->out->showverbose("sort global chains according to reference " "sequence coverage"); } /* sort chains */ gth_chain_collection_sort(chain_collection); /* free */ gt_array_delete(matches); }
static void show_xml_run_header(GthCallInfo *call_info, GthInput *input, const char *timestring, const char *gth_version, unsigned int indentlevel, const char **args) { GtFile *outfp = call_info->out->outfp; GtUword i; gth_indent(outfp, indentlevel); if (call_info->intermediate) { gt_file_xprintf(outfp, "<header xmlns=\"http://www.GenomeThreader.org/" "SplicedAlignment/header/\">\n"); } else { gt_file_xprintf(outfp, "<header xmlns=\"http://www.genomethreader.org/GTH_output/" "header/\">\n"); } /* at least one genomic file defined */ gt_assert(gth_input_num_of_gen_files(input)); /* at least one reference file defined */ gt_assert(gth_input_num_of_ref_files(input)); /* show a readable version of GthCallInfo. That is, it is shown with wich parameters the program was called */ indentlevel++; gth_indent(outfp, indentlevel); gt_file_xprintf(outfp, "<source program=\"GenomeThreader\" version=\"%s\" " "build_date=\"%s\" run_date=\"%s\"/>\n", gth_version, GT_BUILT, timestring); /* show genomic file names */ gth_indent(outfp, indentlevel); gt_file_xprintf(outfp, "<gDNA_template_files>\n"); indentlevel++; for (i = 0; i < gth_input_num_of_gen_files(input); i++) { gth_indent(outfp, indentlevel); gt_file_xprintf(outfp, "<temp_name>%s</temp_name>\n", gth_input_get_genomic_filename(input, i)); } indentlevel--; gth_indent(outfp, indentlevel); gt_file_xprintf(outfp, "</gDNA_template_files>\n"); /* show reference file names */ gth_indent(outfp, indentlevel); gt_file_xprintf(outfp, "<reference_files>\n"); indentlevel++; for (i = 0; i < gth_input_num_of_ref_files(input); i++) { gth_indent(outfp, indentlevel); gt_file_xprintf(outfp, "<file ref_name=\"%s\" type=\"%s\"/>\n", gth_input_get_reference_filename(input, i), gth_input_get_alphatype(input, i) == DNA_ALPHA ? "ESTcDNA" : "Protein"); } indentlevel--; gth_indent(outfp, indentlevel); gt_file_xprintf(outfp, "</reference_files>\n"); gth_indent(outfp, indentlevel); gt_file_xprintf(outfp, "<splice_site_parameters parameter_type=\"%s\" " "species=\"%s\"/>\n", SPLICE_SITE_MODEL_NAME, call_info->speciesnum == NUMOFSPECIES ? GENERIC_SPECIES_NAME : speciestab[call_info->speciesnum]); gth_indent(outfp, indentlevel); gt_file_xprintf(outfp, "<parameters>\n"); indentlevel++; /* output name of BSSM file */ gth_indent(outfp, indentlevel); gt_file_xprintf(outfp, "<parameter name=\"bssmfile\" value=\"%s\"/>\n", gth_input_bssmfilename(input)); /* output name of scorematrix */ gth_indent(outfp, indentlevel); gt_file_xprintf(outfp, "<parameter name=\"scorematrixfile\" value=\"%s\"/>\n", gt_str_get(call_info->scorematrixfile)); /* output searchmode */ gth_indent(outfp, indentlevel); gt_file_xprintf(outfp, "<parameter name=\"searchmode\" " "value=\"forward=%s,reverse=%s)\"/>\n", GTH_SHOWBOOL(gth_input_forward(input)), GTH_SHOWBOOL(gth_input_reverse(input))); /* output arguments as comment */ gt_file_xprintf(outfp, "<!--\n%c Arguments: ", COMMENTCHAR); gt_cstr_array_show_genfile(args, outfp); gt_file_xprintf(outfp, "-->\n"); indentlevel--; gth_indent(outfp, indentlevel); gt_file_xprintf(outfp, "</parameters>\n"); show_overall_reference_type(gth_input_overall_alphatype(input), indentlevel, outfp); indentlevel--; gth_indent(outfp, indentlevel); gt_file_xprintf(outfp, "</header>\n"); }