bool gth_intermediate_output_is_correct(char *outputfilename, GthSACollection *orig_sa_collection, GthInput *input, GtFile **outfp, GtError *err) { SACollectionData sa_collection_data; GthSACollection *read_sa_collection; GtFileMode file_mode; bool rval; #ifndef NDEBUG GtUword numofgenomicfiles, numofreferencefiles; #endif gt_error_check(err); gt_assert(outputfilename); gt_assert(*outfp); #ifndef NDEBUG numofgenomicfiles = gth_input_num_of_gen_files(input); numofreferencefiles = gth_input_num_of_ref_files(input); #endif /* init */ read_sa_collection = gth_sa_collection_new(GTH_DC_NONE); sa_collection_data.sa_collection = read_sa_collection; sa_collection_data.sa_filter = NULL; sa_collection_data.stat = NULL; /* store file mode */ file_mode = gt_file_mode(*outfp); /* close output file */ gt_file_delete(*outfp); /* open intermediate file again for reading */ *outfp = gt_file_xopen_file_mode(file_mode, outputfilename, "r"); gt_assert(*outfp); /* read in the intermediate output */ if (gt_parse_intermediate_output(input, store_in_sa_collection, &sa_collection_data, outputfilename, *outfp, err)) { fprintf(stderr, "error: %s\n", gt_error_get(err)); exit(EXIT_FAILURE); } /* array of genomic files did not grow */ gt_assert(numofgenomicfiles == gth_input_num_of_gen_files(input)); /* array of reference files did not grow */ gt_assert(numofreferencefiles == gth_input_num_of_ref_files(input)); /* compare the trees */ rval = gth_sa_collections_are_equal(orig_sa_collection, read_sa_collection); /* free */ gth_sa_collection_delete(read_sa_collection); return rval; }
/* The following function processes a file. That is, it checkes if the file with name <filename> is already contained in the array <files>. If so, the index refering to this array is returned. Otherwise, the hash of the file content is compared with the hash <filehash>. If the hashes are the same, the filename is added to the array and the index is returned. Otherwise, the function calls exit(). */ static GtUword process_file(GthInput *input, char *filename, char *filehash, bool isreferencefile, GthAlphatype alphatype) { GtWord fileindex; FILE *fp; if (isreferencefile) fileindex = gth_input_determine_reference_file_index(input, filename); else fileindex = gth_input_determine_genomic_file_index(input, filename); if (fileindex == -1) { /* file is not contained in array yet -> open file */ fp = gt_fa_xfopen(filename, "r"); /* check the hash */ if (!hashes_are_the_same(filehash, fp)) { fprintf(stderr, "apparently file \"%s\" has changed\n", filename); exit(EXIT_FAILURE); } /* hashes equal -> store new file in array and return index number */ gt_fa_xfclose(fp); if (isreferencefile) { gth_input_add_reference_file(input, filename, alphatype); fileindex = gth_input_num_of_ref_files(input) - 1; } else { gth_input_add_genomic_file(input, filename); fileindex = gth_input_num_of_gen_files(input) - 1; } return fileindex; } /* file is already contained in array -> return index number */ return fileindex; }
static void calc_chains_from_matches(GthChainCollection *chain_collection, GtArray *matches, GthChainingInfo *chaining_info, GthSeqCon *gen_seq_con, GthSeqCon *ref_seq_con, GtUword rare, double fragweightfactor, GthJumpTableNew jump_table_new, GthJumpTableNewReverse jump_table_new_reverse, GthJumpTableDelete jump_table_delete) { GtUword i, numofchains = 0, num_of_fragments, maxbucketlength = 0; GtRange range; GtFile *outfp = chaining_info->call_info->out->outfp; GtFragment *fragments; GthSaveChainInfo info; GtArray *buckets; Bucket *bucket; /* this is a random sample to check that no equal matches exist either one match to chain or if more than one the first two differ */ gt_assert(gt_array_size(matches) == 1 || (gt_array_size(matches) > 1 && !gth_matches_are_equal(gt_array_get(matches, 0), gt_array_get(matches, 1)))); /* init */ buckets = gt_array_new(sizeof (Bucket)); /* output unsorted matches */ if (chaining_info->call_info->out->comments) { gt_file_xprintf(outfp, "%c output unsorted matches\n", COMMENTCHAR); showmatches(gt_array_get_space(matches), gt_array_size(matches), outfp); } /* transform reference sequence positions to opposite strand if necessary */ if (!chaining_info->directmatches) { if (chaining_info->call_info->out->comments) { gt_file_xprintf(outfp, "%c\n", COMMENTCHAR); gt_file_xprintf(outfp, "%c transform reference sequence positions to " "opposite strand\n", COMMENTCHAR); gt_file_xprintf(outfp, "%c\n", COMMENTCHAR); } transform_refseq_positions(matches, ref_seq_con); /* output transformed matches */ if (chaining_info->call_info->out->comments) { gt_file_xprintf(outfp, "%c output transformed matches\n", COMMENTCHAR); showmatches(gt_array_get_space(matches), gt_array_size(matches), outfp); } } /* sort matches */ sort_matches_and_calc_buckets(matches, buckets, &maxbucketlength); /* output sorted matches */ if (chaining_info->call_info->out->comments) { gt_file_xprintf(outfp, "%c output sorted matches\n", COMMENTCHAR); showmatches(gt_array_get_space(matches), gt_array_size(matches), outfp); } /* output buckets */ if (chaining_info->call_info->out->comments) { gt_file_xprintf(outfp, "%c output buckets\n", COMMENTCHAR); outputbuckets(buckets, gt_array_get_space(matches), outfp); } /* alloc space for fragments */ fragments = gt_malloc(sizeof (GtFragment) * maxbucketlength); /* save data to process the chains with saveChainasDPrange; constant part */ info.chain_collection = chain_collection; info.gcmincoverage = chaining_info->call_info->gcmincoverage; info.stat = chaining_info->stat; info.comments = chaining_info->call_info->out->comments; info.stopafterchaining = chaining_info->call_info->simfilterparam .stopafterchaining; info.paralogs = chaining_info->call_info->simfilterparam.paralogs; info.enrichchains = chaining_info->call_info->simfilterparam .enrichchains; info.jump_table = chaining_info->call_info->simfilterparam.jump_table; info.jump_table_new = jump_table_new; info.jump_table_new_reverse = jump_table_new_reverse; info.jump_table_delete = jump_table_delete; info.jtdebug = chaining_info->jtdebug; info.directmatches = chaining_info->directmatches; info.outfp = outfp; info.gen_file_num = chaining_info->gen_file_num; info.ref_file_num = chaining_info->ref_file_num; /* for every bucket a chain and for every chain a DP call (later maybe more than one chain) */ for (i = 0; i < gt_array_size(buckets); i++) { bucket = gt_array_get(buckets, i); if (chaining_info->call_info->out->showverbose) { if (chaining_info->refseqisindex && !chaining_info->call_info->simfilterparam.online) { /* in this case the exact number of chains is known */ numofchains = gt_array_size(buckets); } else { /* this expression gives an upper bound on the number of chains (because we do not know the exact number here) */ numofchains = chaining_info->bucketnum + gth_seq_con_num_of_seqs(gen_seq_con) * (gth_seq_con_num_of_seqs(ref_seq_con) - bucket->seqnum1); if (numofchains > chaining_info->maxbucketnum) numofchains = chaining_info->maxbucketnum; else chaining_info->maxbucketnum = numofchains; } } /* compute a set of fragments from every bucket of matches */ gthinitfragments(fragments, &num_of_fragments, (GthMatch*) gt_array_get_space(matches) + bucket->startpos, bucket->length, rare, fragweightfactor); if (chaining_info->call_info->out->showverbose) { show_chain_calc_status (chaining_info->call_info->out->showverbose, ++chaining_info->bucketnum, numofchains, num_of_fragments, chaining_info->gen_file_num, gth_input_num_of_gen_files(chaining_info->input), chaining_info->ref_file_num, gth_input_num_of_ref_files(chaining_info->input), chaining_info->directmatches, chaining_info->call_info->out->verboseseqs, bucket->seqnum2, bucket->seqnum1); } info.gen_seq_num = ((GthMatch*) gt_array_get(matches, bucket->startpos)) ->Storeseqnumgenomic; info.ref_seq_num = ((GthMatch*) gt_array_get(matches, bucket->startpos)) ->Storeseqnumreference; /* store genomic offset */ range = gth_seq_con_get_range(gen_seq_con, info.gen_seq_num); info.gen_total_length = range.end - range.start + 1; info.gen_offset = range.start; /* store length of reference sequence */ range = gth_seq_con_get_range(ref_seq_con, info.ref_seq_num); info.ref_total_length = range.end - range.start + 1; info.ref_offset = range.start; info.referencelength = range.end - range.start + 1; /* set number of remaining buckets */ info.numofremainingbuckets = gt_array_size(buckets) - i; if (chaining_info->call_info->simfilterparam.paralogs) { gt_globalchaining_coverage(fragments, num_of_fragments, chaining_info->call_info->gcmaxgapwidth, info.referencelength, ((double) chaining_info->call_info->gcmincoverage) / 100.0, gth_save_chain, &info); } else { gt_globalchaining_max(fragments, num_of_fragments, chaining_info->call_info->gcmaxgapwidth, gth_save_chain, &info); } } /* free space */ gt_array_delete(buckets); gt_free(fragments); }
static void show_xml_run_header(GthCallInfo *call_info, GthInput *input, const char *timestring, const char *gth_version, unsigned int indentlevel, const char **args) { GtFile *outfp = call_info->out->outfp; GtUword i; gth_indent(outfp, indentlevel); if (call_info->intermediate) { gt_file_xprintf(outfp, "<header xmlns=\"http://www.GenomeThreader.org/" "SplicedAlignment/header/\">\n"); } else { gt_file_xprintf(outfp, "<header xmlns=\"http://www.genomethreader.org/GTH_output/" "header/\">\n"); } /* at least one genomic file defined */ gt_assert(gth_input_num_of_gen_files(input)); /* at least one reference file defined */ gt_assert(gth_input_num_of_ref_files(input)); /* show a readable version of GthCallInfo. That is, it is shown with wich parameters the program was called */ indentlevel++; gth_indent(outfp, indentlevel); gt_file_xprintf(outfp, "<source program=\"GenomeThreader\" version=\"%s\" " "build_date=\"%s\" run_date=\"%s\"/>\n", gth_version, GT_BUILT, timestring); /* show genomic file names */ gth_indent(outfp, indentlevel); gt_file_xprintf(outfp, "<gDNA_template_files>\n"); indentlevel++; for (i = 0; i < gth_input_num_of_gen_files(input); i++) { gth_indent(outfp, indentlevel); gt_file_xprintf(outfp, "<temp_name>%s</temp_name>\n", gth_input_get_genomic_filename(input, i)); } indentlevel--; gth_indent(outfp, indentlevel); gt_file_xprintf(outfp, "</gDNA_template_files>\n"); /* show reference file names */ gth_indent(outfp, indentlevel); gt_file_xprintf(outfp, "<reference_files>\n"); indentlevel++; for (i = 0; i < gth_input_num_of_ref_files(input); i++) { gth_indent(outfp, indentlevel); gt_file_xprintf(outfp, "<file ref_name=\"%s\" type=\"%s\"/>\n", gth_input_get_reference_filename(input, i), gth_input_get_alphatype(input, i) == DNA_ALPHA ? "ESTcDNA" : "Protein"); } indentlevel--; gth_indent(outfp, indentlevel); gt_file_xprintf(outfp, "</reference_files>\n"); gth_indent(outfp, indentlevel); gt_file_xprintf(outfp, "<splice_site_parameters parameter_type=\"%s\" " "species=\"%s\"/>\n", SPLICE_SITE_MODEL_NAME, call_info->speciesnum == NUMOFSPECIES ? GENERIC_SPECIES_NAME : speciestab[call_info->speciesnum]); gth_indent(outfp, indentlevel); gt_file_xprintf(outfp, "<parameters>\n"); indentlevel++; /* output name of BSSM file */ gth_indent(outfp, indentlevel); gt_file_xprintf(outfp, "<parameter name=\"bssmfile\" value=\"%s\"/>\n", gth_input_bssmfilename(input)); /* output name of scorematrix */ gth_indent(outfp, indentlevel); gt_file_xprintf(outfp, "<parameter name=\"scorematrixfile\" value=\"%s\"/>\n", gt_str_get(call_info->scorematrixfile)); /* output searchmode */ gth_indent(outfp, indentlevel); gt_file_xprintf(outfp, "<parameter name=\"searchmode\" " "value=\"forward=%s,reverse=%s)\"/>\n", GTH_SHOWBOOL(gth_input_forward(input)), GTH_SHOWBOOL(gth_input_reverse(input))); /* output arguments as comment */ gt_file_xprintf(outfp, "<!--\n%c Arguments: ", COMMENTCHAR); gt_cstr_array_show_genfile(args, outfp); gt_file_xprintf(outfp, "-->\n"); indentlevel--; gth_indent(outfp, indentlevel); gt_file_xprintf(outfp, "</parameters>\n"); show_overall_reference_type(gth_input_overall_alphatype(input), indentlevel, outfp); indentlevel--; gth_indent(outfp, indentlevel); gt_file_xprintf(outfp, "</header>\n"); }
static int compute_sa_collection(GthSACollection *sa_collection, GthCallInfo *call_info, GthInput *input, GthStat *stat, const GthPlugins *plugins) { GthChainCollection *chain_collection; GthMatchInfo match_info; GtUword g, r; int rval = 0; match_info.call_number = 0; match_info.significant_match_found = false; match_info.max_call_number_reached = false; match_info.stop_amino_acid_warning = false; for (g = 0; g < gth_input_num_of_gen_files(input); g++) { for (r = 0; r < gth_input_num_of_ref_files(input); r++) { if (gth_input_get_alphatype(input, r) == DNA_ALPHA || gth_input_forward(input)) { if (call_info->out->showverbose) { show_compute_matches_status(true, call_info->out->showverbose, g, gth_input_num_of_gen_files(input), r, gth_input_num_of_ref_files(input)); } /* compute direct matches */ chain_collection = match_and_chain(call_info, input, stat, g, r, true, &match_info, plugins); if (chain_collection) { rval = calc_spliced_alignments(sa_collection, chain_collection, call_info, input, stat, g, r, true, &match_info, plugins->dna_complete_path_matrix_jt, plugins ->protein_complete_path_matrix_jt); gth_chain_collection_delete(chain_collection); if (rval) break; } } if (match_info.max_call_number_reached) break; if (gth_input_get_alphatype(input, r) == DNA_ALPHA || gth_input_reverse(input)) { if (call_info->out->showverbose) { show_compute_matches_status(false, call_info->out->showverbose, g, gth_input_num_of_gen_files(input), r, gth_input_num_of_ref_files(input)); } /* compute reverse complemented (palindromic) matches */ chain_collection = match_and_chain(call_info, input, stat, g, r, false, &match_info, plugins); if (chain_collection) { rval = calc_spliced_alignments(sa_collection, chain_collection, call_info, input, stat, g, r, false, &match_info, plugins->dna_complete_path_matrix_jt, plugins ->protein_complete_path_matrix_jt); gth_chain_collection_delete(chain_collection); if (rval) break; } if (match_info.max_call_number_reached) break; } } } return rval; }
static int callsahmt(bool call_dna_dp, GthSA *sa, bool forward, GtUword gen_file_num, GtUword ref_file_num, GthChain *raw_chain, GtUword gen_total_length, GtUword gen_offset, const GtRange *gen_seq_bounds, const GtRange *gen_seq_bounds_rc, const unsigned char *ref_seq_tran, const unsigned char *ref_seq_orig, GtUword ref_total_length, GtUword ref_offset, GthInput *input, Introncutoutinfo *introncutoutinfo, GthStat *stat, GtUword chainctr, GtUword num_of_chains, GtUword translationtable, bool directmatches, bool proteinexonpenal, GthSpliceSiteModel *splice_site_model, GthDPOptionsCore *dp_options_core, GthDPOptionsEST *dp_options_est, GthDPOptionsPostpro *dp_options_postpro, GthDNACompletePathMatrixJT dna_complete_path_matrix_jt, GthProteinCompletePathMatrixJT protein_complete_path_matrix_jt, GthOutput *out) { int rval; GthChain *actual_chain, *contracted_chain, *used_chain; GtUword icdelta = introncutoutinfo->icinitialdelta, iciterations = introncutoutinfo->iciterations; bool useintroncutout = introncutoutinfo->introncutout; /* initially useintron is set to the value of introncutoutinfo->introncutout, if the automatic intron cutotu technique is acitvated it can be set to true if an matrix allocation error (ERROR_MATRIX_ALLOCATION_FAILED) occurs */ gt_assert(sa); actual_chain = gth_chain_new(); contracted_chain = gth_chain_new(); for (;;) { /* reset actualDPrange; */ gt_array_set_size(actual_chain->forwardranges, 0); gt_array_set_size(actual_chain->reverseranges, 0); /* copy raw chain to actual chain */ gth_chain_copy(actual_chain, raw_chain); /* shorten potential introns and compute spliced sequence, if the intron cutout technique is used */ if (useintroncutout) { /* shorten potential introns */ gth_chain_shorten_introns(actual_chain, icdelta, introncutoutinfo->icminremintronlength, gen_total_length, gen_offset, out->comments, out->outfp); } else gth_chain_contract(contracted_chain, actual_chain); if (out->showverbose) { show_matrix_calculation_status(out->showverbose, forward, gth_sa_ref_strand_forward(sa), useintroncutout, chainctr, num_of_chains, icdelta, gen_file_num, gth_input_num_of_gen_files(input), ref_file_num, gth_input_num_of_ref_files(input), directmatches, out->verboseseqs, gth_sa_gen_id(sa), gth_sa_ref_id(sa)); } /* allocate space for DP parameter */ if (out->comments) { gt_file_xprintf(out->outfp, "%c alloc space for DP param " "(genomicid=%s, referenceid=%s)\n", COMMENTCHAR, gth_sa_gen_id(sa), gth_sa_ref_id(sa)); } used_chain = useintroncutout ? actual_chain : contracted_chain; /* The variable 'forward' denotes the genomic strand on which the DP is applied. */ if (forward) { if (call_dna_dp) { rval = gth_align_dna(sa, used_chain->forwardranges, gth_input_current_gen_seq_tran(input), gth_input_current_gen_seq_orig(input), ref_seq_tran, ref_seq_orig, ref_total_length, gth_input_current_gen_alphabet(input), gth_input_current_ref_alphabet(input), useintroncutout, introncutoutinfo->autoicmaxmatrixsize, out->showeops, out->comments, out->gs2out, gen_seq_bounds, splice_site_model, dp_options_core, dp_options_est, dp_options_postpro, dna_complete_path_matrix_jt, raw_chain->forward_jump_table, ref_offset, stat, out->outfp); } else { /* call_protein_dp */ rval = gth_align_protein(sa, used_chain->forwardranges, gth_input_current_gen_seq_tran(input), ref_seq_tran, ref_seq_orig, ref_total_length, gth_input_current_gen_alphabet(input), gth_input_current_ref_alphabet(input), input, useintroncutout, introncutoutinfo->autoicmaxmatrixsize, proteinexonpenal, out->showeops, out->comments, out->gs2out, translationtable, gen_seq_bounds, splice_site_model, dp_options_core, dp_options_postpro, protein_complete_path_matrix_jt, raw_chain->forward_jump_table, ref_offset, stat, out->outfp); } } else { /* the DP is called with the revers positions specifiers */ if (call_dna_dp) { rval = gth_align_dna(sa, used_chain->reverseranges, gth_input_current_gen_seq_tran_rc(input), gth_input_current_gen_seq_orig_rc(input), ref_seq_tran, ref_seq_orig, ref_total_length, gth_input_current_gen_alphabet(input), gth_input_current_ref_alphabet(input), useintroncutout, introncutoutinfo->autoicmaxmatrixsize, out->showeops, out->comments, out->gs2out, gen_seq_bounds_rc, splice_site_model, dp_options_core, dp_options_est, dp_options_postpro, dna_complete_path_matrix_jt, raw_chain->reverse_jump_table, ref_offset, stat, out->outfp); } else { /* call_protein_dp */ rval = gth_align_protein(sa, used_chain->reverseranges, gth_input_current_gen_seq_tran_rc(input), ref_seq_tran, ref_seq_orig, ref_total_length, gth_input_current_gen_alphabet(input), gth_input_current_ref_alphabet(input), input, useintroncutout, introncutoutinfo->autoicmaxmatrixsize, proteinexonpenal, out->showeops, out->comments, out->gs2out, translationtable, gen_seq_bounds, splice_site_model, dp_options_core, dp_options_postpro, protein_complete_path_matrix_jt, raw_chain->reverse_jump_table, ref_offset, stat, out->outfp); } } if (rval == GTH_ERROR_DP_PARAMETER_ALLOCATION_FAILED) return GTH_ERROR_DP_PARAMETER_ALLOCATION_FAILED; /* handling of special error codes ERROR_CUTOUT_NOT_IN_INTRON and ERROR_MATRIX_ALLOCATION_FAILED from DP the only possible special error code given back by this function is ERROR_SA_COULD_NOT_BE_DETERMINED */ #ifndef NDEBUG if (!useintroncutout) gt_assert(rval != GTH_ERROR_CUTOUT_NOT_IN_INTRON); #endif if (useintroncutout && rval == GTH_ERROR_CUTOUT_NOT_IN_INTRON) { /* the intron cutout technique failed -> increase counter */ gth_stat_increment_numofunsuccessfulintroncutoutDPs(stat); if (--iciterations > 0) { /* if an iterations is left, increase icdelta, decrease the remaining iterations, and continue the while-loop */ icdelta += introncutoutinfo->icdeltaincrease; continue; } else { /* no iteration left, discard SA */ gth_stat_increment_numofundeterminedSAs(stat); gth_chain_delete(actual_chain); gth_chain_delete(contracted_chain); return GTH_ERROR_SA_COULD_NOT_BE_DETERMINED; } } else if (rval == GTH_ERROR_MATRIX_ALLOCATION_FAILED) { if (introncutoutinfo->autoicmaxmatrixsize > 0 && !useintroncutout) { /* if the automatic intron cutout technique is enabled and a ``normal'' DP returned with the matrix allocation error, set useintroncutout, increase counter, and continue */ if (out->showverbose) { out->showverbose("matrix allocation failed, use intron cutout " "technique"); } gth_stat_increment_numofautointroncutoutcalls(stat); useintroncutout = true; continue; } else { /* otherwise increase relevant statistics, free space and return with error */ gth_stat_increment_numoffailedmatrixallocations(stat); gth_stat_increment_numofundeterminedSAs(stat); gth_chain_delete(actual_chain); gth_chain_delete(contracted_chain); return GTH_ERROR_SA_COULD_NOT_BE_DETERMINED; } } else if (rval) /* ``normal'' DP */ return -1; break; } #if 0 if (out->comments) { gt_file_xprintf(out->outfp, "%c this SA has been computed:\n", COMMENTCHAR); gth_sa_show(sa, input, out->outfp); } #endif /* free */ gth_chain_delete(actual_chain); gth_chain_delete(contracted_chain); return 0; }