static int store_in_sa_collection(void *data, GthSA *sa, GT_UNUSED const char *outputfilename, GT_UNUSED GtError *err) { SACollectionData *sa_collection_data = (SACollectionData*) data; bool inserted; inserted = gth_sa_collection_insert_sa(sa_collection_data->sa_collection, sa, sa_collection_data->sa_filter, sa_collection_data->stat); if (!inserted) { /* unsuccessful insertion; discard sa */ gth_sa_delete(sa); } return 0; }
/* the following function saves <sa> by inserting it into <sa_collection> and sets <significantmatchfound> to true, if the insertion was successful */ static void save_sa(GthSACollection *sa_collection, GthSA *sa, GthSAFilter *sa_filter, GthMatchInfo *match_info, GthStat *stat) { if (!gth_sa_collection_insert_sa(sa_collection, sa, sa_filter, stat)) { /* unsuccessful insertion; discard sa */ gth_sa_delete(sa); match_info->call_number--; } else { /* else successful insertion */ match_info->significant_match_found = true; } }
static int store_in_subset_file(void *data, GthSA *sa, const char *outputfilename, GtError *err) { Store_in_subset_file_data *store_in_subset_file_data = (Store_in_subset_file_data*) data; double split_determing_percentage = 0.0; unsigned long filenum; char filenamesuffix[4]; int had_err = 0; gt_error_check(err); /* filter before we do any further processing */ if (gth_sa_filter_filter_sa(store_in_subset_file_data->sa_filter, sa)) { /* and free it afterwards */ gth_sa_delete(sa); /* discard */ return 0; } /* check whether we got a new output file to process */ if (!store_in_subset_file_data->current_outputfilename) { store_in_subset_file_data->current_outputfilename = gt_cstr_dup(outputfilename); } else if (strcmp(store_in_subset_file_data->current_outputfilename, outputfilename)) { /* close current output files */ close_output_files(store_in_subset_file_data); gt_free(store_in_subset_file_data->current_outputfilename); } /* determine in which file the current sa needs to be put */ switch (store_in_subset_file_data->gthsplitinfo->splitmode) { case ALIGNMENTSCORE_SPLIT: split_determing_percentage = gth_sa_score(sa); strcpy(filenamesuffix, "scr"); break; case COVERAGE_SPLIT: split_determing_percentage = gth_sa_coverage(sa); strcpy(filenamesuffix, "cov"); break; default: gt_assert(0); } gt_assert(split_determing_percentage >= 0.0); /* XXX: change into an assertion when coverage problem is fixed */ if (split_determing_percentage > 1.0) split_determing_percentage = 1.0; if (split_determing_percentage == 1.0) filenum = store_in_subset_file_data->num_of_subset_files - 1; else { filenum = floor(split_determing_percentage * 100.0 / store_in_subset_file_data->gthsplitinfo->range); } gt_assert(filenum < store_in_subset_file_data->num_of_subset_files); /* make sure the file exists and is open */ if (!store_in_subset_file_data->subset_files[filenum]) { gt_assert(store_in_subset_file_data->subset_filenames[filenum] == NULL); store_in_subset_file_data->subset_filenames[filenum] = gt_str_new(); gt_str_append_cstr_nt(store_in_subset_file_data->subset_filenames[filenum], outputfilename, gt_file_basename_length(outputfilename)); gt_str_append_char(store_in_subset_file_data->subset_filenames[filenum], '.'); gt_str_append_cstr(store_in_subset_file_data->subset_filenames[filenum], filenamesuffix); gt_str_append_ulong(store_in_subset_file_data->subset_filenames[filenum], filenum * store_in_subset_file_data->gthsplitinfo->range); gt_str_append_char(store_in_subset_file_data->subset_filenames[filenum], '-'); gt_str_append_ulong(store_in_subset_file_data->subset_filenames[filenum], (filenum + 1) * store_in_subset_file_data->gthsplitinfo->range); gt_str_append_cstr(store_in_subset_file_data->subset_filenames[filenum], gt_file_mode_suffix(store_in_subset_file_data ->gthsplitinfo->file_mode)); /* if not disabled by -force, check if file already exists */ if (!store_in_subset_file_data->gthsplitinfo->force) { store_in_subset_file_data->subset_files[filenum] = gt_file_open(store_in_subset_file_data->gthsplitinfo->file_mode, gt_str_get(store_in_subset_file_data ->subset_filenames[filenum]), "r", NULL); if (store_in_subset_file_data->subset_files[filenum]) { gt_error_set(err, "file \"%s\" exists already. use option -%s to " "overwrite", gt_str_get(store_in_subset_file_data ->subset_filenames[filenum]), GT_FORCE_OPT_CSTR); had_err = -1; } } if (!had_err) { /* open split file for writing */ store_in_subset_file_data->subset_files[filenum] = gt_file_xopen_file_mode(store_in_subset_file_data->gthsplitinfo ->file_mode, gt_str_get(store_in_subset_file_data ->subset_filenames[filenum]), "w"); /* store XML header in file */ gth_xml_show_leader(true, store_in_subset_file_data->subset_files[filenum]); } } /* put it there */ if (!had_err) { gth_xml_inter_sa_visitor_set_outfp(store_in_subset_file_data->sa_visitor, store_in_subset_file_data ->subset_files[filenum]); gth_sa_visitor_visit_sa(store_in_subset_file_data->sa_visitor, sa); } /* adjust counter */ if (!had_err) store_in_subset_file_data->subset_file_sa_counter[filenum]++; /* and free it afterwards */ gth_sa_delete(sa); return had_err; }
static int calc_spliced_alignments(GthSACollection *sa_collection, GthChainCollection *chain_collection, GthCallInfo *call_info, GthInput *input, GthStat *stat, GtUword gen_file_num, GtUword ref_file_num, bool directmatches, GthMatchInfo *match_info, GthDNACompletePathMatrixJT dna_complete_path_matrix_jt, GthProteinCompletePathMatrixJT protein_complete_path_matrix_jt) { const unsigned char *ref_seq_tran, *ref_seq_orig, *ref_seq_tran_rc = NULL, *ref_seq_orig_rc = NULL; GtUword chainctr, gen_offset = GT_UNDEF_UWORD, gen_total_length, ref_total_length; GtFile *outfp = call_info->out->outfp; GtRange gen_seq_bounds, gen_seq_bounds_rc; bool refseqisdna; GthChain *chain; GtRange range; GthSA *saA; int rval; gt_assert(sa_collection && chain_collection); refseqisdna = gth_input_ref_file_is_dna(input, ref_file_num); for (chainctr = 0; chainctr < gth_chain_collection_size(chain_collection); chainctr++) { chain = gth_chain_collection_get(chain_collection, chainctr); if (++match_info->call_number > call_info->firstalshown && call_info->firstalshown > 0) { if (!(call_info->out->xmlout || call_info->out->gff3out)) gt_file_xfputc('\n', outfp); else if (call_info->out->xmlout) gt_file_xprintf(outfp, "<!--\n"); if (!call_info->out->gff3out) { gt_file_xprintf(outfp, "Maximal matching %s count (%u) reached.\n", refseqisdna ? "EST" : "protein", call_info->firstalshown); gt_file_xprintf(outfp, "Only the first %u matches will be " "displayed.\n", call_info->firstalshown); } if (!(call_info->out->xmlout || call_info->out->gff3out)) gt_file_xfputc('\n', outfp); else if (call_info->out->xmlout) gt_file_xprintf(outfp, "-->\n"); match_info->max_call_number_reached = true; break; /* break out of loop */ } /* compute considered genomic regions if not set by -frompos */ if (!gth_input_use_substring_spec(input)) { gen_seq_bounds = gth_input_get_genomic_range(input, chain->gen_file_num, chain->gen_seq_num); gen_total_length = gt_range_length(&gen_seq_bounds); gen_offset = gen_seq_bounds.start; gen_seq_bounds_rc = gen_seq_bounds; } else { /* genomic multiseq contains exactly one sequence */ gt_assert(gth_input_num_of_gen_seqs(input, chain->gen_file_num) == 1); gen_total_length = gth_input_genomic_file_total_length(input, chain ->gen_file_num); gen_seq_bounds.start = gth_input_genomic_substring_from(input); gen_seq_bounds.end = gth_input_genomic_substring_to(input); gen_offset = 0; gen_seq_bounds_rc.start = gen_total_length - 1 - gen_seq_bounds.end; gen_seq_bounds_rc.end = gen_total_length - 1 - gen_seq_bounds.start; } /* "retrieving" the reference sequence */ range = gth_input_get_reference_range(input, chain->ref_file_num, chain->ref_seq_num); ref_seq_tran = gth_input_current_ref_seq_tran(input) + range.start; ref_seq_orig = gth_input_current_ref_seq_orig(input) + range.start; if (refseqisdna) { ref_seq_tran_rc = gth_input_current_ref_seq_tran_rc(input) + range.start; ref_seq_orig_rc = gth_input_current_ref_seq_orig_rc(input) + range.start; } ref_total_length = range.end - range.start + 1; /* check if protein sequences have a stop amino acid */ if (!refseqisdna && !match_info->stop_amino_acid_warning && ref_seq_orig[ref_total_length - 1] != GT_STOP_AMINO) { GtStr *ref_id = gt_str_new(); gth_input_save_ref_id(input, ref_id, chain->ref_file_num, chain->ref_seq_num); gt_warning("protein sequence '%s' (#" GT_WU " in file %s) does not end " "with a stop amino acid ('%c'). If it is not a protein " "fragment you should add a stop amino acid to improve the " "prediction. For example with `gt seqtransform " "-addstopaminos` (see http://genometools.org for details).", gt_str_get(ref_id), chain->ref_seq_num, gth_input_get_reference_filename(input, chain->ref_file_num), GT_STOP_AMINO); match_info->stop_amino_acid_warning = true; gt_str_delete(ref_id); } /* allocating space for alignment */ saA = gth_sa_new_and_set(directmatches, true, input, chain->gen_file_num, chain->gen_seq_num, chain->ref_file_num, chain->ref_seq_num, match_info->call_number, gen_total_length, gen_offset, ref_total_length); /* extend the DP borders to the left and to the right */ gth_chain_extend_borders(chain, &gen_seq_bounds, &gen_seq_bounds_rc, gen_total_length, gen_offset); /* From here on the dp positions always refer to the forward strand of the genomic DNA. */ /* call the Dynamic Programming */ if (refseqisdna) { rval = call_dna_DP(directmatches, call_info, input, stat, sa_collection, saA, gen_file_num, ref_file_num, gen_total_length, gen_offset, &gen_seq_bounds, &gen_seq_bounds_rc, ref_total_length, range.start, chainctr, gth_chain_collection_size(chain_collection), match_info, ref_seq_tran, ref_seq_orig, ref_seq_tran_rc, ref_seq_orig_rc, chain, dna_complete_path_matrix_jt, protein_complete_path_matrix_jt); } else { rval = call_protein_DP(directmatches, call_info, input, stat, sa_collection, saA, gen_file_num, ref_file_num, gen_total_length, gen_offset, &gen_seq_bounds, &gen_seq_bounds_rc, ref_total_length, range.start, chainctr, gth_chain_collection_size(chain_collection), match_info, ref_seq_tran, ref_seq_orig, chain, dna_complete_path_matrix_jt, protein_complete_path_matrix_jt); } /* check return value */ if (rval == GTH_ERROR_DP_PARAMETER_ALLOCATION_FAILED) { /* statistics bookkeeping */ gth_stat_increment_numoffailedDPparameterallocations(stat); gth_stat_increment_numofundeterminedSAs(stat); /* free space */ gth_sa_delete(saA); match_info->call_number--; continue; /* continue with the next DP range */ } else if (rval) return -1; } if (!call_info->out->xmlout && !call_info->out->gff3out && !directmatches && !match_info->significant_match_found && match_info->call_number <= call_info->firstalshown) { show_no_match_line(gth_input_get_alphatype(input, ref_file_num), outfp); } return 0; }
static int call_protein_DP(bool directmatches, GthCallInfo *call_info, GthInput *input, GthStat *stat, GthSACollection *sa_collection, GthSA *saA, GtUword gen_file_num, GtUword ref_file_num, GtUword gen_total_length, GtUword gen_offset, const GtRange *gen_seq_bounds, const GtRange *gen_seq_bounds_rc, GtUword ref_total_length, GtUword ref_offset, GtUword chainctr, GtUword num_of_chains, GthMatchInfo *match_info, const unsigned char *ref_seq_tran, const unsigned char *ref_seq_orig, GthChain *chain, GthDNACompletePathMatrixJT dna_complete_path_matrix_jt, GthProteinCompletePathMatrixJT protein_complete_path_matrix_jt) { GtFile *outfp = call_info->out->outfp; int rval; #ifndef NDEBUG /* strand is in searchmode */ if (directmatches) gt_assert(gth_input_forward(input)); else gt_assert(gth_input_reverse(input)); #endif /* calculate alignment */ rval = callsahmt(false, saA, directmatches, gen_file_num, ref_file_num, chain, gen_total_length, gen_offset, gen_seq_bounds, gen_seq_bounds_rc, ref_seq_tran, ref_seq_orig, ref_total_length, ref_offset, input, &call_info->simfilterparam.introncutoutinfo, stat, chainctr, num_of_chains, call_info->translationtable, directmatches, call_info->proteinexonpenal, call_info->splice_site_model, call_info->dp_options_core, call_info->dp_options_est, call_info->dp_options_postpro, dna_complete_path_matrix_jt, protein_complete_path_matrix_jt, call_info->out); if (rval && rval != GTH_ERROR_SA_COULD_NOT_BE_DETERMINED) { /* ^ this error is treated below */ return rval; } if (rval == GTH_ERROR_SA_COULD_NOT_BE_DETERMINED || isunsuccessfulalignment(saA, call_info->out->comments, outfp)) { match_info->call_number--; /* if the spliced alignment was unsuccessful, it is deleted and the next hit is considered. */ gth_sa_delete(saA); /* continue */ return 0; } /* we can save the alignment now */ save_sa(sa_collection, saA, call_info->sa_filter, match_info, stat); return 0; }
static int call_dna_DP(bool directmatches, GthCallInfo *call_info, GthInput *input, GthStat *stat, GthSACollection *sa_collection, GthSA *saA, GtUword gen_file_num, GtUword ref_file_num, GtUword gen_total_length, GtUword gen_offset, const GtRange *gen_seq_bounds, const GtRange *gen_seq_bounds_rc, GtUword ref_total_length, GtUword ref_offset, GtUword chainctr, GtUword num_of_chains, GthMatchInfo *match_info, const unsigned char *ref_seq_tran, const unsigned char *ref_seq_orig, const unsigned char *ref_seq_tran_rc, const unsigned char *ref_seq_orig_rc, GthChain *chain, GthDNACompletePathMatrixJT dna_complete_path_matrix_jt, GthProteinCompletePathMatrixJT protein_complete_path_matrix_jt) { int rval; bool bothstrandsanalyzed, firstdp = true, GT_UNUSED gs2outdirectmatches = directmatches; GthSA *saB = NULL; GtFile *outfp = call_info->out->outfp; if (directmatches ? gth_input_forward(input) : gth_input_reverse(input)) { /* calculate alignment */ rval = callsahmt(true, saA, directmatches, gen_file_num, ref_file_num, chain, gen_total_length, gen_offset, gen_seq_bounds, gen_seq_bounds_rc, ref_seq_tran, ref_seq_orig, ref_total_length, ref_offset, input, &call_info->simfilterparam.introncutoutinfo, stat, chainctr, num_of_chains, call_info->translationtable, directmatches, call_info->proteinexonpenal, call_info->splice_site_model, call_info->dp_options_core, call_info->dp_options_est, call_info->dp_options_postpro, dna_complete_path_matrix_jt, protein_complete_path_matrix_jt, call_info->out); if (rval && rval != GTH_ERROR_SA_COULD_NOT_BE_DETERMINED) { /* ^ this error is treated below */ return rval; } firstdp = false; bothstrandsanalyzed = gth_input_both(input); if (rval == GTH_ERROR_SA_COULD_NOT_BE_DETERMINED || isunsuccessfulalignment(saA, call_info->out->comments, outfp)) { match_info->call_number--; /* if the spliced alignment was unsuccessful, it is deleted and the next hit is considered. */ gth_sa_delete(saA); return 0; /* continue */ } /* if not both strands are analyzed, we can save this alignment now. Otherwise we have to calculate the alignment to the other strand first and then save the better one. */ if (!bothstrandsanalyzed) save_sa(sa_collection, saA, call_info->sa_filter, match_info, stat); } if (directmatches ? gth_input_reverse(input) : gth_input_forward(input)) { if ((firstdp || gth_sa_is_poor(saA, call_info->minaveragessp)) && !call_info->cdnaforwardonly) { if (firstdp) { /* space for first alignment is already allocated, bu we have to change the direction of the genomic and the reference strand */ gth_sa_set_gen_strand(saA, !directmatches); gth_sa_set_ref_strand(saA, false); } else { /* allocating space for second alignment */ saB = gth_sa_new_and_set(!directmatches, false, input, chain->gen_file_num, chain->gen_seq_num, chain->ref_file_num, chain->ref_seq_num, match_info->call_number, gen_total_length, gen_offset, ref_total_length); } /* setting gs2outdirectmatches (for compatibility) */ gs2outdirectmatches = (bool) !directmatches; /* calculate alignment */ rval = callsahmt(true, firstdp ? saA : saB, !directmatches, gen_file_num, ref_file_num, chain, gen_total_length, gen_offset, gen_seq_bounds, gen_seq_bounds_rc, ref_seq_tran_rc, ref_seq_orig_rc, ref_total_length, ref_offset, input, &call_info->simfilterparam.introncutoutinfo, stat, chainctr, num_of_chains, call_info->translationtable, directmatches, call_info->proteinexonpenal, call_info->splice_site_model, call_info->dp_options_core, call_info->dp_options_est, call_info->dp_options_postpro, dna_complete_path_matrix_jt, protein_complete_path_matrix_jt, call_info->out); if (rval && rval != GTH_ERROR_SA_COULD_NOT_BE_DETERMINED) { /* ^ this error is treated below */ return rval; } if (firstdp) { if (rval == GTH_ERROR_SA_COULD_NOT_BE_DETERMINED || isunsuccessfulalignment(saA, call_info->out->comments, outfp)) { /* for compatibility with GS2 */ /* XXX: makes no sense. Possibly only if -gs2out is used. */ match_info->significant_match_found= true; /* if the spliced alignment was unsuccessful, it is deleted and the next hit is considered. */ gth_sa_delete(saA); return 0; /* continue */ } save_sa(sa_collection, saA, call_info->sa_filter, match_info, stat); } else /* !firstdp */ { if (rval == GTH_ERROR_SA_COULD_NOT_BE_DETERMINED || isunsuccessfulalignment(saB, call_info->out->comments, outfp) || !gth_sa_B_is_better_than_A(saA, saB)) { /* insert first SA */ save_sa(sa_collection, saA, call_info->sa_filter, match_info, stat); /* discard second SA */ gth_sa_delete(saB); } else { /* insert second SA */ save_sa(sa_collection, saB, call_info->sa_filter, match_info, stat); /* free first SA */ gth_sa_delete(saA); } } } else save_sa(sa_collection, saA, call_info->sa_filter, match_info, stat); } return 0; }