GthSA* gth_sa_new_and_set(bool gen_strand_forward, bool ref_strand_forward, GthInput *input, GtUword gen_file_num, GtUword gen_seq_num, GtUword ref_file_num, GtUword ref_seq_num, GtUword call_number, GtUword gen_total_length, GtUword gen_offset, GtUword ref_total_length) { GthSA *sa; /* alloc and init of arrays */ sa = gth_sa_new(); /* setting the strand directions */ sa->gen_strand_forward = gen_strand_forward; sa->ref_strand_forward = ref_strand_forward; /* saving sequence ids */ gth_input_save_gen_id(input, sa->gen_id, gen_file_num, gen_seq_num); gth_input_save_ref_id(input, sa->ref_id, ref_file_num, ref_seq_num); /* saving MD5s, if necessary */ gth_input_save_gen_md5(input, &sa->gen_md5, gen_file_num, gen_seq_num); gth_input_save_ref_md5(input, &sa->ref_md5, ref_file_num, ref_seq_num); /* saving descriptions, if necessary */ gth_input_save_gen_desc(input, &sa->gen_desc, gen_file_num, gen_seq_num); gth_input_save_ref_desc(input, &sa->ref_desc, ref_file_num, ref_seq_num); /* save the consecutive call number */ sa->call_number = call_number; /* save total length of genomic sequence */ sa->gen_total_length = gen_total_length; /* save genomic offset */ sa->gen_offset = gen_offset; /* save total length of reference sequence */ sa->ref_total_length = ref_total_length; gth_backtrace_path_set_ref_dp_length(sa->backtrace_path, ref_total_length); /* save file and sequence numbers */ sa->gen_file_num = gen_file_num; sa->gen_seq_num = gen_seq_num; sa->ref_file_num = ref_file_num; sa->ref_seq_num = ref_seq_num; return sa; }
static int calc_spliced_alignments(GthSACollection *sa_collection, GthChainCollection *chain_collection, GthCallInfo *call_info, GthInput *input, GthStat *stat, GtUword gen_file_num, GtUword ref_file_num, bool directmatches, GthMatchInfo *match_info, GthDNACompletePathMatrixJT dna_complete_path_matrix_jt, GthProteinCompletePathMatrixJT protein_complete_path_matrix_jt) { const unsigned char *ref_seq_tran, *ref_seq_orig, *ref_seq_tran_rc = NULL, *ref_seq_orig_rc = NULL; GtUword chainctr, gen_offset = GT_UNDEF_UWORD, gen_total_length, ref_total_length; GtFile *outfp = call_info->out->outfp; GtRange gen_seq_bounds, gen_seq_bounds_rc; bool refseqisdna; GthChain *chain; GtRange range; GthSA *saA; int rval; gt_assert(sa_collection && chain_collection); refseqisdna = gth_input_ref_file_is_dna(input, ref_file_num); for (chainctr = 0; chainctr < gth_chain_collection_size(chain_collection); chainctr++) { chain = gth_chain_collection_get(chain_collection, chainctr); if (++match_info->call_number > call_info->firstalshown && call_info->firstalshown > 0) { if (!(call_info->out->xmlout || call_info->out->gff3out)) gt_file_xfputc('\n', outfp); else if (call_info->out->xmlout) gt_file_xprintf(outfp, "<!--\n"); if (!call_info->out->gff3out) { gt_file_xprintf(outfp, "Maximal matching %s count (%u) reached.\n", refseqisdna ? "EST" : "protein", call_info->firstalshown); gt_file_xprintf(outfp, "Only the first %u matches will be " "displayed.\n", call_info->firstalshown); } if (!(call_info->out->xmlout || call_info->out->gff3out)) gt_file_xfputc('\n', outfp); else if (call_info->out->xmlout) gt_file_xprintf(outfp, "-->\n"); match_info->max_call_number_reached = true; break; /* break out of loop */ } /* compute considered genomic regions if not set by -frompos */ if (!gth_input_use_substring_spec(input)) { gen_seq_bounds = gth_input_get_genomic_range(input, chain->gen_file_num, chain->gen_seq_num); gen_total_length = gt_range_length(&gen_seq_bounds); gen_offset = gen_seq_bounds.start; gen_seq_bounds_rc = gen_seq_bounds; } else { /* genomic multiseq contains exactly one sequence */ gt_assert(gth_input_num_of_gen_seqs(input, chain->gen_file_num) == 1); gen_total_length = gth_input_genomic_file_total_length(input, chain ->gen_file_num); gen_seq_bounds.start = gth_input_genomic_substring_from(input); gen_seq_bounds.end = gth_input_genomic_substring_to(input); gen_offset = 0; gen_seq_bounds_rc.start = gen_total_length - 1 - gen_seq_bounds.end; gen_seq_bounds_rc.end = gen_total_length - 1 - gen_seq_bounds.start; } /* "retrieving" the reference sequence */ range = gth_input_get_reference_range(input, chain->ref_file_num, chain->ref_seq_num); ref_seq_tran = gth_input_current_ref_seq_tran(input) + range.start; ref_seq_orig = gth_input_current_ref_seq_orig(input) + range.start; if (refseqisdna) { ref_seq_tran_rc = gth_input_current_ref_seq_tran_rc(input) + range.start; ref_seq_orig_rc = gth_input_current_ref_seq_orig_rc(input) + range.start; } ref_total_length = range.end - range.start + 1; /* check if protein sequences have a stop amino acid */ if (!refseqisdna && !match_info->stop_amino_acid_warning && ref_seq_orig[ref_total_length - 1] != GT_STOP_AMINO) { GtStr *ref_id = gt_str_new(); gth_input_save_ref_id(input, ref_id, chain->ref_file_num, chain->ref_seq_num); gt_warning("protein sequence '%s' (#" GT_WU " in file %s) does not end " "with a stop amino acid ('%c'). If it is not a protein " "fragment you should add a stop amino acid to improve the " "prediction. For example with `gt seqtransform " "-addstopaminos` (see http://genometools.org for details).", gt_str_get(ref_id), chain->ref_seq_num, gth_input_get_reference_filename(input, chain->ref_file_num), GT_STOP_AMINO); match_info->stop_amino_acid_warning = true; gt_str_delete(ref_id); } /* allocating space for alignment */ saA = gth_sa_new_and_set(directmatches, true, input, chain->gen_file_num, chain->gen_seq_num, chain->ref_file_num, chain->ref_seq_num, match_info->call_number, gen_total_length, gen_offset, ref_total_length); /* extend the DP borders to the left and to the right */ gth_chain_extend_borders(chain, &gen_seq_bounds, &gen_seq_bounds_rc, gen_total_length, gen_offset); /* From here on the dp positions always refer to the forward strand of the genomic DNA. */ /* call the Dynamic Programming */ if (refseqisdna) { rval = call_dna_DP(directmatches, call_info, input, stat, sa_collection, saA, gen_file_num, ref_file_num, gen_total_length, gen_offset, &gen_seq_bounds, &gen_seq_bounds_rc, ref_total_length, range.start, chainctr, gth_chain_collection_size(chain_collection), match_info, ref_seq_tran, ref_seq_orig, ref_seq_tran_rc, ref_seq_orig_rc, chain, dna_complete_path_matrix_jt, protein_complete_path_matrix_jt); } else { rval = call_protein_DP(directmatches, call_info, input, stat, sa_collection, saA, gen_file_num, ref_file_num, gen_total_length, gen_offset, &gen_seq_bounds, &gen_seq_bounds_rc, ref_total_length, range.start, chainctr, gth_chain_collection_size(chain_collection), match_info, ref_seq_tran, ref_seq_orig, chain, dna_complete_path_matrix_jt, protein_complete_path_matrix_jt); } /* check return value */ if (rval == GTH_ERROR_DP_PARAMETER_ALLOCATION_FAILED) { /* statistics bookkeeping */ gth_stat_increment_numoffailedDPparameterallocations(stat); gth_stat_increment_numofundeterminedSAs(stat); /* free space */ gth_sa_delete(saA); match_info->call_number--; continue; /* continue with the next DP range */ } else if (rval) return -1; } if (!call_info->out->xmlout && !call_info->out->gff3out && !directmatches && !match_info->significant_match_found && match_info->call_number <= call_info->firstalshown) { show_no_match_line(gth_input_get_alphatype(input, ref_file_num), outfp); } return 0; }