static void potentialintronspostpro(GtArray *intronstoprocess, unsigned long icdelta, unsigned long icminremintronlength) { GtArray *originalintrons; GtRange potintron; unsigned long i, potintronlength, minintronlength = 2 * icdelta + icminremintronlength; originalintrons = gt_array_new(sizeof (GtRange)); /* save all (potential) introns */ gt_array_add_array(originalintrons, intronstoprocess); /* reset introns to process */ gt_array_set_size(intronstoprocess, 0); /* store introns */ for (i = 0; i < gt_array_size(originalintrons); i++) { potintron = *(GtRange*) gt_array_get(originalintrons, i); potintronlength = potintron.end - potintron.start + 1; if (potintronlength >= minintronlength) { /* keep this intron (plus/minus intron deltas) that is, this intron is cut out later */ potintron.start += icdelta; potintron.end -= icdelta; gt_array_add(intronstoprocess, potintron); } /* else: skip this intron that is, this intron is not cut out later */ } gt_array_delete(originalintrons); }
void gth_backtrace_path_cutoff_start(GthBacktracePath *bp) { gt_assert(bp); gt_assert(bp->gen_dp_start != GT_UNDEF_ULONG); gt_assert(bp->gen_dp_length != GT_UNDEF_ULONG); gt_assert(bp->ref_dp_start != GT_UNDEF_ULONG); gt_assert(bp->ref_dp_length != GT_UNDEF_ULONG); if (bp->cutoffs.start.genomiccutoff) { bp->gen_dp_start += bp->cutoffs.start.genomiccutoff; gt_assert(bp->gen_dp_length >= bp->cutoffs.start.genomiccutoff); bp->gen_dp_length -= bp->cutoffs.start.genomiccutoff; bp->cutoffs.start.genomiccutoff = 0; } if (bp->cutoffs.start.referencecutoff) { bp->ref_dp_start += bp->cutoffs.start.referencecutoff; gt_assert(bp->ref_dp_length >= bp->cutoffs.start.referencecutoff); bp->ref_dp_length -= bp->cutoffs.start.referencecutoff; bp->cutoffs.start.referencecutoff = 0; } if (bp->cutoffs.start.eopcutoff) { gt_array_set_size(bp->editoperations, gt_array_size(bp->editoperations) - bp->cutoffs.start.eopcutoff); bp->cutoffs.start.eopcutoff = 0; } }
void gth_backtrace_path_reset(GthBacktracePath *bp) { gt_assert(bp); gt_array_set_size(bp->editoperations, 0); bp->alphatype = UNDEF_ALPHA; bp->dummy_index = GT_UNDEF_ULONG; }
void gth_chain_shorten_introns(GthChain *chain, unsigned long icdelta, unsigned long icminremintronlength, unsigned long gen_total_length, unsigned long gen_offset, bool comments, GtFile *outfp) { GthInvertedChain inverted_chain; gt_assert(chain); /* init */ inverted_chain_init(&inverted_chain); if (comments) { gt_file_xprintf(outfp, "%c forward DP ranges (before post processing of " "potential introns):\n", COMMENTCHAR); gt_file_xprintf(outfp, "%c ", COMMENTCHAR); gt_ranges_show(chain->forwardranges, outfp); } /* chain -> inverted_chain */ convert_chain_to_inverted_chain(&inverted_chain, chain); gt_assert(conversion_is_correct(chain, &inverted_chain, gen_total_length, gen_offset)); /* post processing of potential introns */ potentialintronspostpro(inverted_chain.forwardranges, icdelta, icminremintronlength); /* reset chain */ gt_array_set_size(chain->forwardranges, 0); gt_array_set_size(chain->reverseranges, 0); /* inverted_chain -> chain */ convert_inverted_chain_to_chain(chain, &inverted_chain, gen_total_length, gen_offset); if (comments) { gt_file_xprintf(outfp,"%c forward DP ranges (after post processing of " "potential introns):\n" , COMMENTCHAR); gt_file_xprintf(outfp, "%c ", COMMENTCHAR); gt_ranges_show(chain->forwardranges, outfp); } /* free space */ inverted_chain_free(&inverted_chain); }
void gth_backtrace_path_cutoff_walked_path(GthBacktracePath *bp, const GthPathWalker *pw, bool showeops, GtFile *outfp) { unsigned int length; gt_assert(bp && pw); if (gth_path_walker_is_forward(pw)) { gt_assert(!backtrace_path_start_cutoffs_are_set(bp)); if (showeops) { gt_file_xprintf(outfp, "%s(): show path walker\n", __func__); gth_path_walker_show(pw, outfp); gt_file_xprintf(outfp, "%s(): show backtrace path (before eop " "removal)\n", __func__); gth_backtrace_path_show(bp, false, 0, outfp); } /* remove complete eops */ gt_array_set_size(bp->editoperations, gt_array_size(bp->editoperations) - gth_path_walker_actual_eops(pw)); if (showeops) { gt_file_xprintf(outfp, "%s(): show backtrace path (after eop " "removal)\n", __func__); gth_backtrace_path_show(bp, false, 0, outfp); } /* remove part of last eop */ if (gth_path_walker_steps_in_current_eop(pw)) { length = gt_editoperation_length(*(Editoperation*) gt_array_get_last(bp->editoperations), bp->alphatype == PROTEIN_ALPHA); gt_assert(length > gth_path_walker_steps_in_current_eop(pw)); gt_editoperation_set_length(gt_array_get_last(bp->editoperations), length-gth_path_walker_steps_in_current_eop(pw), bp->alphatype == PROTEIN_ALPHA); } /* adjusting genomic and reference DP ranges */ bp->gen_dp_start += gth_path_walker_gen_distance(pw); bp->gen_dp_length -= gth_path_walker_gen_distance(pw); bp->ref_dp_start += gth_path_walker_ref_distance(pw); bp->ref_dp_length -= gth_path_walker_ref_distance(pw); } else { gt_assert(0); /* XXX: implement reverse case */ gt_assert(!backtrace_path_end_cutoffs_are_set(bp)); } }
static void ensure_eop_of_len_1_before_introns(GtArray *editoperations) { Editoperation eop, *eopptr; Eoptype eoptype; unsigned long eoplength; GtArray *backup; bool processing_necessary = false, split_match = false; /* check if processing is necessary the check is rather simple, it might be possible that ``processing_necessary'' is set to ``true'' whereas in fact no processing is necessary */ for (eopptr = gt_array_get_space(editoperations); eopptr < (Editoperation*) gt_array_get_space(editoperations) + gt_array_size(editoperations) - 1; eopptr++) { if ((eoptype = gt_editoperation_type(*eopptr, true)) == EOP_TYPE_INTRON_WITH_1_BASE_LEFT || eoptype == EOP_TYPE_INTRON_WITH_2_BASES_LEFT) { processing_necessary = true; break; } } if (processing_necessary) { /* init backup for the editoperations */ backup = gt_array_new(sizeof (Editoperation)); /* fill backup */ gt_array_add_array(backup, editoperations); /* reset the original edit operations */ gt_array_set_size(editoperations, 0); /* process the backup and fill the original editoperations */ for (eopptr = gt_array_get_space(backup); eopptr < (Editoperation*) gt_array_get_space(backup) + gt_array_size(backup); eopptr++) { if ((eoptype = gt_editoperation_length(*eopptr, true)) == EOP_TYPE_INTRON_WITH_1_BASE_LEFT || eoptype == EOP_TYPE_INTRON_WITH_2_BASES_LEFT) { split_match = true; } else if (split_match) { if (eoptype == EOP_TYPE_MATCH) { split_match = false; if ((eoplength = gt_editoperation_length(*eopptr, true)) > 1) { eop = 1; gt_array_add(editoperations, eop); eop = eoplength - 1; gt_array_add(editoperations, eop); continue; } } else if (eoptype == EOP_TYPE_MISMATCH || eoptype == EOP_TYPE_MISMATCH_WITH_1_GAP) { split_match = false; } } gt_array_add(editoperations, *eopptr); } /* free backup */ gt_array_delete(backup); } }
static int callsahmt(bool call_dna_dp, GthSA *sa, bool forward, GtUword gen_file_num, GtUword ref_file_num, GthChain *raw_chain, GtUword gen_total_length, GtUword gen_offset, const GtRange *gen_seq_bounds, const GtRange *gen_seq_bounds_rc, const unsigned char *ref_seq_tran, const unsigned char *ref_seq_orig, GtUword ref_total_length, GtUword ref_offset, GthInput *input, Introncutoutinfo *introncutoutinfo, GthStat *stat, GtUword chainctr, GtUword num_of_chains, GtUword translationtable, bool directmatches, bool proteinexonpenal, GthSpliceSiteModel *splice_site_model, GthDPOptionsCore *dp_options_core, GthDPOptionsEST *dp_options_est, GthDPOptionsPostpro *dp_options_postpro, GthDNACompletePathMatrixJT dna_complete_path_matrix_jt, GthProteinCompletePathMatrixJT protein_complete_path_matrix_jt, GthOutput *out) { int rval; GthChain *actual_chain, *contracted_chain, *used_chain; GtUword icdelta = introncutoutinfo->icinitialdelta, iciterations = introncutoutinfo->iciterations; bool useintroncutout = introncutoutinfo->introncutout; /* initially useintron is set to the value of introncutoutinfo->introncutout, if the automatic intron cutotu technique is acitvated it can be set to true if an matrix allocation error (ERROR_MATRIX_ALLOCATION_FAILED) occurs */ gt_assert(sa); actual_chain = gth_chain_new(); contracted_chain = gth_chain_new(); for (;;) { /* reset actualDPrange; */ gt_array_set_size(actual_chain->forwardranges, 0); gt_array_set_size(actual_chain->reverseranges, 0); /* copy raw chain to actual chain */ gth_chain_copy(actual_chain, raw_chain); /* shorten potential introns and compute spliced sequence, if the intron cutout technique is used */ if (useintroncutout) { /* shorten potential introns */ gth_chain_shorten_introns(actual_chain, icdelta, introncutoutinfo->icminremintronlength, gen_total_length, gen_offset, out->comments, out->outfp); } else gth_chain_contract(contracted_chain, actual_chain); if (out->showverbose) { show_matrix_calculation_status(out->showverbose, forward, gth_sa_ref_strand_forward(sa), useintroncutout, chainctr, num_of_chains, icdelta, gen_file_num, gth_input_num_of_gen_files(input), ref_file_num, gth_input_num_of_ref_files(input), directmatches, out->verboseseqs, gth_sa_gen_id(sa), gth_sa_ref_id(sa)); } /* allocate space for DP parameter */ if (out->comments) { gt_file_xprintf(out->outfp, "%c alloc space for DP param " "(genomicid=%s, referenceid=%s)\n", COMMENTCHAR, gth_sa_gen_id(sa), gth_sa_ref_id(sa)); } used_chain = useintroncutout ? actual_chain : contracted_chain; /* The variable 'forward' denotes the genomic strand on which the DP is applied. */ if (forward) { if (call_dna_dp) { rval = gth_align_dna(sa, used_chain->forwardranges, gth_input_current_gen_seq_tran(input), gth_input_current_gen_seq_orig(input), ref_seq_tran, ref_seq_orig, ref_total_length, gth_input_current_gen_alphabet(input), gth_input_current_ref_alphabet(input), useintroncutout, introncutoutinfo->autoicmaxmatrixsize, out->showeops, out->comments, out->gs2out, gen_seq_bounds, splice_site_model, dp_options_core, dp_options_est, dp_options_postpro, dna_complete_path_matrix_jt, raw_chain->forward_jump_table, ref_offset, stat, out->outfp); } else { /* call_protein_dp */ rval = gth_align_protein(sa, used_chain->forwardranges, gth_input_current_gen_seq_tran(input), ref_seq_tran, ref_seq_orig, ref_total_length, gth_input_current_gen_alphabet(input), gth_input_current_ref_alphabet(input), input, useintroncutout, introncutoutinfo->autoicmaxmatrixsize, proteinexonpenal, out->showeops, out->comments, out->gs2out, translationtable, gen_seq_bounds, splice_site_model, dp_options_core, dp_options_postpro, protein_complete_path_matrix_jt, raw_chain->forward_jump_table, ref_offset, stat, out->outfp); } } else { /* the DP is called with the revers positions specifiers */ if (call_dna_dp) { rval = gth_align_dna(sa, used_chain->reverseranges, gth_input_current_gen_seq_tran_rc(input), gth_input_current_gen_seq_orig_rc(input), ref_seq_tran, ref_seq_orig, ref_total_length, gth_input_current_gen_alphabet(input), gth_input_current_ref_alphabet(input), useintroncutout, introncutoutinfo->autoicmaxmatrixsize, out->showeops, out->comments, out->gs2out, gen_seq_bounds_rc, splice_site_model, dp_options_core, dp_options_est, dp_options_postpro, dna_complete_path_matrix_jt, raw_chain->reverse_jump_table, ref_offset, stat, out->outfp); } else { /* call_protein_dp */ rval = gth_align_protein(sa, used_chain->reverseranges, gth_input_current_gen_seq_tran_rc(input), ref_seq_tran, ref_seq_orig, ref_total_length, gth_input_current_gen_alphabet(input), gth_input_current_ref_alphabet(input), input, useintroncutout, introncutoutinfo->autoicmaxmatrixsize, proteinexonpenal, out->showeops, out->comments, out->gs2out, translationtable, gen_seq_bounds, splice_site_model, dp_options_core, dp_options_postpro, protein_complete_path_matrix_jt, raw_chain->reverse_jump_table, ref_offset, stat, out->outfp); } } if (rval == GTH_ERROR_DP_PARAMETER_ALLOCATION_FAILED) return GTH_ERROR_DP_PARAMETER_ALLOCATION_FAILED; /* handling of special error codes ERROR_CUTOUT_NOT_IN_INTRON and ERROR_MATRIX_ALLOCATION_FAILED from DP the only possible special error code given back by this function is ERROR_SA_COULD_NOT_BE_DETERMINED */ #ifndef NDEBUG if (!useintroncutout) gt_assert(rval != GTH_ERROR_CUTOUT_NOT_IN_INTRON); #endif if (useintroncutout && rval == GTH_ERROR_CUTOUT_NOT_IN_INTRON) { /* the intron cutout technique failed -> increase counter */ gth_stat_increment_numofunsuccessfulintroncutoutDPs(stat); if (--iciterations > 0) { /* if an iterations is left, increase icdelta, decrease the remaining iterations, and continue the while-loop */ icdelta += introncutoutinfo->icdeltaincrease; continue; } else { /* no iteration left, discard SA */ gth_stat_increment_numofundeterminedSAs(stat); gth_chain_delete(actual_chain); gth_chain_delete(contracted_chain); return GTH_ERROR_SA_COULD_NOT_BE_DETERMINED; } } else if (rval == GTH_ERROR_MATRIX_ALLOCATION_FAILED) { if (introncutoutinfo->autoicmaxmatrixsize > 0 && !useintroncutout) { /* if the automatic intron cutout technique is enabled and a ``normal'' DP returned with the matrix allocation error, set useintroncutout, increase counter, and continue */ if (out->showverbose) { out->showverbose("matrix allocation failed, use intron cutout " "technique"); } gth_stat_increment_numofautointroncutoutcalls(stat); useintroncutout = true; continue; } else { /* otherwise increase relevant statistics, free space and return with error */ gth_stat_increment_numoffailedmatrixallocations(stat); gth_stat_increment_numofundeterminedSAs(stat); gth_chain_delete(actual_chain); gth_chain_delete(contracted_chain); return GTH_ERROR_SA_COULD_NOT_BE_DETERMINED; } } else if (rval) /* ``normal'' DP */ return -1; break; } #if 0 if (out->comments) { gt_file_xprintf(out->outfp, "%c this SA has been computed:\n", COMMENTCHAR); gth_sa_show(sa, input, out->outfp); } #endif /* free */ gth_chain_delete(actual_chain); gth_chain_delete(contracted_chain); return 0; }