void gth_region_factory_save(GthRegionFactory *rf, GtArray *nodes, GthInput *input) { GtHashmap *sequence_regions; GtUword i, j; GtStr *sequenceid; GT_UNUSED int had_err; gt_assert(rf && nodes && input); gt_assert(!rf->factory_was_used); rf->seqid_store = seqid_store_new(input); sequence_regions = gt_hashmap_new(GT_HASH_STRING, NULL, NULL); sequenceid = gt_str_new(); for (i = 0; i < gth_input_num_of_gen_files(input); i++) { gth_input_load_genomic_file(input, i, false); for (j = 0; j < gth_input_num_of_gen_seqs(input, i); j++) { gt_str_reset(sequenceid); gth_input_save_gen_identifier(input, sequenceid, i, j); make_sequence_region(sequence_regions, sequenceid, rf, input, i, j); } } gt_str_delete(sequenceid); had_err = gt_hashmap_foreach_in_key_order(sequence_regions, save_sequence_region, nodes, NULL); gt_assert(!had_err); /* should not happen */ gt_hashmap_delete(sequence_regions); rf->factory_was_used = true; }
static SeqidStore* seqid_store_new(GthInput *input) { SeqidStore *ss; GtUword i, j; gt_assert(input); ss = gt_malloc(sizeof *ss); ss->num_of_files = gth_input_num_of_gen_files(input); ss->num_of_sequences = gt_calloc(ss->num_of_files, sizeof (GtUword)); /* allocate room for store */ ss->store = gt_calloc(ss->num_of_files, sizeof *ss->store); for (i = 0; i < ss->num_of_files; i++) { gth_input_load_genomic_file(input, i, false); ss->num_of_sequences[i] = gth_input_num_of_gen_seqs(input, i); ss->store[i] = gt_calloc(ss->num_of_sequences[i], sizeof **ss->store); } /* allocate room for offsets */ ss->offsets = gt_malloc(ss->num_of_files * sizeof *ss->offsets); for (i = 0; i < ss->num_of_files; i++) ss->offsets[i] = gt_malloc(ss->num_of_sequences[i] * sizeof **ss->offsets); /* initialize offsets to undefined values */ for (i = 0; i < ss->num_of_files; i++) { for (j = 0; j < ss->num_of_sequences[i]; j++) ss->offsets[i][j] = GT_UNDEF_UWORD; } return ss; }
static int calc_spliced_alignments(GthSACollection *sa_collection, GthChainCollection *chain_collection, GthCallInfo *call_info, GthInput *input, GthStat *stat, GtUword gen_file_num, GtUword ref_file_num, bool directmatches, GthMatchInfo *match_info, GthDNACompletePathMatrixJT dna_complete_path_matrix_jt, GthProteinCompletePathMatrixJT protein_complete_path_matrix_jt) { const unsigned char *ref_seq_tran, *ref_seq_orig, *ref_seq_tran_rc = NULL, *ref_seq_orig_rc = NULL; GtUword chainctr, gen_offset = GT_UNDEF_UWORD, gen_total_length, ref_total_length; GtFile *outfp = call_info->out->outfp; GtRange gen_seq_bounds, gen_seq_bounds_rc; bool refseqisdna; GthChain *chain; GtRange range; GthSA *saA; int rval; gt_assert(sa_collection && chain_collection); refseqisdna = gth_input_ref_file_is_dna(input, ref_file_num); for (chainctr = 0; chainctr < gth_chain_collection_size(chain_collection); chainctr++) { chain = gth_chain_collection_get(chain_collection, chainctr); if (++match_info->call_number > call_info->firstalshown && call_info->firstalshown > 0) { if (!(call_info->out->xmlout || call_info->out->gff3out)) gt_file_xfputc('\n', outfp); else if (call_info->out->xmlout) gt_file_xprintf(outfp, "<!--\n"); if (!call_info->out->gff3out) { gt_file_xprintf(outfp, "Maximal matching %s count (%u) reached.\n", refseqisdna ? "EST" : "protein", call_info->firstalshown); gt_file_xprintf(outfp, "Only the first %u matches will be " "displayed.\n", call_info->firstalshown); } if (!(call_info->out->xmlout || call_info->out->gff3out)) gt_file_xfputc('\n', outfp); else if (call_info->out->xmlout) gt_file_xprintf(outfp, "-->\n"); match_info->max_call_number_reached = true; break; /* break out of loop */ } /* compute considered genomic regions if not set by -frompos */ if (!gth_input_use_substring_spec(input)) { gen_seq_bounds = gth_input_get_genomic_range(input, chain->gen_file_num, chain->gen_seq_num); gen_total_length = gt_range_length(&gen_seq_bounds); gen_offset = gen_seq_bounds.start; gen_seq_bounds_rc = gen_seq_bounds; } else { /* genomic multiseq contains exactly one sequence */ gt_assert(gth_input_num_of_gen_seqs(input, chain->gen_file_num) == 1); gen_total_length = gth_input_genomic_file_total_length(input, chain ->gen_file_num); gen_seq_bounds.start = gth_input_genomic_substring_from(input); gen_seq_bounds.end = gth_input_genomic_substring_to(input); gen_offset = 0; gen_seq_bounds_rc.start = gen_total_length - 1 - gen_seq_bounds.end; gen_seq_bounds_rc.end = gen_total_length - 1 - gen_seq_bounds.start; } /* "retrieving" the reference sequence */ range = gth_input_get_reference_range(input, chain->ref_file_num, chain->ref_seq_num); ref_seq_tran = gth_input_current_ref_seq_tran(input) + range.start; ref_seq_orig = gth_input_current_ref_seq_orig(input) + range.start; if (refseqisdna) { ref_seq_tran_rc = gth_input_current_ref_seq_tran_rc(input) + range.start; ref_seq_orig_rc = gth_input_current_ref_seq_orig_rc(input) + range.start; } ref_total_length = range.end - range.start + 1; /* check if protein sequences have a stop amino acid */ if (!refseqisdna && !match_info->stop_amino_acid_warning && ref_seq_orig[ref_total_length - 1] != GT_STOP_AMINO) { GtStr *ref_id = gt_str_new(); gth_input_save_ref_id(input, ref_id, chain->ref_file_num, chain->ref_seq_num); gt_warning("protein sequence '%s' (#" GT_WU " in file %s) does not end " "with a stop amino acid ('%c'). If it is not a protein " "fragment you should add a stop amino acid to improve the " "prediction. For example with `gt seqtransform " "-addstopaminos` (see http://genometools.org for details).", gt_str_get(ref_id), chain->ref_seq_num, gth_input_get_reference_filename(input, chain->ref_file_num), GT_STOP_AMINO); match_info->stop_amino_acid_warning = true; gt_str_delete(ref_id); } /* allocating space for alignment */ saA = gth_sa_new_and_set(directmatches, true, input, chain->gen_file_num, chain->gen_seq_num, chain->ref_file_num, chain->ref_seq_num, match_info->call_number, gen_total_length, gen_offset, ref_total_length); /* extend the DP borders to the left and to the right */ gth_chain_extend_borders(chain, &gen_seq_bounds, &gen_seq_bounds_rc, gen_total_length, gen_offset); /* From here on the dp positions always refer to the forward strand of the genomic DNA. */ /* call the Dynamic Programming */ if (refseqisdna) { rval = call_dna_DP(directmatches, call_info, input, stat, sa_collection, saA, gen_file_num, ref_file_num, gen_total_length, gen_offset, &gen_seq_bounds, &gen_seq_bounds_rc, ref_total_length, range.start, chainctr, gth_chain_collection_size(chain_collection), match_info, ref_seq_tran, ref_seq_orig, ref_seq_tran_rc, ref_seq_orig_rc, chain, dna_complete_path_matrix_jt, protein_complete_path_matrix_jt); } else { rval = call_protein_DP(directmatches, call_info, input, stat, sa_collection, saA, gen_file_num, ref_file_num, gen_total_length, gen_offset, &gen_seq_bounds, &gen_seq_bounds_rc, ref_total_length, range.start, chainctr, gth_chain_collection_size(chain_collection), match_info, ref_seq_tran, ref_seq_orig, chain, dna_complete_path_matrix_jt, protein_complete_path_matrix_jt); } /* check return value */ if (rval == GTH_ERROR_DP_PARAMETER_ALLOCATION_FAILED) { /* statistics bookkeeping */ gth_stat_increment_numoffailedDPparameterallocations(stat); gth_stat_increment_numofundeterminedSAs(stat); /* free space */ gth_sa_delete(saA); match_info->call_number--; continue; /* continue with the next DP range */ } else if (rval) return -1; } if (!call_info->out->xmlout && !call_info->out->gff3out && !directmatches && !match_info->significant_match_found && match_info->call_number <= call_info->firstalshown) { show_no_match_line(gth_input_get_alphatype(input, ref_file_num), outfp); } return 0; }