static void make_sequence_region(GtHashmap *sequence_regions, GtStr *sequenceid, GthRegionFactory *srf, GthInput *input, GtUword filenum, GtUword seqnum) { GtUword offset_is_defined = false; GtRange range, descrange; GtGenomeNode *sr = NULL; gt_assert(sequence_regions && sequenceid && srf && input); if (gth_input_use_substring_spec(input)) { range.start = gth_input_genomic_substring_from(input); range.end = gth_input_genomic_substring_to(input); } else { range = gth_input_get_relative_genomic_range(input, filenum, seqnum); } if (srf->use_desc_ranges) { GtStr *description = gt_str_new(); gth_input_get_genomic_description(input, description, filenum, seqnum); if (!gt_parse_description_range(gt_str_get(description), &descrange)) offset_is_defined = true; gt_str_delete(description); } if (offset_is_defined) range = gt_range_offset(&range, descrange.start); else range = gt_range_offset(&range, 1); /* 1-based */ if (!gt_str_length(sequenceid) || (gt_cstr_table_get(srf->used_seqids, gt_str_get(sequenceid)) && !offset_is_defined)) { /* sequenceid is empty or exists already (and no offset has been parsed) -> make one up */ GtStr *seqid; char *base; base = gt_basename(gth_input_get_genomic_filename(input, filenum)); seqid = gt_str_new_cstr(base); gt_free(base); gt_str_append_char(seqid, '|'); gt_str_append_uword(seqid, seqnum + 1); /* 1-based */ seqid_store_add(srf->seqid_store, filenum, seqnum, seqid, GT_UNDEF_UWORD); gt_assert(!gt_cstr_table_get(srf->used_seqids, gt_str_get(seqid))); gt_cstr_table_add(srf->used_seqids, gt_str_get(seqid)); sr = gt_region_node_new(seqid_store_get(srf->seqid_store, filenum, seqnum), range.start, range.end); gt_hashmap_add(sequence_regions, (void*) gt_cstr_table_get(srf->used_seqids, gt_str_get(seqid)), sr); gt_str_delete(seqid); } else { /* sequenceid does not exists already (or an offset has been parsed) -> use this one */ if (!gt_cstr_table_get(srf->used_seqids, gt_str_get(sequenceid))) { /* no sequence region with this id exists -> create one */ gt_cstr_table_add(srf->used_seqids, gt_str_get(sequenceid)); seqid_store_add(srf->seqid_store, filenum, seqnum, sequenceid, offset_is_defined ? descrange.start : GT_UNDEF_UWORD); sr = gt_region_node_new(seqid_store_get(srf->seqid_store, filenum, seqnum), range.start, range.end); gt_hashmap_add(sequence_regions, (void*) gt_cstr_table_get(srf->used_seqids, gt_str_get(sequenceid)), sr); } else { GtRange prev_range, new_range; /* sequence region with this id exists already -> modify range */ sr = gt_hashmap_get(sequence_regions, gt_str_get(sequenceid)); gt_assert(sr); prev_range = gt_genome_node_get_range(sr); new_range = gt_range_join(&prev_range, &range); gt_genome_node_set_range(sr, &new_range); seqid_store_add(srf->seqid_store, filenum, seqnum, sequenceid, offset_is_defined ? descrange.start : GT_UNDEF_UWORD); } } gt_assert(sr); }
static int calc_spliced_alignments(GthSACollection *sa_collection, GthChainCollection *chain_collection, GthCallInfo *call_info, GthInput *input, GthStat *stat, GtUword gen_file_num, GtUword ref_file_num, bool directmatches, GthMatchInfo *match_info, GthDNACompletePathMatrixJT dna_complete_path_matrix_jt, GthProteinCompletePathMatrixJT protein_complete_path_matrix_jt) { const unsigned char *ref_seq_tran, *ref_seq_orig, *ref_seq_tran_rc = NULL, *ref_seq_orig_rc = NULL; GtUword chainctr, gen_offset = GT_UNDEF_UWORD, gen_total_length, ref_total_length; GtFile *outfp = call_info->out->outfp; GtRange gen_seq_bounds, gen_seq_bounds_rc; bool refseqisdna; GthChain *chain; GtRange range; GthSA *saA; int rval; gt_assert(sa_collection && chain_collection); refseqisdna = gth_input_ref_file_is_dna(input, ref_file_num); for (chainctr = 0; chainctr < gth_chain_collection_size(chain_collection); chainctr++) { chain = gth_chain_collection_get(chain_collection, chainctr); if (++match_info->call_number > call_info->firstalshown && call_info->firstalshown > 0) { if (!(call_info->out->xmlout || call_info->out->gff3out)) gt_file_xfputc('\n', outfp); else if (call_info->out->xmlout) gt_file_xprintf(outfp, "<!--\n"); if (!call_info->out->gff3out) { gt_file_xprintf(outfp, "Maximal matching %s count (%u) reached.\n", refseqisdna ? "EST" : "protein", call_info->firstalshown); gt_file_xprintf(outfp, "Only the first %u matches will be " "displayed.\n", call_info->firstalshown); } if (!(call_info->out->xmlout || call_info->out->gff3out)) gt_file_xfputc('\n', outfp); else if (call_info->out->xmlout) gt_file_xprintf(outfp, "-->\n"); match_info->max_call_number_reached = true; break; /* break out of loop */ } /* compute considered genomic regions if not set by -frompos */ if (!gth_input_use_substring_spec(input)) { gen_seq_bounds = gth_input_get_genomic_range(input, chain->gen_file_num, chain->gen_seq_num); gen_total_length = gt_range_length(&gen_seq_bounds); gen_offset = gen_seq_bounds.start; gen_seq_bounds_rc = gen_seq_bounds; } else { /* genomic multiseq contains exactly one sequence */ gt_assert(gth_input_num_of_gen_seqs(input, chain->gen_file_num) == 1); gen_total_length = gth_input_genomic_file_total_length(input, chain ->gen_file_num); gen_seq_bounds.start = gth_input_genomic_substring_from(input); gen_seq_bounds.end = gth_input_genomic_substring_to(input); gen_offset = 0; gen_seq_bounds_rc.start = gen_total_length - 1 - gen_seq_bounds.end; gen_seq_bounds_rc.end = gen_total_length - 1 - gen_seq_bounds.start; } /* "retrieving" the reference sequence */ range = gth_input_get_reference_range(input, chain->ref_file_num, chain->ref_seq_num); ref_seq_tran = gth_input_current_ref_seq_tran(input) + range.start; ref_seq_orig = gth_input_current_ref_seq_orig(input) + range.start; if (refseqisdna) { ref_seq_tran_rc = gth_input_current_ref_seq_tran_rc(input) + range.start; ref_seq_orig_rc = gth_input_current_ref_seq_orig_rc(input) + range.start; } ref_total_length = range.end - range.start + 1; /* check if protein sequences have a stop amino acid */ if (!refseqisdna && !match_info->stop_amino_acid_warning && ref_seq_orig[ref_total_length - 1] != GT_STOP_AMINO) { GtStr *ref_id = gt_str_new(); gth_input_save_ref_id(input, ref_id, chain->ref_file_num, chain->ref_seq_num); gt_warning("protein sequence '%s' (#" GT_WU " in file %s) does not end " "with a stop amino acid ('%c'). If it is not a protein " "fragment you should add a stop amino acid to improve the " "prediction. For example with `gt seqtransform " "-addstopaminos` (see http://genometools.org for details).", gt_str_get(ref_id), chain->ref_seq_num, gth_input_get_reference_filename(input, chain->ref_file_num), GT_STOP_AMINO); match_info->stop_amino_acid_warning = true; gt_str_delete(ref_id); } /* allocating space for alignment */ saA = gth_sa_new_and_set(directmatches, true, input, chain->gen_file_num, chain->gen_seq_num, chain->ref_file_num, chain->ref_seq_num, match_info->call_number, gen_total_length, gen_offset, ref_total_length); /* extend the DP borders to the left and to the right */ gth_chain_extend_borders(chain, &gen_seq_bounds, &gen_seq_bounds_rc, gen_total_length, gen_offset); /* From here on the dp positions always refer to the forward strand of the genomic DNA. */ /* call the Dynamic Programming */ if (refseqisdna) { rval = call_dna_DP(directmatches, call_info, input, stat, sa_collection, saA, gen_file_num, ref_file_num, gen_total_length, gen_offset, &gen_seq_bounds, &gen_seq_bounds_rc, ref_total_length, range.start, chainctr, gth_chain_collection_size(chain_collection), match_info, ref_seq_tran, ref_seq_orig, ref_seq_tran_rc, ref_seq_orig_rc, chain, dna_complete_path_matrix_jt, protein_complete_path_matrix_jt); } else { rval = call_protein_DP(directmatches, call_info, input, stat, sa_collection, saA, gen_file_num, ref_file_num, gen_total_length, gen_offset, &gen_seq_bounds, &gen_seq_bounds_rc, ref_total_length, range.start, chainctr, gth_chain_collection_size(chain_collection), match_info, ref_seq_tran, ref_seq_orig, chain, dna_complete_path_matrix_jt, protein_complete_path_matrix_jt); } /* check return value */ if (rval == GTH_ERROR_DP_PARAMETER_ALLOCATION_FAILED) { /* statistics bookkeeping */ gth_stat_increment_numoffailedDPparameterallocations(stat); gth_stat_increment_numofundeterminedSAs(stat); /* free space */ gth_sa_delete(saA); match_info->call_number--; continue; /* continue with the next DP range */ } else if (rval) return -1; } if (!call_info->out->xmlout && !call_info->out->gff3out && !directmatches && !match_info->significant_match_found && match_info->call_number <= call_info->firstalshown) { show_no_match_line(gth_input_get_alphatype(input, ref_file_num), outfp); } return 0; }