void gth_backtrace_path_reverse(GthBacktracePath *bp) { Editoperation *front, *back, tmp; unsigned long i = 0; gt_assert(bp); for (front = gt_array_get_space(bp->editoperations), back = (Editoperation*) gt_array_get_space(bp->editoperations) + gt_array_size(bp->editoperations) - 1; front < back; front++, back--, i++) { tmp = *front; *front = *back; *back = tmp; } }
static int gtf_show_transcript(GtFeatureNode *feature_node, GtGTFVisitor *gtf_visitor, GtError *err) { GtFeatureNode *fn; GtUword i; int had_err; gt_error_check(err); gt_assert(feature_node && gtf_visitor); gt_array_reset(gtf_visitor->exon_features); gt_array_reset(gtf_visitor->CDS_features); had_err = gt_feature_node_traverse_direct_children(feature_node, gtf_visitor, save_exon_node, err); if (gt_array_size(gtf_visitor->exon_features)) { /* sort exon features */ qsort(gt_array_get_space(gtf_visitor->exon_features), gt_array_size(gtf_visitor->exon_features), sizeof (GtGenomeNode*), (GtCompare) gt_genome_node_compare); /* show exon features */ gtf_visitor->transcript_id++; for (i = 0; i < gt_array_size(gtf_visitor->exon_features); i++) { fn = *(GtFeatureNode**) gt_array_get(gtf_visitor->exon_features, i); gt_gff3_output_leading(fn, gtf_visitor->outfp); gt_file_xprintf(gtf_visitor->outfp, "gene_id \""GT_WU"\"; transcript_id " "\""GT_WU"."GT_WU"\";\n", gtf_visitor->gene_id, gtf_visitor->gene_id, gtf_visitor->transcript_id); } } if (gt_array_size(gtf_visitor->CDS_features)) { /* sort CDS features */ qsort(gt_array_get_space(gtf_visitor->CDS_features), gt_array_size(gtf_visitor->CDS_features), sizeof (GtGenomeNode*), (GtCompare) gt_genome_node_compare); /* show start_codon feature */ /* fn = *(GtFeatureNode**) */ (void) gt_array_get(gtf_visitor->CDS_features, 0); /* XXX: to be done */ /* show CDS features */ for (i = 0; i < gt_array_size(gtf_visitor->CDS_features); i++) { fn = *(GtFeatureNode**) gt_array_get(gtf_visitor->CDS_features, i); gt_gff3_output_leading(fn, gtf_visitor->outfp); gt_file_xprintf(gtf_visitor->outfp, "gene_id \""GT_WU"\"; transcript_id " "\""GT_WU"."GT_WU"\";\n", gtf_visitor->gene_id, gtf_visitor->gene_id, gtf_visitor->transcript_id); } /* XXX: show stop_codon feature and shorten last CDS feature */ } return had_err; }
unsigned long gth_backtrace_path_indelcount(const GthBacktracePath *bp) { gt_assert(bp); gt_assert(bp->alphatype == DNA_ALPHA || bp->alphatype == PROTEIN_ALPHA); return gt_compute_indelcount(gt_array_get_space(bp->editoperations), gt_array_size(bp->editoperations), bp->alphatype == PROTEIN_ALPHA); }
void gth_backtrace_path_show_complete(const GthBacktracePath *bp, bool xmlout, unsigned int indentlevel, GtFile *outfp) { gt_assert(bp); gt_assert(bp->alphatype == DNA_ALPHA || bp->alphatype == PROTEIN_ALPHA); gt_editoperation_show(gt_array_get_space(bp->editoperations), gt_array_size(bp->editoperations), bp->alphatype == PROTEIN_ALPHA, xmlout, indentlevel, outfp); }
static void sort_matches_and_calc_buckets(GtArray *matches, GtArray *buckets, GtUword *maxbucketlength) { GtUword i, currentstart = 0, currentend = 0; GthMatch *matchptr; Bucket bucket, *bucketptr; gt_assert(gt_array_size(matches)); /* sort matches */ qsort(gt_array_get_space(matches), gt_array_size(matches), sizeof (GthMatch), compare_matches); /* init first bucket */ matchptr = gt_array_get_first(matches); bucket.seqnum1 = matchptr->Storeseqnumreference; bucket.seqnum2 = matchptr->Storeseqnumgenomic; bucket.startpos = 0; /* calc buckets */ for (i = 1; i < gt_array_size(matches); i++) { matchptr = gt_array_get(matches, i); if (matchptr->Storeseqnumreference != bucket.seqnum1 || matchptr->Storeseqnumgenomic != bucket.seqnum2) { /* save the current bucket */ currentend = i - 1; bucket.length = currentend - currentstart + 1; gt_array_add(buckets, bucket); /* create new bucket */ currentstart = i; bucket.seqnum1 = matchptr->Storeseqnumreference; bucket.seqnum2 = matchptr->Storeseqnumgenomic; bucket.startpos = i; } } /* save last bucket */ currentend = i - 1; bucket.length = currentend - currentstart + 1; gt_array_add(buckets, bucket); /* compute maximum bucket length */ *maxbucketlength = 0; for (i = 0; i < gt_array_size(buckets); i++) { bucketptr = gt_array_get(buckets, i); if (bucketptr->length > *maxbucketlength) *maxbucketlength = bucketptr->length; } gt_assert(sum_of_bucket_lengths_equals_num_of_matches(buckets, gt_array_size(matches))); }
static void enrich_chain(GthChain *chain, GtFragment *fragments, unsigned long num_of_fragments, bool comments, GtFile *outfp) { GtRange genomicrange, fragmentrange; GtArray *enrichment; unsigned long i; gt_assert(chain && fragments && num_of_fragments); if (comments) { gt_file_xprintf(outfp, "%c enrich global chain with the following " "forward ranges:\n",COMMENTCHAR); gt_file_xprintf(outfp, "%c ", COMMENTCHAR); gt_ranges_show(chain->forwardranges, outfp); } /* get genomic range of DP range */ genomicrange = chain_get_genomicrange(chain); enrichment = gt_array_new(sizeof (GtRange)); /* add each fragment which overlaps which DP range to the enrichment */ for (i = 0; i < num_of_fragments; i++) { fragmentrange.start = fragments[i].startpos2; fragmentrange.end = fragments[i].endpos2; if (gt_range_overlap(&genomicrange, &fragmentrange)) gt_array_add(enrichment, fragmentrange); } gt_assert(gt_array_size(enrichment)); /* sort the enrichment */ qsort(gt_array_get_space(enrichment), gt_array_size(enrichment), sizeof (GtRange), (GtCompare) gt_range_compare); /* reset the current DP range array */ gt_array_reset(chain->forwardranges); /* rebuild the DP range array which now includes the enrichment */ genomicrange = *(GtRange*) gt_array_get_first(enrichment); gt_array_add(chain->forwardranges, genomicrange); for (i = 1; i < gt_array_size(enrichment); i++) { genomicrange = *(GtRange*) gt_array_get(enrichment, i); if (genomicrange.start <= ((GtRange*) gt_array_get_last(chain->forwardranges))->end) { /* overlap found -> modify last range, if necessary */ if (((GtRange*) gt_array_get_last(chain->forwardranges))->end < genomicrange.end) { ((GtRange*) gt_array_get_last(chain->forwardranges))->end = genomicrange.end; } } else { /* save range */ gt_array_add(chain->forwardranges, genomicrange); } } gt_array_delete(enrichment); }
GthPGLCollection* gth_pgl_collection_new(GthSACollection *sacollection, bool disableclustersas) { GthPGLCollection *pgl_collection; GthPGL *pgl; GtUword i; gt_assert(sacollection); /* init */ pgl_collection = gt_malloc(sizeof *pgl_collection); pgl_collection->pgls = gt_array_new(sizeof (GthPGL*)); /* cluster alignments */ gthclusterSAstoPGLs(pgl_collection->pgls, sacollection); /* assemble (clustered) alignments */ for (i = 0; i < gt_array_size(pgl_collection->pgls); i++) { pgl = *(GthPGL**) gt_array_get(pgl_collection->pgls, i); /* sort the spliced alignments */ qsort(gt_array_get_space(pgl->alignments), gt_array_size(pgl->alignments), sizeof (GthSA*), gth_sa_cmp_genomic_actual); /* cluster spliced alignments which are equal on the genomic sequence. this way we only have to consider one spliced alignment for each cluster later on */ assemble_cluster(pgl, disableclustersas); /* call consensus phase */ gt_consensus_sa(gt_array_get_space(pgl->saclusters), gt_array_size(pgl->saclusters), sizeof (GthSACluster*), pgl_get_genomic_range, pgl_get_strand, get_exons_func, process_splice_form_func, pgl); } return pgl_collection; }
bool gth_backtrace_path_is_valid(const GthBacktracePath *bp) { bool is_valid; gt_assert(bp); gt_assert(bp->alphatype == DNA_ALPHA || bp->alphatype == PROTEIN_ALPHA); gt_assert(bp->ref_dp_length != GT_UNDEF_ULONG); is_valid = gt_eops_equal_referencelength((Editoperation*) gt_array_get_space(bp->editoperations) + bp->cutoffs.end.eopcutoff, gt_safe_cast2long(gt_array_size(bp ->editoperations)) - bp->cutoffs.start.eopcutoff - bp->cutoffs.end.eopcutoff, gt_safe_cast2long(bp->ref_dp_length) - bp->cutoffs.start.referencecutoff - bp->cutoffs.end.referencecutoff, bp->alphatype == PROTEIN_ALPHA); return is_valid; }
void gth_save_chain(GtChain *chain, GtFragment *fragments, unsigned long num_of_fragments, GT_UNUSED unsigned long max_gap_width, void *data) { GthSaveChainInfo *info = (GthSaveChainInfo*) data; GtRange range; GthChain *gth_chain; unsigned long i, fragnum; gt_assert(chain_is_colinear(chain, fragments)); if (info->comments) { gt_file_xprintf(info->outfp, "%c process global chain with score %ld\n", COMMENTCHAR, gt_chain_get_score(chain)); gt_file_xprintf(info->outfp, "%c process global chain with the " "following fragments\n", COMMENTCHAR); for (i = 0; i < gt_chain_size(chain); i++) showfragment(fragments + gt_chain_get_fragnum(chain, i), info->outfp); } /* init */ gth_chain = gth_chain_new(); gth_chain->gen_file_num = info->gen_file_num; gth_chain->gen_seq_num = info->gen_seq_num; gth_chain->ref_file_num = info->ref_file_num; gth_chain->ref_seq_num = info->ref_seq_num; /* chain has a minimum length of 1 */ gt_assert(gt_chain_size(chain)); /* global chain filter */ if (globalchainislongenough(chain, fragments, >h_chain->refseqcoverage, info->gcmincoverage, info->referencelength, info->stat, info->comments, info->outfp)) { /* save all potential exons */ for (i = 0; i < gt_chain_size(chain); i++) { fragnum = gt_chain_get_fragnum(chain, i); range.start = fragments[fragnum].startpos2; range.end = fragments[fragnum].endpos2; /* check for overlap */ if (i > 0 && range.start <= ((GtRange*) gt_array_get_last(gth_chain->forwardranges))->end) { /* overlap found -> modify last range */ gt_assert(((GtRange*) gt_array_get_last(gth_chain->forwardranges)) ->end <= range.end); ((GtRange*) gt_array_get_last(gth_chain->forwardranges))->end = range.end; } else { #ifndef NDEBUG if (i > 0) { /* gap width is smaller or equal than the maximum gap width */ gt_assert((range.start - 1 - ((GtRange*) gt_array_get_last(gth_chain->forwardranges)) ->end + 1 - 1) <= max_gap_width); } #endif /* save range */ gt_array_add(gth_chain->forwardranges, range); } } GtRange genomicrange = chain_get_genomicrange(gth_chain); if (info->enrichchains) { enrich_chain(gth_chain, fragments, num_of_fragments, info->comments, info->outfp); } gt_assert(gt_ranges_are_consecutive(gth_chain->forwardranges)); /* copy ranges to opposite strand */ gt_ranges_copy_to_opposite_strand(gth_chain->reverseranges, gth_chain->forwardranges, info->gen_total_length, info->gen_offset); /* compute jump table if necessary */ if (info->jump_table) { GthJumpTable *forward_jump_table, *reverse_jump_table; GtArray *chain_fragments; chain_fragments = make_list_of_chain_fragments(chain, fragments, num_of_fragments, info->enrichchains, &genomicrange); forward_jump_table = info->jump_table_new(gt_array_get_space(chain_fragments), gt_array_size(chain_fragments), info->jtdebug); reverse_jump_table = info->jump_table_new_reverse(forward_jump_table, info->gen_total_length, info->gen_offset, info->ref_total_length, info->ref_offset); gt_assert(!gth_chain->forward_jump_table); gth_chain->forward_jump_table = forward_jump_table; gt_assert(!gth_chain->reverse_jump_table); gth_chain->reverse_jump_table = reverse_jump_table; gt_array_delete(chain_fragments); gth_chain->jump_table_delete = info->jump_table_delete; } /* save array of potential exons */ gth_chain_collection_add(info->chain_collection, gth_chain); if (info->comments) { gt_file_xprintf(info->outfp, "%c global chain with the following " "ranges has been saved\n",COMMENTCHAR); gt_file_xprintf(info->outfp, "%c forward ranges:\n", COMMENTCHAR); gt_file_xprintf(info->outfp, "%c ", COMMENTCHAR); gt_ranges_show(gth_chain->forwardranges, info->outfp); gt_file_xprintf(info->outfp, "%c reverse ranges:\n", COMMENTCHAR); gt_file_xprintf(info->outfp, "%c ", COMMENTCHAR); gt_ranges_show(gth_chain->reverseranges, info->outfp); } /* output stored chains here (Mohamed needed this to compare the chaining phase of gth with CHAINER) */ if (info->stopafterchaining) { gt_file_xprintf(info->outfp, "%c gl. chain with coverage=%.2f and score %ld " "(genseq=%lu, str.=%c, refseq=%lu)\n", COMMENTCHAR, gth_chain->refseqcoverage, gt_chain_get_score(chain), gth_chain->gen_seq_num, SHOWSTRAND(info->directmatches), gth_chain->ref_seq_num); for (i = 0; i < gt_chain_size(chain); i++) showfragment(fragments + gt_chain_get_fragnum(chain, i), info->outfp); } } else { /* for -paralogs this case is not supposed to occur */ gt_assert(!info->paralogs); if (info->comments) gt_file_xprintf(info->outfp, "%c global chain discarded\n", COMMENTCHAR); gth_chain_delete(gth_chain); } }
void gt_ranges_sort_by_length_stable(GtArray *ranges) { gt_assert(ranges); gt_msort(gt_array_get_space(ranges), gt_array_size(ranges), sizeof (GtRange), (GtCompare) gt_range_compare_by_length_ptr); }
void gt_ranges_sort(GtArray *ranges) { gt_assert(ranges); qsort(gt_array_get_space(ranges), gt_array_size(ranges), sizeof (GtRange), (GtCompare) gt_range_compare); }
static void ensure_eop_of_len_1_before_introns(GtArray *editoperations) { Editoperation eop, *eopptr; Eoptype eoptype; unsigned long eoplength; GtArray *backup; bool processing_necessary = false, split_match = false; /* check if processing is necessary the check is rather simple, it might be possible that ``processing_necessary'' is set to ``true'' whereas in fact no processing is necessary */ for (eopptr = gt_array_get_space(editoperations); eopptr < (Editoperation*) gt_array_get_space(editoperations) + gt_array_size(editoperations) - 1; eopptr++) { if ((eoptype = gt_editoperation_type(*eopptr, true)) == EOP_TYPE_INTRON_WITH_1_BASE_LEFT || eoptype == EOP_TYPE_INTRON_WITH_2_BASES_LEFT) { processing_necessary = true; break; } } if (processing_necessary) { /* init backup for the editoperations */ backup = gt_array_new(sizeof (Editoperation)); /* fill backup */ gt_array_add_array(backup, editoperations); /* reset the original edit operations */ gt_array_set_size(editoperations, 0); /* process the backup and fill the original editoperations */ for (eopptr = gt_array_get_space(backup); eopptr < (Editoperation*) gt_array_get_space(backup) + gt_array_size(backup); eopptr++) { if ((eoptype = gt_editoperation_length(*eopptr, true)) == EOP_TYPE_INTRON_WITH_1_BASE_LEFT || eoptype == EOP_TYPE_INTRON_WITH_2_BASES_LEFT) { split_match = true; } else if (split_match) { if (eoptype == EOP_TYPE_MATCH) { split_match = false; if ((eoplength = gt_editoperation_length(*eopptr, true)) > 1) { eop = 1; gt_array_add(editoperations, eop); eop = eoplength - 1; gt_array_add(editoperations, eop); continue; } } else if (eoptype == EOP_TYPE_MISMATCH || eoptype == EOP_TYPE_MISMATCH_WITH_1_GAP) { split_match = false; } } gt_array_add(editoperations, *eopptr); } /* free backup */ gt_array_delete(backup); } }
Editoperation* gth_backtrace_path_get(const GthBacktracePath *bp) { gt_assert(bp); return (Editoperation*) gt_array_get_space(bp->editoperations) + bp->cutoffs.end.eopcutoff; }
static void calc_chains_from_matches(GthChainCollection *chain_collection, GtArray *matches, GthChainingInfo *chaining_info, GthSeqCon *gen_seq_con, GthSeqCon *ref_seq_con, GtUword rare, double fragweightfactor, GthJumpTableNew jump_table_new, GthJumpTableNewReverse jump_table_new_reverse, GthJumpTableDelete jump_table_delete) { GtUword i, numofchains = 0, num_of_fragments, maxbucketlength = 0; GtRange range; GtFile *outfp = chaining_info->call_info->out->outfp; GtFragment *fragments; GthSaveChainInfo info; GtArray *buckets; Bucket *bucket; /* this is a random sample to check that no equal matches exist either one match to chain or if more than one the first two differ */ gt_assert(gt_array_size(matches) == 1 || (gt_array_size(matches) > 1 && !gth_matches_are_equal(gt_array_get(matches, 0), gt_array_get(matches, 1)))); /* init */ buckets = gt_array_new(sizeof (Bucket)); /* output unsorted matches */ if (chaining_info->call_info->out->comments) { gt_file_xprintf(outfp, "%c output unsorted matches\n", COMMENTCHAR); showmatches(gt_array_get_space(matches), gt_array_size(matches), outfp); } /* transform reference sequence positions to opposite strand if necessary */ if (!chaining_info->directmatches) { if (chaining_info->call_info->out->comments) { gt_file_xprintf(outfp, "%c\n", COMMENTCHAR); gt_file_xprintf(outfp, "%c transform reference sequence positions to " "opposite strand\n", COMMENTCHAR); gt_file_xprintf(outfp, "%c\n", COMMENTCHAR); } transform_refseq_positions(matches, ref_seq_con); /* output transformed matches */ if (chaining_info->call_info->out->comments) { gt_file_xprintf(outfp, "%c output transformed matches\n", COMMENTCHAR); showmatches(gt_array_get_space(matches), gt_array_size(matches), outfp); } } /* sort matches */ sort_matches_and_calc_buckets(matches, buckets, &maxbucketlength); /* output sorted matches */ if (chaining_info->call_info->out->comments) { gt_file_xprintf(outfp, "%c output sorted matches\n", COMMENTCHAR); showmatches(gt_array_get_space(matches), gt_array_size(matches), outfp); } /* output buckets */ if (chaining_info->call_info->out->comments) { gt_file_xprintf(outfp, "%c output buckets\n", COMMENTCHAR); outputbuckets(buckets, gt_array_get_space(matches), outfp); } /* alloc space for fragments */ fragments = gt_malloc(sizeof (GtFragment) * maxbucketlength); /* save data to process the chains with saveChainasDPrange; constant part */ info.chain_collection = chain_collection; info.gcmincoverage = chaining_info->call_info->gcmincoverage; info.stat = chaining_info->stat; info.comments = chaining_info->call_info->out->comments; info.stopafterchaining = chaining_info->call_info->simfilterparam .stopafterchaining; info.paralogs = chaining_info->call_info->simfilterparam.paralogs; info.enrichchains = chaining_info->call_info->simfilterparam .enrichchains; info.jump_table = chaining_info->call_info->simfilterparam.jump_table; info.jump_table_new = jump_table_new; info.jump_table_new_reverse = jump_table_new_reverse; info.jump_table_delete = jump_table_delete; info.jtdebug = chaining_info->jtdebug; info.directmatches = chaining_info->directmatches; info.outfp = outfp; info.gen_file_num = chaining_info->gen_file_num; info.ref_file_num = chaining_info->ref_file_num; /* for every bucket a chain and for every chain a DP call (later maybe more than one chain) */ for (i = 0; i < gt_array_size(buckets); i++) { bucket = gt_array_get(buckets, i); if (chaining_info->call_info->out->showverbose) { if (chaining_info->refseqisindex && !chaining_info->call_info->simfilterparam.online) { /* in this case the exact number of chains is known */ numofchains = gt_array_size(buckets); } else { /* this expression gives an upper bound on the number of chains (because we do not know the exact number here) */ numofchains = chaining_info->bucketnum + gth_seq_con_num_of_seqs(gen_seq_con) * (gth_seq_con_num_of_seqs(ref_seq_con) - bucket->seqnum1); if (numofchains > chaining_info->maxbucketnum) numofchains = chaining_info->maxbucketnum; else chaining_info->maxbucketnum = numofchains; } } /* compute a set of fragments from every bucket of matches */ gthinitfragments(fragments, &num_of_fragments, (GthMatch*) gt_array_get_space(matches) + bucket->startpos, bucket->length, rare, fragweightfactor); if (chaining_info->call_info->out->showverbose) { show_chain_calc_status (chaining_info->call_info->out->showverbose, ++chaining_info->bucketnum, numofchains, num_of_fragments, chaining_info->gen_file_num, gth_input_num_of_gen_files(chaining_info->input), chaining_info->ref_file_num, gth_input_num_of_ref_files(chaining_info->input), chaining_info->directmatches, chaining_info->call_info->out->verboseseqs, bucket->seqnum2, bucket->seqnum1); } info.gen_seq_num = ((GthMatch*) gt_array_get(matches, bucket->startpos)) ->Storeseqnumgenomic; info.ref_seq_num = ((GthMatch*) gt_array_get(matches, bucket->startpos)) ->Storeseqnumreference; /* store genomic offset */ range = gth_seq_con_get_range(gen_seq_con, info.gen_seq_num); info.gen_total_length = range.end - range.start + 1; info.gen_offset = range.start; /* store length of reference sequence */ range = gth_seq_con_get_range(ref_seq_con, info.ref_seq_num); info.ref_total_length = range.end - range.start + 1; info.ref_offset = range.start; info.referencelength = range.end - range.start + 1; /* set number of remaining buckets */ info.numofremainingbuckets = gt_array_size(buckets) - i; if (chaining_info->call_info->simfilterparam.paralogs) { gt_globalchaining_coverage(fragments, num_of_fragments, chaining_info->call_info->gcmaxgapwidth, info.referencelength, ((double) chaining_info->call_info->gcmincoverage) / 100.0, gth_save_chain, &info); } else { gt_globalchaining_max(fragments, num_of_fragments, chaining_info->call_info->gcmaxgapwidth, gth_save_chain, &info); } } /* free space */ gt_array_delete(buckets); gt_free(fragments); }