static void convert_chain_to_inverted_chain(GthInvertedChain *inverted_chain, GthChain *chain) { unsigned long i, lastexonnum = gt_array_size(chain->forwardranges) - 1; GtRange range; /* inverted chain is empty */ gt_assert(!gt_array_size(inverted_chain->forwardranges)); /* chain is not empty */ gt_assert(gt_array_size(chain->forwardranges)); /* copy file and sequence numbers */ inverted_chain->gen_file_num = chain->gen_file_num; inverted_chain->gen_seq_num = chain->gen_seq_num; inverted_chain->ref_file_num = chain->ref_file_num; inverted_chain->ref_seq_num = chain->ref_seq_num; /* save startpos */ inverted_chain->startpos = ((GtRange*) gt_array_get_first(chain->forwardranges))->start; /* save endpos */ inverted_chain->endpos = ((GtRange*) gt_array_get_last(chain->forwardranges))->end; /* convert (potential) exons to (potential) introns */ for (i = 0; i < lastexonnum; i++) { range.start = ((GtRange*) gt_array_get(chain->forwardranges, i)) ->end + 1; range.end = ((GtRange*) gt_array_get(chain->forwardranges, i+1)) ->start - 1; gt_array_add(inverted_chain->forwardranges, range); } }
GtRange gth_sa_range_forward(const GthSA *sa) { GtRange range; GtUword leftgenomicborder, rightgenomicborder; gt_assert(sa); leftgenomicborder = ((Exoninfo*) gt_array_get_first(sa->exons)) ->leftgenomicexonborder; rightgenomicborder = ((Exoninfo*) gt_array_get_last(sa->exons)) ->rightgenomicexonborder; if (sa->gen_strand_forward) { range.start = leftgenomicborder; range.end = rightgenomicborder; } else { /* genomic offset is defined */ gt_assert(sa->gen_offset != GT_UNDEF_UWORD); range.start = sa->gen_total_length - 1 - (rightgenomicborder - sa->gen_offset) + sa->gen_offset; range.end = sa->gen_total_length - 1 - (leftgenomicborder - sa->gen_offset) + sa->gen_offset; } return range; }
void assemble_cluster(GthPGL *pgl, bool disableclustersas) { GthSACluster *sacluster; GthSA *sa; GtUword i; sacluster = gt_malloc(sizeof (GthSACluster)); sacluster->representative = *(GthSA**) gt_array_get_first(pgl->alignments); sacluster->members = gt_array_new(sizeof (GthSA*)); for (i = 1; i < gt_array_size(pgl->alignments); i++) { sa = *(GthSA**) gt_array_get(pgl->alignments, i); if (disableclustersas || gth_sa_cmp_genomic_actual(&sacluster->representative, &sa)) { /* spliced alignments differ -> create a new cluster */ gt_array_add(pgl->saclusters, sacluster); sacluster = gt_malloc(sizeof (GthSACluster)); sacluster->representative = sa; sacluster->members = gt_array_new(sizeof (GthSA*)); } else { /* spliced alignments are equal -> store new sa also in current cluster */ gt_array_add(sacluster->members, sa); } } /* store last cluster */ gt_array_add(pgl->saclusters, sacluster); }
GtUword gt_ranges_spanned_length(const GtArray *ranges) { GtRange spanned_range; gt_assert(ranges); spanned_range.start = ((GtRange*) gt_array_get_first(ranges))->start; spanned_range.end = ((GtRange*) gt_array_get_last(ranges))->end; return gt_range_length(&spanned_range); }
static GtRange chain_get_genomicrange(GthChain *chain) { GtRange range; gt_assert(chain); range.start = ((GtRange*) gt_array_get_first(chain->forwardranges))->start; range.end = ((GtRange*) gt_array_get_last(chain->forwardranges))->end; gt_assert(range.start <= range.end); return range; }
static void sort_matches_and_calc_buckets(GtArray *matches, GtArray *buckets, GtUword *maxbucketlength) { GtUword i, currentstart = 0, currentend = 0; GthMatch *matchptr; Bucket bucket, *bucketptr; gt_assert(gt_array_size(matches)); /* sort matches */ qsort(gt_array_get_space(matches), gt_array_size(matches), sizeof (GthMatch), compare_matches); /* init first bucket */ matchptr = gt_array_get_first(matches); bucket.seqnum1 = matchptr->Storeseqnumreference; bucket.seqnum2 = matchptr->Storeseqnumgenomic; bucket.startpos = 0; /* calc buckets */ for (i = 1; i < gt_array_size(matches); i++) { matchptr = gt_array_get(matches, i); if (matchptr->Storeseqnumreference != bucket.seqnum1 || matchptr->Storeseqnumgenomic != bucket.seqnum2) { /* save the current bucket */ currentend = i - 1; bucket.length = currentend - currentstart + 1; gt_array_add(buckets, bucket); /* create new bucket */ currentstart = i; bucket.seqnum1 = matchptr->Storeseqnumreference; bucket.seqnum2 = matchptr->Storeseqnumgenomic; bucket.startpos = i; } } /* save last bucket */ currentend = i - 1; bucket.length = currentend - currentstart + 1; gt_array_add(buckets, bucket); /* compute maximum bucket length */ *maxbucketlength = 0; for (i = 0; i < gt_array_size(buckets); i++) { bucketptr = gt_array_get(buckets, i); if (bucketptr->length > *maxbucketlength) *maxbucketlength = bucketptr->length; } gt_assert(sum_of_bucket_lengths_equals_num_of_matches(buckets, gt_array_size(matches))); }
GtRange gth_sa_range_actual(const GthSA *sa) { GtRange range; gt_assert(sa); range.start = ((Exoninfo*) gt_array_get_first(sa->exons)) ->leftgenomicexonborder; range.end = ((Exoninfo*) gt_array_get_last(sa->exons)) ->rightgenomicexonborder; return range; }
static void enrich_chain(GthChain *chain, GtFragment *fragments, unsigned long num_of_fragments, bool comments, GtFile *outfp) { GtRange genomicrange, fragmentrange; GtArray *enrichment; unsigned long i; gt_assert(chain && fragments && num_of_fragments); if (comments) { gt_file_xprintf(outfp, "%c enrich global chain with the following " "forward ranges:\n",COMMENTCHAR); gt_file_xprintf(outfp, "%c ", COMMENTCHAR); gt_ranges_show(chain->forwardranges, outfp); } /* get genomic range of DP range */ genomicrange = chain_get_genomicrange(chain); enrichment = gt_array_new(sizeof (GtRange)); /* add each fragment which overlaps which DP range to the enrichment */ for (i = 0; i < num_of_fragments; i++) { fragmentrange.start = fragments[i].startpos2; fragmentrange.end = fragments[i].endpos2; if (gt_range_overlap(&genomicrange, &fragmentrange)) gt_array_add(enrichment, fragmentrange); } gt_assert(gt_array_size(enrichment)); /* sort the enrichment */ qsort(gt_array_get_space(enrichment), gt_array_size(enrichment), sizeof (GtRange), (GtCompare) gt_range_compare); /* reset the current DP range array */ gt_array_reset(chain->forwardranges); /* rebuild the DP range array which now includes the enrichment */ genomicrange = *(GtRange*) gt_array_get_first(enrichment); gt_array_add(chain->forwardranges, genomicrange); for (i = 1; i < gt_array_size(enrichment); i++) { genomicrange = *(GtRange*) gt_array_get(enrichment, i); if (genomicrange.start <= ((GtRange*) gt_array_get_last(chain->forwardranges))->end) { /* overlap found -> modify last range, if necessary */ if (((GtRange*) gt_array_get_last(chain->forwardranges))->end < genomicrange.end) { ((GtRange*) gt_array_get_last(chain->forwardranges))->end = genomicrange.end; } } else { /* save range */ gt_array_add(chain->forwardranges, genomicrange); } } gt_array_delete(enrichment); }
bool gt_ranges_borders_are_in_region(GtArray *ranges, const GtRange *region) { gt_assert(ranges && region); /* check region start */ if (((GtRange*) gt_array_get_first(ranges))->start < region->start) return false; /* check region end */ if (((GtRange*) gt_array_get_last(ranges))->end > region->end) return false; return true; }
void gth_chain_contract(GthChain *dest, const GthChain *src) { GtRange forwardrange, reverserange; gt_assert(gt_array_size(src->forwardranges) == gt_array_size(src->reverseranges)); /* copy core */ chain_copy_core(dest, src); /* contract ranges */ forwardrange.start = ((GtRange*) gt_array_get_first(src->forwardranges))->start; forwardrange.end = ((GtRange*) gt_array_get_last(src->forwardranges))->end; reverserange.start = ((GtRange*) gt_array_get_first(src->reverseranges))->start; reverserange.end = ((GtRange*) gt_array_get_last(src->reverseranges))->end; /* store contracted ranges */ gt_array_add(dest->forwardranges, forwardrange); gt_array_add(dest->reverseranges, reverserange); }
static int seqid_info_add(SeqidInfo *seqid_info, unsigned long seqnum, unsigned long filenum, const GtRange *range, const char *filename, const char *seqid, GtError *err) { SeqidInfoElem *seqid_info_elem_ptr, seqid_info_elem; gt_error_check(err); gt_assert(range); seqid_info_elem_ptr = gt_array_get_first(seqid_info); if (range->end == GT_UNDEF_ULONG || seqid_info_elem_ptr->descrange.end == GT_UNDEF_ULONG) { gt_error_set(err, "sequence file \"%s\" does contain multiple sequences " "with ID \"%s\" and not all of them have description ranges", filename, seqid); return -1; } seqid_info_elem.seqnum = seqnum; seqid_info_elem.filenum = filenum; seqid_info_elem.descrange = *range; gt_array_add(seqid_info, seqid_info_elem); return 0; }
static int seqid_info_add(SeqidInfo *seqid_info, GtUword seqnum, GtUword filenum, const GtRange *range, GT_UNUSED const char *filename, const char *seqid, GtError *err) { SeqidInfoElem *seqid_info_elem_ptr, seqid_info_elem; gt_error_check(err); gt_assert(range); seqid_info_elem_ptr = gt_array_get_first(seqid_info); if (range->end == GT_UNDEF_UWORD || seqid_info_elem_ptr->descrange.end == GT_UNDEF_UWORD) { gt_error_set(err, "input sequence(s) contain multiple sequences " "with ID \"%s\" and not all of them have description ranges", seqid); return -1; } seqid_info_elem.seqnum = seqnum; seqid_info_elem.filenum = filenum; seqid_info_elem.descrange = *range; gt_array_add(seqid_info, seqid_info_elem); return 0; }
static int check_cds_phases(GtArray *cds_features, GtCDSCheckVisitor *v, bool is_multi, bool second_pass, GtError *err) { GtPhase current_phase, correct_phase = GT_PHASE_ZERO; GtFeatureNode *fn; GtStrand strand; unsigned long i, current_length; int had_err = 0; gt_error_check(err); gt_assert(cds_features); gt_assert(gt_array_size(cds_features)); fn = *(GtFeatureNode**) gt_array_get_first(cds_features); strand = gt_feature_node_get_strand(fn); if (strand == GT_STRAND_REVERSE) gt_array_reverse(cds_features); for (i = 0; !had_err && i < gt_array_size(cds_features); i++) { fn = *(GtFeatureNode**) gt_array_get(cds_features, i); /* the first phase can be anything (except being undefined), because the GFF3 spec says: NOTE 4 - CDS features MUST have have a defined phase field. Otherwise it is not possible to infer the correct polypeptides corresponding to partially annotated genes. */ if ((!i && gt_feature_node_get_phase(fn) == GT_PHASE_UNDEFINED) || (i && gt_feature_node_get_phase(fn) != correct_phase)) { if (gt_hashmap_get(v->cds_features, fn)) { if (v->tidy && !is_multi && !gt_feature_node_has_children(fn)) { /* we can split the feature */ gt_warning("%s feature on line %u in file \"%s\" has multiple " "parents which require different phases; split feature", gt_ft_CDS, gt_genome_node_get_line_number((GtGenomeNode*) fn), gt_genome_node_get_filename((GtGenomeNode*) fn)); gt_hashmap_add(v->cds_features_to_split, fn, fn); v->splitting_is_necessary = true; /* split later */ } else { gt_error_set(err, "%s feature on line %u in file \"%s\" has multiple " "parents which require different phases", gt_ft_CDS, gt_genome_node_get_line_number((GtGenomeNode*) fn), gt_genome_node_get_filename((GtGenomeNode*) fn)); had_err = -1; } } else { if (v->tidy) { if (!second_pass) { gt_warning("%s feature on line %u in file \"%s\" has the wrong " "phase %c -> correcting it to %c", gt_ft_CDS, gt_genome_node_get_line_number((GtGenomeNode*) fn), gt_genome_node_get_filename((GtGenomeNode*) fn), GT_PHASE_CHARS[gt_feature_node_get_phase(fn)], GT_PHASE_CHARS[correct_phase]); } gt_feature_node_set_phase(fn, correct_phase); } else { gt_error_set(err, "%s feature on line %u in file \"%s\" has the " "wrong phase %c (should be %c)", gt_ft_CDS, gt_genome_node_get_line_number((GtGenomeNode*) fn), gt_genome_node_get_filename((GtGenomeNode*) fn), GT_PHASE_CHARS[gt_feature_node_get_phase(fn)], GT_PHASE_CHARS[correct_phase]); had_err = -1; } } } if (!had_err) { current_phase = gt_feature_node_get_phase(fn); current_length = gt_genome_node_get_length((GtGenomeNode*) fn); correct_phase = (3 - (current_length - current_phase) % 3) % 3; gt_hashmap_add(v->cds_features, fn, fn); /* record CDS feature */ } } return had_err; }
/* XXX: change this function: add more sophisticated extension strategy */ void gth_chain_extend_borders(GthChain *chain, const GtRange *gen_seq_bounds, const GtRange *gen_seq_bounds_rc, GT_UNUSED unsigned long gen_total_length, GT_UNUSED unsigned long gen_offset) { long tmpborder; /* at least one range in chain */ gt_assert(gt_array_size(chain->forwardranges)); /* forward range borders are in considered genomic region */ gt_assert(gt_ranges_borders_are_in_region(chain->forwardranges, gen_seq_bounds)); /* reverse range borders are in considered genomic region */ gt_assert(gt_ranges_borders_are_in_region(chain->reverseranges, gen_seq_bounds_rc)); /* chain->forwardranges is forward and consecutive */ gt_assert(gt_ranges_are_consecutive(chain->forwardranges)); /* valid sequence bounds */ gt_assert(gen_seq_bounds->start <= gen_seq_bounds->end); gt_assert(gen_seq_bounds_rc->start <= gen_seq_bounds_rc->end); /* set start border, forward strand */ tmpborder = gt_safe_cast2long(((GtRange*) gt_array_get_first(chain->forwardranges)) ->start); tmpborder -= DPEXTENSION; if (tmpborder < gt_safe_cast2long(gen_seq_bounds->start)) tmpborder = gen_seq_bounds->start; ((GtRange*) gt_array_get_first(chain->forwardranges))->start = gt_safe_cast2ulong(tmpborder); /* set start border, reverse complement strand */ tmpborder = gt_safe_cast2long(((GtRange*) gt_array_get_first(chain->reverseranges)) ->start); tmpborder -= DPEXTENSION; if (tmpborder < gt_safe_cast2long(gen_seq_bounds_rc->start)) tmpborder = gen_seq_bounds_rc->start; ((GtRange*) gt_array_get_first(chain->reverseranges))->start = gt_safe_cast2ulong(tmpborder); /* set end border, forward strand */ tmpborder = gt_safe_cast2long(((GtRange*) gt_array_get_last(chain->forwardranges)) ->end); tmpborder += DPEXTENSION; if (tmpborder > gt_safe_cast2long(gen_seq_bounds->end)) tmpborder = gen_seq_bounds->end; ((GtRange*) gt_array_get_last(chain->forwardranges))->end = gt_safe_cast2ulong(tmpborder); /* set end border, reverse complement strand */ tmpborder = gt_safe_cast2long(((GtRange*) gt_array_get_last(chain->reverseranges)) ->end); tmpborder += DPEXTENSION; if (tmpborder > gt_safe_cast2long(gen_seq_bounds_rc->end)) tmpborder = gen_seq_bounds_rc->end; ((GtRange*) gt_array_get_last(chain->reverseranges))->end = gt_safe_cast2ulong(tmpborder); gt_assert(chain_is_filled_and_consistent(chain, gen_total_length, gen_offset)); }
void gth_sa_calc_polyAtailpos(GthSA *sa, const unsigned char *ref_seq_tran, GtAlphabet *ref_alphabet) { GtUword ppa, mma, rightreferenceborder, referencelength; GtWord i, leftreferenceborder; sa->polyAtailpos.start = 0; sa->polyAtailpos.end = 0; ppa = mma = 0; rightreferenceborder = ((Exoninfo*) gt_array_get_last(sa->exons)) ->rightreferenceexonborder; leftreferenceborder = ((Exoninfo*) gt_array_get_first(sa->exons)) ->leftreferenceexonborder; /* setting i */ referencelength = gth_sa_ref_total_length(sa); if ((rightreferenceborder + 1) >= (referencelength - 1 - CALCPOLYATAILWINDOW)) { i = gt_safe_cast2long(rightreferenceborder + 1); } else { if (referencelength < 1 + CALCPOLYATAILWINDOW) i = 0; else i = referencelength - 1 - CALCPOLYATAILWINDOW; } for (/* i already set */; i < gt_safe_cast2long(referencelength); i++) { if (ref_seq_tran[i] == gt_alphabet_encode(ref_alphabet, 'A')) ppa++; else { if (ppa > 0 && mma < 1) { mma++; continue; } else { if (ppa >= MINIMUMPOLYATAILLENGTH) break; else { ppa = mma = 0; continue; } } } } if (ppa >= MINIMUMPOLYATAILLENGTH) { sa->polyAtailpos.start = gt_safe_cast2ulong(i - ppa - mma); sa->polyAtailpos.end = i - 1; } else { ppa = mma = 0; /* setting i */ if ((leftreferenceborder - 1) <= CALCPOLYATAILWINDOW) i = leftreferenceborder - 1; else i = CALCPOLYATAILWINDOW - 1; for (/* i already set */; i >= 0; i--) { if (ref_seq_tran[i] == gt_alphabet_encode(ref_alphabet, 'T')) ppa++; else { if (ppa > 0 && mma < 1) { mma++; continue; } else { if (ppa >= MINIMUMPOLYATAILLENGTH) break; else { ppa = mma = 0; continue; } } } } if (ppa >= MINIMUMPOLYATAILLENGTH) { sa->polyAtailpos.start = gt_safe_cast2ulong(i + ppa + mma); sa->polyAtailpos.end = i + 1; } } }