Ejemplo n.º 1
0
static void enrich_chain(GthChain *chain, GtFragment *fragments,
                         unsigned long num_of_fragments, bool comments,
                         GtFile *outfp)
{
  GtRange genomicrange, fragmentrange;
  GtArray *enrichment;
  unsigned long i;
  gt_assert(chain && fragments && num_of_fragments);
  if (comments) {
    gt_file_xprintf(outfp, "%c enrich global chain with the following "
                              "forward ranges:\n",COMMENTCHAR);
    gt_file_xprintf(outfp, "%c ", COMMENTCHAR);
    gt_ranges_show(chain->forwardranges, outfp);
  }
  /* get genomic range of DP range */
  genomicrange = chain_get_genomicrange(chain);
  enrichment = gt_array_new(sizeof (GtRange));
  /* add each fragment which overlaps which DP range to the enrichment */
  for (i = 0; i < num_of_fragments; i++) {
    fragmentrange.start  = fragments[i].startpos2;
    fragmentrange.end = fragments[i].endpos2;
    if (gt_range_overlap(&genomicrange, &fragmentrange))
      gt_array_add(enrichment, fragmentrange);
  }
  gt_assert(gt_array_size(enrichment));
  /* sort the enrichment */
  qsort(gt_array_get_space(enrichment), gt_array_size(enrichment),
        sizeof (GtRange), (GtCompare) gt_range_compare);
  /* reset the current DP range array */
  gt_array_reset(chain->forwardranges);
  /* rebuild the DP range array which now includes the enrichment */
  genomicrange = *(GtRange*) gt_array_get_first(enrichment);
  gt_array_add(chain->forwardranges, genomicrange);
  for (i = 1; i < gt_array_size(enrichment); i++) {
    genomicrange = *(GtRange*) gt_array_get(enrichment, i);
    if (genomicrange.start <=
        ((GtRange*) gt_array_get_last(chain->forwardranges))->end) {
      /* overlap found -> modify last range, if necessary */
      if (((GtRange*) gt_array_get_last(chain->forwardranges))->end <
          genomicrange.end) {
        ((GtRange*) gt_array_get_last(chain->forwardranges))->end =
          genomicrange.end;
      }
    }
    else {
      /* save range */
      gt_array_add(chain->forwardranges, genomicrange);
    }
  }
  gt_array_delete(enrichment);
}
Ejemplo n.º 2
0
static GtArray* generic_ranges_uniq(GtArray *out_ranges,
                                    const GtArray *in_ranges, bool count)
{
  GtUword i, *ctr_ptr, ctr = 1;
  GtArray *count_array = NULL;
  GtRange cur  = { GT_UNDEF_UWORD, GT_UNDEF_UWORD },
        prev = { GT_UNDEF_UWORD, GT_UNDEF_UWORD };
  gt_assert(out_ranges && in_ranges);
  gt_assert(gt_ranges_are_sorted(in_ranges));
  if (count)
    count_array = gt_array_new(sizeof (GtUword));
  for (i = 0; i < gt_array_size(in_ranges); i++) {
    cur = *(GtRange*) gt_array_get(in_ranges, i);
    if (!i) {
      gt_array_add(out_ranges, cur);
      if (count)
        gt_array_add(count_array, ctr);
    }
    else {
      if (prev.start == cur.start && prev.end == cur.end) {
        if (count) {
          ctr_ptr = gt_array_get_last(count_array);
          (*ctr_ptr)++;
        }
      }
      else {
        gt_array_add(out_ranges, cur);
        if (count)
          gt_array_add(count_array, ctr);
      }
    }
    prev = cur;
  }
  return count_array;
}
static int gt_regioncov_visitor_feature_node(GtNodeVisitor *nv,
                                             GtFeatureNode *fn,
                                             GT_UNUSED GtError *err)
{
  GtRange *old_range_ptr, old_range, new_range;
  GtArray *ranges;
  GtRegionCovVisitor *regioncov_visitor;
  gt_error_check(err);
  regioncov_visitor = gt_regioncov_visitor_cast(nv);
  ranges = gt_hashmap_get(regioncov_visitor->region2rangelist,
                       gt_str_get(gt_genome_node_get_seqid((GtGenomeNode*)
                                                           fn)));
  gt_assert(ranges);
  new_range = gt_genome_node_get_range((GtGenomeNode*) fn);
  if (!gt_array_size(ranges))
    gt_array_add(ranges, new_range);
  else {
    old_range_ptr = gt_array_get_last(ranges);
    old_range = *old_range_ptr;
    old_range.end += regioncov_visitor->max_feature_dist;
    if (gt_range_overlap(&old_range, &new_range)) {
      old_range_ptr->end = MAX(old_range_ptr->end, new_range.end);
    }
    else
      gt_array_add(ranges, new_range);
  }
  return 0;
}
Ejemplo n.º 4
0
static void convert_chain_to_inverted_chain(GthInvertedChain *inverted_chain,
                                            GthChain *chain)
{
  unsigned long i, lastexonnum = gt_array_size(chain->forwardranges) - 1;
  GtRange range;

  /* inverted chain is empty */
  gt_assert(!gt_array_size(inverted_chain->forwardranges));
  /* chain is not empty */
  gt_assert(gt_array_size(chain->forwardranges));

  /* copy file and sequence numbers */
  inverted_chain->gen_file_num = chain->gen_file_num;
  inverted_chain->gen_seq_num  = chain->gen_seq_num;
  inverted_chain->ref_file_num = chain->ref_file_num;
  inverted_chain->ref_seq_num  = chain->ref_seq_num;

  /* save startpos */
  inverted_chain->startpos = ((GtRange*)
                              gt_array_get_first(chain->forwardranges))->start;

  /* save endpos */
  inverted_chain->endpos = ((GtRange*)
                             gt_array_get_last(chain->forwardranges))->end;

  /* convert (potential) exons to (potential) introns */
  for (i = 0; i < lastexonnum; i++) {
    range.start  = ((GtRange*) gt_array_get(chain->forwardranges, i))
                  ->end + 1;
    range.end = ((GtRange*) gt_array_get(chain->forwardranges, i+1))
                  ->start - 1;
    gt_array_add(inverted_chain->forwardranges, range);
  }
}
Ejemplo n.º 5
0
GtRange gth_sa_range_forward(const GthSA *sa)
{
  GtRange range;
  GtUword leftgenomicborder, rightgenomicborder;

  gt_assert(sa);

  leftgenomicborder  = ((Exoninfo*) gt_array_get_first(sa->exons))
                       ->leftgenomicexonborder;
  rightgenomicborder = ((Exoninfo*) gt_array_get_last(sa->exons))
                       ->rightgenomicexonborder;

  if (sa->gen_strand_forward) {
    range.start = leftgenomicborder;
    range.end = rightgenomicborder;
  }
  else {
    /* genomic offset is defined */
    gt_assert(sa->gen_offset != GT_UNDEF_UWORD);
    range.start  = sa->gen_total_length - 1
                   - (rightgenomicborder - sa->gen_offset)
                   + sa->gen_offset;
    range.end = sa->gen_total_length - 1
                - (leftgenomicborder - sa->gen_offset)
                + sa->gen_offset;
  }

  return range;
}
Ejemplo n.º 6
0
GtUword gt_ranges_spanned_length(const GtArray *ranges)
{
  GtRange spanned_range;
  gt_assert(ranges);
  spanned_range.start = ((GtRange*) gt_array_get_first(ranges))->start;
  spanned_range.end   = ((GtRange*) gt_array_get_last(ranges))->end;
  return gt_range_length(&spanned_range);
}
Ejemplo n.º 7
0
void gth_backtrace_path_cutoff_walked_path(GthBacktracePath *bp,
                                           const GthPathWalker *pw,
                                           bool showeops, GtFile *outfp)
{
  unsigned int length;
  gt_assert(bp && pw);
  if (gth_path_walker_is_forward(pw)) {
    gt_assert(!backtrace_path_start_cutoffs_are_set(bp));
    if (showeops) {
      gt_file_xprintf(outfp, "%s(): show path walker\n", __func__);
      gth_path_walker_show(pw, outfp);
      gt_file_xprintf(outfp, "%s(): show backtrace path (before eop "
                         "removal)\n", __func__);
      gth_backtrace_path_show(bp, false, 0, outfp);
    }
    /* remove complete eops */
    gt_array_set_size(bp->editoperations,
                      gt_array_size(bp->editoperations) -
                      gth_path_walker_actual_eops(pw));
    if (showeops) {
      gt_file_xprintf(outfp, "%s(): show backtrace path (after eop "
                         "removal)\n", __func__);
      gth_backtrace_path_show(bp, false, 0, outfp);
    }
    /* remove part of last eop */
    if (gth_path_walker_steps_in_current_eop(pw)) {
      length = gt_editoperation_length(*(Editoperation*)
                                    gt_array_get_last(bp->editoperations),
                                    bp->alphatype == PROTEIN_ALPHA);
      gt_assert(length > gth_path_walker_steps_in_current_eop(pw));
      gt_editoperation_set_length(gt_array_get_last(bp->editoperations),
                               length-gth_path_walker_steps_in_current_eop(pw),
                               bp->alphatype == PROTEIN_ALPHA);
    }
    /* adjusting genomic and reference DP ranges */
    bp->gen_dp_start += gth_path_walker_gen_distance(pw);
    bp->gen_dp_length -= gth_path_walker_gen_distance(pw);
    bp->ref_dp_start += gth_path_walker_ref_distance(pw);
    bp->ref_dp_length -= gth_path_walker_ref_distance(pw);
  }
  else {
    gt_assert(0); /* XXX: implement reverse case */
    gt_assert(!backtrace_path_end_cutoffs_are_set(bp));
  }
}
Ejemplo n.º 8
0
static GtRange chain_get_genomicrange(GthChain *chain)
{
  GtRange range;
  gt_assert(chain);
  range.start = ((GtRange*) gt_array_get_first(chain->forwardranges))->start;
  range.end = ((GtRange*) gt_array_get_last(chain->forwardranges))->end;
  gt_assert(range.start <= range.end);
  return range;
}
Ejemplo n.º 9
0
GtRange gth_sa_range_actual(const GthSA *sa)
{
  GtRange range;

  gt_assert(sa);

  range.start = ((Exoninfo*) gt_array_get_first(sa->exons))
                ->leftgenomicexonborder;
  range.end = ((Exoninfo*) gt_array_get_last(sa->exons))
              ->rightgenomicexonborder;

  return range;
}
Ejemplo n.º 10
0
void gt_splicedseq_add(Splicedseq *ss, unsigned long start, unsigned long end,
                       const char *original_sequence)
{
  unsigned long i;
  gt_assert(ss && start <= end && original_sequence);
  gt_str_append_cstr_nt(ss->splicedseq, original_sequence,
                        end - start + 1);
  /* make sure elements are added in ascending order */
  gt_assert(!gt_array_size(ss->positionmapping) ||
            start > *(unsigned long*) gt_array_get_last(ss->positionmapping));
  for (i = start; i <= end; i++)
    gt_array_add(ss->positionmapping, i);
}
Ejemplo n.º 11
0
bool gt_ranges_borders_are_in_region(GtArray *ranges, const GtRange *region)
{
  gt_assert(ranges && region);

  /* check region start */
  if (((GtRange*) gt_array_get_first(ranges))->start < region->start)
    return false;

  /* check region end */
  if (((GtRange*) gt_array_get_last(ranges))->end > region->end)
    return false;

  return true;
}
Ejemplo n.º 12
0
void gth_chain_contract(GthChain *dest, const GthChain *src)
{
  GtRange forwardrange, reverserange;

  gt_assert(gt_array_size(src->forwardranges) ==
            gt_array_size(src->reverseranges));

  /* copy core */
  chain_copy_core(dest, src);

  /* contract ranges */
  forwardrange.start  = ((GtRange*)
                        gt_array_get_first(src->forwardranges))->start;
  forwardrange.end = ((GtRange*)
                        gt_array_get_last(src->forwardranges))->end;
  reverserange.start  = ((GtRange*)
                        gt_array_get_first(src->reverseranges))->start;
  reverserange.end = ((GtRange*)
                        gt_array_get_last(src->reverseranges))->end;

  /* store contracted ranges */
  gt_array_add(dest->forwardranges, forwardrange);
  gt_array_add(dest->reverseranges, reverserange);
}
Ejemplo n.º 13
0
void gth_backtrace_path_add_intron(GthBacktracePath *bp)
{
  Editoperation *eopptr, intron_eop = DELETIONEOP + 1;
  unsigned long eopid, lenid;
  gt_assert(bp);
  gt_assert(bp->alphatype == DNA_ALPHA || bp->alphatype == PROTEIN_ALPHA);
  if (!gt_array_size(bp->editoperations))
    gt_array_add(bp->editoperations, intron_eop);
  else {
    eopptr = gt_array_get_last(bp->editoperations);
    eopid  = *eopptr & ~bp->max_identical_length;
    lenid  = *eopptr &  bp->max_identical_length;
    if (eopid == DELETIONEOP && lenid > 0 && lenid < bp->max_identical_length)
      (*eopptr)++;
    else
      gt_array_add(bp->editoperations, intron_eop);
  }
}
Ejemplo n.º 14
0
void gth_backtrace_path_add_match(GthBacktracePath *bp,
                                  bool ensure_single_match)
{
  Editoperation *eopptr, match_eop = 1;
  unsigned long eopid, lenid;
  gt_assert(bp);
  gt_assert(bp->alphatype == DNA_ALPHA || bp->alphatype == PROTEIN_ALPHA);
  if (!gt_array_size(bp->editoperations) || ensure_single_match)
    gt_array_add(bp->editoperations, match_eop);
  else {
    eopptr = gt_array_get_last(bp->editoperations);
    eopid  = *eopptr & ~bp->max_identical_length;
    lenid  = *eopptr &  bp->max_identical_length;
    if (eopid == 0 && lenid > 0 && lenid < bp->max_identical_length)
      (*eopptr)++;
    else
      gt_array_add(bp->editoperations, match_eop);
  }
}
Ejemplo n.º 15
0
int gth_match_processor(GthMatchProcessorInfo *info, GthSeqCon *gen_seq_con,
                        GthSeqCon *ref_seq_con, GthMatch *match)
{
  if (info->matchnumcounter) {
    info->matchnumcounter[match->Storeseqnumreference]++;

    if (info->maxnumofmatches > 0 &&
        info->matchnumcounter[match->Storeseqnumreference] >
        info->maxnumofmatches) {
      /* discard matchA */
      return 0;
    }
  }

  if (!(info->refseqisindex && !info->online) &&
      match->Storeseqnumreference != info->lastrefseqnum &&
      gt_array_size(info->matches)) {
    gt_assert(info->chain_collection && info->chaining_info);

    /* chain all current matches */
    calc_chains_from_matches(info->chain_collection, info->matches,
                             info->chaining_info, gen_seq_con, ref_seq_con,
                             info->rare, info->fragweightfactor,
                             info->jump_table_new, info->jump_table_new_reverse,
                             info->jump_table_delete);

    /* and remove them afterwards */
    gt_array_reset(info->matches);
  }

  /*...only if it does not equal the last one */
  if (gt_array_size(info->matches) &&
      gth_matches_are_equal(gt_array_get_last(info->matches), match)) {
    return 0;
  }
  gt_array_add_elem(info->matches, match, sizeof *match);

  /* update last reference sequence number */
  info->lastrefseqnum = match->Storeseqnumreference;

  return 0;
}
Ejemplo n.º 16
0
void gth_backtrace_path_add_intron_with_2_bases_left(GthBacktracePath *bp)
{
  Editoperation *eopptr,
                intron_with_2_bases_left_eop = DELETION_WITH_2_GAPS_EOP + 1;
  unsigned long eopid, lenid;
  gt_assert(bp);
  gt_assert(bp->alphatype == PROTEIN_ALPHA);
  gt_assert(bp->max_identical_length == MAXIDENTICALLENGTH_PROTEIN);
  if (!gt_array_size(bp->editoperations))
    gt_array_add(bp->editoperations, intron_with_2_bases_left_eop);
  else {
    eopptr = gt_array_get_last(bp->editoperations);
    eopid  = *eopptr & ~bp->max_identical_length;
    lenid  = *eopptr &  bp->max_identical_length;
    if (eopid ==  DELETION_WITH_2_GAPS_EOP && lenid > 0 &&
        lenid < bp->max_identical_length) {
      (*eopptr)++;
    }
    else
      gt_array_add(bp->editoperations, intron_with_2_bases_left_eop);
  }
}
Ejemplo n.º 17
0
bool gth_backtrace_path_last_is_intron(const GthBacktracePath *bp)
{
  Eoptype eoptype;

  gt_assert(bp);

  /* check if a dummy has just been inserted */
  if (bp->dummy_index != GT_UNDEF_ULONG &&
      gt_array_size(bp->editoperations) - 1 == bp->dummy_index) {
    return false;
  }

  eoptype = gt_editoperation_type(*(Editoperation*)
                               gt_array_get_last(bp->editoperations),
                               bp->alphatype == PROTEIN_ALPHA);
  if (eoptype == EOP_TYPE_INTRON ||
      eoptype == EOP_TYPE_INTRON_WITH_1_BASE_LEFT ||
      eoptype == EOP_TYPE_INTRON_WITH_2_BASES_LEFT) {
    return true;
  }
  return false;

}
Ejemplo n.º 18
0
void gth_save_chain(GtChain *chain, GtFragment *fragments,
                    unsigned long num_of_fragments,
                    GT_UNUSED unsigned long max_gap_width,
                    void *data)
{
  GthSaveChainInfo *info = (GthSaveChainInfo*) data;
  GtRange range;
  GthChain *gth_chain;
  unsigned long i, fragnum;

  gt_assert(chain_is_colinear(chain, fragments));

  if (info->comments) {
    gt_file_xprintf(info->outfp, "%c process global chain with score %ld\n",
                       COMMENTCHAR, gt_chain_get_score(chain));
    gt_file_xprintf(info->outfp, "%c process global chain with the "
                       "following fragments\n", COMMENTCHAR);
    for (i = 0; i < gt_chain_size(chain); i++)
      showfragment(fragments + gt_chain_get_fragnum(chain, i), info->outfp);
  }

  /* init */
  gth_chain = gth_chain_new();
  gth_chain->gen_file_num = info->gen_file_num;
  gth_chain->gen_seq_num  = info->gen_seq_num;
  gth_chain->ref_file_num = info->ref_file_num;
  gth_chain->ref_seq_num  = info->ref_seq_num;

  /* chain has a minimum length of 1 */
  gt_assert(gt_chain_size(chain));

  /* global chain filter */
  if (globalchainislongenough(chain, fragments,
                              &gth_chain->refseqcoverage, info->gcmincoverage,
                              info->referencelength, info->stat, info->comments,
                              info->outfp)) {
    /* save all potential exons */
    for (i = 0; i < gt_chain_size(chain); i++) {
      fragnum = gt_chain_get_fragnum(chain, i);
      range.start = fragments[fragnum].startpos2;
      range.end = fragments[fragnum].endpos2;

      /* check for overlap */
      if (i > 0 &&
         range.start <=
         ((GtRange*) gt_array_get_last(gth_chain->forwardranges))->end) {
        /* overlap found -> modify last range */
        gt_assert(((GtRange*) gt_array_get_last(gth_chain->forwardranges))
                  ->end <= range.end);
        ((GtRange*) gt_array_get_last(gth_chain->forwardranges))->end =
          range.end;
      }
      else {
#ifndef NDEBUG
        if (i > 0) {
          /* gap width is smaller or equal than the maximum gap width */
          gt_assert((range.start - 1 -
                 ((GtRange*) gt_array_get_last(gth_chain->forwardranges))
                 ->end + 1 - 1) <= max_gap_width);
        }
#endif
        /* save range */
        gt_array_add(gth_chain->forwardranges, range);
      }
    }

    GtRange genomicrange = chain_get_genomicrange(gth_chain);

    if (info->enrichchains) {
      enrich_chain(gth_chain, fragments, num_of_fragments, info->comments,
                   info->outfp);
    }

    gt_assert(gt_ranges_are_consecutive(gth_chain->forwardranges));

    /* copy ranges to opposite strand */
    gt_ranges_copy_to_opposite_strand(gth_chain->reverseranges,
                                      gth_chain->forwardranges,
                                      info->gen_total_length,
                                      info->gen_offset);

    /* compute jump table if necessary */
    if (info->jump_table) {
      GthJumpTable *forward_jump_table, *reverse_jump_table;
      GtArray *chain_fragments;
      chain_fragments = make_list_of_chain_fragments(chain, fragments,
                                                     num_of_fragments,
                                                     info->enrichchains,
                                                     &genomicrange);
      forward_jump_table =
        info->jump_table_new(gt_array_get_space(chain_fragments),
                             gt_array_size(chain_fragments), info->jtdebug);
      reverse_jump_table =
        info->jump_table_new_reverse(forward_jump_table,
                                     info->gen_total_length, info->gen_offset,
                                     info->ref_total_length, info->ref_offset);
      gt_assert(!gth_chain->forward_jump_table);
      gth_chain->forward_jump_table = forward_jump_table;
      gt_assert(!gth_chain->reverse_jump_table);
      gth_chain->reverse_jump_table = reverse_jump_table;
      gt_array_delete(chain_fragments);
      gth_chain->jump_table_delete = info->jump_table_delete;
    }

    /* save array of potential exons */
    gth_chain_collection_add(info->chain_collection, gth_chain);
    if (info->comments) {
      gt_file_xprintf(info->outfp, "%c global chain with the following "
                                   "ranges has been saved\n",COMMENTCHAR);
      gt_file_xprintf(info->outfp, "%c forward ranges:\n", COMMENTCHAR);
      gt_file_xprintf(info->outfp, "%c ", COMMENTCHAR);
      gt_ranges_show(gth_chain->forwardranges, info->outfp);
      gt_file_xprintf(info->outfp, "%c reverse ranges:\n", COMMENTCHAR);
      gt_file_xprintf(info->outfp, "%c ", COMMENTCHAR);
      gt_ranges_show(gth_chain->reverseranges, info->outfp);
    }

    /* output stored chains here
       (Mohamed needed this to compare the chaining phase of gth with CHAINER)
     */
    if (info->stopafterchaining) {
      gt_file_xprintf(info->outfp,
                      "%c gl. chain with coverage=%.2f and score %ld "
                      "(genseq=%lu, str.=%c, refseq=%lu)\n", COMMENTCHAR,
                      gth_chain->refseqcoverage, gt_chain_get_score(chain),
                      gth_chain->gen_seq_num, SHOWSTRAND(info->directmatches),
                      gth_chain->ref_seq_num);

      for (i = 0; i < gt_chain_size(chain); i++)
        showfragment(fragments + gt_chain_get_fragnum(chain, i), info->outfp);
    }
  }
  else {
    /* for -paralogs this case is not supposed to occur */
    gt_assert(!info->paralogs);
    if (info->comments)
      gt_file_xprintf(info->outfp, "%c global chain discarded\n",
                         COMMENTCHAR);
    gth_chain_delete(gth_chain);
  }
}
Ejemplo n.º 19
0
void gth_sa_calc_polyAtailpos(GthSA *sa, const unsigned char *ref_seq_tran,
                              GtAlphabet *ref_alphabet)
{
  GtUword ppa, mma, rightreferenceborder, referencelength;
  GtWord i, leftreferenceborder;

  sa->polyAtailpos.start = 0;
  sa->polyAtailpos.end = 0;
  ppa = mma = 0;

  rightreferenceborder = ((Exoninfo*) gt_array_get_last(sa->exons))
                         ->rightreferenceexonborder;
  leftreferenceborder  = ((Exoninfo*) gt_array_get_first(sa->exons))
                         ->leftreferenceexonborder;

  /* setting i */
  referencelength = gth_sa_ref_total_length(sa);
  if ((rightreferenceborder + 1) >=
      (referencelength - 1 - CALCPOLYATAILWINDOW)) {
    i = gt_safe_cast2long(rightreferenceborder + 1);
  }
  else {
    if (referencelength < 1 + CALCPOLYATAILWINDOW)
      i = 0;
    else
      i =  referencelength - 1 - CALCPOLYATAILWINDOW;
  }

  for (/* i already set */; i < gt_safe_cast2long(referencelength); i++) {
    if (ref_seq_tran[i] == gt_alphabet_encode(ref_alphabet, 'A'))
      ppa++;
    else {
      if (ppa > 0 && mma < 1) {
        mma++;
        continue;
      }
      else {
        if (ppa >= MINIMUMPOLYATAILLENGTH)
          break;
        else {
          ppa = mma = 0;
          continue;
        }
      }
    }
  }

  if (ppa >= MINIMUMPOLYATAILLENGTH) {
    sa->polyAtailpos.start = gt_safe_cast2ulong(i - ppa - mma);
    sa->polyAtailpos.end = i - 1;
  }
  else {
    ppa = mma = 0;

    /* setting i */
    if ((leftreferenceborder - 1) <= CALCPOLYATAILWINDOW)
      i = leftreferenceborder - 1;
    else
      i =  CALCPOLYATAILWINDOW - 1;

    for (/* i already set */; i >= 0; i--) {
      if (ref_seq_tran[i] == gt_alphabet_encode(ref_alphabet, 'T'))
        ppa++;
      else {
        if (ppa > 0 && mma < 1) {
          mma++;
          continue;
        }
        else {
          if (ppa >= MINIMUMPOLYATAILLENGTH)
            break;
          else {
            ppa = mma = 0;
            continue;
          }
        }
      }
    }

    if (ppa >= MINIMUMPOLYATAILLENGTH) {
      sa->polyAtailpos.start  = gt_safe_cast2ulong(i + ppa + mma);
      sa->polyAtailpos.end = i + 1;
    }
  }
}
Ejemplo n.º 20
0
/* XXX: change this function: add more sophisticated extension strategy */
void gth_chain_extend_borders(GthChain *chain, const GtRange *gen_seq_bounds,
                              const GtRange *gen_seq_bounds_rc,
                              GT_UNUSED unsigned long gen_total_length,
                              GT_UNUSED unsigned long gen_offset)
{
  long tmpborder;

  /* at least one range in chain */
  gt_assert(gt_array_size(chain->forwardranges));
  /* forward range borders are in considered genomic region */
  gt_assert(gt_ranges_borders_are_in_region(chain->forwardranges,
                                            gen_seq_bounds));
  /* reverse range borders are in considered genomic region */
  gt_assert(gt_ranges_borders_are_in_region(chain->reverseranges,
                                            gen_seq_bounds_rc));
  /* chain->forwardranges is forward and consecutive */
  gt_assert(gt_ranges_are_consecutive(chain->forwardranges));
  /* valid sequence bounds */
  gt_assert(gen_seq_bounds->start <= gen_seq_bounds->end);
  gt_assert(gen_seq_bounds_rc->start <= gen_seq_bounds_rc->end);

  /* set start border, forward strand */
  tmpborder = gt_safe_cast2long(((GtRange*)
                                 gt_array_get_first(chain->forwardranges))
                                 ->start);
  tmpborder -= DPEXTENSION;
  if (tmpborder < gt_safe_cast2long(gen_seq_bounds->start))
    tmpborder = gen_seq_bounds->start;
  ((GtRange*) gt_array_get_first(chain->forwardranges))->start =
    gt_safe_cast2ulong(tmpborder);

  /* set start border, reverse complement strand */
  tmpborder = gt_safe_cast2long(((GtRange*)
                                 gt_array_get_first(chain->reverseranges))
                                ->start);
  tmpborder -= DPEXTENSION;
  if (tmpborder < gt_safe_cast2long(gen_seq_bounds_rc->start))
    tmpborder = gen_seq_bounds_rc->start;
  ((GtRange*) gt_array_get_first(chain->reverseranges))->start =
    gt_safe_cast2ulong(tmpborder);

  /* set end border, forward strand */
  tmpborder = gt_safe_cast2long(((GtRange*)
                                gt_array_get_last(chain->forwardranges))
                                ->end);
  tmpborder += DPEXTENSION;
  if (tmpborder > gt_safe_cast2long(gen_seq_bounds->end))
    tmpborder = gen_seq_bounds->end;
  ((GtRange*) gt_array_get_last(chain->forwardranges))->end =
    gt_safe_cast2ulong(tmpborder);

  /* set end border, reverse complement strand */
  tmpborder = gt_safe_cast2long(((GtRange*)
                                gt_array_get_last(chain->reverseranges))
                                ->end);
  tmpborder += DPEXTENSION;
  if (tmpborder > gt_safe_cast2long(gen_seq_bounds_rc->end))
    tmpborder = gen_seq_bounds_rc->end;
  ((GtRange*) gt_array_get_last(chain->reverseranges))->end =
    gt_safe_cast2ulong(tmpborder);

  gt_assert(chain_is_filled_and_consistent(chain, gen_total_length,
                                           gen_offset));
}
Ejemplo n.º 21
0
static void add_eop_type_to_eop_array(GtArray *bp, Eoptype eoptype,
                                      unsigned long length, bool proteineop)
{
  Editoperation eop,
                maxlen = proteineop ? (Editoperation) MAXIDENTICALLENGTH_PROTEIN
                                    : (Editoperation) MAXIDENTICALLENGTH;
  Eoptype tmp_eoptype;
  unsigned long i, times_maxlen = 0;

  gt_assert(length > 0);

  switch (eoptype) {
    case EOP_TYPE_MATCH:
      /* here we reproduce the artifact resulting from the dummys used in the
         backtracing procedure to make sure that the parsed array of edit
         operations is exactly the same as the one we have in memory */
      if (proteineop && /* this needs only to be checked for protein bp */
          length > 1 &&       /* and when the length is larger 1 */
          gt_array_size(bp)) { /* we have already stored an eop */
        tmp_eoptype = gt_editoperation_type(*(Editoperation*)
                                         gt_array_get_last(bp), proteineop);
        if (tmp_eoptype == EOP_TYPE_INTRON_WITH_1_BASE_LEFT ||
            tmp_eoptype == EOP_TYPE_INTRON_WITH_2_BASES_LEFT) {
          eop = 1;
          gt_array_add(bp, eop);
          length--;
        }
      }

      /* we store the eop which has not maximal length first to make sure that
         after reversing the array of editoperations has the same form as the
         original one */
      DETERMINE_TIMES_MAXLEN;
      gt_assert(length > 0);
      eop = (Editoperation) length;
      gt_array_add(bp, eop);
      for (i = 0; i < times_maxlen; i++)
        gt_array_add(bp, maxlen);
      break;
    case EOP_TYPE_INTRON:
      DETERMINE_TIMES_MAXLEN;
      eop  = DELETIONEOP;
      eop += length;
      gt_array_add(bp, eop);
      eop  = DELETIONEOP;
      eop += maxlen;
      for (i = 0; i < times_maxlen; i++)
        gt_array_add(bp, eop);
      break;
    case EOP_TYPE_INTRON_WITH_1_BASE_LEFT:
      DETERMINE_TIMES_MAXLEN;
      eop  = DELETION_WITH_1_GAP_EOP;
      eop += length;
      gt_array_add(bp, eop);
      eop  = DELETION_WITH_1_GAP_EOP;
      eop += maxlen;
      for (i = 0; i < times_maxlen; i++)
        gt_array_add(bp, eop);
      break;
    case EOP_TYPE_INTRON_WITH_2_BASES_LEFT:
      DETERMINE_TIMES_MAXLEN;
      eop  = DELETION_WITH_2_GAPS_EOP;
      eop += length;
      gt_array_add(bp, eop);
      eop  = DELETION_WITH_2_GAPS_EOP;
      eop += maxlen;
      for (i = 0; i < times_maxlen; i++)
        gt_array_add(bp, eop);
      break;
    case EOP_TYPE_MISMATCH:
      eop = MISMATCHEOP;
      for (i = 0; i < length; i++)
        gt_array_add(bp, eop);
      break;
    case EOP_TYPE_DELETION:
      eop = DELETIONEOP;
      for (i = 0; i < length; i++)
        gt_array_add(bp, eop);
      break;
    case EOP_TYPE_INSERTION:
      eop = INSERTIONEOP;
      for (i = 0; i < length; i++)
        gt_array_add(bp, eop);
      break;
    case EOP_TYPE_MISMATCH_WITH_1_GAP:
      eop = MISMATCH_WITH_1_GAP_EOP;
      for (i = 0; i < length; i++)
        gt_array_add(bp, eop);
      break;
    case EOP_TYPE_MISMATCH_WITH_2_GAPS:
      eop = MISMATCH_WITH_2_GAPS_EOP;
      for (i = 0; i < length; i++)
        gt_array_add(bp, eop);
      break;
    case EOP_TYPE_DELETION_WITH_1_GAP:
      eop = DELETION_WITH_1_GAP_EOP;
      for (i = 0; i < length; i++)
        gt_array_add(bp, eop);
      break;
    case EOP_TYPE_DELETION_WITH_2_GAPS:
      eop = DELETION_WITH_2_GAPS_EOP;
      for (i = 0; i < length; i++)
        gt_array_add(bp, eop);
      break;
    default: gt_assert(0);
  }
}