Exemple #1
0
void gthcalcsplicesitescore(GthDbl *splicesitescore,
                            Traversealignmentstate *oldstate,
                            const unsigned char *gen_seq_tran,
                            const unsigned char *ref_seq_tran,
                            GtAlphabet *gen_alphabet,
                            GthDPOptionsEST *dp_options_est,
                            bool acceptorsite)
{
  Traversealignmentfunctions travfunctions;
  Traversealignmentstate newstate;
  Calcsplicesitescoredata data;

  gt_assert(dp_options_est);

  travfunctions.processmismatch  = calcsplicesitescoreprocmismatchordeletion;
  travfunctions.processdeletion  = calcsplicesitescoreprocmismatchordeletion;
  travfunctions.processinsertion = calcsplicesitescoreprocinsertion;
  travfunctions.processmatch     = calcsplicesitescoreprocmatch;
  travfunctions.processintron    = calcsplicesitescoreprocintron;
  travfunctions.breakcondition   = calcsplicesitescorebreakcondition;

  /* to prevent manipulation of oldstate we copy it to newstate */
  newstate = *oldstate;

  if (!acceptorsite) { /* i.e. we want to process a donorsite */
    /* to go to the eopptr before the oldstate->eopptr */
    newstate.eopptr++;

    /* adjusting the sequence pointers to be able to correctly go backwards in
       the alignment */
    newstate.genomicptr--;
    newstate.referenceptr--;
  }

  data.breaktraversealignment      = false;
  data.gen_seq_tran                = gen_seq_tran;
  data.ref_seq_tran                = ref_seq_tran;
  data.gen_alphabet                = gen_alphabet;
  data.dp_options_est              = dp_options_est;
  data.splicesiteweight            = (GthFlt) 0.0;
  data.maxsplicesiteweight         = (GthFlt) 0.0;
  data.processedalignmentpositions = 0;

  /* for acceptorsites going forward, for donorsites going backward */
  gthtraversealignment(acceptorsite, &newstate, false, &data, &travfunctions);

  if ((data.processedalignmentpositions >=  (SSSWINDOW_MINSIZE_FACTOR *
                                                  SPLICE_SITE_SCORE_WINDOW)) &&
      (data.splicesiteweight > 0.0) &&  /* the weights must be positive */
      (data.maxsplicesiteweight > 0.0)) {
    *splicesitescore = (GthDbl) (data.splicesiteweight /
                                 data.maxsplicesiteweight);
  }
  else
    *splicesitescore = 0.0;
}
Exemple #2
0
void gth_compute_scores(GthSA *sa,
                        bool proteineop,
                        GthDPParam *dp_param,
                        void *dp_options_est,
                        const unsigned char *gen_seq_tran,
                        const unsigned char *ref_seq_tran,
                        const unsigned char *ref_seq_orig,
                        const GtTransTable *transtable,
                        unsigned long gen_dp_start,
                        unsigned long scoreminexonlen,
                        bool introncutout,
                        bool gs2out,
                        GthSplicedSeq *spliced_seq,
                        unsigned long ref_dp_length,
                        GtAlphabet *gen_alphabet,
                        GtAlphabet *ref_alphabet,
                        GthDPScoresProtein *dp_scores_protein)
{
  Traversealignmentfunctions travfunctions;
  Traversealignmentstate travstate;
  Computebordersandscoresdata data;
  GthFlt score, coverageofgenomicsegment, coverageofreferencesegment;

  gt_assert(!gth_sa_num_of_exons(sa));
  gt_assert(!gth_sa_num_of_introns(sa));

  travfunctions.processmismatch  = computescoresprocmismatch;
  travfunctions.processdeletion  = computescoresprocdeletion;
  travfunctions.processinsertion = computebordersandscoresprocinsertion;
  travfunctions.processmatch     = computebordersandscoresprocmatch;
  travfunctions.processintron    = computebordersandscoresprocintron;
  travfunctions.breakcondition   = NULL;

  /* additional functions for protein edit operations */
  travfunctions.processintron_with_1_base_left  =
    computebordersandscoresprocintron;
  travfunctions.processintron_with_2_bases_left =
    computebordersandscoresprocintron;
  travfunctions.processmismatch_with_1_gap      =
    computescoresprocmismatchordeletionwithgap;
  travfunctions.processmismatch_with_2_gaps     =
    computescoresprocmismatchordeletionwithgap;
  travfunctions.processdeletion_with_1_gap      =
    computescoresprocmismatchordeletionwithgap;
  travfunctions.processdeletion_with_2_gaps     =
    computescoresprocmismatchordeletionwithgap;

  travstate.proteineop = proteineop;
  travstate.processing_intron_with_1_base_left  = false;
  travstate.processing_intron_with_2_bases_left = false;
  travstate.alignment = gth_sa_get_editoperations(sa);
  travstate.alignmentlength =  gth_sa_get_editoperations_length(sa);
  travstate.eopptr       = travstate.alignment + travstate.alignmentlength - 1;
  travstate.genomicptr   = gth_sa_genomiccutoff_start(sa);
  travstate.referenceptr = gth_sa_referencecutoff_start(sa);

  if (travstate.alignmentlength <= 0) {
    /* in this case the alignmentscore is set to 0, which leads to discarding
       this alignment later */
    gth_sa_set_score(sa, 0.0);
    return;
  }

  /* editoperations contain no zero base exons */
  gt_assert(gth_sa_contains_no_zero_base_exons(sa));
  /* editoperations contain no leading or terminal introns or insertions */
  gt_assert(containsnoleadingorterminalintronsorinsertions(travstate.alignment,
                                                        travstate
                                                        .alignmentlength,
                                                        proteineop));
  /* sum of edit operations equals referencelength */
  gt_assert(gt_eops_equal_referencelength(travstate.alignment,
                                       travstate.alignmentlength,
                                       ref_dp_length
                                       - gth_sa_referencecutoff_start(sa)
                                       - gth_sa_referencecutoff_end(sa),
                                       proteineop));

  data.proteineop                     = proteineop;
  data.newexon                        = true;
  data.newintron                      = true;
  data.firstexon                      = true;
  data.introncutout                   = introncutout;
  data.gs2out                         = gs2out;
  data.spliced_seq                    = spliced_seq;
  data.singleexonweight               = (GthFlt) 0.0;
  data.maxsingleexonweight            = (GthFlt) 0.0;
  data.overallexonweight              = (GthFlt) 0.0;
  data.maxoverallexonweight           = (GthFlt) 0.0;
  data.cumulativelengthofscoredexons  = 0;

  data.exon.leftgenomicexonborder     = GT_UNDEF_ULONG;
  data.exon.rightgenomicexonborder    = GT_UNDEF_ULONG;
  data.exon.leftreferenceexonborder   = GT_UNDEF_ULONG;
  data.exon.rightreferenceexonborder  = GT_UNDEF_ULONG;
  data.exon.exonscore                 = GTH_UNDEF_GTHDBL;

  data.intron.donorsiteprobability    = GTH_UNDEF_GTHFLT;
  data.intron.acceptorsiteprobability = GTH_UNDEF_GTHFLT;
  data.intron.donorsitescore          = GTH_UNDEF_GTHDBL;
  data.intron.acceptorsitescore       = GTH_UNDEF_GTHDBL;

  data.sa                             = sa;
  data.dp_param                       = dp_param;
  data.dp_options_est                 = dp_options_est;
  data.gen_seq_tran                   = gen_seq_tran;
  data.ref_seq_tran                   = ref_seq_tran;
  data.ref_seq_orig                   = ref_seq_orig;
  data.transtable                     = transtable;
  data.gen_dp_start                   = gen_dp_start;
  data.scoreminexonlen                = scoreminexonlen;
  data.ref_dp_length                  = ref_dp_length;
  data.gen_alphabet                   = gen_alphabet;
  data.gen_alphabet_characters        = gen_alphabet
                                        ? gt_alphabet_characters(gen_alphabet)
                                        : NULL;
  data.dp_scores_protein              = dp_scores_protein;

  gthtraversealignment(true, &travstate, proteineop, &data, &travfunctions);

  /* this is for saving the last exon */
  evalnewintronifpossible(proteineop, &data.newexon, &data.newintron, true,
                          data.introncutout, data.gs2out, data.spliced_seq,
                          &data.exon, &data.intron, &data.singleexonweight,
                          &data.maxsingleexonweight, &data.overallexonweight,
                          &data.maxoverallexonweight,
                          &data.cumulativelengthofscoredexons, sa, &travstate,
                          gen_alphabet, data.dp_param, data.dp_options_est,
                          data.gen_seq_tran, data.ref_seq_tran,
                          data.gen_dp_start, data.scoreminexonlen);

  /* saving the scores for the whole alignment */
  if (data.maxoverallexonweight > 0.0) {
    score = data.overallexonweight / data.maxoverallexonweight;
    /* XXX: the way the alignmentscore is computed, it is possible to get a
       score > 1.0. Since we don't want this, we cap it */
    if (score > 1.0)
      score = 1.0;
  }
  else
    score = 0.0;
  gth_sa_set_score(sa, score);
  gth_sa_set_cumlen_scored_exons(sa, data.cumulativelengthofscoredexons);

  /* fraction of the gen_dp_length which is scored/weighted */
  coverageofgenomicsegment   = (GthFlt) data.cumulativelengthofscoredexons /
                               (GthFlt) gth_sa_gen_dp_length(sa);
  /* coverage of genomic segment is valid value */
  gt_assert(coverageofgenomicsegment >= 0.0 && coverageofgenomicsegment <= 1.0);

  /* fraction of the referencelength which is scored/weighted */
  coverageofreferencesegment = (GthFlt) data.cumulativelengthofscoredexons /
                               (GthFlt) ((proteineop ? GT_CODON_LENGTH : 1) *
                                         gth_sa_ref_total_length(sa));

  if (coverageofgenomicsegment > coverageofreferencesegment) {
    gth_sa_set_coverage(sa, coverageofgenomicsegment);
    gth_sa_set_highest_cov(sa, true);
  }
  else {
    gth_sa_set_coverage(sa, coverageofreferencesegment);
    gth_sa_set_highest_cov(sa, false);
  }

  /* test the assumption that the coverage is never larger then the default */
  gt_assert(gth_sa_coverage(sa) <= GTH_DEFAULT_MAX_COVERAGE);

  /* compute poly(A) tail position */
  gth_sa_calc_polyAtailpos(sa, ref_seq_tran, ref_alphabet);

  /* determined exons are forward and consecutive */
  gt_assert(gth_sa_exons_are_forward_and_consecutive(sa));
}
static void determine_cutoffs(GthBacktracePath *bp,
                              GthCutoffmode leadcutoffsmode,
                              GthCutoffmode termcutoffsmode,
                              unsigned long cutoffsminexonlen)
{
  Traversealignmentfunctions travfunctions;
  Traversealignmentstate travstate;
  Relaxedcutoffsdata relaxedcutoffsdata;
  Strictcutoffsdata strictcutoffsdata;
  Minimalcutoffsdata minimalcutoffsdata;
  bool proteineop = bp->alphatype == PROTEIN_ALPHA;

  /* sum of edit operations equals referencelength (before cutoffs)", */
  gt_assert(gth_backtrace_path_is_valid(bp));

  /* setting the traverse alignment state */
  travstate.proteineop      = proteineop;
  travstate.processing_intron_with_1_base_left  = false;
  travstate.processing_intron_with_2_bases_left = false;
  travstate.alignment       = gth_backtrace_path_get(bp);
  travstate.alignmentlength = gth_backtrace_path_length(bp);
  travstate.eopptr          = travstate.alignment +
                              travstate.alignmentlength - 1;
  travstate.genomicptr      = 0;
  travstate.referenceptr    = 0;

  /* cutting of leading indels in the sequences */
  switch (leadcutoffsmode) {
    case RELAXED:
      gt_initRelaxedcutoffsTravfunctions(&travfunctions);
      gt_initRelaxedcutoffsdata(&relaxedcutoffsdata, &bp->cutoffs.start);
      gthtraversealignment(true, &travstate, proteineop, &relaxedcutoffsdata,
                           &travfunctions);
      break;
    case STRICT:
      gt_initStrictcutoffsTravfunctions(&travfunctions);
      gt_initStrictcutoffsdata(&strictcutoffsdata, &bp->cutoffs.start,
                            cutoffsminexonlen);
      gthtraversealignment(true , &travstate , proteineop, &strictcutoffsdata,
                           &travfunctions);
      break;
    case MINIMAL:
      gt_initMinimalcutoffsTravfunctions(&travfunctions);
      gt_initMinimalcutoffsdata(&minimalcutoffsdata, &bp->cutoffs.start);
      gthtraversealignment(true, &travstate, proteineop, &minimalcutoffsdata,
                           &travfunctions);
      break;
    default: gt_assert(0);
  }

  /* resetting the traverse alignment state */
  travstate.processing_intron_with_1_base_left  = false;
  travstate.processing_intron_with_2_bases_left = false;
  travstate.eopptr = gth_backtrace_path_get(bp);
  travstate.genomicptr = 0;
  travstate.referenceptr = 0;

  /* cutting of terminal indels in the sequences */
  switch (termcutoffsmode) {
    case RELAXED:
      gt_initRelaxedcutoffsTravfunctions(&travfunctions);
      gt_initRelaxedcutoffsdata(&relaxedcutoffsdata, &bp->cutoffs.end);
      gthtraversealignment(false, &travstate, proteineop, &relaxedcutoffsdata,
                           &travfunctions);
      break;
    case STRICT:
      gt_initStrictcutoffsTravfunctions(&travfunctions);
      gt_initStrictcutoffsdata(&strictcutoffsdata, &bp->cutoffs.end,
                            cutoffsminexonlen);
      gthtraversealignment(false, &travstate, proteineop, &strictcutoffsdata,
                           &travfunctions);
      break;
    case MINIMAL:
      gt_initMinimalcutoffsTravfunctions(&travfunctions);
      gt_initMinimalcutoffsdata(&minimalcutoffsdata, &bp->cutoffs.end);
      gthtraversealignment(false, &travstate, proteineop, &minimalcutoffsdata,
                           &travfunctions);
      break;
    default: gt_assert(0);
  }

  /* sum of edit operations equals referencelength (after cutoffs) */
  gt_assert(gth_backtrace_path_is_valid(bp));
}