Ejemplo n.º 1
0
void gth_sa_set(GthSA *sa, GthAlphatype ref_alphatype,
                GtUword gen_dp_start, GtUword gen_dp_length)
{
  gth_backtrace_path_set_gen_dp_start(sa->backtrace_path, gen_dp_start);
  gth_backtrace_path_set_gen_dp_length(sa->backtrace_path, gen_dp_length);
  gth_sa_set_score(sa, 0.0);
  gth_sa_set_coverage(sa, 0.0);
  gth_sa_set_highest_cov(sa, true);
  gth_sa_set_cumlen_scored_exons(sa, 0);
  /* reset edit operations */
  gth_backtrace_path_reset(sa->backtrace_path);
  gth_backtrace_path_set_alphatype(sa->backtrace_path, ref_alphatype);
  /* reset arrays */
  gt_array_reset(sa->exons);
  gt_array_reset(sa->introns);
}
Ejemplo n.º 2
0
void gth_compute_scores(GthSA *sa,
                        bool proteineop,
                        GthDPParam *dp_param,
                        void *dp_options_est,
                        const unsigned char *gen_seq_tran,
                        const unsigned char *ref_seq_tran,
                        const unsigned char *ref_seq_orig,
                        const GtTransTable *transtable,
                        unsigned long gen_dp_start,
                        unsigned long scoreminexonlen,
                        bool introncutout,
                        bool gs2out,
                        GthSplicedSeq *spliced_seq,
                        unsigned long ref_dp_length,
                        GtAlphabet *gen_alphabet,
                        GtAlphabet *ref_alphabet,
                        GthDPScoresProtein *dp_scores_protein)
{
  Traversealignmentfunctions travfunctions;
  Traversealignmentstate travstate;
  Computebordersandscoresdata data;
  GthFlt score, coverageofgenomicsegment, coverageofreferencesegment;

  gt_assert(!gth_sa_num_of_exons(sa));
  gt_assert(!gth_sa_num_of_introns(sa));

  travfunctions.processmismatch  = computescoresprocmismatch;
  travfunctions.processdeletion  = computescoresprocdeletion;
  travfunctions.processinsertion = computebordersandscoresprocinsertion;
  travfunctions.processmatch     = computebordersandscoresprocmatch;
  travfunctions.processintron    = computebordersandscoresprocintron;
  travfunctions.breakcondition   = NULL;

  /* additional functions for protein edit operations */
  travfunctions.processintron_with_1_base_left  =
    computebordersandscoresprocintron;
  travfunctions.processintron_with_2_bases_left =
    computebordersandscoresprocintron;
  travfunctions.processmismatch_with_1_gap      =
    computescoresprocmismatchordeletionwithgap;
  travfunctions.processmismatch_with_2_gaps     =
    computescoresprocmismatchordeletionwithgap;
  travfunctions.processdeletion_with_1_gap      =
    computescoresprocmismatchordeletionwithgap;
  travfunctions.processdeletion_with_2_gaps     =
    computescoresprocmismatchordeletionwithgap;

  travstate.proteineop = proteineop;
  travstate.processing_intron_with_1_base_left  = false;
  travstate.processing_intron_with_2_bases_left = false;
  travstate.alignment = gth_sa_get_editoperations(sa);
  travstate.alignmentlength =  gth_sa_get_editoperations_length(sa);
  travstate.eopptr       = travstate.alignment + travstate.alignmentlength - 1;
  travstate.genomicptr   = gth_sa_genomiccutoff_start(sa);
  travstate.referenceptr = gth_sa_referencecutoff_start(sa);

  if (travstate.alignmentlength <= 0) {
    /* in this case the alignmentscore is set to 0, which leads to discarding
       this alignment later */
    gth_sa_set_score(sa, 0.0);
    return;
  }

  /* editoperations contain no zero base exons */
  gt_assert(gth_sa_contains_no_zero_base_exons(sa));
  /* editoperations contain no leading or terminal introns or insertions */
  gt_assert(containsnoleadingorterminalintronsorinsertions(travstate.alignment,
                                                        travstate
                                                        .alignmentlength,
                                                        proteineop));
  /* sum of edit operations equals referencelength */
  gt_assert(gt_eops_equal_referencelength(travstate.alignment,
                                       travstate.alignmentlength,
                                       ref_dp_length
                                       - gth_sa_referencecutoff_start(sa)
                                       - gth_sa_referencecutoff_end(sa),
                                       proteineop));

  data.proteineop                     = proteineop;
  data.newexon                        = true;
  data.newintron                      = true;
  data.firstexon                      = true;
  data.introncutout                   = introncutout;
  data.gs2out                         = gs2out;
  data.spliced_seq                    = spliced_seq;
  data.singleexonweight               = (GthFlt) 0.0;
  data.maxsingleexonweight            = (GthFlt) 0.0;
  data.overallexonweight              = (GthFlt) 0.0;
  data.maxoverallexonweight           = (GthFlt) 0.0;
  data.cumulativelengthofscoredexons  = 0;

  data.exon.leftgenomicexonborder     = GT_UNDEF_ULONG;
  data.exon.rightgenomicexonborder    = GT_UNDEF_ULONG;
  data.exon.leftreferenceexonborder   = GT_UNDEF_ULONG;
  data.exon.rightreferenceexonborder  = GT_UNDEF_ULONG;
  data.exon.exonscore                 = GTH_UNDEF_GTHDBL;

  data.intron.donorsiteprobability    = GTH_UNDEF_GTHFLT;
  data.intron.acceptorsiteprobability = GTH_UNDEF_GTHFLT;
  data.intron.donorsitescore          = GTH_UNDEF_GTHDBL;
  data.intron.acceptorsitescore       = GTH_UNDEF_GTHDBL;

  data.sa                             = sa;
  data.dp_param                       = dp_param;
  data.dp_options_est                 = dp_options_est;
  data.gen_seq_tran                   = gen_seq_tran;
  data.ref_seq_tran                   = ref_seq_tran;
  data.ref_seq_orig                   = ref_seq_orig;
  data.transtable                     = transtable;
  data.gen_dp_start                   = gen_dp_start;
  data.scoreminexonlen                = scoreminexonlen;
  data.ref_dp_length                  = ref_dp_length;
  data.gen_alphabet                   = gen_alphabet;
  data.gen_alphabet_characters        = gen_alphabet
                                        ? gt_alphabet_characters(gen_alphabet)
                                        : NULL;
  data.dp_scores_protein              = dp_scores_protein;

  gthtraversealignment(true, &travstate, proteineop, &data, &travfunctions);

  /* this is for saving the last exon */
  evalnewintronifpossible(proteineop, &data.newexon, &data.newintron, true,
                          data.introncutout, data.gs2out, data.spliced_seq,
                          &data.exon, &data.intron, &data.singleexonweight,
                          &data.maxsingleexonweight, &data.overallexonweight,
                          &data.maxoverallexonweight,
                          &data.cumulativelengthofscoredexons, sa, &travstate,
                          gen_alphabet, data.dp_param, data.dp_options_est,
                          data.gen_seq_tran, data.ref_seq_tran,
                          data.gen_dp_start, data.scoreminexonlen);

  /* saving the scores for the whole alignment */
  if (data.maxoverallexonweight > 0.0) {
    score = data.overallexonweight / data.maxoverallexonweight;
    /* XXX: the way the alignmentscore is computed, it is possible to get a
       score > 1.0. Since we don't want this, we cap it */
    if (score > 1.0)
      score = 1.0;
  }
  else
    score = 0.0;
  gth_sa_set_score(sa, score);
  gth_sa_set_cumlen_scored_exons(sa, data.cumulativelengthofscoredexons);

  /* fraction of the gen_dp_length which is scored/weighted */
  coverageofgenomicsegment   = (GthFlt) data.cumulativelengthofscoredexons /
                               (GthFlt) gth_sa_gen_dp_length(sa);
  /* coverage of genomic segment is valid value */
  gt_assert(coverageofgenomicsegment >= 0.0 && coverageofgenomicsegment <= 1.0);

  /* fraction of the referencelength which is scored/weighted */
  coverageofreferencesegment = (GthFlt) data.cumulativelengthofscoredexons /
                               (GthFlt) ((proteineop ? GT_CODON_LENGTH : 1) *
                                         gth_sa_ref_total_length(sa));

  if (coverageofgenomicsegment > coverageofreferencesegment) {
    gth_sa_set_coverage(sa, coverageofgenomicsegment);
    gth_sa_set_highest_cov(sa, true);
  }
  else {
    gth_sa_set_coverage(sa, coverageofreferencesegment);
    gth_sa_set_highest_cov(sa, false);
  }

  /* test the assumption that the coverage is never larger then the default */
  gt_assert(gth_sa_coverage(sa) <= GTH_DEFAULT_MAX_COVERAGE);

  /* compute poly(A) tail position */
  gth_sa_calc_polyAtailpos(sa, ref_seq_tran, ref_alphabet);

  /* determined exons are forward and consecutive */
  gt_assert(gth_sa_exons_are_forward_and_consecutive(sa));
}
Ejemplo n.º 3
0
static void end_element_handler(void *info, const XML_Char *name)
{
  Parseinfo *parseinfo = (Parseinfo*) info;
  GthSA *sa = parseinfo->currentSA;
  GtUword datalength;
  double retdouble;
  GtWord ret;
  char *data;

  /* save data and data length */
  data       = gt_str_get(parseinfo->databuf);
  datalength = gt_str_length(parseinfo->databuf);

  /* perform actions depending on end tag */
  if (strcmp(name, SPLICEDALIGNMENT_TAG) == 0) {
    /* before we store the spliced alignment we have to reverse its edit
       operations */
    gt_assert(sa && gth_sa_backtrace_path(sa));
    gth_backtrace_path_reverse(gth_sa_backtrace_path(sa));

    /* ensure that before an intron which is not in phase the edit operation
       has length 1 (only for protein spliced alignments) */
    gth_backtrace_path_ensure_length_1_before_introns(
                                                     gth_sa_backtrace_path(sa));

    if (parseinfo->saprocessfunc(parseinfo->data , sa,
                                 parseinfo->outputfilename, parseinfo->err)) {
      /* XXX */
      fprintf(stderr, "error: %s\n", gt_error_get(parseinfo->err));
      exit(EXIT_FAILURE);
    }
    /* reset current spliced alignment */
    parseinfo->currentSA = NULL;
 }
  else if (strcmp(name, REFERENCEALPHATYPE_TAG) == 0) {
    if (strcmp(data, "DNA_ALPHA") == 0)
      gth_sa_set_alphatype(sa, DNA_ALPHA);
    else if (strcmp(data, "PROTEIN_ALPHA") == 0) {
      gth_sa_set_alphatype(sa, PROTEIN_ALPHA);
    }
    else {
      ILLEGAL_DATA;
    }
  }
  else if (strcmp(name, DNA_EOP_TYPE_TAG) == 0) {
    if (strcmp(data, "match") == 0)
      parseinfo->eoptype = EOP_TYPE_MATCH;
    else if (strcmp(data, "deletion") == 0)
      parseinfo->eoptype = EOP_TYPE_DELETION;
    else if (strcmp(data, "insertion") == 0)
      parseinfo->eoptype = EOP_TYPE_INSERTION;
    else if (strcmp(data, "mismatch") == 0)
      parseinfo->eoptype = EOP_TYPE_MISMATCH;
    else if (strcmp(data, "intron") == 0)
      parseinfo->eoptype = EOP_TYPE_INTRON;
    else {
      ILLEGAL_DATA;
    }
  }
  else if (strcmp(name, DNA_EOP_LENGTH_TAG) == 0) {
    SCANUINT;
    gth_backtrace_path_add_eop(gth_sa_backtrace_path(sa), parseinfo->eoptype,
                               ret);
  }
  else if (strcmp(name, PROTEIN_EOP_TYPE_TAG) == 0) {
    if (strcmp(data, "match") == 0)
      parseinfo->eoptype = EOP_TYPE_MATCH;
    else if (strcmp(data, "deletion") == 0)
      parseinfo->eoptype = EOP_TYPE_DELETION;
    else if (strcmp(data, "insertion") == 0)
      parseinfo->eoptype = EOP_TYPE_INSERTION;
    else if (strcmp(data, "mismatch") == 0)
      parseinfo->eoptype = EOP_TYPE_MISMATCH;
    else if (strcmp(data, "intron") == 0)
      parseinfo->eoptype = EOP_TYPE_INTRON;
    else if (strcmp(data, "mismatch_with_1_gap") == 0)
      parseinfo->eoptype = EOP_TYPE_MISMATCH_WITH_1_GAP;
    else if (strcmp(data, "mismatch_with_2_gaps") == 0)
      parseinfo->eoptype = EOP_TYPE_MISMATCH_WITH_2_GAPS;
    else if (strcmp(data, "deletion_with_1_gap") == 0)
      parseinfo->eoptype = EOP_TYPE_DELETION_WITH_1_GAP;
    else if (strcmp(data, "deletion_with_2_gaps") == 0)
      parseinfo->eoptype = EOP_TYPE_DELETION_WITH_2_GAPS;
    else if (strcmp(data, "intron_with_1_base_left") == 0)
      parseinfo->eoptype = EOP_TYPE_INTRON_WITH_1_BASE_LEFT;
    else if (strcmp(data, "intron_with_2_bases_left") == 0)
      parseinfo->eoptype = EOP_TYPE_INTRON_WITH_2_BASES_LEFT;
    else {
      ILLEGAL_DATA;
    }
  }
  else if (strcmp(name, PROTEIN_EOP_LENGTH_TAG) == 0) {
    SCANUINT;
    gth_backtrace_path_add_eop(gth_sa_backtrace_path(sa), parseinfo->eoptype,
                               ret);
  }
  else if (strcmp(name, INDELCOUNT_TAG) == 0) {
    SCANUINT;
    /* ignore indelcount, gets recomputed anyway */
  }
  else if (strcmp(name, GENOMICLENGTHDP_TAG) == 0) {
    SCANUINT;
    gth_sa_set_gen_dp_length(sa, ret);
  }
  else if (strcmp(name, GENOMICLENGTHTOTAL_TAG) == 0) {
    SCANUINT;
    gth_sa_set_gen_total_length(sa, ret);
  }
  else if (strcmp(name, GENOMICOFFSET_TAG) == 0) {
    SCANUINT;
    gth_sa_set_gen_offset(sa, ret);
  }
  else if (strcmp(name, REFERENCELENGTH_TAG) == 0) {
    SCANUINT;
    gth_sa_set_ref_total_length(sa, ret);
  }
  else if (strcmp(name, DPSTARTPOS_TAG) == 0) {
    SCANUINT;
    gth_sa_set_gen_dp_start(sa, ret);
  }
  else if (strcmp(name, DPENDPOS_TAG) == 0) {
    SCANUINT;
    /* ignore DP end pos, gets recomputed from gen_dp_length anyway */
    gt_assert(gth_sa_gen_dp_end(sa) == ret);
  }
  else if (strcmp(name, GENOMICFILENAME_TAG) == 0) {
    /* save genomic file name */
    gt_str_append_cstr_nt(parseinfo->genomicfilename, data, datalength);
  }
  else if (strcmp(name, GENOMICFILEHASH_TAG) == 0) {
    gth_sa_set_gen_file_num(sa, process_file(parseinfo->input,
                            gt_str_get(parseinfo->genomicfilename), data, false,
                            UNDEF_ALPHA));
    /* reset genomic filename */
    gt_str_reset(parseinfo->genomicfilename);
  }
  else if (strcmp(name, GENOMICSEQNUM_TAG) == 0) {
    SCANUINT;
    gth_sa_set_gen_seq_num(sa, ret);
  }
  else if (strcmp(name, REFERENCEFILENAME_TAG) == 0) {
    /* save reference file name */
    gt_str_append_cstr_nt(parseinfo->referencefilename, data, datalength);
  }
  else if (strcmp(name, REFERENCEFILEHASH_TAG) == 0) {
    gth_sa_set_ref_file_num(sa, process_file(parseinfo->input,
                                       gt_str_get(parseinfo->referencefilename),
                                                  data, true,
                                                  gth_sa_alphatype(sa)));

    /* reset reference filename */
    gt_str_reset(parseinfo->referencefilename);
  }
  else if (strcmp(name, REFERENCESEQNUM_TAG) == 0) {
    SCANUINT;
    gth_sa_set_ref_seq_num(sa, ret);
  }
  else if (strcmp(name, GENOMICID_TAG) == 0)
    gth_sa_set_gen_id(sa, data);
  else if (strcmp(name, REFERENCEID_TAG) == 0)
    gth_sa_set_ref_id(sa, data);
  else if (strcmp(name, GENOMICSTRANDISFORWARD_TAG) == 0)
    gth_sa_set_gen_strand(sa, parse_boolean(data, parseinfo));
  else if (strcmp(name, REFERENCESTRANDISFORWARD_TAG) == 0)
    gth_sa_set_ref_strand(sa, parse_boolean(data, parseinfo));
  else if (strcmp(name, GENOMICCUTOFF_TAG) == 0) {
    SCANUINT;
    parseinfo->cutoffs.genomiccutoff = ret;
  }
  else if (strcmp(name, REFERENCECUTOFF_TAG) == 0) {
    SCANUINT;
    parseinfo->cutoffs.referencecutoff = ret;
  }
  else if (strcmp(name, EOPCUTOFF_TAG) == 0) {
    SCANUINT;
    parseinfo->cutoffs.eopcutoff = ret;
  }
  else if (strcmp(name, CUTOFFSSTART_TAG) == 0)
    gth_sa_set_cutoffs_start(sa, &parseinfo->cutoffs);
  else if (strcmp(name, CUTOFFSEND_TAG) == 0)
    gth_sa_set_cutoffs_end(sa, &parseinfo->cutoffs);
  else if (strcmp(name, LEFTGENOMICEXONBORDER_TAG) == 0) {
    SCANUINT;
    parseinfo->exoninfo.leftgenomicexonborder = ret;
  }
  else if (strcmp(name, RIGHTGENOMICEXONBORDER_TAG) == 0) {
    SCANUINT;
    parseinfo->exoninfo.rightgenomicexonborder = ret;
  }
  else if (strcmp(name, LEFTREFERENCEEXONBORDER_TAG) == 0) {
    SCANUINT;
    parseinfo->exoninfo.leftreferenceexonborder = ret;
  }
  else if (strcmp(name, RIGHTREFERENCEEXONBORDER_TAG) == 0) {
    SCANUINT;
    parseinfo->exoninfo.rightreferenceexonborder = ret;
  }
  else if (strcmp(name, EXONSCORE_TAG) == 0) {
    SCANDOUBLE;
    parseinfo->exoninfo.exonscore = retdouble;
  }
  else if (strcmp(name, EXONINFO_TAG) == 0)
    gth_sa_add_exon(sa, &parseinfo->exoninfo);
  else if (strcmp(name, DONORSITEPROBABILITY_TAG) == 0) {
    SCANDOUBLE;
    parseinfo->introninfo.donorsiteprobability = (GthFlt) retdouble;
  }
  else if (strcmp(name, ACCEPTORSITEPROBABILITY_TAG) == 0) {
    SCANDOUBLE;
    parseinfo->introninfo.acceptorsiteprobability = (GthFlt) retdouble;
  }
  else if (strcmp(name, DONORSITESCORE_TAG) == 0) {
    SCANDOUBLE;
    parseinfo->introninfo.donorsitescore = retdouble;
  }
  else if (strcmp(name, ACCEPTORSITESCORE_TAG) == 0) {
    SCANDOUBLE;
    parseinfo->introninfo.acceptorsitescore = retdouble;
  }
  else if (strcmp(name, INTRONINFO_TAG) == 0)
    gth_sa_add_intron(sa, &parseinfo->introninfo);
  else if (strcmp(name, POLYASTART_TAG) == 0) {
    SCANUINT;
    gth_sa_set_polyAtail_start(sa, ret);
  }
  else if (strcmp(name, POLYAEND_TAG) == 0) {
    SCANUINT;
    gth_sa_set_polyAtail_stop(sa, ret);
  }
  else if (strcmp(name, ALIGNMENTSCORE_TAG) == 0) {
    SCANDOUBLE;
    gth_sa_set_score(sa, retdouble);
  }
  else if (strcmp(name, COVERAGE_TAG) == 0) {
    SCANDOUBLE;
    gth_sa_set_coverage(sa, retdouble);
  }
  else if (strcmp(name, COVERAGEOFGENOMICSEGMENTISHIGHEST_TAG) == 0) {
    gth_sa_set_highest_cov(sa, parse_boolean(data, parseinfo));
  }
  else if (strcmp(name, CUMULATIVELENGTHOFSCOREDEXONS_TAG) == 0) {
    SCANUINT;
    gth_sa_set_cumlen_scored_exons(sa, ret);
  }
}