Example #1
0
static void evalnewintronifpossible(bool proteineop, bool *newexon,
                                    bool *newintron, bool lastintron,
                                    bool introncutout, bool gs2out,
                                    GthSplicedSeq *spliced_seq,
                                    Exoninfo *exon, Introninfo *intron,
                                    GthFlt *singleexonweight,
                                    GthFlt *maxsingleexonweight,
                                    GthFlt *overallexonweight,
                                    GthFlt *maxoverallexonweight,
                                    unsigned long
                                    *cumulativelengthofscoredexons,
                                    GthSA *sa,
                                    Traversealignmentstate *travstate,
                                    GtAlphabet *gen_alphabet,
                                    GthDPParam *dp_param,
                                    GthDPOptionsEST *dp_options_est,
                                    const unsigned char *gen_seq_tran,
                                    const unsigned char *ref_seq_tran,
                                    unsigned long gen_dp_start,
                                    unsigned long scoreminexonlen)
{
  unsigned long genomicexonlength, splicedpos;

  if (*newintron) { /* in this case an exon will be saved */
    exon->rightgenomicexonborder = gen_dp_start + travstate->genomicptr - 1;
    exon->rightreferenceexonborder = gt_safe_cast2ulong(travstate
                                                        ->referenceptr - 1);
    *newintron = false;
    *newexon   = true;
    if (*maxsingleexonweight > 0.0) {
      exon->exonscore = (GthDbl) ((*singleexonweight) /
                                            (*maxsingleexonweight));
    }
    else
      exon->exonscore = 0.0;

    /* for calculating the alignmentscore and the cumulative length of scored
       exons */
    genomicexonlength = exon->rightgenomicexonborder -
                        exon->leftgenomicexonborder + 1;

    /* short exons are not used for the alignment score */
    if (genomicexonlength >= scoreminexonlen) {
      *overallexonweight             += *singleexonweight;
      *maxoverallexonweight          += *maxsingleexonweight;
    }
    /* coverage includes short exons (not for gs2out): */
    if (gs2out) {
      if (genomicexonlength >= scoreminexonlen)
        *cumulativelengthofscoredexons += genomicexonlength;
    }
    else
      *cumulativelengthofscoredexons += genomicexonlength;

    /* saving the exon */
    gt_assert(exon->leftgenomicexonborder <= exon->rightgenomicexonborder);
    gth_sa_add_exon(sa, exon);

    /* resetting scores */
    *singleexonweight    = (GthFlt) 0.0;
    *maxsingleexonweight = (GthFlt) 0.0;

    /* if this is not the last intron, save the donor site stuff.
       if this is the last intron, this function has been called to save the
       last exon. Therefore, no saving of donor site stuff is necessary. */
    if (!lastintron) {
      /* save donorsiteprobability */
      if (introncutout) {
        splicedpos =
          gth_spliced_seq_orig_to_spliced_pos(spliced_seq,
                      gt_safe_cast2ulong(travstate->genomicptr + gen_dp_start));
        if (splicedpos == GT_UNDEF_ULONG) {
          /* XXX: no spliced position has been found -> this is an artificially
             introduced intron, use 0.0 as donor site probabilty */
          intron->donorsiteprobability = 0.0;
        }
        else {
          intron->donorsiteprobability = (GthFlt)
                                 exp((double) dp_param->log_Pdonor[splicedpos]);
        }
      }
      else {
        intron->donorsiteprobability = (GthFlt)
                      exp((double) dp_param->log_Pdonor[travstate->genomicptr]);
      }

      /* for the cDNAs/ESTs: calculationg donorsitescore: going back from here
       */
      if (proteineop)
        intron->donorsitescore = UNDEFINED_SPLICE_SITE_SCORE;
      else {
        gthcalcsplicesitescore(&intron->donorsitescore, travstate, gen_seq_tran,
                               ref_seq_tran, gen_alphabet, dp_options_est,
                               false);
      }
    }
  }
}
Example #2
0
static void end_element_handler(void *info, const XML_Char *name)
{
  Parseinfo *parseinfo = (Parseinfo*) info;
  GthSA *sa = parseinfo->currentSA;
  GtUword datalength;
  double retdouble;
  GtWord ret;
  char *data;

  /* save data and data length */
  data       = gt_str_get(parseinfo->databuf);
  datalength = gt_str_length(parseinfo->databuf);

  /* perform actions depending on end tag */
  if (strcmp(name, SPLICEDALIGNMENT_TAG) == 0) {
    /* before we store the spliced alignment we have to reverse its edit
       operations */
    gt_assert(sa && gth_sa_backtrace_path(sa));
    gth_backtrace_path_reverse(gth_sa_backtrace_path(sa));

    /* ensure that before an intron which is not in phase the edit operation
       has length 1 (only for protein spliced alignments) */
    gth_backtrace_path_ensure_length_1_before_introns(
                                                     gth_sa_backtrace_path(sa));

    if (parseinfo->saprocessfunc(parseinfo->data , sa,
                                 parseinfo->outputfilename, parseinfo->err)) {
      /* XXX */
      fprintf(stderr, "error: %s\n", gt_error_get(parseinfo->err));
      exit(EXIT_FAILURE);
    }
    /* reset current spliced alignment */
    parseinfo->currentSA = NULL;
 }
  else if (strcmp(name, REFERENCEALPHATYPE_TAG) == 0) {
    if (strcmp(data, "DNA_ALPHA") == 0)
      gth_sa_set_alphatype(sa, DNA_ALPHA);
    else if (strcmp(data, "PROTEIN_ALPHA") == 0) {
      gth_sa_set_alphatype(sa, PROTEIN_ALPHA);
    }
    else {
      ILLEGAL_DATA;
    }
  }
  else if (strcmp(name, DNA_EOP_TYPE_TAG) == 0) {
    if (strcmp(data, "match") == 0)
      parseinfo->eoptype = EOP_TYPE_MATCH;
    else if (strcmp(data, "deletion") == 0)
      parseinfo->eoptype = EOP_TYPE_DELETION;
    else if (strcmp(data, "insertion") == 0)
      parseinfo->eoptype = EOP_TYPE_INSERTION;
    else if (strcmp(data, "mismatch") == 0)
      parseinfo->eoptype = EOP_TYPE_MISMATCH;
    else if (strcmp(data, "intron") == 0)
      parseinfo->eoptype = EOP_TYPE_INTRON;
    else {
      ILLEGAL_DATA;
    }
  }
  else if (strcmp(name, DNA_EOP_LENGTH_TAG) == 0) {
    SCANUINT;
    gth_backtrace_path_add_eop(gth_sa_backtrace_path(sa), parseinfo->eoptype,
                               ret);
  }
  else if (strcmp(name, PROTEIN_EOP_TYPE_TAG) == 0) {
    if (strcmp(data, "match") == 0)
      parseinfo->eoptype = EOP_TYPE_MATCH;
    else if (strcmp(data, "deletion") == 0)
      parseinfo->eoptype = EOP_TYPE_DELETION;
    else if (strcmp(data, "insertion") == 0)
      parseinfo->eoptype = EOP_TYPE_INSERTION;
    else if (strcmp(data, "mismatch") == 0)
      parseinfo->eoptype = EOP_TYPE_MISMATCH;
    else if (strcmp(data, "intron") == 0)
      parseinfo->eoptype = EOP_TYPE_INTRON;
    else if (strcmp(data, "mismatch_with_1_gap") == 0)
      parseinfo->eoptype = EOP_TYPE_MISMATCH_WITH_1_GAP;
    else if (strcmp(data, "mismatch_with_2_gaps") == 0)
      parseinfo->eoptype = EOP_TYPE_MISMATCH_WITH_2_GAPS;
    else if (strcmp(data, "deletion_with_1_gap") == 0)
      parseinfo->eoptype = EOP_TYPE_DELETION_WITH_1_GAP;
    else if (strcmp(data, "deletion_with_2_gaps") == 0)
      parseinfo->eoptype = EOP_TYPE_DELETION_WITH_2_GAPS;
    else if (strcmp(data, "intron_with_1_base_left") == 0)
      parseinfo->eoptype = EOP_TYPE_INTRON_WITH_1_BASE_LEFT;
    else if (strcmp(data, "intron_with_2_bases_left") == 0)
      parseinfo->eoptype = EOP_TYPE_INTRON_WITH_2_BASES_LEFT;
    else {
      ILLEGAL_DATA;
    }
  }
  else if (strcmp(name, PROTEIN_EOP_LENGTH_TAG) == 0) {
    SCANUINT;
    gth_backtrace_path_add_eop(gth_sa_backtrace_path(sa), parseinfo->eoptype,
                               ret);
  }
  else if (strcmp(name, INDELCOUNT_TAG) == 0) {
    SCANUINT;
    /* ignore indelcount, gets recomputed anyway */
  }
  else if (strcmp(name, GENOMICLENGTHDP_TAG) == 0) {
    SCANUINT;
    gth_sa_set_gen_dp_length(sa, ret);
  }
  else if (strcmp(name, GENOMICLENGTHTOTAL_TAG) == 0) {
    SCANUINT;
    gth_sa_set_gen_total_length(sa, ret);
  }
  else if (strcmp(name, GENOMICOFFSET_TAG) == 0) {
    SCANUINT;
    gth_sa_set_gen_offset(sa, ret);
  }
  else if (strcmp(name, REFERENCELENGTH_TAG) == 0) {
    SCANUINT;
    gth_sa_set_ref_total_length(sa, ret);
  }
  else if (strcmp(name, DPSTARTPOS_TAG) == 0) {
    SCANUINT;
    gth_sa_set_gen_dp_start(sa, ret);
  }
  else if (strcmp(name, DPENDPOS_TAG) == 0) {
    SCANUINT;
    /* ignore DP end pos, gets recomputed from gen_dp_length anyway */
    gt_assert(gth_sa_gen_dp_end(sa) == ret);
  }
  else if (strcmp(name, GENOMICFILENAME_TAG) == 0) {
    /* save genomic file name */
    gt_str_append_cstr_nt(parseinfo->genomicfilename, data, datalength);
  }
  else if (strcmp(name, GENOMICFILEHASH_TAG) == 0) {
    gth_sa_set_gen_file_num(sa, process_file(parseinfo->input,
                            gt_str_get(parseinfo->genomicfilename), data, false,
                            UNDEF_ALPHA));
    /* reset genomic filename */
    gt_str_reset(parseinfo->genomicfilename);
  }
  else if (strcmp(name, GENOMICSEQNUM_TAG) == 0) {
    SCANUINT;
    gth_sa_set_gen_seq_num(sa, ret);
  }
  else if (strcmp(name, REFERENCEFILENAME_TAG) == 0) {
    /* save reference file name */
    gt_str_append_cstr_nt(parseinfo->referencefilename, data, datalength);
  }
  else if (strcmp(name, REFERENCEFILEHASH_TAG) == 0) {
    gth_sa_set_ref_file_num(sa, process_file(parseinfo->input,
                                       gt_str_get(parseinfo->referencefilename),
                                                  data, true,
                                                  gth_sa_alphatype(sa)));

    /* reset reference filename */
    gt_str_reset(parseinfo->referencefilename);
  }
  else if (strcmp(name, REFERENCESEQNUM_TAG) == 0) {
    SCANUINT;
    gth_sa_set_ref_seq_num(sa, ret);
  }
  else if (strcmp(name, GENOMICID_TAG) == 0)
    gth_sa_set_gen_id(sa, data);
  else if (strcmp(name, REFERENCEID_TAG) == 0)
    gth_sa_set_ref_id(sa, data);
  else if (strcmp(name, GENOMICSTRANDISFORWARD_TAG) == 0)
    gth_sa_set_gen_strand(sa, parse_boolean(data, parseinfo));
  else if (strcmp(name, REFERENCESTRANDISFORWARD_TAG) == 0)
    gth_sa_set_ref_strand(sa, parse_boolean(data, parseinfo));
  else if (strcmp(name, GENOMICCUTOFF_TAG) == 0) {
    SCANUINT;
    parseinfo->cutoffs.genomiccutoff = ret;
  }
  else if (strcmp(name, REFERENCECUTOFF_TAG) == 0) {
    SCANUINT;
    parseinfo->cutoffs.referencecutoff = ret;
  }
  else if (strcmp(name, EOPCUTOFF_TAG) == 0) {
    SCANUINT;
    parseinfo->cutoffs.eopcutoff = ret;
  }
  else if (strcmp(name, CUTOFFSSTART_TAG) == 0)
    gth_sa_set_cutoffs_start(sa, &parseinfo->cutoffs);
  else if (strcmp(name, CUTOFFSEND_TAG) == 0)
    gth_sa_set_cutoffs_end(sa, &parseinfo->cutoffs);
  else if (strcmp(name, LEFTGENOMICEXONBORDER_TAG) == 0) {
    SCANUINT;
    parseinfo->exoninfo.leftgenomicexonborder = ret;
  }
  else if (strcmp(name, RIGHTGENOMICEXONBORDER_TAG) == 0) {
    SCANUINT;
    parseinfo->exoninfo.rightgenomicexonborder = ret;
  }
  else if (strcmp(name, LEFTREFERENCEEXONBORDER_TAG) == 0) {
    SCANUINT;
    parseinfo->exoninfo.leftreferenceexonborder = ret;
  }
  else if (strcmp(name, RIGHTREFERENCEEXONBORDER_TAG) == 0) {
    SCANUINT;
    parseinfo->exoninfo.rightreferenceexonborder = ret;
  }
  else if (strcmp(name, EXONSCORE_TAG) == 0) {
    SCANDOUBLE;
    parseinfo->exoninfo.exonscore = retdouble;
  }
  else if (strcmp(name, EXONINFO_TAG) == 0)
    gth_sa_add_exon(sa, &parseinfo->exoninfo);
  else if (strcmp(name, DONORSITEPROBABILITY_TAG) == 0) {
    SCANDOUBLE;
    parseinfo->introninfo.donorsiteprobability = (GthFlt) retdouble;
  }
  else if (strcmp(name, ACCEPTORSITEPROBABILITY_TAG) == 0) {
    SCANDOUBLE;
    parseinfo->introninfo.acceptorsiteprobability = (GthFlt) retdouble;
  }
  else if (strcmp(name, DONORSITESCORE_TAG) == 0) {
    SCANDOUBLE;
    parseinfo->introninfo.donorsitescore = retdouble;
  }
  else if (strcmp(name, ACCEPTORSITESCORE_TAG) == 0) {
    SCANDOUBLE;
    parseinfo->introninfo.acceptorsitescore = retdouble;
  }
  else if (strcmp(name, INTRONINFO_TAG) == 0)
    gth_sa_add_intron(sa, &parseinfo->introninfo);
  else if (strcmp(name, POLYASTART_TAG) == 0) {
    SCANUINT;
    gth_sa_set_polyAtail_start(sa, ret);
  }
  else if (strcmp(name, POLYAEND_TAG) == 0) {
    SCANUINT;
    gth_sa_set_polyAtail_stop(sa, ret);
  }
  else if (strcmp(name, ALIGNMENTSCORE_TAG) == 0) {
    SCANDOUBLE;
    gth_sa_set_score(sa, retdouble);
  }
  else if (strcmp(name, COVERAGE_TAG) == 0) {
    SCANDOUBLE;
    gth_sa_set_coverage(sa, retdouble);
  }
  else if (strcmp(name, COVERAGEOFGENOMICSEGMENTISHIGHEST_TAG) == 0) {
    gth_sa_set_highest_cov(sa, parse_boolean(data, parseinfo));
  }
  else if (strcmp(name, CUMULATIVELENGTHOFSCOREDEXONS_TAG) == 0) {
    SCANUINT;
    gth_sa_set_cumlen_scored_exons(sa, ret);
  }
}