Beispiel #1
0
static void end_element_handler(void *info, const XML_Char *name)
{
  Parseinfo *parseinfo = (Parseinfo*) info;
  GthSA *sa = parseinfo->currentSA;
  GtUword datalength;
  double retdouble;
  GtWord ret;
  char *data;

  /* save data and data length */
  data       = gt_str_get(parseinfo->databuf);
  datalength = gt_str_length(parseinfo->databuf);

  /* perform actions depending on end tag */
  if (strcmp(name, SPLICEDALIGNMENT_TAG) == 0) {
    /* before we store the spliced alignment we have to reverse its edit
       operations */
    gt_assert(sa && gth_sa_backtrace_path(sa));
    gth_backtrace_path_reverse(gth_sa_backtrace_path(sa));

    /* ensure that before an intron which is not in phase the edit operation
       has length 1 (only for protein spliced alignments) */
    gth_backtrace_path_ensure_length_1_before_introns(
                                                     gth_sa_backtrace_path(sa));

    if (parseinfo->saprocessfunc(parseinfo->data , sa,
                                 parseinfo->outputfilename, parseinfo->err)) {
      /* XXX */
      fprintf(stderr, "error: %s\n", gt_error_get(parseinfo->err));
      exit(EXIT_FAILURE);
    }
    /* reset current spliced alignment */
    parseinfo->currentSA = NULL;
 }
  else if (strcmp(name, REFERENCEALPHATYPE_TAG) == 0) {
    if (strcmp(data, "DNA_ALPHA") == 0)
      gth_sa_set_alphatype(sa, DNA_ALPHA);
    else if (strcmp(data, "PROTEIN_ALPHA") == 0) {
      gth_sa_set_alphatype(sa, PROTEIN_ALPHA);
    }
    else {
      ILLEGAL_DATA;
    }
  }
  else if (strcmp(name, DNA_EOP_TYPE_TAG) == 0) {
    if (strcmp(data, "match") == 0)
      parseinfo->eoptype = EOP_TYPE_MATCH;
    else if (strcmp(data, "deletion") == 0)
      parseinfo->eoptype = EOP_TYPE_DELETION;
    else if (strcmp(data, "insertion") == 0)
      parseinfo->eoptype = EOP_TYPE_INSERTION;
    else if (strcmp(data, "mismatch") == 0)
      parseinfo->eoptype = EOP_TYPE_MISMATCH;
    else if (strcmp(data, "intron") == 0)
      parseinfo->eoptype = EOP_TYPE_INTRON;
    else {
      ILLEGAL_DATA;
    }
  }
  else if (strcmp(name, DNA_EOP_LENGTH_TAG) == 0) {
    SCANUINT;
    gth_backtrace_path_add_eop(gth_sa_backtrace_path(sa), parseinfo->eoptype,
                               ret);
  }
  else if (strcmp(name, PROTEIN_EOP_TYPE_TAG) == 0) {
    if (strcmp(data, "match") == 0)
      parseinfo->eoptype = EOP_TYPE_MATCH;
    else if (strcmp(data, "deletion") == 0)
      parseinfo->eoptype = EOP_TYPE_DELETION;
    else if (strcmp(data, "insertion") == 0)
      parseinfo->eoptype = EOP_TYPE_INSERTION;
    else if (strcmp(data, "mismatch") == 0)
      parseinfo->eoptype = EOP_TYPE_MISMATCH;
    else if (strcmp(data, "intron") == 0)
      parseinfo->eoptype = EOP_TYPE_INTRON;
    else if (strcmp(data, "mismatch_with_1_gap") == 0)
      parseinfo->eoptype = EOP_TYPE_MISMATCH_WITH_1_GAP;
    else if (strcmp(data, "mismatch_with_2_gaps") == 0)
      parseinfo->eoptype = EOP_TYPE_MISMATCH_WITH_2_GAPS;
    else if (strcmp(data, "deletion_with_1_gap") == 0)
      parseinfo->eoptype = EOP_TYPE_DELETION_WITH_1_GAP;
    else if (strcmp(data, "deletion_with_2_gaps") == 0)
      parseinfo->eoptype = EOP_TYPE_DELETION_WITH_2_GAPS;
    else if (strcmp(data, "intron_with_1_base_left") == 0)
      parseinfo->eoptype = EOP_TYPE_INTRON_WITH_1_BASE_LEFT;
    else if (strcmp(data, "intron_with_2_bases_left") == 0)
      parseinfo->eoptype = EOP_TYPE_INTRON_WITH_2_BASES_LEFT;
    else {
      ILLEGAL_DATA;
    }
  }
  else if (strcmp(name, PROTEIN_EOP_LENGTH_TAG) == 0) {
    SCANUINT;
    gth_backtrace_path_add_eop(gth_sa_backtrace_path(sa), parseinfo->eoptype,
                               ret);
  }
  else if (strcmp(name, INDELCOUNT_TAG) == 0) {
    SCANUINT;
    /* ignore indelcount, gets recomputed anyway */
  }
  else if (strcmp(name, GENOMICLENGTHDP_TAG) == 0) {
    SCANUINT;
    gth_sa_set_gen_dp_length(sa, ret);
  }
  else if (strcmp(name, GENOMICLENGTHTOTAL_TAG) == 0) {
    SCANUINT;
    gth_sa_set_gen_total_length(sa, ret);
  }
  else if (strcmp(name, GENOMICOFFSET_TAG) == 0) {
    SCANUINT;
    gth_sa_set_gen_offset(sa, ret);
  }
  else if (strcmp(name, REFERENCELENGTH_TAG) == 0) {
    SCANUINT;
    gth_sa_set_ref_total_length(sa, ret);
  }
  else if (strcmp(name, DPSTARTPOS_TAG) == 0) {
    SCANUINT;
    gth_sa_set_gen_dp_start(sa, ret);
  }
  else if (strcmp(name, DPENDPOS_TAG) == 0) {
    SCANUINT;
    /* ignore DP end pos, gets recomputed from gen_dp_length anyway */
    gt_assert(gth_sa_gen_dp_end(sa) == ret);
  }
  else if (strcmp(name, GENOMICFILENAME_TAG) == 0) {
    /* save genomic file name */
    gt_str_append_cstr_nt(parseinfo->genomicfilename, data, datalength);
  }
  else if (strcmp(name, GENOMICFILEHASH_TAG) == 0) {
    gth_sa_set_gen_file_num(sa, process_file(parseinfo->input,
                            gt_str_get(parseinfo->genomicfilename), data, false,
                            UNDEF_ALPHA));
    /* reset genomic filename */
    gt_str_reset(parseinfo->genomicfilename);
  }
  else if (strcmp(name, GENOMICSEQNUM_TAG) == 0) {
    SCANUINT;
    gth_sa_set_gen_seq_num(sa, ret);
  }
  else if (strcmp(name, REFERENCEFILENAME_TAG) == 0) {
    /* save reference file name */
    gt_str_append_cstr_nt(parseinfo->referencefilename, data, datalength);
  }
  else if (strcmp(name, REFERENCEFILEHASH_TAG) == 0) {
    gth_sa_set_ref_file_num(sa, process_file(parseinfo->input,
                                       gt_str_get(parseinfo->referencefilename),
                                                  data, true,
                                                  gth_sa_alphatype(sa)));

    /* reset reference filename */
    gt_str_reset(parseinfo->referencefilename);
  }
  else if (strcmp(name, REFERENCESEQNUM_TAG) == 0) {
    SCANUINT;
    gth_sa_set_ref_seq_num(sa, ret);
  }
  else if (strcmp(name, GENOMICID_TAG) == 0)
    gth_sa_set_gen_id(sa, data);
  else if (strcmp(name, REFERENCEID_TAG) == 0)
    gth_sa_set_ref_id(sa, data);
  else if (strcmp(name, GENOMICSTRANDISFORWARD_TAG) == 0)
    gth_sa_set_gen_strand(sa, parse_boolean(data, parseinfo));
  else if (strcmp(name, REFERENCESTRANDISFORWARD_TAG) == 0)
    gth_sa_set_ref_strand(sa, parse_boolean(data, parseinfo));
  else if (strcmp(name, GENOMICCUTOFF_TAG) == 0) {
    SCANUINT;
    parseinfo->cutoffs.genomiccutoff = ret;
  }
  else if (strcmp(name, REFERENCECUTOFF_TAG) == 0) {
    SCANUINT;
    parseinfo->cutoffs.referencecutoff = ret;
  }
  else if (strcmp(name, EOPCUTOFF_TAG) == 0) {
    SCANUINT;
    parseinfo->cutoffs.eopcutoff = ret;
  }
  else if (strcmp(name, CUTOFFSSTART_TAG) == 0)
    gth_sa_set_cutoffs_start(sa, &parseinfo->cutoffs);
  else if (strcmp(name, CUTOFFSEND_TAG) == 0)
    gth_sa_set_cutoffs_end(sa, &parseinfo->cutoffs);
  else if (strcmp(name, LEFTGENOMICEXONBORDER_TAG) == 0) {
    SCANUINT;
    parseinfo->exoninfo.leftgenomicexonborder = ret;
  }
  else if (strcmp(name, RIGHTGENOMICEXONBORDER_TAG) == 0) {
    SCANUINT;
    parseinfo->exoninfo.rightgenomicexonborder = ret;
  }
  else if (strcmp(name, LEFTREFERENCEEXONBORDER_TAG) == 0) {
    SCANUINT;
    parseinfo->exoninfo.leftreferenceexonborder = ret;
  }
  else if (strcmp(name, RIGHTREFERENCEEXONBORDER_TAG) == 0) {
    SCANUINT;
    parseinfo->exoninfo.rightreferenceexonborder = ret;
  }
  else if (strcmp(name, EXONSCORE_TAG) == 0) {
    SCANDOUBLE;
    parseinfo->exoninfo.exonscore = retdouble;
  }
  else if (strcmp(name, EXONINFO_TAG) == 0)
    gth_sa_add_exon(sa, &parseinfo->exoninfo);
  else if (strcmp(name, DONORSITEPROBABILITY_TAG) == 0) {
    SCANDOUBLE;
    parseinfo->introninfo.donorsiteprobability = (GthFlt) retdouble;
  }
  else if (strcmp(name, ACCEPTORSITEPROBABILITY_TAG) == 0) {
    SCANDOUBLE;
    parseinfo->introninfo.acceptorsiteprobability = (GthFlt) retdouble;
  }
  else if (strcmp(name, DONORSITESCORE_TAG) == 0) {
    SCANDOUBLE;
    parseinfo->introninfo.donorsitescore = retdouble;
  }
  else if (strcmp(name, ACCEPTORSITESCORE_TAG) == 0) {
    SCANDOUBLE;
    parseinfo->introninfo.acceptorsitescore = retdouble;
  }
  else if (strcmp(name, INTRONINFO_TAG) == 0)
    gth_sa_add_intron(sa, &parseinfo->introninfo);
  else if (strcmp(name, POLYASTART_TAG) == 0) {
    SCANUINT;
    gth_sa_set_polyAtail_start(sa, ret);
  }
  else if (strcmp(name, POLYAEND_TAG) == 0) {
    SCANUINT;
    gth_sa_set_polyAtail_stop(sa, ret);
  }
  else if (strcmp(name, ALIGNMENTSCORE_TAG) == 0) {
    SCANDOUBLE;
    gth_sa_set_score(sa, retdouble);
  }
  else if (strcmp(name, COVERAGE_TAG) == 0) {
    SCANDOUBLE;
    gth_sa_set_coverage(sa, retdouble);
  }
  else if (strcmp(name, COVERAGEOFGENOMICSEGMENTISHIGHEST_TAG) == 0) {
    gth_sa_set_highest_cov(sa, parse_boolean(data, parseinfo));
  }
  else if (strcmp(name, CUMULATIVELENGTHOFSCOREDEXONS_TAG) == 0) {
    SCANUINT;
    gth_sa_set_cumlen_scored_exons(sa, ret);
  }
}
static int call_dna_DP(bool directmatches, GthCallInfo *call_info,
                       GthInput *input, GthStat *stat,
                       GthSACollection *sa_collection, GthSA *saA,
                       GtUword gen_file_num,
                       GtUword ref_file_num,
                       GtUword gen_total_length,
                       GtUword gen_offset,
                       const GtRange *gen_seq_bounds,
                       const GtRange *gen_seq_bounds_rc,
                       GtUword ref_total_length, GtUword ref_offset,
                       GtUword chainctr,
                       GtUword num_of_chains, GthMatchInfo *match_info,
                       const unsigned char *ref_seq_tran,
                       const unsigned char *ref_seq_orig,
                       const unsigned char *ref_seq_tran_rc,
                       const unsigned char *ref_seq_orig_rc,
                       GthChain *chain,
                       GthDNACompletePathMatrixJT dna_complete_path_matrix_jt,
                       GthProteinCompletePathMatrixJT
                       protein_complete_path_matrix_jt)
{
  int rval;
  bool bothstrandsanalyzed, firstdp = true,
       GT_UNUSED gs2outdirectmatches = directmatches;
  GthSA *saB = NULL;
  GtFile *outfp = call_info->out->outfp;

  if (directmatches ? gth_input_forward(input)
                    : gth_input_reverse(input)) {
    /* calculate alignment */
    rval = callsahmt(true, saA, directmatches, gen_file_num, ref_file_num,
                     chain, gen_total_length, gen_offset, gen_seq_bounds,
                     gen_seq_bounds_rc,
                     ref_seq_tran, ref_seq_orig, ref_total_length, ref_offset,
                     input, &call_info->simfilterparam.introncutoutinfo, stat,
                     chainctr, num_of_chains, call_info->translationtable,
                     directmatches, call_info->proteinexonpenal,
                     call_info->splice_site_model, call_info->dp_options_core,
                     call_info->dp_options_est, call_info->dp_options_postpro,
                     dna_complete_path_matrix_jt,
                     protein_complete_path_matrix_jt, call_info->out);
    if (rval && rval != GTH_ERROR_SA_COULD_NOT_BE_DETERMINED) {
                     /* ^ this error is treated below */
      return rval;
    }

    firstdp = false;
    bothstrandsanalyzed = gth_input_both(input);

    if (rval == GTH_ERROR_SA_COULD_NOT_BE_DETERMINED ||
        isunsuccessfulalignment(saA, call_info->out->comments, outfp)) {
      match_info->call_number--;
      /* if the spliced alignment was unsuccessful, it is deleted and the
         next hit is considered. */
      gth_sa_delete(saA);
      return 0; /* continue */
    }

    /* if not both strands are analyzed, we can save this alignment now.
       Otherwise we have to calculate the alignment to the other strand
       first and then save the better one. */
    if (!bothstrandsanalyzed)
      save_sa(sa_collection, saA, call_info->sa_filter, match_info, stat);
  }

  if (directmatches ? gth_input_reverse(input)
                    : gth_input_forward(input)) {
    if ((firstdp || gth_sa_is_poor(saA, call_info->minaveragessp)) &&
        !call_info->cdnaforwardonly) {
      if (firstdp) {
        /* space for first alignment is already allocated, bu we have to
           change the direction of the genomic and the reference strand */
        gth_sa_set_gen_strand(saA, !directmatches);
        gth_sa_set_ref_strand(saA, false);
      }
      else {
        /* allocating space for second alignment */
        saB = gth_sa_new_and_set(!directmatches, false, input,
                                 chain->gen_file_num, chain->gen_seq_num,
                                 chain->ref_file_num, chain->ref_seq_num,
                                 match_info->call_number, gen_total_length,
                                 gen_offset, ref_total_length);
      }

      /* setting gs2outdirectmatches (for compatibility) */
      gs2outdirectmatches = (bool) !directmatches;

      /* calculate alignment */
      rval = callsahmt(true, firstdp ? saA : saB, !directmatches,
                       gen_file_num, ref_file_num, chain, gen_total_length,
                       gen_offset, gen_seq_bounds, gen_seq_bounds_rc,
                       ref_seq_tran_rc, ref_seq_orig_rc, ref_total_length,
                       ref_offset, input,
                       &call_info->simfilterparam.introncutoutinfo, stat,
                       chainctr, num_of_chains, call_info->translationtable,
                       directmatches, call_info->proteinexonpenal,
                       call_info->splice_site_model, call_info->dp_options_core,
                       call_info->dp_options_est, call_info->dp_options_postpro,
                       dna_complete_path_matrix_jt,
                       protein_complete_path_matrix_jt, call_info->out);
      if (rval && rval != GTH_ERROR_SA_COULD_NOT_BE_DETERMINED) {
                       /* ^ this error is treated below */
        return rval;
      }

      if (firstdp) {
        if (rval == GTH_ERROR_SA_COULD_NOT_BE_DETERMINED ||
            isunsuccessfulalignment(saA, call_info->out->comments, outfp)) {
          /* for compatibility with GS2 */
          /* XXX: makes no sense. Possibly only if -gs2out is used. */
          match_info->significant_match_found= true;

          /* if the spliced alignment was unsuccessful, it is deleted and
             the next hit is considered. */
          gth_sa_delete(saA);
          return 0; /* continue */
        }

        save_sa(sa_collection, saA, call_info->sa_filter, match_info, stat);
      }
      else /* !firstdp */
      {
        if (rval == GTH_ERROR_SA_COULD_NOT_BE_DETERMINED ||
            isunsuccessfulalignment(saB, call_info->out->comments, outfp) ||
            !gth_sa_B_is_better_than_A(saA, saB)) {
          /* insert first SA */
          save_sa(sa_collection, saA, call_info->sa_filter, match_info, stat);
          /* discard second SA */
          gth_sa_delete(saB);
        }
        else {
          /* insert second SA */
          save_sa(sa_collection, saB, call_info->sa_filter, match_info, stat);
          /* free first SA */
          gth_sa_delete(saA);
        }
      }
    }
    else
      save_sa(sa_collection, saA, call_info->sa_filter, match_info, stat);
  }

  return 0;
}