示例#1
0
/* The following function prints the "classic" GeneSeqer2 MATCH line */
static void xml_showmatchline(GthSA *sa, unsigned int indentlevel,
                              GtFile *outfp)
{
  gth_indent(outfp, indentlevel);
  gt_file_xprintf(outfp, "<MATCH_line gen_id=\"%s\" gen_strand=\"%c\" ",
                     gth_sa_gen_id(sa),
                     gth_sa_gen_strand_char(sa));
  if (gth_sa_alphatype(sa) == DNA_ALPHA) {
    gt_file_xprintf(outfp, "ref_id=\"%s\" ref_strand=\"%c\">\n",
                       gth_sa_ref_id(sa),
                       gth_sa_ref_strand_char(sa));
  }
  else
    gt_file_xprintf(outfp, "ref_id=\"%s\">\n", gth_sa_ref_id(sa));

  indentlevel++;
  gth_indent(outfp, indentlevel);
  gt_file_xprintf(outfp,
                     "<total_alignment_score>%.3f</total_alignment_score>\n",
                     gth_sa_score(sa));
  gth_indent(outfp, indentlevel);
  gt_file_xprintf(outfp, "<cumulative_length_of_scored_exons>%lu"
                     "</cumulative_length_of_scored_exons>\n",
                     gth_sa_cumlen_scored_exons(sa));
  gth_indent(outfp, indentlevel);
  gt_file_xprintf(outfp, "<coverage percentage=\"%.3f\" high_type=\"",
                     gth_sa_coverage(sa));
  gt_file_xfputc(gth_sa_coverage_char(sa), outfp);

  gt_file_xprintf(outfp, "\"/>\n");
  indentlevel--;
  gth_indent(outfp, indentlevel);
  gt_file_xprintf(outfp, "</MATCH_line>\n");
}
示例#2
0
bool gth_sa_filter_filter_sa(const GthSAFilter *sa_filter, GthSA *sa)
{
  gt_assert(sa_filter && sa);
  /* alignment score is larger or equal then default min value */
  gt_assert(gth_sa_score(sa) >= GTH_DEFAULT_MIN_ALIGNMENTSCORE);
  /* alignment score is smaller or equal then default max value */
  gt_assert(gth_sa_score(sa) <= GTH_DEFAULT_MAX_ALIGNMENTSCORE);
  /* coverage is larger or equal then default min value */
  gt_assert(gth_sa_coverage(sa) >= GTH_DEFAULT_MIN_COVERAGE);
  /* coverage score is smaller or equal then default max value */
  gt_assert(gth_sa_coverage(sa) <= GTH_DEFAULT_MAX_COVERAGE);

  /* filter */
  if (gth_sa_score(sa)    < sa_filter->min_alignmentscore ||
      gth_sa_score(sa)    > sa_filter->max_alignmentscore ||
      gth_sa_coverage(sa) < sa_filter->min_coverage       ||
      gth_sa_coverage(sa) > sa_filter->max_coverage) {
    return true;
  }
  return false;
}
/* The following function prints the "classic" GeneSeqer2 MATCH line */
static void showmatchline(GthSA *sa, GtFile *outfp)
{
  gt_file_xprintf(outfp, "MATCH\t%s%c\t%s%c\t%5.3f\t"GT_WU"\t%5.3f\t%c\n",
                     gth_sa_gen_id(sa),
                     gth_sa_gen_strand_char(sa),
                     gth_sa_ref_id(sa),
                     gth_sa_ref_strand_char(sa),
                     gth_sa_score(sa),
                     gth_sa_cumlen_scored_exons(sa),
                     gth_sa_coverage(sa),
                     gth_sa_coverage_char(sa));
}
示例#4
0
void gth_compute_scores(GthSA *sa,
                        bool proteineop,
                        GthDPParam *dp_param,
                        void *dp_options_est,
                        const unsigned char *gen_seq_tran,
                        const unsigned char *ref_seq_tran,
                        const unsigned char *ref_seq_orig,
                        const GtTransTable *transtable,
                        unsigned long gen_dp_start,
                        unsigned long scoreminexonlen,
                        bool introncutout,
                        bool gs2out,
                        GthSplicedSeq *spliced_seq,
                        unsigned long ref_dp_length,
                        GtAlphabet *gen_alphabet,
                        GtAlphabet *ref_alphabet,
                        GthDPScoresProtein *dp_scores_protein)
{
  Traversealignmentfunctions travfunctions;
  Traversealignmentstate travstate;
  Computebordersandscoresdata data;
  GthFlt score, coverageofgenomicsegment, coverageofreferencesegment;

  gt_assert(!gth_sa_num_of_exons(sa));
  gt_assert(!gth_sa_num_of_introns(sa));

  travfunctions.processmismatch  = computescoresprocmismatch;
  travfunctions.processdeletion  = computescoresprocdeletion;
  travfunctions.processinsertion = computebordersandscoresprocinsertion;
  travfunctions.processmatch     = computebordersandscoresprocmatch;
  travfunctions.processintron    = computebordersandscoresprocintron;
  travfunctions.breakcondition   = NULL;

  /* additional functions for protein edit operations */
  travfunctions.processintron_with_1_base_left  =
    computebordersandscoresprocintron;
  travfunctions.processintron_with_2_bases_left =
    computebordersandscoresprocintron;
  travfunctions.processmismatch_with_1_gap      =
    computescoresprocmismatchordeletionwithgap;
  travfunctions.processmismatch_with_2_gaps     =
    computescoresprocmismatchordeletionwithgap;
  travfunctions.processdeletion_with_1_gap      =
    computescoresprocmismatchordeletionwithgap;
  travfunctions.processdeletion_with_2_gaps     =
    computescoresprocmismatchordeletionwithgap;

  travstate.proteineop = proteineop;
  travstate.processing_intron_with_1_base_left  = false;
  travstate.processing_intron_with_2_bases_left = false;
  travstate.alignment = gth_sa_get_editoperations(sa);
  travstate.alignmentlength =  gth_sa_get_editoperations_length(sa);
  travstate.eopptr       = travstate.alignment + travstate.alignmentlength - 1;
  travstate.genomicptr   = gth_sa_genomiccutoff_start(sa);
  travstate.referenceptr = gth_sa_referencecutoff_start(sa);

  if (travstate.alignmentlength <= 0) {
    /* in this case the alignmentscore is set to 0, which leads to discarding
       this alignment later */
    gth_sa_set_score(sa, 0.0);
    return;
  }

  /* editoperations contain no zero base exons */
  gt_assert(gth_sa_contains_no_zero_base_exons(sa));
  /* editoperations contain no leading or terminal introns or insertions */
  gt_assert(containsnoleadingorterminalintronsorinsertions(travstate.alignment,
                                                        travstate
                                                        .alignmentlength,
                                                        proteineop));
  /* sum of edit operations equals referencelength */
  gt_assert(gt_eops_equal_referencelength(travstate.alignment,
                                       travstate.alignmentlength,
                                       ref_dp_length
                                       - gth_sa_referencecutoff_start(sa)
                                       - gth_sa_referencecutoff_end(sa),
                                       proteineop));

  data.proteineop                     = proteineop;
  data.newexon                        = true;
  data.newintron                      = true;
  data.firstexon                      = true;
  data.introncutout                   = introncutout;
  data.gs2out                         = gs2out;
  data.spliced_seq                    = spliced_seq;
  data.singleexonweight               = (GthFlt) 0.0;
  data.maxsingleexonweight            = (GthFlt) 0.0;
  data.overallexonweight              = (GthFlt) 0.0;
  data.maxoverallexonweight           = (GthFlt) 0.0;
  data.cumulativelengthofscoredexons  = 0;

  data.exon.leftgenomicexonborder     = GT_UNDEF_ULONG;
  data.exon.rightgenomicexonborder    = GT_UNDEF_ULONG;
  data.exon.leftreferenceexonborder   = GT_UNDEF_ULONG;
  data.exon.rightreferenceexonborder  = GT_UNDEF_ULONG;
  data.exon.exonscore                 = GTH_UNDEF_GTHDBL;

  data.intron.donorsiteprobability    = GTH_UNDEF_GTHFLT;
  data.intron.acceptorsiteprobability = GTH_UNDEF_GTHFLT;
  data.intron.donorsitescore          = GTH_UNDEF_GTHDBL;
  data.intron.acceptorsitescore       = GTH_UNDEF_GTHDBL;

  data.sa                             = sa;
  data.dp_param                       = dp_param;
  data.dp_options_est                 = dp_options_est;
  data.gen_seq_tran                   = gen_seq_tran;
  data.ref_seq_tran                   = ref_seq_tran;
  data.ref_seq_orig                   = ref_seq_orig;
  data.transtable                     = transtable;
  data.gen_dp_start                   = gen_dp_start;
  data.scoreminexonlen                = scoreminexonlen;
  data.ref_dp_length                  = ref_dp_length;
  data.gen_alphabet                   = gen_alphabet;
  data.gen_alphabet_characters        = gen_alphabet
                                        ? gt_alphabet_characters(gen_alphabet)
                                        : NULL;
  data.dp_scores_protein              = dp_scores_protein;

  gthtraversealignment(true, &travstate, proteineop, &data, &travfunctions);

  /* this is for saving the last exon */
  evalnewintronifpossible(proteineop, &data.newexon, &data.newintron, true,
                          data.introncutout, data.gs2out, data.spliced_seq,
                          &data.exon, &data.intron, &data.singleexonweight,
                          &data.maxsingleexonweight, &data.overallexonweight,
                          &data.maxoverallexonweight,
                          &data.cumulativelengthofscoredexons, sa, &travstate,
                          gen_alphabet, data.dp_param, data.dp_options_est,
                          data.gen_seq_tran, data.ref_seq_tran,
                          data.gen_dp_start, data.scoreminexonlen);

  /* saving the scores for the whole alignment */
  if (data.maxoverallexonweight > 0.0) {
    score = data.overallexonweight / data.maxoverallexonweight;
    /* XXX: the way the alignmentscore is computed, it is possible to get a
       score > 1.0. Since we don't want this, we cap it */
    if (score > 1.0)
      score = 1.0;
  }
  else
    score = 0.0;
  gth_sa_set_score(sa, score);
  gth_sa_set_cumlen_scored_exons(sa, data.cumulativelengthofscoredexons);

  /* fraction of the gen_dp_length which is scored/weighted */
  coverageofgenomicsegment   = (GthFlt) data.cumulativelengthofscoredexons /
                               (GthFlt) gth_sa_gen_dp_length(sa);
  /* coverage of genomic segment is valid value */
  gt_assert(coverageofgenomicsegment >= 0.0 && coverageofgenomicsegment <= 1.0);

  /* fraction of the referencelength which is scored/weighted */
  coverageofreferencesegment = (GthFlt) data.cumulativelengthofscoredexons /
                               (GthFlt) ((proteineop ? GT_CODON_LENGTH : 1) *
                                         gth_sa_ref_total_length(sa));

  if (coverageofgenomicsegment > coverageofreferencesegment) {
    gth_sa_set_coverage(sa, coverageofgenomicsegment);
    gth_sa_set_highest_cov(sa, true);
  }
  else {
    gth_sa_set_coverage(sa, coverageofreferencesegment);
    gth_sa_set_highest_cov(sa, false);
  }

  /* test the assumption that the coverage is never larger then the default */
  gt_assert(gth_sa_coverage(sa) <= GTH_DEFAULT_MAX_COVERAGE);

  /* compute poly(A) tail position */
  gth_sa_calc_polyAtailpos(sa, ref_seq_tran, ref_alphabet);

  /* determined exons are forward and consecutive */
  gt_assert(gth_sa_exons_are_forward_and_consecutive(sa));
}
static void xml_inter_show_spliced_alignment(GthSA *sa, GthInput *input,
                                             unsigned int indentlevel,
                                             GtFile *outfp)
{
  bool dnaalpha = true;

  gth_indent(outfp, indentlevel);
  gt_file_xprintf(outfp,
                  "<spliced_alignment xmlns=\"http://www.GenomeThreader.org/"
                  "SplicedAlignment/spliced_alignment/\">\n");
  indentlevel++;

  gth_indent(outfp, indentlevel);
  gt_file_xprintf(outfp, "<referencealphatype>");
  switch (gth_sa_alphatype(sa)) {
    case DNA_ALPHA:
      gt_file_xprintf(outfp, "DNA_ALPHA");
      break;
    case PROTEIN_ALPHA:
      gt_file_xprintf(outfp, "PROTEIN_ALPHA");
      dnaalpha = false;
      break;
    default: gt_assert(0);
  }
  gt_file_xprintf(outfp, "</referencealphatype>\n");

  gth_indent(outfp, indentlevel);
  gt_file_xprintf(outfp, "<editoperations>\n");
  indentlevel++;
  gth_backtrace_path_show_complete(gth_sa_backtrace_path(sa), true, indentlevel,
                                   outfp);
  indentlevel--;
  gth_indent(outfp, indentlevel);
  gt_file_xprintf(outfp, "</editoperations>\n");

  gth_indent(outfp, indentlevel);
  gt_file_xprintf(outfp, "<indelcount>"GT_WU"</indelcount>\n",
                     gth_sa_indelcount(sa));

  gth_indent(outfp, indentlevel);
  gt_file_xprintf(outfp, "<genomiclengthDP>"GT_WU"</genomiclengthDP>\n",
                     gth_sa_gen_dp_length(sa));

  gth_indent(outfp, indentlevel);
  gt_file_xprintf(outfp, "<genomiclengthtotal>"GT_WU"</genomiclengthtotal>\n",
                     gth_sa_gen_total_length(sa));

  gth_indent(outfp, indentlevel);
  gt_file_xprintf(outfp, "<genomicoffset>"GT_WU"</genomicoffset>\n",
                     gth_sa_gen_offset(sa));

  gth_indent(outfp, indentlevel);
  gt_file_xprintf(outfp, "<referencelength>"GT_WU"</referencelength>\n",
                     gth_sa_ref_total_length(sa));

  gth_indent(outfp, indentlevel);
  gt_file_xprintf(outfp, "<dpstartpos>"GT_WU"</dpstartpos>\n",
                     gth_sa_gen_dp_start(sa));

  gth_indent(outfp, indentlevel);
  gt_file_xprintf(outfp, "<dpendpos>"GT_WU"</dpendpos>\n",
                     gth_sa_gen_dp_end(sa));

  showgenomicfilename(sa, input, indentlevel, outfp);

  gth_indent(outfp, indentlevel);
  gt_file_xprintf(outfp, "<genomicseqnum>"GT_WU"</genomicseqnum>\n",
                     gth_sa_gen_seq_num(sa));

  showreferencefilename(sa, input, indentlevel, outfp);

  gth_indent(outfp, indentlevel);
  gt_file_xprintf(outfp, "<referenceseqnum>"GT_WU"</referenceseqnum>\n",
                     gth_sa_ref_seq_num(sa));

  gth_indent(outfp, indentlevel);
  gt_file_xprintf(outfp, "<genomicid>%s</genomicid>\n", gth_sa_gen_id(sa));

  gth_indent(outfp, indentlevel);
  gt_file_xprintf(outfp, "<referenceid>%s</referenceid>\n",
                  gth_sa_ref_id(sa));

  gth_indent(outfp, indentlevel);
  gt_file_xprintf(outfp,
                  "<genomicstrandisforward>%s</genomicstrandisforward>\n",
                  GTH_SHOWBOOL(gth_sa_gen_strand_forward(sa)));

  gth_indent(outfp, indentlevel);
  gt_file_xprintf(outfp,
                    "<referencestrandisforward>%s</referencestrandisforward>\n",
                    GTH_SHOWBOOL(gth_sa_ref_strand_forward(sa)));

  showalignmentcutoffs(sa, indentlevel, outfp);

  showexons(sa, indentlevel, outfp);

  showintrons(sa, dnaalpha, indentlevel, outfp);

  showpolyAtailpos(sa, indentlevel, outfp);

  gth_indent(outfp, indentlevel);
  gt_file_xprintf(outfp, "<alignmentscore>%.*f</alignmentscore>\n",
                  PRECISION, gth_sa_score(sa));

  gth_indent(outfp, indentlevel);
  gt_file_xprintf(outfp, "<coverage>%.*f</coverage>\n", PRECISION,
                     gth_sa_coverage(sa));

  gth_indent(outfp, indentlevel);
  gt_file_xprintf(outfp, "<coverageofgenomicsegmentishighest>%s"
                  "</coverageofgenomicsegmentishighest>\n",
                  GTH_SHOWBOOL(gth_sa_genomic_cov_is_highest(sa)));

  gth_indent(outfp, indentlevel);
  gt_file_xprintf(outfp, "<cumulativelengthofscoredexons>"GT_WU""
                     "</cumulativelengthofscoredexons>\n",
                     gth_sa_cumlen_scored_exons(sa));

  indentlevel--;
  gth_indent(outfp, indentlevel);
  gt_file_xprintf(outfp, "</spliced_alignment>\n");
}
示例#6
0
static int store_in_subset_file(void *data, GthSA *sa,
                                const char *outputfilename, GtError *err)
{
  Store_in_subset_file_data *store_in_subset_file_data =
    (Store_in_subset_file_data*) data;
  double split_determing_percentage = 0.0;
  unsigned long filenum;
  char filenamesuffix[4];
  int had_err = 0;

  gt_error_check(err);

  /* filter before we do any further processing */
  if (gth_sa_filter_filter_sa(store_in_subset_file_data->sa_filter, sa)) {
    /* and free it afterwards */
    gth_sa_delete(sa);
    /* discard */
    return 0;
  }

  /* check whether we got a new output file to process */
  if (!store_in_subset_file_data->current_outputfilename) {
    store_in_subset_file_data->current_outputfilename =
      gt_cstr_dup(outputfilename);
  }
  else if (strcmp(store_in_subset_file_data->current_outputfilename,
                  outputfilename)) {
    /* close current output files */
    close_output_files(store_in_subset_file_data);
    gt_free(store_in_subset_file_data->current_outputfilename);
 }

  /* determine in which file the current sa needs to be put */
  switch (store_in_subset_file_data->gthsplitinfo->splitmode) {
    case ALIGNMENTSCORE_SPLIT:
      split_determing_percentage = gth_sa_score(sa);
      strcpy(filenamesuffix, "scr");
      break;
    case COVERAGE_SPLIT:
      split_determing_percentage = gth_sa_coverage(sa);
      strcpy(filenamesuffix, "cov");
      break;
    default: gt_assert(0);
  }
  gt_assert(split_determing_percentage >= 0.0);
  /* XXX: change into an assertion when coverage problem is fixed */
  if (split_determing_percentage > 1.0)
    split_determing_percentage = 1.0;

  if (split_determing_percentage == 1.0)
    filenum = store_in_subset_file_data->num_of_subset_files - 1;
  else {
    filenum =  floor(split_determing_percentage * 100.0 /
                           store_in_subset_file_data->gthsplitinfo->range);
  }
  gt_assert(filenum < store_in_subset_file_data->num_of_subset_files);

  /* make sure the file exists and is open */
  if (!store_in_subset_file_data->subset_files[filenum]) {
    gt_assert(store_in_subset_file_data->subset_filenames[filenum] == NULL);
    store_in_subset_file_data->subset_filenames[filenum] = gt_str_new();
    gt_str_append_cstr_nt(store_in_subset_file_data->subset_filenames[filenum],
                          outputfilename,
                          gt_file_basename_length(outputfilename));
    gt_str_append_char(store_in_subset_file_data->subset_filenames[filenum],
                       '.');
    gt_str_append_cstr(store_in_subset_file_data->subset_filenames[filenum],
                       filenamesuffix);
    gt_str_append_ulong(store_in_subset_file_data->subset_filenames[filenum],
                        filenum *
                        store_in_subset_file_data->gthsplitinfo->range);
    gt_str_append_char(store_in_subset_file_data->subset_filenames[filenum],
                       '-');
    gt_str_append_ulong(store_in_subset_file_data->subset_filenames[filenum],
                     (filenum + 1) *
                     store_in_subset_file_data->gthsplitinfo->range);
    gt_str_append_cstr(store_in_subset_file_data->subset_filenames[filenum],
                       gt_file_mode_suffix(store_in_subset_file_data
                                           ->gthsplitinfo->file_mode));

    /* if not disabled by -force, check if file already exists */
    if (!store_in_subset_file_data->gthsplitinfo->force) {
      store_in_subset_file_data->subset_files[filenum] =
        gt_file_open(store_in_subset_file_data->gthsplitinfo->file_mode,
                     gt_str_get(store_in_subset_file_data
                                ->subset_filenames[filenum]), "r", NULL);
      if (store_in_subset_file_data->subset_files[filenum]) {
        gt_error_set(err, "file \"%s\" exists already. use option -%s to "
                     "overwrite", gt_str_get(store_in_subset_file_data
                                             ->subset_filenames[filenum]),
                     GT_FORCE_OPT_CSTR);
        had_err = -1;
      }
    }
    if (!had_err) {
      /* open split file for writing */
      store_in_subset_file_data->subset_files[filenum] =
          gt_file_xopen_file_mode(store_in_subset_file_data->gthsplitinfo
                                  ->file_mode,
                                  gt_str_get(store_in_subset_file_data
                                             ->subset_filenames[filenum]), "w");
      /* store XML header in file */
      gth_xml_show_leader(true,
                          store_in_subset_file_data->subset_files[filenum]);
    }
  }

  /* put it there */
  if (!had_err) {
    gth_xml_inter_sa_visitor_set_outfp(store_in_subset_file_data->sa_visitor,
                                       store_in_subset_file_data
                                       ->subset_files[filenum]);
    gth_sa_visitor_visit_sa(store_in_subset_file_data->sa_visitor, sa);
  }

  /* adjust counter */
  if (!had_err)
    store_in_subset_file_data->subset_file_sa_counter[filenum]++;

  /* and free it afterwards */
  gth_sa_delete(sa);

  return had_err;
}