예제 #1
0
static void showdelimiterline(GtFile *outfp)
{
  GtUword i;
  for (i = 0; i < DELIMITERLINELENGTH; i++)
    gt_file_xfputc(SA_DELIMITERLINECHAR, outfp);
  gt_file_xfputc('\n', outfp);
}
예제 #2
0
static void showgthreferenceinformation(GthSA *sa, GthInput *input,
                                        bool showseqnums,
                                        GtFile *outfp)
{
  gt_assert(gth_sa_ref_file_num(sa) != GT_UNDEF_UWORD);

  switch (gth_sa_alphatype(sa)) {
    case DNA_ALPHA:
      gt_file_xprintf(outfp,
                         "EST Sequence: file=%s, strand=%c, description=",
                         gth_input_get_reference_filename(input,
                                                  gth_sa_ref_file_num(sa)),
                         gth_sa_ref_strand_char(sa));
      break;
    case PROTEIN_ALPHA:
      gt_file_xprintf(outfp, "Protein Sequence: file=%s, description=",
                         gth_input_get_reference_filename(input,
                                                 gth_sa_ref_file_num(sa)));
      break;
    default: gt_assert(0);
  }

  gth_sa_echo_reference_description(sa, input, outfp);

  if (showseqnums)
    gt_file_xprintf(outfp, ", seqnum="GT_WU"",  gth_sa_ref_seq_num(sa));

  gt_file_xfputc('\n', outfp);
  gt_file_xfputc('\n', outfp);
}
예제 #3
0
static int gff3_show_feature_node(GtFeatureNode *fn, void *data,
                                  GT_UNUSED GtError *err)
{
  bool part_shown = false;
  GtGFF3Visitor *gff3_visitor = (GtGFF3Visitor*) data;
  GtArray *parent_features = NULL;
  ShowAttributeInfo info;
  GtUword i;
  GtStr *id;

  gt_error_check(err);
  gt_assert(fn && gff3_visitor);

  /* output leading part */
  gt_gff3_output_leading(fn, gff3_visitor->outfp);

  /* show unique id part of attributes */
  if ((id = gt_hashmap_get(gff3_visitor->feature_node_to_unique_id_str, fn))) {
    gt_file_xprintf(gff3_visitor->outfp, "%s=%s", GT_GFF_ID, gt_str_get(id));
    part_shown = true;
  }

  /* show parent part of attributes */
  parent_features = gt_hashmap_get(gff3_visitor->feature_node_to_id_array, fn);
  if (gt_array_size(parent_features)) {
    if (part_shown)
      gt_file_xfputc(';', gff3_visitor->outfp);
    gt_file_xprintf(gff3_visitor->outfp, "%s=", GT_GFF_PARENT);
    for (i = 0; i < gt_array_size(parent_features); i++) {
      if (i)
        gt_file_xfputc(',', gff3_visitor->outfp);
      gt_file_xprintf(gff3_visitor->outfp, "%s",
                      *(char**) gt_array_get(parent_features, i));
    }
    part_shown = true;
  }

  /* show missing part of attributes */
  info.attribute_shown = &part_shown;
  info.outfp = gff3_visitor->outfp;
  gt_feature_node_foreach_attribute(fn, show_attribute, &info);

  /* show dot if no attributes have been shown */
  if (!part_shown)
    gt_file_xfputc('.', gff3_visitor->outfp);

  /* show terminal newline */
  gt_file_xfputc('\n', gff3_visitor->outfp);

  return 0;
}
예제 #4
0
/* The following function prints the "classic" GeneSeqer2 MATCH line */
static void xml_showmatchline(GthSA *sa, unsigned int indentlevel,
                              GtFile *outfp)
{
  gth_indent(outfp, indentlevel);
  gt_file_xprintf(outfp, "<MATCH_line gen_id=\"%s\" gen_strand=\"%c\" ",
                     gth_sa_gen_id(sa),
                     gth_sa_gen_strand_char(sa));
  if (gth_sa_alphatype(sa) == DNA_ALPHA) {
    gt_file_xprintf(outfp, "ref_id=\"%s\" ref_strand=\"%c\">\n",
                       gth_sa_ref_id(sa),
                       gth_sa_ref_strand_char(sa));
  }
  else
    gt_file_xprintf(outfp, "ref_id=\"%s\">\n", gth_sa_ref_id(sa));

  indentlevel++;
  gth_indent(outfp, indentlevel);
  gt_file_xprintf(outfp,
                     "<total_alignment_score>%.3f</total_alignment_score>\n",
                     gth_sa_score(sa));
  gth_indent(outfp, indentlevel);
  gt_file_xprintf(outfp, "<cumulative_length_of_scored_exons>%lu"
                     "</cumulative_length_of_scored_exons>\n",
                     gth_sa_cumlen_scored_exons(sa));
  gth_indent(outfp, indentlevel);
  gt_file_xprintf(outfp, "<coverage percentage=\"%.3f\" high_type=\"",
                     gth_sa_coverage(sa));
  gt_file_xfputc(gth_sa_coverage_char(sa), outfp);

  gt_file_xprintf(outfp, "\"/>\n");
  indentlevel--;
  gth_indent(outfp, indentlevel);
  gt_file_xprintf(outfp, "</MATCH_line>\n");
}
예제 #5
0
static void showconcreteline(const unsigned char *alignmentline,
                             unsigned long cols, GtFile *outfp)
{
  unsigned long i;

  for (i = 0; i < cols; i++) {
    switch (alignmentline[i]) {
      case ABSTRACTGAPSYMBOL:
        gt_file_xfputc(CONCRETEGAPSYMBOL, outfp);
        break;
      case ABSTRACTINTRONSYMBOL:
        gt_file_xfputc(CONCRETEINTRONSYMBOL, outfp);
        break;
      default:
        gt_file_xfputc(alignmentline[i], outfp);
    }
  }
}
예제 #6
0
static void show_pgl(GthPGL *pgl, GtUword pglnum,
                     GtUword translationtable, GthInput *input,
                     unsigned int indentlevel, GthOutput *out)
{
  GtUword i;
  GtFile *outfp = out->outfp;

  gt_assert(!out->gff3out);

  if (out->xmlout) {
    gth_indent(outfp, indentlevel);
    gt_file_xprintf(outfp, "<predicted_gene_location>\n");
    indentlevel++;
    gth_indent(outfp, indentlevel);
    gt_file_xprintf(outfp, "<PGL_line PGL_serial=\"" GT_WU "\" "
                    "PGL_strand=\"%c\" PGL_start=\"" GT_WU "\" PGL_stop=\""
                    GT_WU "\"/>\n",
                    pglnum + OUTPUTOFFSET,
                    SHOWSTRAND(gth_pgl_is_forward(pgl)),
                    SHOWGENPOS(gth_pgl_is_forward(pgl),
                               gth_pgl_total_length(pgl),
                               gth_pgl_genomic_offset(pgl),
                               pgl->maxrange.start),
                    SHOWGENPOS(gth_pgl_is_forward(pgl),
                               gth_pgl_total_length(pgl),
                               gth_pgl_genomic_offset(pgl),
                               pgl->maxrange.end));
  }
  else {
    gt_file_xprintf(outfp, "PGL %3" GT_WUS " (%c strand):      " GT_WU "     "
                    GT_WU,
                    pglnum + OUTPUTOFFSET,
                    SHOWSTRAND(gth_pgl_is_forward(pgl)),
                    SHOWGENPOS(gth_pgl_is_forward(pgl),
                               gth_pgl_total_length(pgl),
                               gth_pgl_genomic_offset(pgl),
                               pgl->maxrange.start),
                    SHOWGENPOS(gth_pgl_is_forward(pgl),
                               gth_pgl_total_length(pgl),
                               gth_pgl_genomic_offset(pgl),
                               pgl->maxrange.end));
    if (out->pglgentemplate)
      gt_file_xprintf(outfp, " (genomic template '%s')", gth_pgl_gen_id(pgl));
    gt_file_xfputc('\n', outfp);
  }

  for (i = 0; i < gt_array_size(pgl->assemblies); i++) {
    show_ags(gth_pgl_get_ags(pgl, i), pglnum, i, translationtable, input,
             indentlevel, out);
  }

  if (out->xmlout) {
    indentlevel--;
    gth_indent(outfp, indentlevel);
    gt_file_xprintf(outfp, "</predicted_gene_location>\n");
  }
}
예제 #7
0
static void gt_fastq_show_buffer(char separator, const char *description,
    const char *buffer, GtUword buffer_length, GtUword width,
    GtFile *outfp)
{
  GtUword i, current_length;
  gt_file_xfputc(separator, outfp);
  if (description != NULL)
    gt_file_xfputs(description, outfp);
  gt_file_xfputc('\n', outfp);
  for (i = 0, current_length = 0; i < buffer_length;
       i++, current_length++) {
    if (width && current_length == width) {
      gt_file_xfputc('\n', outfp);
      current_length = 0;
    }
    gt_file_xfputc(buffer[i], outfp);
  }
  gt_file_xfputc('\n', outfp);
}
예제 #8
0
static void txt_pgl_visitor_preface(GthPGLVisitor *pgl_visitor,
                                    GtUword num_of_pgls)
{
  GtUword i;
  GthTxtPGLVisitor *visitor = txt_pgl_visitor_cast(pgl_visitor);
  for (i = 0; i < DELIMITERLINELENGTH; i++)
    gt_file_xfputc(PGLS_DELIMITERCHAR, visitor->out->outfp);
  gt_file_xprintf(visitor->out->outfp, "\n\n");
  gt_file_xprintf(visitor->out->outfp, "Predicted gene locations (" GT_WU
                  "):\n\n\n", num_of_pgls);
}
예제 #9
0
void gt_ranges_show(GtArray *ranges, GtFile *outfp)
{
  GtRange *range;
  GtUword i;
  gt_assert(ranges);
  for (i = 0; i < gt_array_size(ranges); i++) {
    range = gt_array_get(ranges, i);
    gt_file_xprintf(outfp, "("GT_WU","GT_WU")", range->start, range->end);
  }
  gt_file_xfputc('\n', outfp);
}
예제 #10
0
파일: sa.c 프로젝트: AnnSeidel/genometools
void gth_sa_show_exons(const GthSA *sa, GtFile *outfp)
{
  Exoninfo *exoninfo;
  GtUword i;
  gt_assert(sa);
  for (i = 0; i < gt_array_size(sa->exons); i++) {
    exoninfo = (Exoninfo*) gt_array_get(sa->exons, i);
    gt_file_xprintf(outfp, "("GT_WU","GT_WU")", exoninfo->leftgenomicexonborder,
                    exoninfo->rightgenomicexonborder);
  }
  gt_file_xfputc('\n', outfp);
}
예제 #11
0
static void showgthgenomicinformation(GthSA *sa, GthInput *input,
                                      bool showseqnums, GtFile *outfp)
{
  gt_assert(gth_sa_gen_file_num(sa) != GT_UNDEF_UWORD);

  gt_file_xprintf(outfp, "Genomic Template: file=%s, strand=%c, from="GT_WU", "
                            "to="GT_WU", description=",
                     gth_input_get_genomic_filename(input,
                                                    gth_sa_gen_file_num(sa)),
                     gth_sa_gen_strand_char(sa),
                     gth_sa_gen_dp_start_show(sa),
                     gth_sa_gen_dp_end_show(sa));

  gth_sa_echo_genomic_description(sa, input, outfp);

  if (showseqnums)
    gt_file_xprintf(outfp, ", seqnum="GT_WU"",  gth_sa_gen_seq_num(sa));

  gt_file_xfputc('\n', outfp);
  gt_file_xfputc('\n', outfp);
}
예제 #12
0
void gt_fasta_show_entry_with_suffix(const char *description,
                                     const char *sequence,
                                     GtUword sequence_length,
                                     const char *suffix, GtUword width,
                                     GtFile *outfp)
{
  GtUword i, current_length, suffix_length;
  gt_assert(sequence);
  gt_file_xfputc(GT_FASTA_SEPARATOR, outfp);
  if (description)
    gt_file_xfputs(description, outfp);
  gt_file_xfputc('\n', outfp);
  suffix_length = suffix ? strlen(suffix) : 0;
  for (i = 0, current_length = 0; i < sequence_length + suffix_length;
       i++, current_length++) {
    if (width && current_length == width) {
      gt_file_xfputc('\n', outfp);
      current_length = 0;
    }
    if (i < sequence_length)
      gt_file_xfputc(sequence[i], outfp);
    else
      gt_file_xfputc(suffix[i-sequence_length], outfp);
  }
  gt_file_xfputc('\n', outfp);
}
예제 #13
0
static void show_attribute(const char *attr_name, const char *attr_value,
                           void *data)
{
  ShowAttributeInfo *info = (ShowAttributeInfo*) data;
  gt_assert(attr_name && attr_value && info);
  if (strcmp(attr_name, GT_GFF_ID) && strcmp(attr_name, GT_GFF_PARENT)) {
    if (*info->attribute_shown)
      gt_file_xfputc(';', info->outfp);
    else
      *info->attribute_shown = true;
    gt_file_xprintf(info->outfp, "%s=%s", attr_name, attr_value);
  }
}
예제 #14
0
static void outputPGSlines(GtArray *alignments, GtFile *outfp)
{
  GtUword i, j;
  GthSA *sa;

  for (i = 0; i < gt_array_size(alignments); i++) {
    sa = *(GthSA**) gt_array_get(alignments, i);

    gt_file_xprintf(outfp, "  PGS (");
    for (j = 0; j < gth_sa_num_of_exons(sa); j++) {
      if (j > 0)
        gt_file_xfputc(',', outfp);
      gt_file_xprintf(outfp, GT_WU "  " GT_WU ,
                      gth_sa_left_genomic_exon_border(sa, j),
                      gth_sa_right_genomic_exon_border(sa, j));
    }
    gt_file_xprintf(outfp, ")\t%s%c\n", gth_sa_ref_id(sa),
                       gth_sa_ref_strand_char(sa));
  }

  gt_file_xfputc('\n', outfp);
}
예제 #15
0
static void outputAGSline(const GthAGS *ags, GtUword agsnum,
                          GtFile *outfp)
{
  GthExonAGS *exon;
  GtUword i;

  gt_file_xprintf(outfp, "AGS-" GT_WU " (",  agsnum + OUTPUTOFFSET);
  for (i = 0; i < gth_ags_num_of_exons(ags); i++) {
    exon = gth_ags_get_exon(ags, i);
    if (i > 0)
      gt_file_xfputc(',', outfp);
    gt_file_xprintf(outfp, GT_WU "  " GT_WU, SHOWGENPOSAGS(exon->range.start),
                    SHOWGENPOSAGS(exon->range.end));
  }
  gt_file_xprintf(outfp, ")\n");
}
예제 #16
0
static void output_exon_intron_lines(const GthAGS *ags, int widthforgenpos,
                                     GtFile *outfp)
{
  GthSpliceSiteProb *splicesiteprob;
  GthExonAGS *exon;
  GtUword i, leftexonborder, rightexonborder, exonlength,
                leftintronborder = GT_UNDEF_UWORD, rightintronborder,
                intronlength;
  GthDbl exonscore;
  GthFlt donorsiteprob, acceptorsiteprob;

  for (i = 0; i < gt_array_size(ags->exons); i++) {
    exon            = (GthExonAGS*) gt_array_get(ags->exons, i);
    leftexonborder  = exon->range.start;
    rightexonborder = exon->range.end;
    exonlength      = rightexonborder - leftexonborder + 1;
    exonscore       = exon->score;

    if (i > 0) {
      rightintronborder = leftexonborder - 1;
      intronlength      = rightintronborder - leftintronborder + 1;
      splicesiteprob    = (GthSpliceSiteProb*)
                          gt_array_get(ags->splicesiteprobs, i-1);
      donorsiteprob     = splicesiteprob->donorsiteprob;
      acceptorsiteprob  = splicesiteprob->acceptorsiteprob;

      /* output intron */
      gt_file_xprintf(outfp,
                      "    Intron %2" GT_WUS " %*" GT_WUS " %*" GT_WUS " (%4"
                      GT_WUS " n);           " "Pd: %5.3f  Pa: %5.3f\n",
                      i - 1 + OUTPUTOFFSET, widthforgenpos,
                      SHOWGENPOSAGS(leftintronborder), widthforgenpos,
                      SHOWGENPOSAGS(rightintronborder), intronlength,
                      donorsiteprob, acceptorsiteprob);
    }
    leftintronborder = rightexonborder + 1;

    /* output exon */
    gt_file_xprintf(outfp,
                    "  Exon %2" GT_WUS " %*" GT_WUS " %*" GT_WUS " (%4" GT_WUS
                    " n); score: %5.3f\n",
                    i + OUTPUTOFFSET, widthforgenpos,
                    SHOWGENPOSAGS(leftexonborder), widthforgenpos,
                    SHOWGENPOSAGS(rightexonborder), exonlength, exonscore);
  }
  gt_file_xfputc('\n', outfp);
}
예제 #17
0
static void outputSCRline(const GthAGS *ags, GtFile *outfp)
{
  GthSpliceSiteProb *splicesiteprob;
  GtUword i;

  gt_file_xprintf(outfp, "SCR   (");
  for (i = 0; i < gt_array_size(ags->exons) - 1; i++) {
    splicesiteprob = (GthSpliceSiteProb*) gt_array_get(ags->splicesiteprobs, i);
    gt_file_xprintf(outfp, "e %5.3f  d %5.3f a %5.3f,",
                    ((GthExonAGS*) gt_array_get(ags->exons, i))->score,
                    splicesiteprob->donorsiteprob,
                    splicesiteprob->acceptorsiteprob);
  }
  gt_file_xprintf(outfp, "e %5.3f)\n",
                  ((GthExonAGS*) gt_array_get(ags->exons, i))->score);
  gt_file_xfputc('\n', outfp);
}
예제 #18
0
void gt_gff3_output_leading(GtFeatureNode *fn, GtFile *outfp)
{
  GtGenomeNode *gn;
  gt_assert(fn);
  gn = (GtGenomeNode*) fn;
  gt_file_xprintf(outfp, "%s\t%s\t%s\t"GT_WU"\t"GT_WU"\t",
                     gt_str_get(gt_genome_node_get_seqid(gn)),
                     gt_feature_node_get_source(fn),
                     gt_feature_node_get_type(fn),
                     gt_genome_node_get_start(gn),
                     gt_genome_node_get_end(gn));
  if (gt_feature_node_score_is_defined(fn))
    gt_file_xprintf(outfp, "%.3g", gt_feature_node_get_score(fn));
  else
    gt_file_xfputc('.', outfp);
  gt_file_xprintf(outfp, "\t%c\t%c\t",
                     GT_STRAND_CHARS[gt_feature_node_get_strand(fn)],
                     GT_PHASE_CHARS[gt_feature_node_get_phase(fn)]);
}
예제 #19
0
/*
  The following function prints the "classic" GeneSeqer2 PGS line
*/
static void showpgsline(GthSA *sa, GtFile *outfp)
{
  GtUword i, numofexons;
  gt_assert(sa);
  numofexons = gth_sa_num_of_exons(sa);
  gt_file_xprintf(outfp, "PGS_%s%c_%s%c\t(",
                     gth_sa_gen_id(sa),
                     gth_sa_gen_strand_char(sa),
                     gth_sa_ref_id(sa),
                     gth_sa_ref_strand_char(sa));

  for (i = 0; i < numofexons; i++) {
    gt_file_xprintf(outfp, ""GT_WU"  "GT_WU"",
                    gth_sa_left_genomic_exon_border(sa, i),
                    gth_sa_right_genomic_exon_border(sa, i));
    if (i == numofexons - 1)
      gt_file_xprintf(outfp, ")\n\n");
    else
      gt_file_xfputc(',', outfp);
  }
}
예제 #20
0
static void showalignmentheader(GthSA *sa, bool gs2out, int widthforgenpos,
                                GtUword minintronlength, GtFile *outfp)
{
  GtUword i, leftreferenceexonborder, rightreferenceexonborder,
                referenceexonlength;
  GthDbl exonscore, donorsitescore, acceptorsitescore;
  GthFlt donorsiteprobability, acceptorsiteprobability;
  Exoninfo *exoninfo;
  Introninfo *introninfo;

  gt_file_xprintf(outfp, "Predicted gene structure");
  if (gs2out) {
    gt_file_xprintf(outfp, " (within gDNA segment "GT_WU" to "GT_WU"):\n",
                       gth_sa_gen_dp_start_show(sa),
                       gth_sa_gen_dp_end_show(sa));
  }
  else
    gt_file_xprintf(outfp, ":\n");
  gt_file_xfputc('\n', outfp);

  for (i = 0; i < gth_sa_num_of_exons(sa); i++) {
    exoninfo = gth_sa_get_exon(sa, i);
    leftreferenceexonborder  = exoninfo->leftreferenceexonborder;
    rightreferenceexonborder = exoninfo->rightreferenceexonborder;
    referenceexonlength      = rightreferenceexonborder
                               - leftreferenceexonborder + 1;
    exonscore                = exoninfo->exonscore;

    if (i > 0) {
      introninfo = gth_sa_get_intron(sa, i-1);
      donorsiteprobability    = introninfo->donorsiteprobability;
      donorsitescore          = introninfo->donorsitescore;
      acceptorsiteprobability = introninfo->acceptorsiteprobability;
      acceptorsitescore       = introninfo->acceptorsitescore;

      gt_file_xprintf(outfp, "  Intron %2" GT_WUS " %*" GT_WUS " %*" GT_WUS
                      " (%4" GT_WUS " n); ",
                      i - 1 + OUTPUTOFFSET, widthforgenpos,
                      gth_sa_left_intron_border(sa, i-1), widthforgenpos,
                      gth_sa_right_intron_border(sa, i-1),
                      gth_sa_intron_length(sa, i-1));

      gt_file_xprintf(outfp, "Pd: %5.3f ", donorsiteprobability);
      if (gth_sa_alphatype(sa) == DNA_ALPHA) {
        if (donorsitescore == 0.0)
          gt_file_xprintf(outfp, "(s:    0), ");
        else
          gt_file_xprintf(outfp, "(s: %4.2f), ", donorsitescore);
      }
      else
        gt_file_xprintf(outfp, "  ");
      gt_file_xprintf(outfp, "Pa: %5.3f ", acceptorsiteprobability);
      if (gth_sa_alphatype(sa) == DNA_ALPHA) {
        if (acceptorsitescore == 0.0)
          gt_file_xprintf(outfp, "(s:    0)");
        else
          gt_file_xprintf(outfp, "(s: %4.2f)", acceptorsitescore);
      }
      /* if the intron is shorter or equal than the minimum intron length two
         question marks are shown at the end of the line */
      if (gth_sa_intron_length(sa, i-1) <= minintronlength)
        gt_file_xprintf(outfp, " ??");
      gt_file_xfputc('\n', outfp);
    }

    gt_file_xprintf(outfp,
                    " Exon %2" GT_WUS " %*" GT_WUS " %*" GT_WUS " (%4" GT_WUS
                    " n);  %s %6" GT_WUS " %6" GT_WUS " (%4" GT_WUS " %s); "
                    "score: %5.3f\n", i + OUTPUTOFFSET, widthforgenpos,
                    gth_sa_left_genomic_exon_border(sa, i), widthforgenpos,
                    gth_sa_right_genomic_exon_border(sa, i),
                    gth_sa_genomic_exon_length(sa, i), gth_sa_alphastring(sa),
                    leftreferenceexonborder  + OUTPUTOFFSET,
                    rightreferenceexonborder + OUTPUTOFFSET,
                    referenceexonlength,
                    gth_sa_alphatype(sa) == DNA_ALPHA ? "n" : "aa", exonscore);
  }

  /* showing PPA line (if an poly-A tail was determined) */
  if (gth_sa_alphatype(sa) == DNA_ALPHA)
    showppaline(sa, outfp);
  gt_file_xfputc('\n', outfp);

  /* showing MATCH line */
  showmatchline(sa, outfp);

  /* showing PGS line */
  showpgsline(sa, outfp);
}
예제 #21
0
파일: stat.c 프로젝트: 9beckert/TIR
void gth_stat_show(GthStat *stat, bool show_full_stats, bool xmlout,
                   GtFile *outfp)
{
  char *timestring;

  gt_assert(stat);

  /* begin XML comment */
  if (xmlout)
    gt_file_xprintf(outfp, "<!--\n");

  /* output exon length distribution */
  if (stat->exondistri) {
    gt_file_xprintf(outfp, "%c length distribution of all exons:\n",
                    COMMENTCHAR);
    gt_disc_distri_show(stat->exondistribution, outfp);
  }

  /* output intron length distribution */
  if (stat->introndistri) {
    if (stat->exondistri)
      gt_file_xprintf(outfp, "%c\n", COMMENTCHAR);
    gt_file_xprintf(outfp, "%c length distribution of all introns:\n",
                    COMMENTCHAR);
    gt_disc_distri_show(stat->introndistribution, outfp);
  }

  /* output match number distribution */
  if (stat->matchnumdistri) {
    if (stat->exondistri || stat->introndistri)
      gt_file_xprintf(outfp, "%c\n", COMMENTCHAR);
    gt_file_xprintf(outfp, "%c distribution of match numbers (per genomic "
                    "file, per reference sequence:\n", COMMENTCHAR);
    gt_disc_distri_show(stat->matchnumdistribution, outfp);
  }

  /* output reference sequence coverage distribution */
  if (stat->refseqcovdistri) {
    if (stat->exondistri || stat->introndistri || stat->matchnumdistri)
      gt_file_xprintf(outfp, "%c\n", COMMENTCHAR);
    gt_file_xprintf(outfp, "%c reference sequence coverage distribution (of "
                    "global chains):\n", COMMENTCHAR);
    gt_disc_distri_show(stat->refseqcoveragedistribution, outfp);
  }

  /* output spliced alignment statistics */
  if (stat->sa_stats) {
    if (stat->exondistri     || stat->introndistri ||
        stat->matchnumdistri || stat->refseqcovdistri) {
      gt_file_xprintf(outfp, "%c\n", COMMENTCHAR);
    }
    INFOCHAR;
    gt_file_xprintf(outfp,
                       "spliced alignment alignment score distribution:\n");
    gt_disc_distri_show(stat->sa_alignment_score_distribution, outfp);
    INFOCHAR;
    gt_file_xfputc('\n', outfp);
    INFOCHAR;
    gt_file_xprintf(outfp, "spliced alignment coverage distribution:\n");
    gt_disc_distri_show(stat->sa_coverage_distribution, outfp);
  }

  /* output general statistics */
  outputgeneralstatistics(stat, show_full_stats, outfp);
  INFOCHAR;
  gt_file_xfputc('\n', outfp);

  /* output the memory statistics */
  outputmemorystatistics(stat, show_full_stats, outfp);

  /* output time */
  INFOCHAR;
  gt_file_xfputc('\n', outfp);
  INFOCHAR;
  timestring = gth_get_time();
  gt_file_xprintf(outfp, "date finished: %s\n", timestring);
  gt_free(timestring);

  /* output important messages */
  if (stat->numofremovedzerobaseexons         ||
      stat->numofautointroncutoutcalls        ||
      stat->numofunsuccessfulintroncutoutDPs  ||
      stat->numoffailedDPparameterallocations ||
      stat->numoffailedmatrixallocations      ||
      stat->numofundeterminedSAs              ||
      stat->numoffilteredpolyAtailmatches) {
    gt_file_xprintf(outfp, "%c\n", COMMENTCHAR);
    gt_file_xprintf(outfp, "%c important messages:\n", COMMENTCHAR);
    if (stat->numofremovedzerobaseexons > 0) {
      gt_file_xprintf(outfp, "%c %lu removed zero base exons\n",
                         COMMENTCHAR, stat->numofremovedzerobaseexons);
    }
    if (stat->numofautointroncutoutcalls > 0) {
      gt_file_xprintf(outfp, "%c %lu times the intron cutout technique was "
                         "used automatically\n", COMMENTCHAR,
                         stat->numofautointroncutoutcalls);
    }
    if (stat->numofunsuccessfulintroncutoutDPs > 0) {
      gt_file_xprintf(outfp, "%c %lu unsuccessful DP calls using intron "
                         "cutout technique\n", COMMENTCHAR,
                         stat->numofunsuccessfulintroncutoutDPs);
    }
    if (stat->numoffailedDPparameterallocations > 0) {
      gt_file_xprintf(outfp, "%c %lu DP parameter allocations failed\n",
                         COMMENTCHAR, stat->numoffailedDPparameterallocations);
    }
    if (stat->numoffailedmatrixallocations > 0) {
      gt_file_xprintf(outfp, "%c %lu matrix allocations failed\n",
                         COMMENTCHAR, stat->numoffailedmatrixallocations);
    }
    if (stat->numofundeterminedSAs > 0) {
      gt_file_xprintf(outfp, "%c %lu undetermined spliced alignments\n",
                         COMMENTCHAR, stat->numofundeterminedSAs);
    }
    if (stat->numoffilteredpolyAtailmatches > 0) {
      gt_file_xprintf(outfp,
                      "%c %lu matches containing a poly(A) tail filtered\n",
                         COMMENTCHAR, stat->numoffilteredpolyAtailmatches);
    }
  }

  /* end XML comment */
  if (xmlout)
    gt_file_xprintf(outfp, "-->\n");
}
static int gt_condenseq_extract_runner(GT_UNUSED int argc,
                                       const char **argv,
                                       int parsed_args,
                                       void *tool_arguments,
                                       GtError *err)
{
  int had_err = 0;
  GtCondenserExtractArguments *arguments = tool_arguments;
  GtCondenseq *condenseq = NULL;
  GtLogger *logger = NULL;

  gt_error_check(err);
  gt_assert(arguments);

  logger = gt_logger_new(arguments->verbose, GT_LOGGER_DEFLT_PREFIX, stderr);

  if (!had_err) {
    condenseq = gt_condenseq_new_from_file(argv[parsed_args], logger, err);
    if (condenseq == NULL) {
      had_err = -1;
    }
  }

  if (!had_err) {
    const char *buffer = NULL;
    const char *desc = NULL;
    GtUword desclen,
            seqlen,
            rend = gt_condenseq_total_length(condenseq),
            send = gt_condenseq_num_of_sequences(condenseq);
    bool concat = strcmp(gt_str_get(arguments->mode), "concat") == 0;
    /* single sequence to extract = range of length 1 */
    if (arguments->seq != GT_UNDEF_UWORD) {
      arguments->seqrange.start = arguments->seqrange.end = arguments->seq;
    }
    /* no range given at all: extract all seqs */
    if (arguments->range.start == GT_UNDEF_UWORD &&
        arguments->seqrange.start == GT_UNDEF_UWORD) {
      arguments->seqrange.start = 0;
      arguments->seqrange.end = send - 1;
    }
    /* if seqs are specified, and concat is given, switch to posrange */
    if (concat && arguments->seqrange.start != GT_UNDEF_UWORD) {
      if (arguments->seqrange.end >= send) {
        had_err = -1;
        gt_error_set(err, "range end " GT_WU " excedes number of sequences "
                     GT_WU " (ranges are zero based sequence ids)",
                     arguments->seqrange.end, send);
      }
      else {
        arguments->range.start =
          gt_condenseq_seqstartpos(condenseq, arguments->seqrange.start);
        arguments->range.end =
          gt_condenseq_seqstartpos(condenseq, arguments->seqrange.end) +
          gt_condenseq_seqlength(condenseq, arguments->seqrange.end) - 1;
      }
    }
    /* extract sequence region */
    if (!had_err && arguments->range.start != GT_UNDEF_UWORD) {
      const GtUword maxbuffsize = ((GtUword) 1) << 17; /* ~ 100000byte */
      GtUword clen,
              rstart,
              current_length = 0, i;
      const char sepchar = gt_str_get(arguments->sepchar)[0];

      if (arguments->range.end >= rend) {
        had_err = -1;
        gt_error_set(err, "range end " GT_WU " excedes length of sequence "
                     GT_WU " (ranges are zero based positions)",
                     arguments->range.end, rend);
      }
      if (!had_err) {
        rstart = arguments->range.start;
        rend = arguments->range.end;
        /* nextlength = gt_condenseq_seqlength(condenseq, seqnum); */
        /* seqstart = gt_condenseq_seqstartpos(condenseq, seqnum); */
        /* gt_assert(rstart >= seqstart); */
        /* nextlength -= rstart - seqstart; [> handle first seq <] */
        while (rstart <= rend) {
          GtRange cur_range;
          if (rend - rstart > maxbuffsize) {
            GtUword seqnum = gt_condenseq_pos2seqnum(condenseq,
                                                     rstart + maxbuffsize),
                    closest_sep = gt_condenseq_seqstartpos(condenseq,
                                                           seqnum) - 1;
            gt_assert(closest_sep > rstart);
            clen = closest_sep - rstart + 1;
          }
          else
            clen = rend - rstart + 1;

          cur_range.start = rstart;
          cur_range.end = rstart + clen - 1;
          buffer = gt_condenseq_extract_decoded_range(condenseq, cur_range,
                                                      sepchar);
          gt_assert(buffer != NULL);
          for (i = 0; i < clen; i++, current_length++) {
            if (arguments->width && current_length == arguments->width) {
              gt_file_xfputc('\n', arguments->outfp);
              current_length = 0;
            }
            gt_file_xfputc(buffer[i], arguments->outfp);
          }
          rstart += clen;
        }
        gt_file_xfputc('\n', arguments->outfp);
      }
    }
    else if (!had_err) { /* extract seqwise and always fasta */
      GtUword seqnum,
              sstart = arguments->seqrange.start;

      if (arguments->seqrange.end >= send) {
        had_err = -1;
        gt_error_set(err, "range end " GT_WU " excedes number of sequences "
                     GT_WU " (ranges are zero based sequence ids)",
                     arguments->seqrange.end, send);
      }
      send = arguments->seqrange.end;
      for (seqnum = sstart;
           !had_err && seqnum <= send;
           ++seqnum) {
        buffer = gt_condenseq_extract_decoded(condenseq, &seqlen, seqnum);
        desc = gt_condenseq_description(condenseq, &desclen, seqnum);
        gt_fasta_show_entry_nt(desc, desclen,
                               buffer, seqlen,
                               arguments->width,
                               arguments->outfp);
      }
    }
  }
  gt_condenseq_delete(condenseq);
  gt_logger_delete(logger);
  return had_err;
}
예제 #23
0
static int gt_convertseq_runner(int argc, const char **argv, int parsed_args,
                              void *tool_arguments, GtError *err)
{
  GtConvertseqArguments *arguments = tool_arguments;
  int had_err = 0, i;
  GtFilelengthvalues *flv;
  GtSeqIterator *seqit;
  GtSequenceBuffer *sb = NULL;
  GtStrArray *files;
  const GtUchar *sequence;
  char *desc;
  GtUword len, j;
  off_t totalsize;
  gt_error_check(err);
  gt_assert(arguments != NULL);

  files = gt_str_array_new();
  for (i = parsed_args; i < argc; i++)
  {
    gt_str_array_add_cstr(files, argv[i]);
  }
  totalsize = gt_files_estimate_total_size(files);

  flv = gt_calloc((size_t) gt_str_array_size(files),
                  sizeof (GtFilelengthvalues));

  sb = gt_sequence_buffer_new_guess_type(files, err);
  if (!sb) {
    had_err = -1;
  }
  if (!had_err) {
    gt_sequence_buffer_set_filelengthtab(sb, flv);
    /* read input using seqiterator */
    seqit = gt_seq_iterator_sequence_buffer_new_with_buffer(sb);
    if (arguments->verbose)
    {
      gt_progressbar_start(gt_seq_iterator_getcurrentcounter(seqit,
                                                           (GtUint64)
                                                           totalsize),
                           (GtUint64) totalsize);
    }
    while (true)
    {
      GtUchar *seq = NULL;
      desc = NULL;
      j = 0UL;
      had_err = gt_seq_iterator_next(seqit, &sequence, &len, &desc, err);
      if (had_err != 1)
        break;
      if (arguments->revcomp) {
        GtUchar *newseq = gt_calloc((size_t) len+1, sizeof (GtUchar));
        memcpy(newseq, sequence, (size_t) len*sizeof (GtUchar));
        had_err = gt_reverse_complement((char*) newseq, len, err);
        if (had_err)
          break;
        seq = newseq;
      } else seq = (GtUchar*) sequence;

      if (!arguments->showseq) {
        bool in_wildcard = false;
        gt_file_xprintf(arguments->outfp, ">%s\n", desc);
        for (i = 0; (GtUword) i < len; i++) {
          if (arguments->reduce_wc_dna) {
            switch (seq[i]) {
              case 'a':
              case 'A':
              case 'c':
              case 'C':
              case 'g':
              case 'G':
              case 't':
              case 'u':
              case 'T':
              case 'U':
                in_wildcard = false;
                gt_file_xfputc((int) seq[i], arguments->outfp);
                j++;
                break;
              default:
                if (!in_wildcard) {
                  in_wildcard = true;
                  if (isupper((int) seq[i]))
                    gt_file_xfputc((int) 'N', arguments->outfp);
                  else
                    gt_file_xfputc((int) 'n', arguments->outfp);
                  j++;
                }
            }
          }
          else if (arguments->reduce_wc_prot) {
            switch (seq[i]) {
              case 'X':
              case 'B':
              case 'Z':
                if (!in_wildcard) {
                  in_wildcard = true;
                  gt_file_xfputc((int) 'N', arguments->outfp);
                  j++;
                }
                break;
              case 'x':
              case 'b':
              case 'z':
                if (!in_wildcard) {
                  in_wildcard = true;
                  gt_file_xfputc((int) 'n', arguments->outfp);
                  j++;
                }
                break;
              default:
                in_wildcard = false;
                gt_file_xfputc((int) seq[i], arguments->outfp);
                j++;
            }
          }
          else {
            gt_file_xfputc((int) seq[i], arguments->outfp);
            j++;
          }
          if (arguments->fastawidth > 0 && j % arguments->fastawidth == 0) {
            j = 0;
            gt_file_xprintf(arguments->outfp, "\n");
          }
        }
        if (arguments->fastawidth == 0 || len % arguments->fastawidth != 0)
            gt_file_xprintf(arguments->outfp, "\n");
      }
      if (arguments->revcomp) {
        gt_free(seq);
      }
    }
    if (arguments->showflv) {
      for (j=0;j<gt_str_array_size(files);j++) {
        fprintf(stderr, "file "GT_WU" (%s): "GT_WU"/"GT_WU"\n",
               j,
               gt_str_array_get(files, j),
               (GtUword) flv[j].length,
               (GtUword) flv[j].effectivelength);
      }
    }
    if (arguments->verbose)
    {
      gt_progressbar_stop();
    }
    gt_sequence_buffer_delete(sb);
    gt_seq_iterator_delete(seqit);
  }
  gt_str_array_delete(files);
  gt_free(flv);

  return had_err;
}
예제 #24
0
static void showtranslation(GthSplicedSeq *splicedseq,
                            char *frame0_in,
                            char *frame1_in,
                            char *frame2_in,
                            GtArray *exons,
                            bool gen_strand_forward,
                            unsigned long gen_total_length,
                            unsigned long gen_offset,
                            unsigned int indentlevel,
                            GthOutput *out)
{
  char *dotline, *template_out, *frame0_out, *frame1_out, *frame2_out;
  unsigned long i, exonseparatorwidth =  strlen(EXONSEPARATORSTRING),
                outlen = splicedseq->splicedseqlen +
                         ((gt_array_size(exons) - 1) * exonseparatorwidth) +
                         (splicedseq->splicedseqlen / TRANSLATIONLINEWIDTH);
  GtFile *outfp = out->outfp;

  dotline      = gt_malloc(sizeof (unsigned char) * outlen);
  template_out = gt_malloc(sizeof (unsigned char) * outlen);
  frame0_out   = gt_malloc(sizeof (unsigned char) * outlen);
  frame1_out   = gt_malloc(sizeof (unsigned char) * outlen);
  frame2_out   = gt_malloc(sizeof (unsigned char) * outlen);

  createoutputlines(dotline, template_out, frame0_out, frame1_out, frame2_out,
                    (char*) splicedseq->splicedseq, frame0_in, frame1_in,
                    frame2_in, splicedseq, exonseparatorwidth, outlen,
                    out->gs2out);

  if (out->xmlout) {
    gth_indent(outfp, indentlevel);
    gt_file_xprintf(outfp, "<translation>\n");
    indentlevel++;

    gth_indent(outfp, indentlevel);
    gt_file_xprintf(outfp, "<gDNA_template>");
    for (i = 0; i < outlen; i++) {
      if (template_out[i] != '\n') {
        gt_file_xfputc(template_out[i], outfp);
      }
    }
    gt_file_xprintf(outfp, "</gDNA_template>\n");

    gth_indent(outfp, indentlevel);
    gt_file_xprintf(outfp, "<first_frame>");
    for (i = 0; i < outlen; i++) {
      if (frame0_out[i] != '\n') {
        gt_file_xfputc(frame0_out[i], outfp);
      }
    }
    gt_file_xprintf(outfp, "</first_frame>\n");

    gth_indent(outfp, indentlevel);
    gt_file_xprintf(outfp, "<second_frame>");
    for (i = 0; i < outlen; i++) {
      if (frame1_out[i] != '\n') {
        gt_file_xfputc(frame1_out[i], outfp);
      }
    }
    gt_file_xprintf(outfp, "</second_frame>\n");

    gth_indent(outfp, indentlevel);
    gt_file_xprintf(outfp, "<third_frame>");
    for (i = 0; i < outlen; i++) {
      if (frame2_out[i] != '\n') {
        gt_file_xfputc(frame2_out[i], outfp);
      }
    }
    gt_file_xprintf(outfp, "</third_frame>\n");

    indentlevel--;
    gth_indent(outfp, indentlevel);
    gt_file_xprintf(outfp, "</translation>\n");
  }
  else {
    showoutputlines(dotline, template_out, frame0_out, frame1_out, frame2_out,
                    outlen, gen_strand_forward, gen_total_length,
                    gen_offset, splicedseq->positionmapping, out);
  }

  gt_free(dotline);
  gt_free(template_out);
  gt_free(frame0_out);
  gt_free(frame1_out);
  gt_free(frame2_out);
}
static int calc_spliced_alignments(GthSACollection *sa_collection,
                                   GthChainCollection *chain_collection,
                                   GthCallInfo *call_info,
                                   GthInput *input,
                                   GthStat *stat,
                                   GtUword gen_file_num,
                                   GtUword ref_file_num,
                                   bool directmatches,
                                   GthMatchInfo *match_info,
                                   GthDNACompletePathMatrixJT
                                   dna_complete_path_matrix_jt,
                                   GthProteinCompletePathMatrixJT
                                   protein_complete_path_matrix_jt)
{
  const unsigned char *ref_seq_tran, *ref_seq_orig, *ref_seq_tran_rc = NULL,
                      *ref_seq_orig_rc = NULL;
  GtUword chainctr, gen_offset = GT_UNDEF_UWORD, gen_total_length,
                ref_total_length;
  GtFile *outfp = call_info->out->outfp;
  GtRange gen_seq_bounds, gen_seq_bounds_rc;
  bool refseqisdna;
  GthChain *chain;
  GtRange range;
  GthSA *saA;
  int rval;

  gt_assert(sa_collection && chain_collection);

  refseqisdna = gth_input_ref_file_is_dna(input, ref_file_num);

  for (chainctr = 0;
       chainctr < gth_chain_collection_size(chain_collection);
       chainctr++) {
       chain = gth_chain_collection_get(chain_collection, chainctr);
    if (++match_info->call_number > call_info->firstalshown &&
        call_info->firstalshown > 0) {
      if (!(call_info->out->xmlout || call_info->out->gff3out))
        gt_file_xfputc('\n', outfp);
      else if (call_info->out->xmlout)
        gt_file_xprintf(outfp, "<!--\n");

      if (!call_info->out->gff3out) {
        gt_file_xprintf(outfp, "Maximal matching %s count (%u) reached.\n",
                        refseqisdna ? "EST" : "protein",
                        call_info->firstalshown);
        gt_file_xprintf(outfp, "Only the first %u matches will be "
                           "displayed.\n", call_info->firstalshown);
      }

      if (!(call_info->out->xmlout || call_info->out->gff3out))
        gt_file_xfputc('\n', outfp);
      else if (call_info->out->xmlout)
        gt_file_xprintf(outfp, "-->\n");

      match_info->max_call_number_reached = true;
      break; /* break out of loop */
    }

    /* compute considered genomic regions if not set by -frompos */
    if (!gth_input_use_substring_spec(input)) {
      gen_seq_bounds = gth_input_get_genomic_range(input, chain->gen_file_num,
                                                   chain->gen_seq_num);
      gen_total_length      = gt_range_length(&gen_seq_bounds);
      gen_offset            = gen_seq_bounds.start;
      gen_seq_bounds_rc     = gen_seq_bounds;
    }
    else {
      /* genomic multiseq contains exactly one sequence */
      gt_assert(gth_input_num_of_gen_seqs(input, chain->gen_file_num) == 1);
      gen_total_length = gth_input_genomic_file_total_length(input,
                                                             chain
                                                             ->gen_file_num);
      gen_seq_bounds.start    = gth_input_genomic_substring_from(input);
      gen_seq_bounds.end      = gth_input_genomic_substring_to(input);
      gen_offset              = 0;
      gen_seq_bounds_rc.start = gen_total_length - 1 - gen_seq_bounds.end;
      gen_seq_bounds_rc.end   = gen_total_length - 1 - gen_seq_bounds.start;
    }

    /* "retrieving" the reference sequence */
    range = gth_input_get_reference_range(input, chain->ref_file_num,
                                          chain->ref_seq_num);
    ref_seq_tran = gth_input_current_ref_seq_tran(input) + range.start;
    ref_seq_orig = gth_input_current_ref_seq_orig(input) + range.start;
    if (refseqisdna) {
      ref_seq_tran_rc = gth_input_current_ref_seq_tran_rc(input) + range.start;
      ref_seq_orig_rc = gth_input_current_ref_seq_orig_rc(input) + range.start;
    }
    ref_total_length = range.end - range.start + 1;

    /* check if protein sequences have a stop amino acid */
    if (!refseqisdna && !match_info->stop_amino_acid_warning &&
       ref_seq_orig[ref_total_length - 1] != GT_STOP_AMINO) {
      GtStr *ref_id = gt_str_new();
      gth_input_save_ref_id(input, ref_id, chain->ref_file_num,
                            chain->ref_seq_num);
      gt_warning("protein sequence '%s' (#" GT_WU " in file %s) does not end "
                 "with a stop amino acid ('%c'). If it is not a protein "
                 "fragment you should add a stop amino acid to improve the "
                 "prediction. For example with `gt seqtransform "
                 "-addstopaminos` (see http://genometools.org for details).",
                 gt_str_get(ref_id), chain->ref_seq_num,
                 gth_input_get_reference_filename(input, chain->ref_file_num),
                 GT_STOP_AMINO);
      match_info->stop_amino_acid_warning = true;
      gt_str_delete(ref_id);
    }

    /* allocating space for alignment */
    saA = gth_sa_new_and_set(directmatches, true, input, chain->gen_file_num,
                             chain->gen_seq_num, chain->ref_file_num,
                             chain->ref_seq_num, match_info->call_number,
                             gen_total_length, gen_offset, ref_total_length);

    /* extend the DP borders to the left and to the right */
    gth_chain_extend_borders(chain, &gen_seq_bounds, &gen_seq_bounds_rc,
                             gen_total_length, gen_offset);

    /* From here on the dp positions always refer to the forward strand of the
       genomic DNA. */

    /* call the Dynamic Programming */
    if (refseqisdna) {
      rval = call_dna_DP(directmatches, call_info, input, stat,
                         sa_collection, saA, gen_file_num, ref_file_num,
                         gen_total_length, gen_offset, &gen_seq_bounds,
                         &gen_seq_bounds_rc, ref_total_length, range.start,
                         chainctr, gth_chain_collection_size(chain_collection),
                         match_info, ref_seq_tran, ref_seq_orig,
                         ref_seq_tran_rc, ref_seq_orig_rc, chain,
                         dna_complete_path_matrix_jt,
                         protein_complete_path_matrix_jt);
    }
    else {
      rval = call_protein_DP(directmatches, call_info, input,
                             stat, sa_collection, saA, gen_file_num,
                             ref_file_num, gen_total_length, gen_offset,
                             &gen_seq_bounds, &gen_seq_bounds_rc,
                             ref_total_length, range.start, chainctr,
                             gth_chain_collection_size(chain_collection),
                             match_info, ref_seq_tran, ref_seq_orig, chain,
                             dna_complete_path_matrix_jt,
                             protein_complete_path_matrix_jt);
    }
    /* check return value */
    if (rval == GTH_ERROR_DP_PARAMETER_ALLOCATION_FAILED) {
      /* statistics bookkeeping */
      gth_stat_increment_numoffailedDPparameterallocations(stat);
      gth_stat_increment_numofundeterminedSAs(stat);
      /* free space */
      gth_sa_delete(saA);
      match_info->call_number--;
      continue; /* continue with the next DP range */
    }
    else if (rval)
      return -1;
  }

  if (!call_info->out->xmlout && !call_info->out->gff3out && !directmatches &&
      !match_info->significant_match_found &&
      match_info->call_number <= call_info->firstalshown) {
    show_no_match_line(gth_input_get_alphatype(input, ref_file_num), outfp);
  }

  return 0;
}