Пример #1
0
static int gt_genomediff_arguments_check(int rest_argc,
        void *tool_arguments,
        GtError *err)
{
    GtGenomediffArguments *arguments = tool_arguments;
    bool prepared_index;
    int had_err = 0;
    gt_error_check(err);
    gt_assert(arguments);

    if (rest_argc == 0) {
        gt_error_set(err, "give at least one file (base)name!");
        had_err = -1;
    }
    if (!had_err) {
        if (strcmp("esa", gt_str_get(arguments->indextype)) == 0)
            arguments->with_esa = true;
        else if (strcmp("pck", gt_str_get(arguments->indextype)) == 0)
            arguments->with_pck = true;
    }
    prepared_index = (arguments->with_esa || arguments->with_pck);

    if (!had_err && arguments->user_max_depth != -1 && !arguments->with_pck)
        gt_warning("option -maxdepth does only apply to -indextype pck");

    if (!had_err &&
            prepared_index && gt_encseq_options_mirrored_value(arguments->loadopts))
        gt_warning("option -mirrored is ignored with esa and pck index");

    if (!had_err && prepared_index && rest_argc > 1) {
        gt_error_set(err, "there should be only one basename argument with "
                     "-indextype esa|pck");
        had_err = -1;
    }
    if (rest_argc == 1 && gt_str_length(arguments->indexname) != 0) {
        gt_error_set(err, "Option -indexname is only needed with sequence files, "
                     "if one file is given as argument, this should be an index.");
        had_err = -1;
    }
    if (!had_err && rest_argc > 1 && gt_str_length(arguments->indexname) == 0) {
        gt_error_set(err, "more than one input file given, please use -indexname "
                     "for basename of indices created during run.");
        had_err = -1;
    }

    if (!had_err)
        arguments->with_units = gt_option_is_set(arguments->ref_unitfile);

    return had_err;
}
static int check_boundaries_visitor_check_rec(GtFeatureNode *parent,
                                              GtFeatureNode *child,
                                              GtError *err)
{
  GtFeatureNodeIterator *fni;
  GtFeatureNode *node;
  GtRange range,
          p_range;
  int had_err = 0;

  range = gt_genome_node_get_range((GtGenomeNode*) child);
  p_range = gt_genome_node_get_range((GtGenomeNode*) parent);

  if (range.start < p_range.start || range.end > p_range.end) {
    gt_warning("%s child range " GT_WU "-" GT_WU " (file %s, line %u) not "
               "contained in %s parent range " GT_WU "-" GT_WU " (file %s, "
               "line %u)",
               gt_feature_node_get_type(child),
               range.start, range.end,
               gt_genome_node_get_filename((GtGenomeNode*) child),
               gt_genome_node_get_line_number((GtGenomeNode*) child),
               gt_feature_node_get_type(parent),
               p_range.start, p_range.end,
               gt_genome_node_get_filename((GtGenomeNode*) parent),
               gt_genome_node_get_line_number((GtGenomeNode*) parent));
  }

  fni = gt_feature_node_iterator_new_direct(child);
  while ((node = gt_feature_node_iterator_next(fni))) {
    had_err = check_boundaries_visitor_check_rec(child, node, err);
  }
  gt_feature_node_iterator_delete(fni);

  return had_err;
}
Пример #3
0
static int gt_splicesiteinfo_runner(int argc, const char **argv,
                                    int parsed_args, void *tool_arguments,
                                    GtError *err)
{
  SpliceSiteInfoArguments *arguments = tool_arguments;
  GtNodeStream *gff3_in_stream = NULL,
               *add_introns_stream = NULL,
               *splice_site_info_stream = NULL;
  GtRegionMapping *region_mapping;
  int had_err = 0;

  gt_error_check(err);
  gt_assert(arguments);

  if (!had_err) {
    /* create gff3 input stream */
    gff3_in_stream = gt_gff3_in_stream_new_unsorted(argc - parsed_args,
                                                    argv + parsed_args);

    /* create region mapping */
    region_mapping = gt_seqid2file_region_mapping_new(arguments->s2fi, err);
    if (!region_mapping)
      had_err = -1;
  }

  if (!had_err) {
    /* create addintrons stream (if necessary) */
    if (arguments->addintrons)
      add_introns_stream = gt_add_introns_stream_new(gff3_in_stream);

    /* create extract feature stream */
    splice_site_info_stream = gt_splice_site_info_stream_new(
                                                          arguments->addintrons
                                                          ? add_introns_stream
                                                          : gff3_in_stream,
                                                          region_mapping);

    /* pull the features through the stream and free them afterwards */
    had_err = gt_node_stream_pull(splice_site_info_stream, err);
  }

  if (!had_err) {
    if (!gt_splice_site_info_stream_show(splice_site_info_stream,
                                         arguments->outfp)) {
      gt_warning("input file(s) contained no intron, use option -addintrons to "
                 "add introns automatically");
    }
  }

  /* free */
  gt_node_stream_delete(splice_site_info_stream);
  gt_node_stream_delete(add_introns_stream);
  gt_node_stream_delete(gff3_in_stream);

  return had_err;
}
Пример #4
0
static int toolbox_iterate(void *key, void *value, void *data,
                           GT_UNUSED GtError *err)
{
  const char *name = key;
  GtToolinfo *toolinfo = value;
  IterateInfo *info = data;
  gt_error_check(err);
  gt_assert(key && value && data);
  if (!toolinfo->hidden) {
    if (toolinfo->tool)
      info->func(name, toolinfo->tool, info->data);
    else
      gt_warning("skipping tool '%s' in iterator (not a GtTool object)", name);
  }
  return 0;
}
Пример #5
0
static int gt_speck_arguments_check(GT_UNUSED int rest_argc,
                                    void *tool_arguments,
                                    GT_UNUSED GtError *err)
{
  SpeccheckArguments *arguments = tool_arguments;
  int had_err = 0;
  gt_error_check(err);
  gt_assert(arguments);

  if (arguments->outfp && arguments->colored) {
    gt_warning("file output requested ('-o'), disabling colored output");
    arguments->colored = false;
  }

  return had_err = 0;
}
Пример #6
0
static int gt_speck_arguments_check(GT_UNUSED int rest_argc,
                                    void *tool_arguments,
                                    GT_UNUSED GtError *err)
{
  SpeccheckArguments *arguments = tool_arguments;
  int had_err = 0;
  gt_error_check(err);
  gt_assert(arguments);

  if ((arguments->outfp || (!arguments->outfp && !isatty(STDOUT_FILENO)))
      && arguments->colored) {
    gt_warning("not printing to terminal, disabling colored output");
    arguments->colored = false;
  }

  return had_err = 0;
}
Пример #7
0
static int determine_outfp(void *data, GtError *err)
{
  GtOutputFileInfo *ofi = (GtOutputFileInfo*) data;
  GtFileMode file_mode;
  int had_err = 0;
  gt_error_check(err);
  gt_assert(ofi);
  if (!gt_str_length(ofi->output_filename))
    *ofi->outfp = NULL; /* no output file given -> use stdout */
  else { /* outputfile given -> create generic file pointer */
    gt_assert(!(ofi->gzip && ofi->bzip2));
    if (ofi->gzip)
      file_mode = GT_FILE_MODE_GZIP;
    else if (ofi->bzip2)
      file_mode = GT_FILE_MODE_BZIP2;
    else
      file_mode = GT_FILE_MODE_UNCOMPRESSED;
    if (file_mode != GT_FILE_MODE_UNCOMPRESSED &&
        strcmp(gt_str_get(ofi->output_filename) +
               gt_str_length(ofi->output_filename) -
               strlen(gt_file_mode_suffix(file_mode)),
               gt_file_mode_suffix(file_mode))) {
      gt_warning("output file '%s' doesn't have correct suffix '%s', appending "
                 "it", gt_str_get(ofi->output_filename),
                 gt_file_mode_suffix(file_mode));
      gt_str_append_cstr(ofi->output_filename, gt_file_mode_suffix(file_mode));
    }
    if (!ofi->force && gt_file_exists(gt_str_get(ofi->output_filename))) {
        gt_error_set(err, "file \"%s\" exists already, use option -%s to "
                     "overwrite", gt_str_get(ofi->output_filename),
                     GT_FORCE_OPT_CSTR);
        had_err = -1;
    }
    if (!had_err) {
      *ofi->outfp = gt_file_xopen_file_mode(file_mode,
                                            gt_str_get(ofi->output_filename),
                                            "w");
      gt_assert(*ofi->outfp);
    }
  }
  return had_err;
}
Пример #8
0
static GtStr* make_id_unique(GtGFF3Visitor *gff3_visitor, GtFeatureNode *fn)
{
  GtUword i = 1;
  GtStr *id = gt_str_new_cstr(gt_feature_node_get_attribute(fn, "ID"));

  if (gt_cstr_table_get(gff3_visitor->used_ids, gt_str_get(id))) {
    GtStr *buf = gt_str_new();
    while (!id_string_is_unique(id, buf, gff3_visitor->used_ids, i++));
    gt_warning("feature ID \"%s\" not unique: changing to %s", gt_str_get(id),
                                                               gt_str_get(buf));
    gt_str_set(id, gt_str_get(buf));
    gt_str_delete(buf);
  }
  /* update table with the new id */
  gt_cstr_table_add(gff3_visitor->used_ids, gt_str_get(id));
  /* store (unique) id */
  gt_hashmap_add(gff3_visitor->feature_node_to_unique_id_str, fn, id);

  return id;
}
Пример #9
0
void gt_lib_init(void)
{
  const char *bookkeeping;
  bookkeeping = getenv("GT_MEM_BOOKKEEPING");
  gt_ma_init(bookkeeping && !strcmp(bookkeeping, "on"));
  proc_env_options();
  if (spacepeak && !(bookkeeping && !strcmp(bookkeeping, "on")))
    gt_warning("GT_ENV_OPTIONS=-spacepeak used without GT_MEM_BOOKKEEPING=on");
  gt_fa_init();
  if (spacepeak) {
    gt_spacepeak_init();
    gt_ma_enable_global_spacepeak();
    gt_fa_enable_global_spacepeak();
  }
  gt_log_init();
  if (showtime) gt_showtime_enable();
  gt_symbol_init();
  gt_class_prealloc_run();
  gt_ya_rand_init(0);
}
Пример #10
0
int gt_ltrdigest_arguments_check(GT_UNUSED int rest_argc, void *tool_arguments,
                                 GtError* err)
{
  GtLTRdigestOptions *arguments = tool_arguments;
  int had_err  = 0;

  if (arguments->nthreads > 0) {
    gt_warning("The '-threads' option is deprecated. Please use the '-j'"
               "option of the 'gt' call instead, e.g.:\n"
               "  gt -j %lu ltrdigest ...", arguments->nthreads);
  }

  /* -trnas */
  if (!had_err && arguments->trna_lib && gt_str_length(arguments->trna_lib) > 0)
  {
    if (!gt_file_exists(gt_str_get(arguments->trna_lib)))
    {
      gt_error_set(err, "File '%s' does not exist!",
                        gt_str_get(arguments->trna_lib));
      had_err = -1;
    }
  }

  if (!had_err)
  {
    GtHMM *hmm;
    GtAlphabet *alpha;
    alpha = gt_alphabet_new_dna();
    hmm = gt_ppt_hmm_new(alpha, &arguments->ppt_opts);
    if (!hmm)
    {
      gt_error_set(err, "PPT HMM parameters are not valid!");
      had_err = -1;
    }
    else
      gt_hmm_delete(hmm);
    gt_alphabet_delete(alpha);
  }

  return had_err;
}
Пример #11
0
static int gtf_show_feature_node(GtFeatureNode *fn, void *data, GtError *err)
{
  GtGTFVisitor *gtf_visitor = (GtGTFVisitor*) data;
  int had_err = 0;
  if (gt_feature_node_has_type(fn, gt_ft_gene)) {
      gtf_visitor->gene_id++;
      gtf_visitor->transcript_id = 0;
      had_err = gtf_show_transcript(fn, gtf_visitor, err);
  }
  else if (gt_feature_node_has_type(fn, gt_ft_mRNA)) {
    had_err = gtf_show_transcript(fn, gtf_visitor, err);
  }
  else if (!(gt_feature_node_has_type(fn, gt_ft_CDS) ||
             gt_feature_node_has_type(fn, gt_ft_exon))) {
      gt_warning("skipping GFF3 feature of type \"%s\" (from line %u in file "
                 "\"%s\")",
                 gt_feature_node_get_type(fn),
                 gt_genome_node_get_line_number((GtGenomeNode*) fn),
                 gt_genome_node_get_filename((GtGenomeNode*) fn));
  }
  return had_err;
}
Пример #12
0
int gt_ltrdigest_arguments_check(GT_UNUSED int rest_argc, void *tool_arguments,
                                 GtError* err)
{
  GtLTRdigestOptions *arguments = tool_arguments;
  int had_err  = 0;

  if (arguments->nthreads > 0) {
    gt_warning("The '-threads' option is deprecated. Please use the '-j'"
               "option of the 'gt' call instead, e.g.:\n"
               "  gt -j "GT_WU" ltrdigest ...", arguments->nthreads);
  }

  /* -trnas */
  if (!had_err && arguments->trna_lib && gt_str_length(arguments->trna_lib) > 0)
  {
    if (!gt_file_exists(gt_str_get(arguments->trna_lib)))
    {
      gt_error_set(err, "File '%s' does not exist!",
                        gt_str_get(arguments->trna_lib));
      had_err = -1;
    }
  }
  return had_err;
}
static int gt_xrf_abbr_parse_tree_tag_line(GtIO *xrf_abbr_file, GtStr *tag,
                                           GtStr *value, GtError *err)
{
  int had_err = 0;
  gt_error_check(err);
  gt_log_log("tag");
  gt_assert(xrf_abbr_file && tag && value);
  do {
    had_err = gt_xrf_abbr_parse_tree_proc_any_char(xrf_abbr_file, tag,
                                                   false, err);
  } while (!had_err && gt_xrf_abbr_parse_tree_any_char(xrf_abbr_file, false));
  if (!had_err)
    had_err = gt_io_expect(xrf_abbr_file, XRF_SEPARATOR_CHAR, err);
  while (!had_err && gt_io_peek(xrf_abbr_file) == XRF_BLANK_CHAR)
    gt_io_next(xrf_abbr_file);
  if (!had_err) {
    do {
      had_err = gt_xrf_abbr_parse_tree_proc_any_char(xrf_abbr_file, value,
                                                     true, err);
    } while (!had_err && gt_xrf_abbr_parse_tree_any_char(xrf_abbr_file, true));
  }
  if (!had_err) {
    if (gt_io_peek(xrf_abbr_file) == XRF_COMMENT_CHAR)
      had_err = gt_xrf_abbr_parse_tree_comment_line(xrf_abbr_file, err);
    else
      had_err = gt_io_expect(xrf_abbr_file, GT_END_OF_LINE, err);
  }
  if (!had_err && !gt_xrf_abbr_parse_tree_valid_label(gt_str_get(tag))) {
    gt_warning("file \"%s\": line "GT_WU": unknown label \"%s\"",
                gt_io_get_filename(xrf_abbr_file),
                gt_io_get_line_number(xrf_abbr_file),
                gt_str_get(tag));
  }
  gt_log_log("parsed line %s/%s", gt_str_get(tag), gt_str_get(value));
  return had_err;
}
Пример #14
0
static int construct_mRNAs(GT_UNUSED void *key, void *value, void *data,
                           GtError *err)
{
  ConstructionInfo *cinfo = (ConstructionInfo*) data;
  GtArray *gt_genome_node_array = (GtArray*) value,
          *mRNAs = (GtArray*) cinfo->mRNAs;
  GtGenomeNode *mRNA_node, *first_node, *gn;
  const char *tname;
  GtStrand mRNA_strand;
  GtRange mRNA_range;
  GtStr *mRNA_seqid;
  GtUword i;
  int had_err = 0;

  gt_error_check(err);
  gt_assert(key && value && data);
   /* at least one node in array */
  gt_assert(gt_array_size(gt_genome_node_array));

  /* determine the range and the strand of the mRNA */
  first_node = *(GtGenomeNode**) gt_array_get(gt_genome_node_array, 0);
  mRNA_range = gt_genome_node_get_range(first_node);
  mRNA_strand = gt_feature_node_get_strand((GtFeatureNode*) first_node);
  mRNA_seqid = gt_genome_node_get_seqid(first_node);

  /* TODO: support discontinuous start/stop codons */
  for (i = 0; !had_err && i < gt_array_size(gt_genome_node_array); i++) {
    gn = *(GtGenomeNode**) gt_array_get(gt_genome_node_array, i);
    if (gt_feature_node_get_attribute((GtFeatureNode*) gn,
        GTF_PARSER_STOP_CODON_FLAG)) {
      GtUword j;
      GtRange stop_codon_rng = gt_genome_node_get_range(gn);
      bool found_cds = false;
      for (j = 0; !had_err && j < gt_array_size(gt_genome_node_array); j++) {
        GtGenomeNode* gn2;
        GtRange this_rng;
        const char *this_type;
        gn2 = *(GtGenomeNode**) gt_array_get(gt_genome_node_array, j);
        if (gn == gn2) continue;
        this_rng = gt_genome_node_get_range(gn2);
        this_type = gt_feature_node_get_type((GtFeatureNode*) gn2);
        if (this_type == gt_symbol(gt_ft_CDS)) {
          if (gt_range_contains(&this_rng, &stop_codon_rng)) {
            if (cinfo->tidy) {
              gt_warning("stop codon on line %u in file %s is contained in "
                         "CDS in line %u",
                         gt_genome_node_get_line_number(gn),
                         gt_genome_node_get_filename(gn),
                         gt_genome_node_get_line_number(gn2));
              found_cds = true;
            } else {
              gt_error_set(err, "stop codon on line %u in file %s is "
                                "contained in CDS in line %u",
                           gt_genome_node_get_line_number(gn),
                           gt_genome_node_get_filename(gn),
                           gt_genome_node_get_line_number(gn2));
              had_err = -1;
            }
            break;
          }
          if (this_rng.end + 1 == stop_codon_rng.start) {
            this_rng.end = stop_codon_rng.end;
            gt_genome_node_set_range(gn2, &this_rng);
            found_cds = true;
            break;
          }
          if (this_rng.start == stop_codon_rng.end + 1) {
            this_rng.start = stop_codon_rng.start;
            gt_genome_node_set_range(gn2, &this_rng);
            found_cds = true;
            break;
          }
        }
      }
      if (!found_cds) {
        if (!had_err) {
          if (cinfo->tidy) {
            gt_warning("found stop codon on line %u in file %s with no "
                       "flanking CDS, ignoring it",
                       gt_genome_node_get_line_number(gn),
                       gt_genome_node_get_filename(gn));
          } else {
            gt_error_set(err, "found stop codon on line %u in file %s with no "
                              "flanking CDS",
                         gt_genome_node_get_line_number(gn),
                         gt_genome_node_get_filename(gn));
            had_err = -1;
            break;
          }
        }
      } else {
        gt_array_rem(gt_genome_node_array, i);
        gt_genome_node_delete(gn);
      }
    }
  }

  for (i = 1; !had_err && i < gt_array_size(gt_genome_node_array); i++) {
    GtRange range;
    GtStrand strand;
    gn = *(GtGenomeNode**) gt_array_get(gt_genome_node_array, i);
    range = gt_genome_node_get_range(gn);
    mRNA_range = gt_range_join(&mRNA_range, &range);
    strand = gt_feature_node_get_strand((GtFeatureNode*) gn);
    if (strand != mRNA_strand) {
      gt_error_set(err, "feature %s on line %u has strand %c, but the "
                        "parent transcript has strand %c",
                   (const char*) key,
                   gt_genome_node_get_line_number(gn),
                   GT_STRAND_CHARS[strand],
                   GT_STRAND_CHARS[mRNA_strand]);
      had_err = -1;
      break;
    } else {
      mRNA_strand = gt_strand_join(mRNA_strand, strand);
    }
    if (!had_err && gt_str_cmp(mRNA_seqid, gt_genome_node_get_seqid(gn))) {
      gt_error_set(err, "The features on lines %u and %u refer to different "
                "genomic sequences (``seqname''), although they have the same "
                "gene IDs (``gene_id'') which must be globally unique",
                gt_genome_node_get_line_number(first_node),
                gt_genome_node_get_line_number(gn));
      had_err = -1;
      break;
    }
  }

  if (!had_err) {
    mRNA_node = gt_feature_node_new(mRNA_seqid, gt_ft_mRNA, mRNA_range.start,
                                    mRNA_range.end, mRNA_strand);
    gt_feature_node_add_attribute(((GtFeatureNode*) mRNA_node), "ID", key);
    gt_feature_node_add_attribute(((GtFeatureNode*) mRNA_node), "transcript_id",
                                  key);

    if ((tname = gt_hashmap_get(cinfo->transcript_id_to_name_mapping,
                              (const char*) key)) && strlen(tname) > 0) {
      gt_feature_node_add_attribute((GtFeatureNode*) mRNA_node, GT_GFF_NAME,
                                      tname);
    }

    /* register children */
    for (i = 0; i < gt_array_size(gt_genome_node_array); i++) {
      gn = *(GtGenomeNode**) gt_array_get(gt_genome_node_array, i);
      gt_feature_node_add_child((GtFeatureNode*) mRNA_node,
                                (GtFeatureNode*) gt_genome_node_ref(gn));
    }

    /* store the mRNA */
    gt_array_add(mRNAs, mRNA_node);
  }

  return had_err;
}
Пример #15
0
static int add_ids_visitor_feature_node(GtNodeVisitor *nv, GtFeatureNode *fn,
                                        GtError *err)
{
  AutomaticSequenceRegion *auto_sr;
  GtAddIDsVisitor *aiv;
  const char *seqid;
  bool is_circular;
  aiv = add_ids_visitor_cast(nv);
  seqid = gt_str_get(gt_genome_node_get_seqid((GtGenomeNode*) fn));
  if (aiv->ensure_sorting && !gt_cstr_table_get(aiv->defined_seqids, seqid)) {
    gt_error_set(err, "the file %s is not sorted (seqid \"%s\" on line %u has "
                 "not been previously introduced with a \"%s\" line)",
                 gt_genome_node_get_filename((GtGenomeNode*) fn), seqid,
                 gt_genome_node_get_line_number((GtGenomeNode*) fn),
                 GT_GFF_SEQUENCE_REGION);
    return -1;
  }
  if (!gt_cstr_table_get(aiv->defined_seqids, seqid)) {
    GtFeatureNodeIterator *fni;
    GtFeatureNode *node;
    GtRange range = gt_genome_node_get_range((GtGenomeNode*) fn);
    is_circular = gt_feature_node_get_attribute(fn, GT_GFF_IS_CIRCULAR)
                  ? true : false;
    if (!is_circular) {
      fni = gt_feature_node_iterator_new(fn);
      while ((node = gt_feature_node_iterator_next(fni))) {
        GtRange node_range = gt_genome_node_get_range((GtGenomeNode*) node);
        range = gt_range_join(&range, &node_range);
      }
      gt_feature_node_iterator_delete(fni);
    }
    /* sequence region has not been previously introduced -> check if one has
       already been created automatically */
    auto_sr = gt_hashmap_get(aiv->undefined_sequence_regions, seqid);
    if (!auto_sr) {
      GtStr *seqid_str;
      /* sequence region has not been createad automatically -> do it now */
      gt_warning("seqid \"%s\" on line %u in file \"%s\" has not been "
                 "previously introduced with a \"%s\" line, create such a line "
                 "automatically", seqid,
                 gt_genome_node_get_line_number((GtGenomeNode*) fn),
                 gt_genome_node_get_filename((GtGenomeNode*) fn),
                 GT_GFF_SEQUENCE_REGION);
      auto_sr = automatic_sequence_region_new(is_circular);
      seqid_str = gt_genome_node_get_seqid((GtGenomeNode*) fn);
      auto_sr->sequence_region = gt_region_node_new(seqid_str, range.start,
                                                               range.end);
      gt_hashmap_add(aiv->undefined_sequence_regions, gt_str_get(seqid_str),
                     auto_sr);
    }
    else {
      if (auto_sr->is_circular) {
        gt_assert(!is_circular); /* XXX */
      }
      else if (is_circular) {
        gt_assert(!auto_sr->is_circular); /* XXX */
        auto_sr->is_circular = true;
        gt_genome_node_set_range(auto_sr->sequence_region, &range);
      }
      else {
        GtRange joined_range,
                sr_range = gt_genome_node_get_range(auto_sr->sequence_region);
        /* update the range of the sequence region */
        joined_range = gt_range_join(&range, &sr_range);
        gt_genome_node_set_range(auto_sr->sequence_region, &joined_range);
      }
    }
    gt_array_add(auto_sr->feature_nodes, fn);
  }
  else
    gt_queue_add(aiv->node_buffer, fn);
  return 0;
}
Пример #16
0
static int condenseq_io(GtCondenseq *condenseq,
                        FILE* fp,
                        GtIOFunc io_func,
                        GtError *err)
{
  int had_err = 0;
  int file_format = GT_CONDENSEQ_VERSION;
  GtUword idx;
  had_err = gt_condenseq_io_one(condenseq->orig_length);
  if (!had_err)
    had_err = gt_condenseq_io_one(file_format);
  if (!had_err && file_format != GT_CONDENSEQ_VERSION) {
    gt_error_set(err, "condenseq index is format version %d, current is "
                 "%d -- please re-encode",
                 file_format, GT_CONDENSEQ_VERSION);
    had_err = -1;
  }
  if (!had_err)
    had_err = gt_condenseq_io_one(condenseq->orig_num_seq);
  if (!had_err)
    had_err = gt_condenseq_io_one(condenseq->ldb_nelems);
  if (!had_err) {
    if (condenseq->ldb_nelems == 0) {
      gt_warning("compression of condenseq did not succeed in finding any "
                 "compressable similarities, maybe the input is to small or "
                 "the chosen parameters should be reconsidered.");
    }
    if (condenseq->links == NULL) {
      condenseq->links = gt_calloc((size_t) condenseq->ldb_nelems,
                                   sizeof (*condenseq->links));
      condenseq->ldb_allocated = condenseq->ldb_nelems;
    }

    had_err = gt_condenseq_io_one(condenseq->udb_nelems);
  }

  if (!had_err) {
    gt_assert(condenseq->udb_nelems > 0);

    if (condenseq->uniques == NULL) {
      condenseq->uniques = gt_malloc(sizeof (*condenseq->uniques) *
                                     condenseq->udb_nelems );
      condenseq->udb_allocated = condenseq->udb_nelems;
    }
  }

  for (idx = 0; !had_err && idx < condenseq->ldb_nelems; idx++) {
    had_err = condenseq_linkentry_io(&condenseq->links[idx], fp, io_func, err);
  }

  for (idx = 0; !had_err && idx < condenseq->udb_nelems; idx++) {
    had_err = condenseq_uniqueentry_io(&condenseq->uniques[idx], fp, io_func,
                                       err);
  }
  if (!had_err && condenseq->orig_num_seq > (GtUword) 1) {
    condenseq->ssptab = gt_intset_io(condenseq->ssptab, fp, err);
    if (condenseq->ssptab == NULL)
      had_err = 1;
  }
  if (!had_err)
    had_err = gt_condenseq_io_one(condenseq->id_len);
  if (!had_err) {
    if (condenseq->id_len == GT_UNDEF_UWORD) {
      condenseq->sdstab = gt_intset_io(condenseq->sdstab, fp, err);
      if (condenseq->sdstab == NULL)
        had_err = 1;
    }
  }
  if (!had_err)
    had_err = gt_condenseq_io_one(condenseq->ids_total_len);
  if (!had_err) {
    condenseq->orig_ids = gt_realloc(condenseq->orig_ids,
                                     (size_t) condenseq->ids_total_len);
    had_err = io_func(condenseq->orig_ids, sizeof (*condenseq->orig_ids),
                      (size_t) condenseq->ids_total_len, fp, err);
  }
  return had_err;
}
Пример #17
0
int gt_gtf_parser_parse(GtGTFParser *parser, GtQueue *genome_nodes,
                        GtStr *filenamestr, GtFile *fpin, bool be_tolerant,
                        GtError *err)
{
  GtStr *seqid_str, *source_str, *line_buffer;
  char *line;
  size_t line_length;
  GtUword i, line_number = 0;
  GtGenomeNode *gn;
  GtRange range;
  GtPhase phase_value;
  GtStrand gt_strand_value;
  GtSplitter *splitter, *attribute_splitter;
  float score_value;
  char *seqname,
       *source,
       *feature,
       *start,
       *end,
       *score,
       *strand,
       *frame,
       *attributes,
       *token,
       *gene_id,
       *gene_name = NULL,
       *transcript_id,
       *transcript_name = NULL,
       **tokens;
  GtHashmap *transcript_id_hash; /* map from transcript id to array of genome
                                    nodes */
  GtArray *gt_genome_node_array;
  ConstructionInfo cinfo;
  GTF_feature_type gtf_feature_type;
  GT_UNUSED bool gff_type_is_valid = false;
  const char *type = NULL;
  const char *filename;
  bool score_is_defined;
  int had_err = 0;

  gt_assert(parser && genome_nodes);
  gt_error_check(err);

  filename = gt_str_get(filenamestr);

  /* alloc */
  line_buffer = gt_str_new();
  splitter = gt_splitter_new(),
  attribute_splitter = gt_splitter_new();

#define HANDLE_ERROR                                                \
        if (had_err) {                                              \
          if (be_tolerant) {                                        \
            fprintf(stderr, "skipping line: %s\n", gt_error_get(err)); \
            gt_error_unset(err);                                       \
            gt_str_reset(line_buffer);                                 \
            had_err = 0;                                            \
            continue;                                               \
          }                                                         \
          else {                                                    \
            had_err = -1;                                           \
            break;                                                  \
          }                                                         \
        }

  while (gt_str_read_next_line_generic(line_buffer, fpin) != EOF) {
    line = gt_str_get(line_buffer);
    line_length = gt_str_length(line_buffer);
    line_number++;
    had_err = 0;

    if (line_length == 0) {
      gt_warning("skipping blank line " GT_WU " in file \"%s\"", line_number,
                 filename);
    }
    else if (line[0] == '#') {
      /* storing comment */
      if (line_length >= 2 && line[1] == '#')
        gn = gt_comment_node_new(line+2); /* store '##' line as '#' line */
      else
        gn = gt_comment_node_new(line+1);
      gt_genome_node_set_origin(gn, filenamestr, line_number);
      gt_queue_add(genome_nodes, gn);
    }
    else {
      /* process tab delimited GTF line */
      gt_splitter_reset(splitter);
      gt_splitter_split(splitter, line, line_length, '\t');
      if (gt_splitter_size(splitter) != 9UL) {
        gt_error_set(err, "line " GT_WU " in file \"%s\" contains " GT_WU
                     " tab (\\t) " "separated fields instead of 9", line_number,
                     filename,
                  gt_splitter_size(splitter));
        had_err = -1;
        break;
      }
      tokens = gt_splitter_get_tokens(splitter);
      seqname    = tokens[0];
      source     = tokens[1];
      feature    = tokens[2];
      start      = tokens[3];
      end        = tokens[4];
      score      = tokens[5];
      strand     = tokens[6];
      frame      = tokens[7];
      attributes = tokens[8];

      /* parse feature */
      if (GTF_feature_type_get(&gtf_feature_type, feature) == -1) {
        /* we skip unknown features */
        fprintf(stderr, "skipping line " GT_WU " in file \"%s\": unknown "
                "feature: \"%s\"\n", line_number, filename, feature);
        gt_str_reset(line_buffer);
        continue;
      }

      /* translate into GFF3 feature type */
      switch (gtf_feature_type) {
        case GTF_CDS:
        case GTF_stop_codon:
          gff_type_is_valid = gt_type_checker_is_valid(parser->type_checker,
                                                       gt_ft_CDS);
          type = gt_ft_CDS;
          break;
        case GTF_exon:
          gff_type_is_valid = gt_type_checker_is_valid(parser->type_checker,
                                                       gt_ft_exon);
          type = gt_ft_exon;
      }
      gt_assert(gff_type_is_valid);

      /* parse the range */
      had_err = gt_parse_range(&range, start, end, line_number, filename, err);
      HANDLE_ERROR;

      /* process seqname (we have to do it here because we need the range) */
      gt_region_node_builder_add_region(parser->region_node_builder, seqname,
                                        range);

      /* parse the score */
      had_err = gt_parse_score(&score_is_defined, &score_value, score,
                               line_number, filename, err);
      HANDLE_ERROR;

      /* parse the strand */
      had_err = gt_parse_strand(&gt_strand_value, strand, line_number, filename,
                               err);
      HANDLE_ERROR;

      /* parse the frame */
      had_err = gt_parse_phase(&phase_value, frame, line_number, filename, err);
      HANDLE_ERROR;

      /* parse the attributes */
      gt_splitter_reset(attribute_splitter);
      gene_id = NULL;
      transcript_id = NULL;
      gt_splitter_split(attribute_splitter, attributes, strlen(attributes),
                        ';');
      for (i = 0; i < gt_splitter_size(attribute_splitter); i++) {
        token = gt_splitter_get_token(attribute_splitter, i);
        /* skip leading blanks */
        while (*token == ' ')
          token++;
        /* look for the two mandatory attributes */
        if (strncmp(token, GENE_ID_ATTRIBUTE, strlen(GENE_ID_ATTRIBUTE)) == 0) {
          if (strlen(token) + 2 < strlen(GENE_ID_ATTRIBUTE)) {
            gt_error_set(err, "missing value to attribute \"%s\" on line "
                         GT_WU "in file \"%s\"", GENE_ID_ATTRIBUTE, line_number,
                         filename);
            had_err = -1;
          }
          HANDLE_ERROR;
          gene_id = token + strlen(GENE_ID_ATTRIBUTE) + 1;
        }
        else if (strncmp(token, TRANSCRIPT_ID_ATTRIBUTE,
                         strlen(TRANSCRIPT_ID_ATTRIBUTE)) == 0) {
          if (strlen(token) + 2 < strlen(TRANSCRIPT_ID_ATTRIBUTE)) {
            gt_error_set(err, "missing value to attribute \"%s\" on line "
                         GT_WU "in file \"%s\"", TRANSCRIPT_ID_ATTRIBUTE,
                         line_number, filename);
            had_err = -1;
          }
          HANDLE_ERROR;
          transcript_id = token + strlen(TRANSCRIPT_ID_ATTRIBUTE) + 1;
        }
        else if (strncmp(token, GENE_NAME_ATTRIBUTE,
                         strlen(GENE_NAME_ATTRIBUTE)) == 0) {
          if (strlen(token) + 2 < strlen(GENE_NAME_ATTRIBUTE)) {
            gt_error_set(err, "missing value to attribute \"%s\" on line "
                         GT_WU "in file \"%s\"", GENE_NAME_ATTRIBUTE,
                         line_number, filename);
            had_err = -1;
          }
          HANDLE_ERROR;
          gene_name = token + strlen(GENE_NAME_ATTRIBUTE) + 1;
          /* for output we want to strip quotes */
          if (*gene_name == '"')
            gene_name++;
          if (gene_name[strlen(gene_name)-1] == '"')
            gene_name[strlen(gene_name)-1] = '\0';
        }
        else if (strncmp(token, TRANSCRIPT_NAME_ATTRIBUTE,
                         strlen(TRANSCRIPT_NAME_ATTRIBUTE)) == 0) {
          if (strlen(token) + 2 < strlen(TRANSCRIPT_NAME_ATTRIBUTE)) {
            gt_error_set(err, "missing value to attribute \"%s\" on line "
                         GT_WU "in file \"%s\"", TRANSCRIPT_NAME_ATTRIBUTE,
                         line_number, filename);
            had_err = -1;
          }
          HANDLE_ERROR;
          transcript_name = token + strlen(TRANSCRIPT_NAME_ATTRIBUTE) + 1;
          /* for output we want to strip quotes */
          if (*transcript_name == '"')
            transcript_name++;
          if (transcript_name[strlen(transcript_name)-1] == '"')
            transcript_name[strlen(transcript_name)-1] = '\0';
        }
      }

      /* check for the mandatory attributes */
      if (!gene_id) {
        gt_error_set(err, "missing attribute \"%s\" on line " GT_WU
                     " in file \"%s\"", GENE_ID_ATTRIBUTE, line_number,
                     filename);
        had_err = -1;
      }
      HANDLE_ERROR;
      if (!transcript_id) {
        gt_error_set(err, "missing attribute \"%s\" on line " GT_WU
                     " in file \"%s\"", TRANSCRIPT_ID_ATTRIBUTE, line_number,
                     filename);
        had_err = -1;
      }
      HANDLE_ERROR;

      /* process the mandatory attributes */
      if (!(transcript_id_hash = gt_hashmap_get(parser->gene_id_hash,
                                             gene_id))) {
        transcript_id_hash = gt_hashmap_new(GT_HASH_STRING, gt_free_func,
                                            (GtFree) gt_array_delete);
        gt_hashmap_add(parser->gene_id_hash, gt_cstr_dup(gene_id),
                    transcript_id_hash);
      }
      gt_assert(transcript_id_hash);

      if (!(gt_genome_node_array = gt_hashmap_get(transcript_id_hash,
                                            transcript_id))) {
        gt_genome_node_array = gt_array_new(sizeof (GtGenomeNode*));
        gt_hashmap_add(transcript_id_hash, gt_cstr_dup(transcript_id),
                    gt_genome_node_array);
      }
      gt_assert(gt_genome_node_array);

      /* save optional gene_name and transcript_name attributes */
      if (transcript_name
            && !gt_hashmap_get(parser->transcript_id_to_name_mapping,
                             transcript_id)) {
        gt_hashmap_add(parser->transcript_id_to_name_mapping,
                    gt_cstr_dup(transcript_id),
                    gt_cstr_dup(transcript_name));
      }
      if (gene_name && !gt_hashmap_get(parser->gene_id_to_name_mapping,
                                    gene_id)) {
        gt_hashmap_add(parser->gene_id_to_name_mapping,
                    gt_cstr_dup(gene_id),
                    gt_cstr_dup(gene_name));
      }

      /* get seqid */
      seqid_str = gt_hashmap_get(parser->seqid_to_str_mapping, seqname);
      if (!seqid_str) {
        seqid_str = gt_str_new_cstr(seqname);
        gt_hashmap_add(parser->seqid_to_str_mapping, gt_str_get(seqid_str),
                       seqid_str);
      }
      gt_assert(seqid_str);

      /* construct the new feature */
      gn = gt_feature_node_new(seqid_str, type, range.start, range.end,
                                 gt_strand_value);
      gt_genome_node_set_origin(gn, filenamestr, line_number);

      /* set source */
      source_str = gt_hashmap_get(parser->source_to_str_mapping, source);
      if (!source_str) {
        source_str = gt_str_new_cstr(source);
        gt_hashmap_add(parser->source_to_str_mapping, gt_str_get(source_str),
                    source_str);
      }
      gt_assert(source_str);
      gt_feature_node_set_source((GtFeatureNode*) gn, source_str);

      if (score_is_defined)
        gt_feature_node_set_score((GtFeatureNode*) gn, score_value);
      if (phase_value != GT_PHASE_UNDEFINED)
        gt_feature_node_set_phase((GtFeatureNode*) gn, phase_value);
      gt_array_add(gt_genome_node_array, gn);
    }

    gt_str_reset(line_buffer);
  }

  /* process all region nodes */
  if (!had_err)
    gt_region_node_builder_build(parser->region_node_builder, genome_nodes);

  /* process all feature nodes */
  cinfo.genome_nodes = genome_nodes;
  cinfo.gene_id_to_name_mapping = parser->gene_id_to_name_mapping;
  cinfo.transcript_id_to_name_mapping = parser->transcript_id_to_name_mapping;
  if (!had_err) {
    had_err = gt_hashmap_foreach(parser->gene_id_hash, construct_genes,
                              &cinfo, err);
  }

  /* free */
  gt_splitter_delete(splitter);
  gt_splitter_delete(attribute_splitter);
  gt_str_delete(line_buffer);

  return had_err;
}
static int inter_feature_in_children(GtFeatureNode *current_feature, void *data,
                                     GT_UNUSED GtError *err)
{
  GtInterFeatureVisitor *aiv = (GtInterFeatureVisitor*) data;
  GtFeatureNode *inter_node;
  GtRange previous_range, current_range, inter_range;
  GtStrand previous_strand, /*current_strand, */inter_strand;
  GtStr *parent_seqid;
  gt_error_check(err);
  gt_assert(current_feature);
  if (gt_feature_node_has_type(current_feature, aiv->outside_type)) {
    if (aiv->previous_feature) {
      /* determine inter range */
      previous_range = gt_genome_node_get_range((GtGenomeNode*)
                                                aiv->previous_feature);
      current_range = gt_genome_node_get_range((GtGenomeNode*) current_feature);
      if (previous_range.end >= current_range.start) {
        gt_warning("overlapping boundary features " GT_WU "-" GT_WU " and "
                   GT_WU "-" GT_WU ", " "not placing '%s' inter-feature",
                   previous_range.start,
                   previous_range.end,
                   current_range.start,
                   current_range.end,
                   aiv->inter_type);
        return 0;
      }
      if (current_range.start - previous_range.end < 2) {
        gt_warning("no space for inter-feature '%s' between " GT_WU " and "
                   GT_WU,
                   aiv->inter_type,
                   previous_range.end,
                   current_range.start);
        return 0;
      }
      inter_range.start = previous_range.end + 1;
      inter_range.end = current_range.start - 1;

      /* determine inter strand */
      previous_strand = gt_feature_node_get_strand(aiv->previous_feature);
      /*current_strand = gt_feature_node_get_strand(current_feature);*/
      gt_assert(previous_strand == gt_feature_node_get_strand(current_feature));
      inter_strand = previous_strand;

      /* determine sequence id */
      parent_seqid =
        gt_genome_node_get_seqid((GtGenomeNode*) aiv->parent_feature);
      gt_assert(!gt_str_cmp(parent_seqid,
                            gt_genome_node_get_seqid((GtGenomeNode*)
                                                     aiv->previous_feature)));
      gt_assert(!gt_str_cmp(parent_seqid,
                            gt_genome_node_get_seqid((GtGenomeNode*)
                                                     current_feature)));

      /* create inter feature */
      inter_node = (GtFeatureNode*)
                   gt_feature_node_new(parent_seqid, aiv->inter_type,
                                       inter_range.start, inter_range.end,
                                       inter_strand);
      gt_feature_node_add_child(aiv->parent_feature, inter_node);
    }
    aiv->previous_feature = current_feature;
  }
  return 0;
}
Пример #19
0
static int parse_range(GtRange *range, const char *start, const char *end,
                       unsigned int line_number, const char *filename,
                       bool tidy, bool correct_negative, GtError *err)
{
  long start_val, end_val;
  char *ep;

  gt_assert(start && end && filename);
  gt_error_check(err);

  range->start = GT_UNDEF_ULONG;
  range->end = GT_UNDEF_ULONG;

  /* parse and check start */
  errno = 0;
  start_val = strtol(start, &ep, 10);
  if (start[0] == '\0' || *ep != '\0') {
    gt_error_set(err, "could not parse number '%s' on line %u in file '%s'",
                 start, line_number, filename);
    return -1;
  }
  if (errno == ERANGE && (start_val == LONG_MAX || start_val == LONG_MIN)) {
    gt_error_set(err, "number '%s' out of range on line %u in file '%s'", start,
                 line_number, filename);
    return -1;
  }
  if (start_val < 0) {
    if (tidy || correct_negative) {
      gt_warning("start '%s' is negative on line %u in file '%s'; reset to 1",
                 start, line_number, filename);
      start_val = 1;
    }
    else {
      gt_error_set(err, "start '%s' is negative on line %u in file '%s'", start,
                   line_number, filename);
      return -1;
    }
  }
  if (start_val == 0 && tidy) {
    gt_warning("start '%s' is zero on line %u in file '%s' (GFF3 files are "
               "1-based); reset to 1", start, line_number, filename);
    start_val = 1;
  }

  /* parse and check end */
  errno = 0;
  end_val = strtol(end, &ep, 10);
  if (end[0] == '\0' || *ep != '\0') {
    gt_error_set(err, "could not parse number '%s' on line %u in file '%s'",
                 end, line_number, filename);
    return -1;
  }
  if (errno == ERANGE && (end_val == LONG_MAX || end_val == LONG_MIN)) {
    gt_error_set(err, "number '%s' out of range on line %u in file '%s'", end,
                 line_number, filename);
    return -1;
  }
  if (end_val < 0) {
    if (tidy || correct_negative) {
      gt_warning("end '%s' is negative on line %u in file '%s'; reset to 1",
                 end, line_number, filename);
      end_val = 1;
    }
    else {
      gt_error_set(err, "end '%s' is negative on line %u in file '%s'", end,
                   line_number, filename);
      return -1;
    }
  }
  if (end_val == 0 && tidy) {
    gt_warning("end '%s' is zero on line %u in file '%s' (GFF3 files are "
               "1-based); reset to 1", end, line_number, filename);
    end_val = 1;
  }

  /* check range */
  if (start_val > end_val) {
    if (tidy) {
      long tmp_val;
      gt_warning("start '%lu' is larger then end '%lu' on line %u in file "
                 "'%s'; swap them", start_val, end_val, line_number, filename);
      tmp_val = start_val;
      start_val = end_val;
      end_val = tmp_val;
    }
    else {
      gt_error_set(err, "start '%lu' is larger then end '%lu' on line %u in "
                   "file '%s'", start_val, end_val, line_number, filename);
      return -1;
    }
  }

  /* set result */
  range->start = start_val;
  range->end = end_val;

  return 0;
}
Пример #20
0
static int gt_seqorder_runner(GT_UNUSED int argc, const char **argv,
    int parsed_args, void *tool_arguments, GtError *err)
{
  GtSeqorderArguments *arguments = tool_arguments;
  int had_err = 0;
  GtEncseq *encseq;
  GtEncseqLoader *loader;
  unsigned long i, nofseqs;

  gt_error_check(err);
  gt_assert(arguments != NULL);

  /* load encseq */
  loader = gt_encseq_loader_new();
  encseq = gt_encseq_loader_load(loader, argv[parsed_args], err);
  if (encseq == NULL)
    had_err = -1;
  if (had_err == 0 && !gt_encseq_has_description_support(encseq))
    gt_warning("%s has no description support", argv[parsed_args]);
  if (!had_err)
  {
    nofseqs = gt_encseq_num_of_sequences(encseq);
    if (arguments->invert)
    {
      for (i = nofseqs; i > 0; i--)
        gt_seqorder_output(i - 1, encseq);
    }
    else if (arguments->shuffle)
    {
      unsigned long *seqnums;
      seqnums = gt_malloc(sizeof (unsigned long) * nofseqs);
      gt_seqorder_get_shuffled_seqnums(nofseqs, seqnums);
      for (i = 0; i < nofseqs; i++)
        gt_seqorder_output(seqnums[i], encseq);
      gt_free(seqnums);
    }
    else
    {
      GtSuffixsortspace *suffixsortspace;
      gt_assert(arguments->sort || arguments->revsort);
      suffixsortspace
        = gt_suffixsortspace_new(nofseqs,
                                 /* Use iterator over sequence separators:
                                    saves a lot of binary searches */
                                 gt_encseq_seqstartpos(encseq, nofseqs-1),
                                 false,NULL);
      gt_seqorder_sort(suffixsortspace, encseq);
      if (arguments->sort)
        for (i = 0; i < nofseqs; i++)
          gt_seqorder_output(gt_encseq_seqnum(encseq,
                gt_suffixsortspace_getdirect(suffixsortspace, i)), encseq);
      else
        for (i = nofseqs; i > 0; i--)
          gt_seqorder_output(gt_encseq_seqnum(encseq,
                gt_suffixsortspace_getdirect(suffixsortspace, i - 1)), encseq);
      gt_suffixsortspace_delete(suffixsortspace, false);
    }
  }

  gt_encseq_loader_delete(loader);
  gt_encseq_delete(encseq);
  return had_err;
}
static int calc_spliced_alignments(GthSACollection *sa_collection,
                                   GthChainCollection *chain_collection,
                                   GthCallInfo *call_info,
                                   GthInput *input,
                                   GthStat *stat,
                                   GtUword gen_file_num,
                                   GtUword ref_file_num,
                                   bool directmatches,
                                   GthMatchInfo *match_info,
                                   GthDNACompletePathMatrixJT
                                   dna_complete_path_matrix_jt,
                                   GthProteinCompletePathMatrixJT
                                   protein_complete_path_matrix_jt)
{
  const unsigned char *ref_seq_tran, *ref_seq_orig, *ref_seq_tran_rc = NULL,
                      *ref_seq_orig_rc = NULL;
  GtUword chainctr, gen_offset = GT_UNDEF_UWORD, gen_total_length,
                ref_total_length;
  GtFile *outfp = call_info->out->outfp;
  GtRange gen_seq_bounds, gen_seq_bounds_rc;
  bool refseqisdna;
  GthChain *chain;
  GtRange range;
  GthSA *saA;
  int rval;

  gt_assert(sa_collection && chain_collection);

  refseqisdna = gth_input_ref_file_is_dna(input, ref_file_num);

  for (chainctr = 0;
       chainctr < gth_chain_collection_size(chain_collection);
       chainctr++) {
       chain = gth_chain_collection_get(chain_collection, chainctr);
    if (++match_info->call_number > call_info->firstalshown &&
        call_info->firstalshown > 0) {
      if (!(call_info->out->xmlout || call_info->out->gff3out))
        gt_file_xfputc('\n', outfp);
      else if (call_info->out->xmlout)
        gt_file_xprintf(outfp, "<!--\n");

      if (!call_info->out->gff3out) {
        gt_file_xprintf(outfp, "Maximal matching %s count (%u) reached.\n",
                        refseqisdna ? "EST" : "protein",
                        call_info->firstalshown);
        gt_file_xprintf(outfp, "Only the first %u matches will be "
                           "displayed.\n", call_info->firstalshown);
      }

      if (!(call_info->out->xmlout || call_info->out->gff3out))
        gt_file_xfputc('\n', outfp);
      else if (call_info->out->xmlout)
        gt_file_xprintf(outfp, "-->\n");

      match_info->max_call_number_reached = true;
      break; /* break out of loop */
    }

    /* compute considered genomic regions if not set by -frompos */
    if (!gth_input_use_substring_spec(input)) {
      gen_seq_bounds = gth_input_get_genomic_range(input, chain->gen_file_num,
                                                   chain->gen_seq_num);
      gen_total_length      = gt_range_length(&gen_seq_bounds);
      gen_offset            = gen_seq_bounds.start;
      gen_seq_bounds_rc     = gen_seq_bounds;
    }
    else {
      /* genomic multiseq contains exactly one sequence */
      gt_assert(gth_input_num_of_gen_seqs(input, chain->gen_file_num) == 1);
      gen_total_length = gth_input_genomic_file_total_length(input,
                                                             chain
                                                             ->gen_file_num);
      gen_seq_bounds.start    = gth_input_genomic_substring_from(input);
      gen_seq_bounds.end      = gth_input_genomic_substring_to(input);
      gen_offset              = 0;
      gen_seq_bounds_rc.start = gen_total_length - 1 - gen_seq_bounds.end;
      gen_seq_bounds_rc.end   = gen_total_length - 1 - gen_seq_bounds.start;
    }

    /* "retrieving" the reference sequence */
    range = gth_input_get_reference_range(input, chain->ref_file_num,
                                          chain->ref_seq_num);
    ref_seq_tran = gth_input_current_ref_seq_tran(input) + range.start;
    ref_seq_orig = gth_input_current_ref_seq_orig(input) + range.start;
    if (refseqisdna) {
      ref_seq_tran_rc = gth_input_current_ref_seq_tran_rc(input) + range.start;
      ref_seq_orig_rc = gth_input_current_ref_seq_orig_rc(input) + range.start;
    }
    ref_total_length = range.end - range.start + 1;

    /* check if protein sequences have a stop amino acid */
    if (!refseqisdna && !match_info->stop_amino_acid_warning &&
       ref_seq_orig[ref_total_length - 1] != GT_STOP_AMINO) {
      GtStr *ref_id = gt_str_new();
      gth_input_save_ref_id(input, ref_id, chain->ref_file_num,
                            chain->ref_seq_num);
      gt_warning("protein sequence '%s' (#" GT_WU " in file %s) does not end "
                 "with a stop amino acid ('%c'). If it is not a protein "
                 "fragment you should add a stop amino acid to improve the "
                 "prediction. For example with `gt seqtransform "
                 "-addstopaminos` (see http://genometools.org for details).",
                 gt_str_get(ref_id), chain->ref_seq_num,
                 gth_input_get_reference_filename(input, chain->ref_file_num),
                 GT_STOP_AMINO);
      match_info->stop_amino_acid_warning = true;
      gt_str_delete(ref_id);
    }

    /* allocating space for alignment */
    saA = gth_sa_new_and_set(directmatches, true, input, chain->gen_file_num,
                             chain->gen_seq_num, chain->ref_file_num,
                             chain->ref_seq_num, match_info->call_number,
                             gen_total_length, gen_offset, ref_total_length);

    /* extend the DP borders to the left and to the right */
    gth_chain_extend_borders(chain, &gen_seq_bounds, &gen_seq_bounds_rc,
                             gen_total_length, gen_offset);

    /* From here on the dp positions always refer to the forward strand of the
       genomic DNA. */

    /* call the Dynamic Programming */
    if (refseqisdna) {
      rval = call_dna_DP(directmatches, call_info, input, stat,
                         sa_collection, saA, gen_file_num, ref_file_num,
                         gen_total_length, gen_offset, &gen_seq_bounds,
                         &gen_seq_bounds_rc, ref_total_length, range.start,
                         chainctr, gth_chain_collection_size(chain_collection),
                         match_info, ref_seq_tran, ref_seq_orig,
                         ref_seq_tran_rc, ref_seq_orig_rc, chain,
                         dna_complete_path_matrix_jt,
                         protein_complete_path_matrix_jt);
    }
    else {
      rval = call_protein_DP(directmatches, call_info, input,
                             stat, sa_collection, saA, gen_file_num,
                             ref_file_num, gen_total_length, gen_offset,
                             &gen_seq_bounds, &gen_seq_bounds_rc,
                             ref_total_length, range.start, chainctr,
                             gth_chain_collection_size(chain_collection),
                             match_info, ref_seq_tran, ref_seq_orig, chain,
                             dna_complete_path_matrix_jt,
                             protein_complete_path_matrix_jt);
    }
    /* check return value */
    if (rval == GTH_ERROR_DP_PARAMETER_ALLOCATION_FAILED) {
      /* statistics bookkeeping */
      gth_stat_increment_numoffailedDPparameterallocations(stat);
      gth_stat_increment_numofundeterminedSAs(stat);
      /* free space */
      gth_sa_delete(saA);
      match_info->call_number--;
      continue; /* continue with the next DP range */
    }
    else if (rval)
      return -1;
  }

  if (!call_info->out->xmlout && !call_info->out->gff3out && !directmatches &&
      !match_info->significant_match_found &&
      match_info->call_number <= call_info->firstalshown) {
    show_no_match_line(gth_input_get_alphatype(input, ref_file_num), outfp);
  }

  return 0;
}
Пример #22
0
static int check_cds_phases(GtArray *cds_features, GtCDSCheckVisitor *v,
                            bool is_multi, bool second_pass, GtError *err)
{
  GtPhase current_phase, correct_phase = GT_PHASE_ZERO;
  GtFeatureNode *fn;
  GtStrand strand;
  unsigned long i, current_length;
  int had_err = 0;
  gt_error_check(err);
  gt_assert(cds_features);
  gt_assert(gt_array_size(cds_features));
  fn = *(GtFeatureNode**) gt_array_get_first(cds_features);
  strand = gt_feature_node_get_strand(fn);
  if (strand == GT_STRAND_REVERSE)
    gt_array_reverse(cds_features);
  for (i = 0; !had_err && i < gt_array_size(cds_features); i++) {
    fn = *(GtFeatureNode**) gt_array_get(cds_features, i);
    /* the first phase can be anything (except being undefined), because the
       GFF3 spec says:

       NOTE 4 - CDS features MUST have have a defined phase field. Otherwise it
       is not possible to infer the correct polypeptides corresponding to
       partially annotated genes. */
    if ((!i && gt_feature_node_get_phase(fn) == GT_PHASE_UNDEFINED) ||
        (i && gt_feature_node_get_phase(fn) != correct_phase)) {
      if (gt_hashmap_get(v->cds_features, fn)) {
        if (v->tidy && !is_multi && !gt_feature_node_has_children(fn)) {
          /* we can split the feature */
          gt_warning("%s feature on line %u in file \"%s\" has multiple "
                     "parents which require different phases; split feature",
                     gt_ft_CDS,
                     gt_genome_node_get_line_number((GtGenomeNode*) fn),
                     gt_genome_node_get_filename((GtGenomeNode*) fn));
          gt_hashmap_add(v->cds_features_to_split, fn, fn);
          v->splitting_is_necessary = true; /* split later */
        }
        else {
          gt_error_set(err, "%s feature on line %u in file \"%s\" has multiple "
                       "parents which require different phases",
                       gt_ft_CDS,
                       gt_genome_node_get_line_number((GtGenomeNode*) fn),
                       gt_genome_node_get_filename((GtGenomeNode*) fn));
          had_err = -1;
        }
      }
      else {
        if (v->tidy) {
          if (!second_pass) {
            gt_warning("%s feature on line %u in file \"%s\" has the wrong "
                       "phase %c -> correcting it to %c", gt_ft_CDS,
                       gt_genome_node_get_line_number((GtGenomeNode*) fn),
                       gt_genome_node_get_filename((GtGenomeNode*) fn),
                       GT_PHASE_CHARS[gt_feature_node_get_phase(fn)],
                       GT_PHASE_CHARS[correct_phase]);
          }
          gt_feature_node_set_phase(fn, correct_phase);
        }
        else {
          gt_error_set(err, "%s feature on line %u in file \"%s\" has the "
                       "wrong phase %c (should be %c)", gt_ft_CDS,
                       gt_genome_node_get_line_number((GtGenomeNode*) fn),
                       gt_genome_node_get_filename((GtGenomeNode*) fn),
                       GT_PHASE_CHARS[gt_feature_node_get_phase(fn)],
                       GT_PHASE_CHARS[correct_phase]);
          had_err = -1;
        }
      }
    }
    if (!had_err) {
      current_phase = gt_feature_node_get_phase(fn);
      current_length = gt_genome_node_get_length((GtGenomeNode*) fn);
      correct_phase = (3 - (current_length - current_phase) % 3) % 3;
      gt_hashmap_add(v->cds_features, fn, fn); /* record CDS feature */
    }
  }
  return had_err;
}
Пример #23
0
static int gt_seqtranslate_runner(int argc, const char **argv, int parsed_args,
                              void *tool_arguments, GT_UNUSED GtError *err)
{
  GtTranslateArguments *arguments = tool_arguments;
  GtSeqIterator *si = NULL;
  GtSequenceBuffer *sb = NULL;
  GtStrArray *infiles;
  int had_err = 0,
      rval,
      i;
  GtStr *translations[3];
  translations[0] = gt_str_new();
  translations[1] = gt_str_new();
  translations[2] = gt_str_new();

  gt_error_check(err);
  gt_assert(arguments);

  infiles = gt_str_array_new();
  for (i = parsed_args; i < argc; i++) {
    gt_str_array_add_cstr(infiles, argv[i]);
  }
  sb = gt_sequence_buffer_new_guess_type(infiles, err);
  if (!sb)
    had_err = -1;
  if (!had_err) {
    si = gt_seq_iterator_sequence_buffer_new_with_buffer(sb);
    if (!si)
      had_err = -1;
  }
  if (!had_err) {
    char *desc;
    const GtUchar *sequence;
    GtUword len;
    while (!had_err && (rval = gt_seq_iterator_next(si,
                                                   &sequence,
                                                   &len, &desc, err))) {
      if (rval < 0) {
        had_err = -1;
        break;
      }
      if (len < GT_CODON_LENGTH) {
        gt_warning("sequence '%s' is shorter than codon length of %d, skipping",
                   desc, GT_CODON_LENGTH);
      } else {
        had_err = gt_seqtranslate_do_translation(arguments, (char*) sequence,
                                                 len, desc,
                                                 translations, false, err);
        if (!had_err && arguments->reverse) {
          char *revseq = gt_cstr_dup_nt((char*) sequence, len);
          had_err = gt_reverse_complement(revseq, len, err);
          if (!had_err) {
            had_err = gt_seqtranslate_do_translation(arguments, revseq, len,
                                                  desc, translations, true,
                                                  err);
          }
          gt_free(revseq);
        }
      }
    }
  }
  gt_str_delete(translations[0]);
  gt_str_delete(translations[1]);
  gt_str_delete(translations[2]);
  gt_str_array_delete(infiles);
  gt_seq_iterator_delete(si);
  gt_sequence_buffer_delete(sb);
  return had_err;
}
Пример #24
0
static int output_sequence(GtEncseq *encseq, GtEncseqDecodeArguments *args,
                           const char *filename, GtError *err)
{
  GtUword i, j, sfrom, sto;
  int had_err = 0;
  bool has_desc;
  GtEncseqReader *esr;
  gt_assert(encseq);

  if (!(has_desc = gt_encseq_has_description_support(encseq)))
    gt_warning("Missing description support for file %s", filename);

  if (strcmp(gt_str_get(args->mode), "fasta") == 0) {
    /* specify a single sequence to extract */
    if (args->seq != GT_UNDEF_UWORD) {
      if (args->seq >= gt_encseq_num_of_sequences(encseq)) {
        gt_error_set(err,
                     "requested sequence "GT_WU" exceeds number of sequences "
                     "("GT_WU")", args->seq,
                     gt_encseq_num_of_sequences(encseq));
        return -1;
      }
      sfrom = args->seq;
      sto = args->seq + 1;
    } else if (args->seqrng.start != GT_UNDEF_UWORD
                 && args->seqrng.end != GT_UNDEF_UWORD) {
      /* specify a sequence range to extract */
      if (args->seqrng.start >= gt_encseq_num_of_sequences(encseq)
            || args->seqrng.end >= gt_encseq_num_of_sequences(encseq)) {
        gt_error_set(err,
                     "range "GT_WU"-"GT_WU" includes a sequence number "
                     "exceeding the total number of sequences ("GT_WU")",
                     args->seqrng.start,
                     args->seqrng.end,
                     gt_encseq_num_of_sequences(encseq));
        return -1;
      }
      sfrom = args->seqrng.start;
      sto = args->seqrng.end + 1;
    } else {
      /* extract all sequences */
      sfrom = 0;
      sto = gt_encseq_num_of_sequences(encseq);
    }
    for (i = sfrom; i < sto; i++) {
      GtUword desclen, startpos, len;
      char buf[BUFSIZ];
      const char *desc = NULL;
      /* XXX: maybe make this distinction in the functions via readmode? */
      if (!GT_ISDIRREVERSE(args->rm)) {
        startpos = gt_encseq_seqstartpos(encseq, i);
        len = gt_encseq_seqlength(encseq, i);
        if (has_desc) {
          desc = gt_encseq_description(encseq, &desclen, i);
        } else {
          (void) snprintf(buf, BUFSIZ, "sequence "GT_WU"", i);
          desclen = strlen(buf);
          desc = buf;
        }
      } else {
        startpos = gt_encseq_seqstartpos(encseq, i);
        len = gt_encseq_seqlength(encseq,
                                  gt_encseq_num_of_sequences(encseq)-1-i);
        startpos = gt_encseq_total_length(encseq)
                     - (gt_encseq_seqstartpos(encseq,
                                              gt_encseq_num_of_sequences(
                                                encseq)-1-i) + len);
        if (has_desc) {
          desc = gt_encseq_description(encseq,
                                       &desclen,
                                       gt_encseq_num_of_sequences(encseq)-1-i);
        } else {
          (void) snprintf(buf, BUFSIZ, "sequence "GT_WU"", i);
          desclen = strlen(buf);
          desc = buf;
        }
      }
      gt_assert(desc);
      /* output description */
      gt_xfputc(GT_FASTA_SEPARATOR, stdout);
      gt_xfwrite(desc, 1, desclen, stdout);
      gt_xfputc('\n', stdout);
      /* XXX: make this more efficient by writing in a buffer first and then
         showing the result */
      if (args->singlechars) {
        for (j = 0; j < len; j++) {
           gt_xfputc(gt_encseq_get_decoded_char(encseq,
                                                startpos + j,
                                                args->rm),
                     stdout);
        }
      } else {
        esr = gt_encseq_create_reader_with_readmode(encseq, args->rm, startpos);
        for (j = 0; j < len; j++) {
           gt_xfputc(gt_encseq_reader_next_decoded_char(esr), stdout);
        }
        gt_encseq_reader_delete(esr);
      }
      gt_xfputc('\n', stdout);
    }
  }

  if (strcmp(gt_str_get(args->mode), "concat") == 0) {
    GtUword from = 0,
                  to = gt_encseq_total_length(encseq) - 1;
    if (args->rng.start != GT_UNDEF_UWORD && args->rng.end != GT_UNDEF_UWORD) {
      if (args->rng.end > to) {
        had_err = -1;
        gt_error_set(err,
                     "end of range ("GT_WU") exceeds encoded sequence length "
                     "("GT_WU")", args->rng.end, to);
      }
      if (!had_err) {
        from = args->rng.start;
        to = args->rng.end;
      }
    }
    if (!had_err) {
      if (args->singlechars) {
        for (j = from; j <= to; j++) {
          char cc = gt_encseq_get_decoded_char(encseq, j, args->rm);
          if (cc == (char) SEPARATOR)
            cc = gt_str_get(args->sepchar)[0];
          gt_xfputc(cc, stdout);
        }
      } else {
        esr = gt_encseq_create_reader_with_readmode(encseq, args->rm, from);
        if (esr) {
          for (j = from; j <= to; j++) {
            char cc = gt_encseq_reader_next_decoded_char(esr);
            if (cc == (char) SEPARATOR)
              cc = gt_str_get(args->sepchar)[0];
            gt_xfputc(cc, stdout);
          }
          gt_encseq_reader_delete(esr);
        }
      }
      gt_xfputc('\n', stdout);
    }
  }
  return had_err;
}
Пример #25
0
static int gt_sketch_page_runner(GT_UNUSED int argc,
                                 const char **argv,
                                 int parsed_args,
                                 void *tool_arguments,
                                 GtError *err)
{
  SketchPageArguments *arguments = tool_arguments;
  int had_err = 0;
  GtFeatureIndex *features = NULL;
  GtRange qry_range, sequence_region_range;
  GtStyle *sty = NULL;
  GtStr *prog, *gt_style_file;
  GtDiagram *d = NULL;
  GtLayout *l = NULL;
  GtBioseq *bioseq = NULL;
  GtCanvas *canvas = NULL;
  const char *seqid = NULL, *outfile;
  unsigned long start, height, num_pages = 0;
  double offsetpos, usable_height;
  cairo_surface_t *surf = NULL;
  cairo_t *cr = NULL;
  GtTextWidthCalculator *twc;
  gt_error_check(err);

  features = gt_feature_index_memory_new();

  if (cairo_version() < CAIRO_VERSION_ENCODE(1, 8, 6))
    gt_warning("Your cairo library (version %s) is older than version 1.8.6! "
               "These versions contain a bug which may result in "
               "corrupted PDF output!", cairo_version_string());

  /* get style */
  sty = gt_style_new(err);
  if (gt_str_length(arguments->stylefile) == 0)
  {
    prog = gt_str_new();
    gt_str_append_cstr_nt(prog, argv[0],
                          gt_cstr_length_up_to_char(argv[0], ' '));
    gt_style_file = gt_get_gtdata_path(gt_str_get(prog), err);
    gt_str_delete(prog);
    gt_str_append_cstr(gt_style_file, "/sketch/default.style");
  }
  else
  {
    gt_style_file = gt_str_ref(arguments->stylefile);
  }
  had_err = gt_style_load_file(sty, gt_str_get(gt_style_file), err);

  outfile = argv[parsed_args];
  if (!had_err)
  {
    /* get features */
    had_err = gt_feature_index_add_gff3file(features, argv[parsed_args+1], err);
     if (!had_err && gt_str_length(arguments->seqid) == 0) {
      seqid = gt_feature_index_get_first_seqid(features);
      if (seqid == NULL)
      {
        gt_error_set(err, "GFF input file must contain a sequence region!");
        had_err = -1;
      }
    }
    else if (!had_err
               && !gt_feature_index_has_seqid(features,
                                              gt_str_get(arguments->seqid)))
    {
      gt_error_set(err, "sequence region '%s' does not exist in GFF input file",
                   gt_str_get(arguments->seqid));
      had_err = -1;
    }
    else if (!had_err)
      seqid = gt_str_get(arguments->seqid);
  }

  /* set text */
  if (gt_str_length(arguments->text) == 0)
  {
    gt_str_delete(arguments->text);
    arguments->text = gt_str_new_cstr(argv[parsed_args+1]);
  }

  if (!had_err)
  {
    /* set display range */
    gt_feature_index_get_range_for_seqid(features, &sequence_region_range,
                                         seqid);
    qry_range.start = (arguments->range.start == GT_UNDEF_ULONG ?
                         sequence_region_range.start :
                         arguments->range.start);
    qry_range.end   = (arguments->range.end == GT_UNDEF_ULONG ?
                         sequence_region_range.end :
                         arguments->range.end);

    /* set output format */
    if (strcmp(gt_str_get(arguments->format), "pdf") == 0)
    {
      surf = cairo_pdf_surface_create(outfile,
                                      mm_to_pt(arguments->pwidth),
                                      mm_to_pt(arguments->pheight));
    }
    else if (strcmp(gt_str_get(arguments->format), "ps") == 0)
    {
      surf =  cairo_ps_surface_create(outfile,
                                      mm_to_pt(arguments->pwidth),
                                      mm_to_pt(arguments->pheight));
    }
    gt_log_log("created page with %.2f:%.2f dimensions\n",
                                                  mm_to_pt(arguments->pwidth),
                                                  mm_to_pt(arguments->pheight));

    offsetpos = TEXT_SPACER + arguments->theight + TEXT_SPACER;
    usable_height = mm_to_pt(arguments->pheight)
                              - arguments->theight
                              - arguments->theight
                              - 4*TEXT_SPACER;

    if (gt_str_length(arguments->seqfile) > 0) {
      bioseq = gt_bioseq_new(gt_str_get(arguments->seqfile), err);
    }

    cr = cairo_create(surf);
    cairo_set_font_size(cr, 8);
    twc = gt_text_width_calculator_cairo_new(cr, sty);
    for (start = qry_range.start; start <= qry_range.end;
         start += arguments->width)
    {
      GtRange single_range;
      GtCustomTrack *ct = NULL;
      const char *seq;
      single_range.start = start;
      single_range.end = start + arguments->width;

      if (had_err)
        break;

      d = gt_diagram_new(features, seqid, &single_range, sty, err);
      if (!d) {
        had_err = -1;
        break;
      }
      if (bioseq) {
        seq = gt_bioseq_get_sequence(bioseq, 0);
        ct = gt_custom_track_gc_content_new(seq,
                                      gt_bioseq_get_sequence_length(bioseq, 0),
                                      800, 70, 0.4, true);
        gt_diagram_add_custom_track(d, ct);
      }

      l = gt_layout_new_with_twc(d, mm_to_pt(arguments->width), sty, twc, err);
      had_err = gt_layout_get_height(l, &height, err);
      if (!had_err) {
        if (gt_double_smaller_double(usable_height - 10 - 2*TEXT_SPACER
              - arguments->theight, offsetpos + height))
        {
            draw_header(cr, gt_str_get(arguments->text), argv[parsed_args+1],
                        seqid, num_pages, mm_to_pt(arguments->pwidth),
                        mm_to_pt(arguments->pheight),
                        arguments->theight);
          cairo_show_page(cr);
          offsetpos = TEXT_SPACER + arguments->theight + TEXT_SPACER;
          num_pages++;
        }
        canvas = gt_canvas_cairo_context_new(sty,
                                             cr,
                                             offsetpos,
                                             mm_to_pt(arguments->pwidth),
                                             height,
                                             NULL,
                                             err);
        if (!canvas)
          had_err = -1;
        offsetpos += height;
        if (!had_err)
          had_err = gt_layout_sketch(l, canvas, err);
      }
      gt_canvas_delete(canvas);
      gt_layout_delete(l);
      gt_diagram_delete(d);
      if (ct)
        gt_custom_track_delete(ct);
    }
    draw_header(cr, gt_str_get(arguments->text), argv[parsed_args+1], seqid,
                num_pages, mm_to_pt(arguments->pwidth),
                mm_to_pt(arguments->pheight),
                arguments->theight);
    cairo_show_page(cr);
    num_pages++;
    gt_log_log("finished, should be %lu pages\n", num_pages);
    gt_text_width_calculator_delete(twc);
    cairo_destroy(cr);
    cairo_surface_flush(surf);
    cairo_surface_finish(surf);
    cairo_surface_destroy(surf);
    cairo_debug_reset_static_data();
    if (bioseq)
      gt_bioseq_delete(bioseq);
    gt_style_delete(sty);
    gt_str_delete(gt_style_file);
    gt_feature_index_delete(features);
  }
  return had_err;
}