Ejemplo n.º 1
0
void gt_gff3_output_leading_str(GtFeatureNode *fn, GtStr *outstr)
{
  GtGenomeNode *gn;
  gt_assert(fn && outstr);
  gn = (GtGenomeNode*) fn;
  gt_str_append_str(outstr, gt_genome_node_get_seqid(gn));
  gt_str_append_char(outstr, '\t');
  gt_str_append_cstr(outstr, gt_feature_node_get_source(fn));
  gt_str_append_char(outstr, '\t');
  gt_str_append_cstr(outstr, gt_feature_node_get_type(fn));
  gt_str_append_char(outstr, '\t');
  gt_str_append_uword(outstr, gt_genome_node_get_start(gn));
  gt_str_append_char(outstr, '\t');
  gt_str_append_uword(outstr, gt_genome_node_get_end(gn));
  gt_str_append_char(outstr, '\t');
  if (gt_feature_node_score_is_defined(fn)) {
    char buf[BUFSIZ];
    (void) snprintf(buf, BUFSIZ, "%.3g", gt_feature_node_get_score(fn));
    gt_str_append_cstr(outstr, buf);
  } else
    gt_str_append_char(outstr, '.');
  gt_str_append_char(outstr, '\t');
  gt_str_append_char(outstr, GT_STRAND_CHARS[gt_feature_node_get_strand(fn)]);
  gt_str_append_char(outstr, '\t');
  gt_str_append_char(outstr, GT_PHASE_CHARS[gt_feature_node_get_phase(fn)]);
  gt_str_append_char(outstr, '\t');
}
Ejemplo n.º 2
0
static void set_gff3_target_attribute(GthSA *sa, bool md5ids)
{
  gt_assert(sa && !sa->gff3_target_attribute);
  sa->gff3_target_attribute = gt_str_new();
  if (md5ids) {
    gt_assert(sa->ref_md5);
    gt_str_append_cstr(sa->gff3_target_attribute, GT_MD5_SEQID_PREFIX);
    gt_str_append_str(sa->gff3_target_attribute, sa->ref_md5);
    gt_str_append_char(sa->gff3_target_attribute, ':');
  }
  gt_gff3_escape(sa->gff3_target_attribute, gt_str_get(sa->ref_id),
                 gt_str_length(sa->ref_id));
  gt_str_append_char(sa->gff3_target_attribute, ' ');
  gt_str_append_uword(sa->gff3_target_attribute,
                      gth_sa_referencecutoff_start(sa) + 1); /* XXX: use
                                                                reference
                                                                dpstartpos */
  gt_str_append_char(sa->gff3_target_attribute, ' ');
  gt_str_append_uword(sa->gff3_target_attribute,
                      gth_sa_ref_total_length(sa) - /* XXX */
                      gth_sa_referencecutoff_end(sa));
  gt_str_append_char(sa->gff3_target_attribute, ' ');
  if (sa->ref_strand_forward) {
    gt_str_append_char(sa->gff3_target_attribute,
                       GT_STRAND_CHARS[GT_STRAND_FORWARD]);
  }
  else {
    gt_str_append_char(sa->gff3_target_attribute,
                       GT_STRAND_CHARS[GT_STRAND_REVERSE]);
  }
}
Ejemplo n.º 3
0
static void show_parse_file_status(GthShowVerbose showverbose,
                                   GtUword filenum,
                                   GtUword numoffiles,
                                   const char *filename)
{
  GtStr *buf = gt_str_new();
  gt_str_append_cstr(buf, "process file ");
  gt_str_append_uword(buf, filenum + 1);
  gt_str_append_char(buf, GT_PATH_SEPARATOR);
  gt_str_append_uword(buf, numoffiles);
  gt_str_append_cstr(buf, ": ");
  gt_str_append_cstr(buf, filename);
  showverbose(gt_str_get(buf));
  gt_str_delete(buf);
}
Ejemplo n.º 4
0
static void write_model(GtStr *str, const char *model_cstr,
                        const GthBSSMModel *model)
{
    GtUword i, j, k, l;
    gt_assert(str && model_cstr && model);
    gt_assert(model->hypothesis_num == 2 || model->hypothesis_num == 7);
    gt_str_append_cstr(str, "  ");
    gt_str_append_cstr(str, model_cstr);
    gt_str_append_cstr(str, " = {\n");
    gt_str_append_cstr(str, "    hypothesis_num = ");
    gt_str_append_uword(str, model->hypothesis_num);
    gt_str_append_cstr(str, ",\n");
    gt_str_append_cstr(str, "    window_size_left = ");
    gt_str_append_uword(str, model->window_size_left);
    gt_str_append_cstr(str, ",\n");
    gt_str_append_cstr(str, "    window_size_right = ");
    gt_str_append_uword(str, model->window_size_right);
    gt_str_append_cstr(str, ",\n");
    for (i = 0; i < model->hypothesis_num; i++) {
        gt_str_append_cstr(str, "    {\n");
        for (j = 0; j < WINSIZE + 2; j++) {
            gt_str_append_cstr(str, "      {\n");
            for (k = 0; k < 4; k++) {
                gt_str_append_cstr(str, "        { ");
                for (l = 0; l < 4; l++) {
                    if (l)
                        gt_str_append_cstr(str, ", ");
                    if (model->hypothesis_num == 2) {
                        gt_str_append_double(str, model->hypotables.hypo2table[i][j][k][l],
                                             BSSM_PRECISION);
                    }
                    else {
                        gt_str_append_double(str, model->hypotables.hypo7table[i][j][k][l],
                                             BSSM_PRECISION);
                    }
                }
                gt_str_append_cstr(str, " },\n");
            }
            gt_str_append_cstr(str, "      },\n");
        }
        gt_str_append_cstr(str, "    },\n");
    }
    gt_str_append_cstr(str, "  }");
}
Ejemplo n.º 5
0
static void remove_bioseq_files(GtBioseq *bs)
{
  GtStr *base = gt_str_new_cstr("stdin.");
  gt_str_append_uword(base, (GtUword) bs);
  remove_indexfile(GT_ENCSEQFILESUFFIX, gt_str_get(base));
  remove_indexfile(GT_DESTABFILESUFFIX, gt_str_get(base));
  remove_indexfile(GT_SSPTABFILESUFFIX, gt_str_get(base));
  remove_indexfile(GT_SDSTABFILESUFFIX, gt_str_get(base));
  remove_indexfile(GT_MD5TABFILESUFFIX, gt_str_get(base));
  remove_indexfile(GT_OISTABFILESUFFIX, gt_str_get(base));
  gt_str_delete(base);
}
Ejemplo n.º 6
0
int gt_condenseq_output_to_gff3(const GtCondenseq *condenseq,
                                GtError *err)
{
  int had_err = 0;
  GtUword idx,
          name_len,
          seqnum = 0, seqstart = 0, seqend = 0,
          desclen;
  GtStr *filename = NULL,
        *id = gt_str_new_cstr("U"),
        *name = gt_str_new_cstr("unique"),
        *parent_unique = gt_str_new_cstr("U"),
        *seqid = gt_str_new(),
        *source = gt_str_new_cstr("Condenseq");
  GtFile *outfile = NULL;
  GtGFF3Visitor *gffv = NULL;
  GtNodeVisitor *nodev = NULL;
  GtFeatureNode *fnode = NULL;
  GtGenomeNode *node = NULL;
  GtRange range;

  gt_assert(condenseq != NULL);

  filename = gt_str_new_cstr(gt_condenseq_basefilename(condenseq));

  name_len = gt_str_length(name);
  gt_str_append_cstr(filename, ".gff3");
  outfile = gt_file_new(gt_str_get(filename), "w", err);
  nodev = gt_gff3_visitor_new(outfile);
  gffv = (GtGFF3Visitor *) nodev;
  gt_gff3_visitor_retain_id_attributes(gffv);

  node = gt_feature_node_new(seqid, "experimental_feature", (GtUword) 1,
                             (GtUword) 1, GT_STRAND_BOTH);
  fnode = (GtFeatureNode*) node;
  gt_feature_node_set_source(fnode, source);
  for (idx = 0; !had_err && idx < condenseq->udb_nelems; ++idx) {
    GtCondenseqUnique uq = condenseq->uniques[idx];
    if (seqend <= uq.orig_startpos) {
      const char *desc;
      gt_genome_node_delete(node);
      seqnum = gt_condenseq_pos2seqnum(condenseq, uq.orig_startpos);
      seqstart = gt_condenseq_seqstartpos(condenseq, seqnum);
      seqend = seqstart + condenseq_seqlength_help(condenseq, seqnum, seqstart);
      desc = gt_condenseq_description(condenseq, &desclen, seqnum);
      gt_str_reset(seqid);
      gt_str_append_cstr_nt(seqid, desc, desclen);
      node = gt_feature_node_new(seqid, "experimental_feature", (GtUword) 1,
                                 (GtUword) 1, GT_STRAND_BOTH);
      fnode = (GtFeatureNode*) node;
      gt_feature_node_set_source(fnode, source);
    }
    gt_str_set_length(name, name_len);
    gt_str_append_uword(name, idx);
    gt_str_set_length(id, (GtUword) 1);
    gt_str_append_uword(id, idx);
    gt_feature_node_set_attribute(fnode, "Name", gt_str_get(name));
    gt_feature_node_set_attribute(fnode, "ID", gt_str_get(id));
    /* 1 Based coordinates! */
    range.start = uq.orig_startpos + 1 - seqstart;
    range.end = uq.orig_startpos + uq.len - seqstart;
    gt_genome_node_set_range(node, &range);
    had_err = gt_genome_node_accept(node, nodev, err);
  }
  gt_str_reset(name);
  gt_str_append_cstr(name, "link");
  gt_str_reset(id);
  gt_str_append_cstr(id, "L");
  name_len = gt_str_length(name);
  seqend = 0;
  for (idx = 0; !had_err && idx < condenseq->ldb_nelems; ++idx) {
    GtCondenseqLink link = condenseq->links[idx];
    if (seqend <= link.orig_startpos) {
      const char *desc;
      gt_genome_node_delete(node);
      seqnum = gt_condenseq_pos2seqnum(condenseq, link.orig_startpos);
      seqstart = gt_condenseq_seqstartpos(condenseq, seqnum);
      seqend = seqstart + condenseq_seqlength_help(condenseq, seqnum, seqstart);
      desc = gt_condenseq_description(condenseq, &desclen, seqnum);
      gt_str_reset(seqid);
      gt_str_append_cstr_nt(seqid, desc, desclen);
      node = gt_feature_node_new(seqid, "experimental_feature", (GtUword) 1,
                                 (GtUword) 1, GT_STRAND_BOTH);
      fnode = (GtFeatureNode*) node;
      gt_feature_node_set_source(fnode, source);
    }
    gt_str_set_length(name, name_len);
    gt_str_append_uword(name, idx);
    gt_str_set_length(id, (GtUword) 1);
    gt_str_append_uword(id, idx);
    gt_feature_node_set_attribute(fnode, "Name", gt_str_get(name));
    gt_feature_node_set_attribute(fnode, "ID", gt_str_get(id));
    gt_str_set_length(parent_unique, (GtUword) 1);
    gt_str_append_uword(parent_unique, link.unique_id);
    gt_feature_node_set_attribute(fnode, "Derives_from",
                                  gt_str_get(parent_unique));
    /* 1 Based coordinates! */
    range.start = link.orig_startpos + 1 - seqstart;
    range.end = link.orig_startpos + link.len - seqstart;
    gt_genome_node_set_range(node, &range);
    had_err = gt_genome_node_accept(node, nodev, err);
  }
  gt_file_delete(outfile);
  gt_genome_node_delete(node);
  gt_node_visitor_delete(nodev);
  gt_str_delete(filename);
  gt_str_delete(id);
  gt_str_delete(name);
  gt_str_delete(parent_unique);
  gt_str_delete(seqid);
  gt_str_delete(source);
  return had_err;
}
Ejemplo n.º 7
0
static void make_sequence_region(GtHashmap *sequence_regions,
                                 GtStr *sequenceid,
                                 GthRegionFactory *srf,
                                 GthInput *input,
                                 GtUword filenum,
                                 GtUword seqnum)
{
    GtUword offset_is_defined = false;
    GtRange range, descrange;
    GtGenomeNode *sr = NULL;
    gt_assert(sequence_regions && sequenceid && srf && input);
    if (gth_input_use_substring_spec(input)) {
        range.start = gth_input_genomic_substring_from(input);
        range.end   = gth_input_genomic_substring_to(input);
    }
    else {
        range = gth_input_get_relative_genomic_range(input, filenum, seqnum);
    }
    if (srf->use_desc_ranges) {
        GtStr *description = gt_str_new();
        gth_input_get_genomic_description(input, description, filenum, seqnum);
        if (!gt_parse_description_range(gt_str_get(description), &descrange))
            offset_is_defined = true;
        gt_str_delete(description);
    }
    if (offset_is_defined)
        range = gt_range_offset(&range, descrange.start);
    else
        range = gt_range_offset(&range, 1); /* 1-based */
    if (!gt_str_length(sequenceid) ||
            (gt_cstr_table_get(srf->used_seqids, gt_str_get(sequenceid)) &&
             !offset_is_defined)) {
        /* sequenceid is empty or exists already (and no offset has been parsed)
           -> make one up */
        GtStr *seqid;
        char *base;
        base = gt_basename(gth_input_get_genomic_filename(input, filenum));
        seqid = gt_str_new_cstr(base);
        gt_free(base);
        gt_str_append_char(seqid, '|');
        gt_str_append_uword(seqid, seqnum + 1); /* 1-based */
        seqid_store_add(srf->seqid_store, filenum, seqnum, seqid, GT_UNDEF_UWORD);
        gt_assert(!gt_cstr_table_get(srf->used_seqids, gt_str_get(seqid)));
        gt_cstr_table_add(srf->used_seqids, gt_str_get(seqid));
        sr = gt_region_node_new(seqid_store_get(srf->seqid_store, filenum, seqnum),
                                range.start, range.end);
        gt_hashmap_add(sequence_regions,
                       (void*) gt_cstr_table_get(srf->used_seqids,
                               gt_str_get(seqid)),
                       sr);
        gt_str_delete(seqid);
    }
    else {
        /* sequenceid does not exists already (or an offset has been parsed)
           -> use this one */
        if (!gt_cstr_table_get(srf->used_seqids, gt_str_get(sequenceid))) {
            /* no sequence region with this id exists -> create one */
            gt_cstr_table_add(srf->used_seqids, gt_str_get(sequenceid));
            seqid_store_add(srf->seqid_store, filenum, seqnum, sequenceid,
                            offset_is_defined ? descrange.start : GT_UNDEF_UWORD);
            sr = gt_region_node_new(seqid_store_get(srf->seqid_store, filenum,
                                                    seqnum), range.start, range.end);
            gt_hashmap_add(sequence_regions,
                           (void*) gt_cstr_table_get(srf->used_seqids,
                                   gt_str_get(sequenceid)),
                           sr);
        }
        else {
            GtRange prev_range, new_range;
            /* sequence region with this id exists already -> modify range */
            sr = gt_hashmap_get(sequence_regions, gt_str_get(sequenceid));
            gt_assert(sr);
            prev_range = gt_genome_node_get_range(sr);
            new_range = gt_range_join(&prev_range, &range);
            gt_genome_node_set_range(sr, &new_range);
            seqid_store_add(srf->seqid_store, filenum, seqnum, sequenceid,
                            offset_is_defined ? descrange.start : GT_UNDEF_UWORD);
        }
    }
    gt_assert(sr);
}
static int hmmsearch_process_coarse_hits(
                                       char *table_filename,
                                       GtCondenseq *ces,
                                       GtCondenseqHmmsearchArguments *arguments,
                                       GtLogger *logger,
                                       GtError *err) {
  int had_err = 0;
  GtStr *line = gt_str_new();
  FILE *table = NULL;
  GtSplitter *splitter = gt_splitter_new();
  GtStr *query = gt_str_new(),
        *fine_fasta_filename = gt_str_new_cstr("condenseq");
  GtRBTree *sequences = NULL;
  GtUword filecount = (GtUword) 1;
  unsigned int querycount = 0;
  const GtUword fine_fasta_name_length = gt_str_length(fine_fasta_filename);
  const GtUword table_name_length = gt_str_length(arguments->outtable_filename);

  table = gt_xfopen(table_filename, "r");

  sequences = gt_rbtree_new(hmmsearch_cmp_seqnum,
                            hmmsearch_tree_free_node, NULL);

  while (!had_err && gt_str_read_next_line(line, table) == 0) {
    char *c_line = gt_str_get(line);
    GtUword uid;
    const GtUword target_column = 0,
          query_column = (GtUword) 3;

    if (c_line[0] != '#') {
      gt_splitter_split_non_empty(splitter, c_line, gt_str_length(line), ' ');
      gt_assert(gt_splitter_size(splitter) == (GtUword) 23);
      if (sscanf(gt_splitter_get_token(splitter, target_column),
                 GT_WU, &uid) != 1) {
        gt_error_set(err, "couldn't parse target number: %s",
                     gt_splitter_get_token(splitter, target_column));
        had_err = -1;
      }
      if (gt_str_length(query) == 0 ||
          strcmp(gt_str_get(query),
                 gt_splitter_get_token(splitter, query_column)) != 0) {
        gt_str_set(query, gt_splitter_get_token(splitter, query_column));
        gt_logger_log(logger, "new query: %s", gt_str_get(query));
        querycount++;
      }
      if (!had_err && querycount == arguments->max_queries) {
        hmmsearch_create_fine_fas(fine_fasta_filename, sequences, ces);
        if (table_name_length != 0)
          gt_str_append_uword(arguments->outtable_filename, filecount++);
        had_err =
          hmmsearch_call_fine_search(table_name_length != 0 ?
                                       arguments->outtable_filename :
                                       NULL,
                                     gt_str_get(fine_fasta_filename),
                                     gt_str_get(arguments->hmmsearch_path),
                                     gt_str_get(arguments->hmm),
                                     logger, err);
        gt_rbtree_clear(sequences);
        gt_str_set_length(fine_fasta_filename, fine_fasta_name_length);
        if (table_name_length != 0)
          gt_str_set_length(arguments->outtable_filename, table_name_length);
        querycount = 0;
      }
      if (!had_err) {
        if (gt_condenseq_each_redundant_seq(ces, uid,
                                            hmmsearch_process_seq,
                                            sequences, err) == 0) {
          had_err = -1;
        }
      }
      gt_splitter_reset(splitter);
    }
    gt_str_reset(line);
  }
  gt_splitter_delete(splitter);
  gt_str_delete(line);
  gt_str_delete(query);
  gt_xfclose(table);

  if (!had_err) {
    hmmsearch_create_fine_fas(fine_fasta_filename, sequences, ces);
    if (table_name_length != 0)
      gt_str_append_uword(arguments->outtable_filename, filecount++);
    had_err =
      hmmsearch_call_fine_search(table_name_length != 0 ?
                                 arguments->outtable_filename :
                                 NULL,
                                 gt_str_get(fine_fasta_filename),
                                 gt_str_get(arguments->hmmsearch_path),
                                 gt_str_get(arguments->hmm),
                                 logger, err);
  }
  gt_log_log("created " GT_WU " files", filecount);
  gt_rbtree_delete(sequences);
  gt_str_delete(fine_fasta_filename);
  return had_err;
}
Ejemplo n.º 9
0
/* Formats a given position number for short display in the ruler. */
void gt_format_ruler_label(char *txt, GtWord pos,
                           const char *unitstr, size_t buflen)
{
  double fpos;
  int logval;
  GtStr *formatstring;
  GtUword upos;
  gt_assert(txt);
  bool negative = false;

  if (pos < 0)
  {
    upos = (GtUword)-pos;
    negative = true;
    formatstring = gt_str_new_cstr("-%.");
  }
  else
  {
    upos = (GtUword)pos;
    formatstring = gt_str_new_cstr("%.");
  }
  logval = (int) floor(log10(upos));
  if (upos >= 1000000000)
  {
    fpos = (double) upos / 1000000000;
    while (upos % 10 == 0)
    {
      upos /= 10;
      logval--;
    }
    /*@ignore@*/
    gt_str_append_uword(formatstring, (GtUword) logval);
    gt_str_append_cstr(formatstring, "fG%s");
    (void) snprintf(txt, buflen, gt_str_get(formatstring), fpos, unitstr);
    /*@end@*/
  }
  else if (upos >= 1000000)
  {
    fpos = (double) upos / 1000000;
    while (upos % 10 == 0)
    {
      upos /= 10;
      logval--;
    }
    /*@ignore@*/
    gt_str_append_uword(formatstring, (GtUword) logval);
    gt_str_append_cstr(formatstring, "fM%s");
    (void) snprintf(txt, buflen, gt_str_get(formatstring), fpos, unitstr);
    /*@end@*/
  }
  else if (upos >= 1000)
  {
    fpos = (double) upos / 1000;
    while (upos % 10 == 0)
    {
      upos /= 10;
      logval--;
    }
    /*@ignore@*/
    gt_str_append_uword(formatstring, (GtUword) logval);
    gt_str_append_cstr(formatstring, "fk%s");
    (void) snprintf(txt, buflen, gt_str_get(formatstring), fpos, unitstr);
    /*@end@*/
  } else {
    /*@ignore@*/
    (void) snprintf(txt, buflen, " %s"GT_WU"%s", negative ? "-" : "", upos,
        unitstr);
    /*@end@*/
  }

  gt_str_delete(formatstring);
}
Ejemplo n.º 10
0
static int bioseq_fill(GtBioseq *bs, bool recreate, GtError *err)
{
  GtStr *bioseq_index_file = NULL,
        *bioseq_ois_file = NULL,
        *bioseq_sds_file = NULL,
        *bioseq_md5_file = NULL,
        *bioseq_des_file = NULL;
  int had_err = 0;
  GtStr *bioseq_basename;

  gt_assert(!bs->encseq);

  if (bs->use_stdin) {
    bioseq_basename = gt_str_new_cstr("stdin.");
    /* assign a unique name */
    gt_str_append_uword(bioseq_basename, (GtUword) bs);
  } else
    bioseq_basename = bs->sequence_file;

  /* construct file names */
  bioseq_index_file = gt_str_clone(bioseq_basename);
  gt_str_append_cstr(bioseq_index_file, GT_ENCSEQFILESUFFIX);
  bioseq_ois_file = gt_str_clone(bioseq_basename);
  gt_str_append_cstr(bioseq_ois_file, GT_OISTABFILESUFFIX);
  bioseq_sds_file = gt_str_clone(bioseq_basename);
  gt_str_append_cstr(bioseq_sds_file, GT_SDSTABFILESUFFIX);
  bioseq_md5_file = gt_str_clone(bioseq_basename);
  gt_str_append_cstr(bioseq_md5_file, GT_MD5TABFILESUFFIX);
  bioseq_des_file = gt_str_clone(bioseq_basename);
  gt_str_append_cstr(bioseq_des_file, GT_DESTABFILESUFFIX);

  /* construct the bioseq files if necessary */
  if (recreate || bs->use_stdin ||
      !gt_file_exists(gt_str_get(bioseq_index_file)) ||
      !gt_file_exists(gt_str_get(bioseq_ois_file)) ||
      !gt_file_exists(gt_str_get(bioseq_sds_file)) ||
      !gt_file_exists(gt_str_get(bioseq_md5_file)) ||
      !gt_file_exists(gt_str_get(bioseq_des_file)) ||
      gt_file_is_newer(gt_str_get(bs->sequence_file),
                       gt_str_get(bioseq_index_file))) {
    had_err = construct_bioseq_files(bs, bioseq_basename, err);
  }

  if (!had_err) {
    GtEncseqLoader *el = gt_encseq_loader_new();
    gt_encseq_loader_disable_autosupport(el);
    gt_encseq_loader_require_lossless_support(el);
    gt_encseq_loader_require_description_support(el);
    gt_encseq_loader_require_md5_support(el);
    gt_encseq_loader_require_multiseq_support(el);
    bs->encseq = gt_encseq_loader_load(el, gt_str_get(bioseq_basename), err);
    if (bs->encseq == NULL) {
      had_err = -1;
      gt_assert(gt_error_is_set(err));
    }
    gt_encseq_loader_delete(el);
  }
  if (!had_err) {
    gt_assert(bs->encseq);
  }

  /* free */
  if (bs->use_stdin)
    gt_str_delete(bioseq_basename);
  gt_str_delete(bioseq_index_file);
  gt_str_delete(bioseq_ois_file);
  gt_str_delete(bioseq_md5_file);
  gt_str_delete(bioseq_sds_file);
  gt_str_delete(bioseq_des_file);

  return had_err;
}