const char *gt_condenseq_extract_decoded(GtCondenseq *condenseq,
                                         GtUword *length,
                                         GtUword id)
{
  GtRange range;
  range.start = gt_condenseq_seqstartpos(condenseq, id);
  if (id < condenseq->orig_num_seq - 1)
    /* -2 because of seperator */
    range.end = gt_condenseq_seqstartpos(condenseq, id + 1) - 2;
  else
    range.end = condenseq->orig_length - 1;
  *length = range.end - range.start + 1;
  return gt_condenseq_extract_decoded_range(condenseq, range, '\0');
}
GtUword gt_condenseq_unique_range_to_seqrange(GtCondenseq *condenseq,
                                              GtUword uid,
                                              GtRange *urange)
{
  GtUword seqnum = 0, seqstart = 0;
  GtCondenseqUnique uq;

  gt_assert(condenseq != NULL);
  gt_assert(uid < condenseq->udb_nelems);

  uq = condenseq->uniques[uid];
  seqnum = gt_condenseq_pos2seqnum(condenseq, uq.orig_startpos);
  seqstart = gt_condenseq_seqstartpos(condenseq, seqnum);
  urange->start += uq.orig_startpos - seqstart;
  urange->end += uq.orig_startpos - seqstart;
  return seqnum;
}
GtUword gt_condenseq_each_redundant_range(
                                      GtCondenseq *condenseq,
                                      GtUword uid,
                                      GtRange urange,
                                      GtUword left_extend,
                                      GtUword right_extend,
                                      GtCondenseqProcessExtractedRange callback,
                                      void *callback_data,
                                      GtError *err)
{
  int had_err = 0;
  GtUword num_ranges = (GtUword) 1,
          linkidx,
          orig_seqnum,
          orig_seqstart,
          orig_seqend;
  const GtCondenseqUnique *unique;
  GtRange extract;

  gt_assert(condenseq != NULL);
  gt_assert(uid < condenseq->udb_nelems);

  unique = &condenseq->uniques[uid];

  /* handle unique itself */
  orig_seqnum = gt_condenseq_pos2seqnum(condenseq, unique->orig_startpos);
  orig_seqstart = gt_condenseq_seqstartpos(condenseq, orig_seqnum);
  orig_seqend = orig_seqstart + condenseq_seqlength_help(condenseq, orig_seqnum,
                                                         orig_seqstart) - 1;
  extract.start = unique->orig_startpos + urange.start;
  extract.start = extract.start < left_extend ?
    0 : extract.start - left_extend;
  extract.start = extract.start < orig_seqstart ?
    orig_seqstart : extract.start;
  extract.end = unique->orig_startpos + urange.end + right_extend;
  extract.end = extract.end > orig_seqend ?
    orig_seqend : extract.end;

  gt_assert(extract.start <= extract.end);
  had_err = callback(callback_data, orig_seqnum, extract, err);

  for (linkidx = 0;
       !had_err && linkidx < unique->links.nextfreeuint32_t;
       ++linkidx) {
    const GtCondenseqLink *link =
      &condenseq->links[unique->links.spaceuint32_t[linkidx]];

    /* the second part is a little heuristic, the len of the link could be
       completely comprised of insertions, which would place it downstream of
       urange.start. But we assume ~ similar lengths for links and their unique
       counterpart */
    if (!(urange.end < link->unique_offset ||
        urange.start > link->unique_offset + link->len - 1)) {
      GtUword shift;
      orig_seqnum = gt_condenseq_pos2seqnum(condenseq, link->orig_startpos);
      orig_seqstart = gt_condenseq_seqstartpos(condenseq, orig_seqnum);
      orig_seqend = orig_seqstart + condenseq_seqlength_help(condenseq,
                                                             orig_seqnum,
                                                             orig_seqstart) - 1;
      extract.start = link->orig_startpos < left_extend ?
        0 : link->orig_startpos - left_extend;
      if (urange.start < link->unique_offset) {
        shift = link->unique_offset - urange.start;
        extract.start = extract.start < shift ?
          0 : extract.start - shift;
      }
      else {
        shift = urange.start - link->unique_offset;
        extract.start += shift;
      }
      extract.start = extract.start < orig_seqstart ?
        orig_seqstart : extract.start;
      /* see heuristic note above */
      extract.end = link->orig_startpos + right_extend + link->len;
      if (urange.end < link->unique_offset + link->len - 1) {
        shift = (link->unique_offset + link->len - 1) - urange.end;
        extract.end = extract.end < shift ?
          0 : extract.end - shift;
      }
      else {
        shift = urange.end - (link->unique_offset + link->len - 1);
        extract.end += shift;
      }
      extract.end = extract.end > orig_seqend ?
        orig_seqend : extract.end;
      gt_assert(extract.start <= extract.end);
      had_err = callback(callback_data, orig_seqnum, extract, err);
      num_ranges++;
    }
  }

  if (!had_err)
    return num_ranges;
  return 0;
}
int gt_condenseq_output_to_gff3(const GtCondenseq *condenseq,
                                GtError *err)
{
  int had_err = 0;
  GtUword idx,
          name_len,
          seqnum = 0, seqstart = 0, seqend = 0,
          desclen;
  GtStr *filename = NULL,
        *id = gt_str_new_cstr("U"),
        *name = gt_str_new_cstr("unique"),
        *parent_unique = gt_str_new_cstr("U"),
        *seqid = gt_str_new(),
        *source = gt_str_new_cstr("Condenseq");
  GtFile *outfile = NULL;
  GtGFF3Visitor *gffv = NULL;
  GtNodeVisitor *nodev = NULL;
  GtFeatureNode *fnode = NULL;
  GtGenomeNode *node = NULL;
  GtRange range;

  gt_assert(condenseq != NULL);

  filename = gt_str_new_cstr(gt_condenseq_basefilename(condenseq));

  name_len = gt_str_length(name);
  gt_str_append_cstr(filename, ".gff3");
  outfile = gt_file_new(gt_str_get(filename), "w", err);
  nodev = gt_gff3_visitor_new(outfile);
  gffv = (GtGFF3Visitor *) nodev;
  gt_gff3_visitor_retain_id_attributes(gffv);

  node = gt_feature_node_new(seqid, "experimental_feature", (GtUword) 1,
                             (GtUword) 1, GT_STRAND_BOTH);
  fnode = (GtFeatureNode*) node;
  gt_feature_node_set_source(fnode, source);
  for (idx = 0; !had_err && idx < condenseq->udb_nelems; ++idx) {
    GtCondenseqUnique uq = condenseq->uniques[idx];
    if (seqend <= uq.orig_startpos) {
      const char *desc;
      gt_genome_node_delete(node);
      seqnum = gt_condenseq_pos2seqnum(condenseq, uq.orig_startpos);
      seqstart = gt_condenseq_seqstartpos(condenseq, seqnum);
      seqend = seqstart + condenseq_seqlength_help(condenseq, seqnum, seqstart);
      desc = gt_condenseq_description(condenseq, &desclen, seqnum);
      gt_str_reset(seqid);
      gt_str_append_cstr_nt(seqid, desc, desclen);
      node = gt_feature_node_new(seqid, "experimental_feature", (GtUword) 1,
                                 (GtUword) 1, GT_STRAND_BOTH);
      fnode = (GtFeatureNode*) node;
      gt_feature_node_set_source(fnode, source);
    }
    gt_str_set_length(name, name_len);
    gt_str_append_uword(name, idx);
    gt_str_set_length(id, (GtUword) 1);
    gt_str_append_uword(id, idx);
    gt_feature_node_set_attribute(fnode, "Name", gt_str_get(name));
    gt_feature_node_set_attribute(fnode, "ID", gt_str_get(id));
    /* 1 Based coordinates! */
    range.start = uq.orig_startpos + 1 - seqstart;
    range.end = uq.orig_startpos + uq.len - seqstart;
    gt_genome_node_set_range(node, &range);
    had_err = gt_genome_node_accept(node, nodev, err);
  }
  gt_str_reset(name);
  gt_str_append_cstr(name, "link");
  gt_str_reset(id);
  gt_str_append_cstr(id, "L");
  name_len = gt_str_length(name);
  seqend = 0;
  for (idx = 0; !had_err && idx < condenseq->ldb_nelems; ++idx) {
    GtCondenseqLink link = condenseq->links[idx];
    if (seqend <= link.orig_startpos) {
      const char *desc;
      gt_genome_node_delete(node);
      seqnum = gt_condenseq_pos2seqnum(condenseq, link.orig_startpos);
      seqstart = gt_condenseq_seqstartpos(condenseq, seqnum);
      seqend = seqstart + condenseq_seqlength_help(condenseq, seqnum, seqstart);
      desc = gt_condenseq_description(condenseq, &desclen, seqnum);
      gt_str_reset(seqid);
      gt_str_append_cstr_nt(seqid, desc, desclen);
      node = gt_feature_node_new(seqid, "experimental_feature", (GtUword) 1,
                                 (GtUword) 1, GT_STRAND_BOTH);
      fnode = (GtFeatureNode*) node;
      gt_feature_node_set_source(fnode, source);
    }
    gt_str_set_length(name, name_len);
    gt_str_append_uword(name, idx);
    gt_str_set_length(id, (GtUword) 1);
    gt_str_append_uword(id, idx);
    gt_feature_node_set_attribute(fnode, "Name", gt_str_get(name));
    gt_feature_node_set_attribute(fnode, "ID", gt_str_get(id));
    gt_str_set_length(parent_unique, (GtUword) 1);
    gt_str_append_uword(parent_unique, link.unique_id);
    gt_feature_node_set_attribute(fnode, "Derives_from",
                                  gt_str_get(parent_unique));
    /* 1 Based coordinates! */
    range.start = link.orig_startpos + 1 - seqstart;
    range.end = link.orig_startpos + link.len - seqstart;
    gt_genome_node_set_range(node, &range);
    had_err = gt_genome_node_accept(node, nodev, err);
  }
  gt_file_delete(outfile);
  gt_genome_node_delete(node);
  gt_node_visitor_delete(nodev);
  gt_str_delete(filename);
  gt_str_delete(id);
  gt_str_delete(name);
  gt_str_delete(parent_unique);
  gt_str_delete(seqid);
  gt_str_delete(source);
  return had_err;
}
static int gt_condenseq_extract_runner(GT_UNUSED int argc,
                                       const char **argv,
                                       int parsed_args,
                                       void *tool_arguments,
                                       GtError *err)
{
  int had_err = 0;
  GtCondenserExtractArguments *arguments = tool_arguments;
  GtCondenseq *condenseq = NULL;
  GtLogger *logger = NULL;

  gt_error_check(err);
  gt_assert(arguments);

  logger = gt_logger_new(arguments->verbose, GT_LOGGER_DEFLT_PREFIX, stderr);

  if (!had_err) {
    condenseq = gt_condenseq_new_from_file(argv[parsed_args], logger, err);
    if (condenseq == NULL) {
      had_err = -1;
    }
  }

  if (!had_err) {
    const char *buffer = NULL;
    const char *desc = NULL;
    GtUword desclen,
            seqlen,
            rend = gt_condenseq_total_length(condenseq),
            send = gt_condenseq_num_of_sequences(condenseq);
    bool concat = strcmp(gt_str_get(arguments->mode), "concat") == 0;
    /* single sequence to extract = range of length 1 */
    if (arguments->seq != GT_UNDEF_UWORD) {
      arguments->seqrange.start = arguments->seqrange.end = arguments->seq;
    }
    /* no range given at all: extract all seqs */
    if (arguments->range.start == GT_UNDEF_UWORD &&
        arguments->seqrange.start == GT_UNDEF_UWORD) {
      arguments->seqrange.start = 0;
      arguments->seqrange.end = send - 1;
    }
    /* if seqs are specified, and concat is given, switch to posrange */
    if (concat && arguments->seqrange.start != GT_UNDEF_UWORD) {
      if (arguments->seqrange.end >= send) {
        had_err = -1;
        gt_error_set(err, "range end " GT_WU " excedes number of sequences "
                     GT_WU " (ranges are zero based sequence ids)",
                     arguments->seqrange.end, send);
      }
      else {
        arguments->range.start =
          gt_condenseq_seqstartpos(condenseq, arguments->seqrange.start);
        arguments->range.end =
          gt_condenseq_seqstartpos(condenseq, arguments->seqrange.end) +
          gt_condenseq_seqlength(condenseq, arguments->seqrange.end) - 1;
      }
    }
    /* extract sequence region */
    if (!had_err && arguments->range.start != GT_UNDEF_UWORD) {
      const GtUword maxbuffsize = ((GtUword) 1) << 17; /* ~ 100000byte */
      GtUword clen,
              rstart,
              current_length = 0, i;
      const char sepchar = gt_str_get(arguments->sepchar)[0];

      if (arguments->range.end >= rend) {
        had_err = -1;
        gt_error_set(err, "range end " GT_WU " excedes length of sequence "
                     GT_WU " (ranges are zero based positions)",
                     arguments->range.end, rend);
      }
      if (!had_err) {
        rstart = arguments->range.start;
        rend = arguments->range.end;
        /* nextlength = gt_condenseq_seqlength(condenseq, seqnum); */
        /* seqstart = gt_condenseq_seqstartpos(condenseq, seqnum); */
        /* gt_assert(rstart >= seqstart); */
        /* nextlength -= rstart - seqstart; [> handle first seq <] */
        while (rstart <= rend) {
          GtRange cur_range;
          if (rend - rstart > maxbuffsize) {
            GtUword seqnum = gt_condenseq_pos2seqnum(condenseq,
                                                     rstart + maxbuffsize),
                    closest_sep = gt_condenseq_seqstartpos(condenseq,
                                                           seqnum) - 1;
            gt_assert(closest_sep > rstart);
            clen = closest_sep - rstart + 1;
          }
          else
            clen = rend - rstart + 1;

          cur_range.start = rstart;
          cur_range.end = rstart + clen - 1;
          buffer = gt_condenseq_extract_decoded_range(condenseq, cur_range,
                                                      sepchar);
          gt_assert(buffer != NULL);
          for (i = 0; i < clen; i++, current_length++) {
            if (arguments->width && current_length == arguments->width) {
              gt_file_xfputc('\n', arguments->outfp);
              current_length = 0;
            }
            gt_file_xfputc(buffer[i], arguments->outfp);
          }
          rstart += clen;
        }
        gt_file_xfputc('\n', arguments->outfp);
      }
    }
    else if (!had_err) { /* extract seqwise and always fasta */
      GtUword seqnum,
              sstart = arguments->seqrange.start;

      if (arguments->seqrange.end >= send) {
        had_err = -1;
        gt_error_set(err, "range end " GT_WU " excedes number of sequences "
                     GT_WU " (ranges are zero based sequence ids)",
                     arguments->seqrange.end, send);
      }
      send = arguments->seqrange.end;
      for (seqnum = sstart;
           !had_err && seqnum <= send;
           ++seqnum) {
        buffer = gt_condenseq_extract_decoded(condenseq, &seqlen, seqnum);
        desc = gt_condenseq_description(condenseq, &desclen, seqnum);
        gt_fasta_show_entry_nt(desc, desclen,
                               buffer, seqlen,
                               arguments->width,
                               arguments->outfp);
      }
    }
  }
  gt_condenseq_delete(condenseq);
  gt_logger_delete(logger);
  return had_err;
}
示例#6
0
GtUword gt_condenseq_seqlength(const GtCondenseq *condenseq, GtUword seqnum)
{
  GtUword start = gt_condenseq_seqstartpos(condenseq, seqnum);
  return condenseq_seqlength_help(condenseq, seqnum, start);
}