Exemple #1
0
static int select_visitor_region_node(GtNodeVisitor *nv, GtRegionNode *rn,
                                      GT_UNUSED GtError *err)
{
  GtSelectVisitor *select_visitor;
  gt_error_check(err);
  select_visitor = select_visitor_cast(nv);
  if (!gt_str_length(select_visitor->seqid) || /* no seqid was specified */
      !gt_str_cmp(select_visitor->seqid,       /* or seqids are equal */
               gt_genome_node_get_seqid((GtGenomeNode*) rn))) {
    if (select_visitor->contain_range.start != GT_UNDEF_ULONG) {
      GtRange range = gt_genome_node_get_range((GtGenomeNode*) rn);
      if (gt_range_overlap(&range, &select_visitor->contain_range)) {
        /* an overlapping contain range was defined -> update range  */
        range.start = MAX(range.start, select_visitor->contain_range.start);
        range.end = MIN(range.end, select_visitor->contain_range.end);
        gt_genome_node_set_range((GtGenomeNode*) rn, &range);
        gt_queue_add(select_visitor->node_buffer, rn);
      }
      else /* contain range does not overlap with <rn> range -> delete <rn> */
        gt_genome_node_delete((GtGenomeNode*) rn);
    }
    else
      gt_queue_add(select_visitor->node_buffer, rn);
  }
  else
    gt_genome_node_delete((GtGenomeNode*) rn);
  return 0;
}
void gt_desc_buffer_reset(GtDescBuffer *db)
{
  GtUword laststartpos;
  gt_assert(db);

  if (!db->dirty) return;
  if (gt_queue_size(db->startqueue) == 0) {
    db->length = 0;
    db->dirty = false;
    return;
  }
  laststartpos = (GtUword) gt_queue_head(db->startqueue);
  if (laststartpos != 0) {
    laststartpos = (GtUword) gt_queue_get(db->startqueue);
    db->length = db->length - laststartpos;
    if (db->length >= laststartpos) {
      /* strings overlap */
      memmove(db->buf, db->buf + laststartpos, db->length * sizeof (char));
    } else {
      /* no overlap */
      memcpy(db->buf, db->buf + laststartpos, db->length * sizeof (char));
    }
    gt_queue_add(db->startqueue, (void*) 0);
  }
  db->dirty = false;
}
static void infer_cds_visitor_test_data(GtQueue *queue)
{
  GtError *error = gt_error_new();
  const char *file = "data/gff3/grape-codons.gff3";
  GtNodeStream *gff3in = gt_gff3_in_stream_new_unsorted(1, &file);
  gt_gff3_in_stream_check_id_attributes((GtGFF3InStream *)gff3in);
  gt_gff3_in_stream_enable_tidy_mode((GtGFF3InStream *)gff3in);
  GtLogger *logger = gt_logger_new(true, "", stderr);
  GtNodeStream *icv_stream = agn_infer_cds_stream_new(gff3in, NULL, logger);
  GtArray *feats = gt_array_new( sizeof(GtFeatureNode *) );
  GtNodeStream *arraystream = gt_array_out_stream_new(icv_stream, feats, error);
  int pullresult = gt_node_stream_pull(arraystream, error);
  if(pullresult == -1)
  {
    fprintf(stderr, "[AgnInferCDSVisitor::infer_cds_visitor_test_data] error "
            "processing features: %s\n", gt_error_get(error));
  }
  gt_node_stream_delete(gff3in);
  gt_node_stream_delete(icv_stream);
  gt_node_stream_delete(arraystream);
  gt_logger_delete(logger);
  gt_array_sort(feats, (GtCompare)agn_genome_node_compare);
  gt_array_reverse(feats);
  while(gt_array_size(feats) > 0)
  {
    GtFeatureNode *fn = *(GtFeatureNode **)gt_array_pop(feats);
    gt_queue_add(queue, fn);
  }
  gt_array_delete(feats);
  gt_error_delete(error);
}
Exemple #4
0
static int select_visitor_eof_node(GtNodeVisitor *nv, GtEOFNode *eofn,
                                   GT_UNUSED GtError *err)
{
  GtSelectVisitor *select_visitor;
  gt_error_check(err);
  select_visitor = select_visitor_cast(nv);
  gt_queue_add(select_visitor->node_buffer, eofn);
  return 0;
}
static int add_ids_visitor_comment_node(GtNodeVisitor *nv, GtCommentNode *c,
                                        GT_UNUSED GtError *err)
{
  GtAddIDsVisitor *add_ids_visitor;
  gt_error_check(err);
  add_ids_visitor = add_ids_visitor_cast(nv);
  gt_queue_add(add_ids_visitor->node_buffer, c);
  return 0;
}
Exemple #6
0
static int select_visitor_comment_node(GtNodeVisitor *nv, GtCommentNode *c,
                                       GT_UNUSED GtError *err)
{
  GtSelectVisitor *select_visitor;
  gt_error_check(err);
  select_visitor = select_visitor_cast(nv);
  gt_queue_add(select_visitor->node_buffer, c);
  return 0;
}
static int add_ids_visitor_meta_node(GtNodeVisitor *nv, GtMetaNode *mn,
                                     GT_UNUSED GtError *err)
{
  GtAddIDsVisitor *add_ids_visitor;
  gt_error_check(err);
  add_ids_visitor = add_ids_visitor_cast(nv);
  gt_queue_add(add_ids_visitor->node_buffer, mn);
  return 0;
}
static int add_ids_visitor_eof_node(GtNodeVisitor *nv, GtEOFNode *eofn,
                                    GT_UNUSED GtError *err)
{
  GtAddIDsVisitor *add_ids_visitor;
  gt_error_check(err);
  add_ids_visitor = add_ids_visitor_cast(nv);
  gt_add_ids_visitor_finalize(nv);
  gt_queue_add(add_ids_visitor->node_buffer, eofn);
  return 0;
}
Exemple #9
0
static int filter_stream_next(GtNodeStream *ns, GtGenomeNode **gn,
                              GtError *error)
{
  AgnFilterStream *stream;
  GtFeatureNode *fn;
  int had_err;
  gt_error_check(error);
  stream = filter_stream_cast(ns);

  if(gt_queue_size(stream->cache) > 0)
  {
    *gn = gt_queue_get(stream->cache);
    return 0;
  }

  while(1)
  {
    had_err = gt_node_stream_next(stream->in_stream, gn, error);
    if(had_err)
      return had_err;
    if(!*gn)
      return 0;

    fn = gt_feature_node_try_cast(*gn);
    if(!fn)
      return 0;

    GtFeatureNode *current;
    GtFeatureNodeIterator *iter = gt_feature_node_iterator_new(fn);
    for(current  = gt_feature_node_iterator_next(iter);
        current != NULL;
        current  = gt_feature_node_iterator_next(iter))
    {
      const char *type = gt_feature_node_get_type(current);
      bool keepfeature = false;
      if(gt_hashmap_get(stream->typestokeep, type) != NULL)
        keepfeature = true;

      if(keepfeature)
      {
        gt_genome_node_ref((GtGenomeNode *)current);
        gt_queue_add(stream->cache, current);
      }
    }
    gt_feature_node_iterator_delete(iter);
    gt_genome_node_delete((GtGenomeNode *)fn);
    if(gt_queue_size(stream->cache) > 0)
    {
      *gn = gt_queue_get(stream->cache);
      return 0;
    }
  }

  return 0;
}
static int add_auto_sr_to_queue(GT_UNUSED void *key, void *value, void *data,
                                GT_UNUSED GtError *err)
{
  AutomaticSequenceRegion *auto_sr = value;
  GtQueue *genome_nodes = data;
  GtGenomeNode *gf;
  unsigned int i;
  gt_error_check(err);
  gt_assert(key && value && data);
  if (gt_array_size(auto_sr->feature_nodes)) {
    gt_queue_add(genome_nodes, auto_sr->sequence_region);
    auto_sr->sequence_region = NULL;
    for (i = 0; i < gt_array_size(auto_sr->feature_nodes); i++) {
      gf = *(GtGenomeNode**) gt_array_get(auto_sr->feature_nodes, i);
      gt_queue_add(genome_nodes, gf);
    }
    gt_array_reset(auto_sr->feature_nodes);
  }
  return 0;
}
static int add_ids_visitor_sequence_node(GtNodeVisitor *nv, GtSequenceNode *sn,
                                         GT_UNUSED GtError *err)
{
  GtAddIDsVisitor *add_ids_visitor;
  gt_error_check(err);
  add_ids_visitor = add_ids_visitor_cast(nv);
  /* sequence nodes have to be at the end of a stream -> finalize first */
  gt_add_ids_visitor_finalize(nv);
  /* then add sequence node to buffer */
  gt_queue_add(add_ids_visitor->node_buffer, sn);
  return 0;
}
Exemple #12
0
static int select_visitor_sequence_node(GtNodeVisitor *nv, GtSequenceNode *sn,
                                        GT_UNUSED GtError *err)
{
  GtSelectVisitor *select_visitor;
  gt_error_check(err);
  select_visitor = select_visitor_cast(nv);
  if (!gt_str_length(select_visitor->seqid) || /* no seqid was specified */
      !gt_str_cmp(select_visitor->seqid,       /* or seqids are equal */
                  gt_genome_node_get_seqid((GtGenomeNode*) sn))) {
    gt_queue_add(select_visitor->node_buffer, sn);
  }
  else
    gt_genome_node_delete((GtGenomeNode*) sn);
  return 0;
}
GtDescBuffer* gt_desc_buffer_new(void)
{
  GtDescBuffer *db = gt_malloc(sizeof *db);
  db->buf = gt_calloc(GT_DESC_BUFFER_INIT_SIZE, sizeof (char));
  db->length = 0;
  db->maxlength = db->curlength = 0;
  db->allocated = GT_DESC_BUFFER_INIT_SIZE;
  db->finished = false;
  db->dirty = true;
  db->shorten = false;
  db->seen_whitespace = false;
  db->reference_count = 0;
  db->startqueue = gt_queue_new();
  gt_queue_add(db->startqueue, (void*) 0);
  return db;
}
Exemple #14
0
static int buffer_stream_next(GtNodeStream *ns, GtGenomeNode **gn, GtError *err)
{
  GtBufferStream *bs;
  gt_error_check(err);
  bs = buffer_stream_cast(ns);
  if (bs->buffering) {
    int had_err = gt_node_stream_next(bs->in_stream, gn, err);
    if (!had_err && *gn)
      gt_queue_add(bs->node_buffer, gt_genome_node_ref(*gn));
    return had_err;
  }
  else {
    *gn = gt_queue_size(bs->node_buffer) ? gt_queue_get(bs->node_buffer) : NULL;
    return 0;
  }
}
static int snp_annotator_stream_process_current_gene(GtSNPAnnotatorStream *sas,
                                                     GtError *err)
{
  int had_err = 0;
  GtUword i;
  GtUword nof_genes = gt_array_size(sas->cur_gene_set);
  gt_error_check(err);

  if (gt_queue_size(sas->snps) > 0) {
    /* we need to process SNPs for a gene cluster*/
    gt_assert(gt_queue_size(sas->outqueue) == 0);

    for (i = 0; !had_err && i < nof_genes; i++) {
      GtNodeVisitor *sav;
      GtFeatureNode *gene;
      gene = *(GtFeatureNode**) gt_array_get(sas->cur_gene_set, i);
      sav = gt_snp_annotator_visitor_new(gene, sas->tt, sas->rmap, err);
      if (!sav)
        had_err = -1;
      if (!had_err) {
        if (i < nof_genes-1) {
          had_err = gt_queue_iterate(sas->snps,
                                     snp_annotator_stream_process_snp,
                                     sav, err);
        } else {
          while (!had_err && gt_queue_size(sas->snps) > 0) {
            GtFeatureNode *snp = (GtFeatureNode*) gt_queue_get(sas->snps);
            had_err = gt_genome_node_accept((GtGenomeNode*) snp, sav, err);
            gt_queue_add(sas->outqueue, snp);
            gt_genome_node_delete((GtGenomeNode*) snp);
          }
        }
        gt_node_visitor_delete(sav);
      }
      gt_genome_node_delete((GtGenomeNode*) gene);
    }
  } else {
    /* no SNPs for this gene cluster, delete it */
    for (i = 0; !had_err && i < nof_genes; i++) {
      gt_genome_node_delete(*(GtGenomeNode**) gt_array_get(sas->cur_gene_set,
                                                           i));
    }
  }
  gt_assert(gt_queue_size(sas->snps) == 0);
  gt_array_reset(sas->cur_gene_set);
  return had_err;
}
Exemple #16
0
int gt_bed_parser_parse(GtBEDParser *bed_parser, GtQueue *genome_nodes,
                        const char *filename, GtError *err)
{
  GtIO *bed_file;
  int had_err;
  gt_error_check(err);
  gt_assert(bed_parser && genome_nodes);
  bed_file = gt_io_new(filename, "r");
  /* parse BED file */
  had_err = parse_bed_file(bed_parser, bed_file, err);
  /* process created region and feature nodes */
  gt_region_node_builder_build(bed_parser->region_node_builder, genome_nodes);
  gt_region_node_builder_reset(bed_parser->region_node_builder);
  while (gt_queue_size(bed_parser->feature_nodes))
    gt_queue_add(genome_nodes, gt_queue_get(bed_parser->feature_nodes));
  gt_io_delete(bed_file);
  return had_err;
}
void gt_orphanage_add(GtOrphanage *o, GtGenomeNode *orphan,
                      const char *orphan_id, GtStrArray *missing_parents)
{
  const char *missing_parent;
  GtUword i;
  gt_assert(o && orphan);
  gt_assert(gt_feature_node_get_attribute((GtFeatureNode*) orphan,
                                          GT_GFF_PARENT));
  gt_queue_add(o->orphans, orphan);
  if (orphan_id && !gt_cstr_table_get(o->orphan_ids, orphan_id))
    gt_cstr_table_add(o->orphan_ids, orphan_id);
  if (missing_parents) {
    for (i = 0; i < gt_str_array_size(missing_parents); i++) {
      missing_parent = gt_str_array_get(missing_parents, i);
      if (!gt_cstr_table_get(o->missing_parents, missing_parent))
        gt_cstr_table_add(o->missing_parents, missing_parent);
    }
  }
}
void gt_desc_buffer_append_char(GtDescBuffer *db, char c)
{
  gt_assert(db);
  if (db->shorten) {
    if (db->seen_whitespace)
      return;
    if (isspace(c)) {
      db->seen_whitespace = true;
      return;
    }
  }
  if (db->finished) {
    gt_queue_add(db->startqueue, (void*) (db->length));
    db->finished = false;
  }
  if (db->length + 2 > db->allocated) {
    db->buf = gt_dynalloc(db->buf, &db->allocated,
                          (db->length + 2) * sizeof (char));
  }
  db->curlength++;
  db->buf[db->length++] = c;
}
void feature_in_stream_init(GtFeatureInStream *stream)
{
  GtUword i;
  GtError *error = gt_error_new();

  stream->seqids = gt_feature_index_get_seqids(stream->fi, error);
  stream->seqindex = 0;
  for (i = 0; i < gt_str_array_size(stream->seqids); i++)
  {
    const char *seqid = gt_str_array_get(stream->seqids, i);
    GtRange seqrange;
    if (stream->useorig)
      gt_feature_index_get_orig_range_for_seqid(stream->fi, &seqrange, seqid,
                                                error);
    else
      gt_feature_index_get_range_for_seqid(stream->fi, &seqrange, seqid, error);
    GtStr *seqstr = gt_str_new_cstr(seqid);
    GtGenomeNode *rn = gt_region_node_new(seqstr, seqrange.start, seqrange.end);
    gt_queue_add(stream->regioncache, rn);
    gt_str_delete(seqstr);
  }
  gt_error_delete(error);
}
static int gff3_numsorted_out_stream_next(GtNodeStream *ns, GtGenomeNode **gn,
                                           GtError *err)
{
  GtGFF3NumsortedOutStream *gff3_out_stream;
  int had_err = 0;
  GtUword i = 0;
  gt_error_check(err);
  gff3_out_stream = gff3_numsorted_out_stream_cast(ns);
  if (!gff3_out_stream->outqueue) {
    gff3_out_stream->outqueue = gt_queue_new();
    while (!(had_err =
                    gt_node_stream_next(gff3_out_stream->in_stream, gn, err))) {
      if (!*gn) break;
      gt_array_add(gff3_out_stream->buffer, *gn);
    }
    if (!had_err) {
      gt_genome_nodes_sort_stable_with_func(gff3_out_stream->buffer,
                             (GtCompare) gt_genome_node_compare_numeric_seqids);
      for (i = 0; !had_err && i < gt_array_size(gff3_out_stream->buffer); i++) {
        GtGenomeNode *mygn = *(GtGenomeNode**)
                                       gt_array_get(gff3_out_stream->buffer, i);
        gt_queue_add(gff3_out_stream->outqueue, mygn);
      }
    }
  }
  if (gff3_out_stream->outqueue && !had_err) {
    if (gt_queue_size(gff3_out_stream->outqueue) > 0) {
      GtGenomeNode *mygn = (GtGenomeNode*)
                                        gt_queue_get(gff3_out_stream->outqueue);
      gt_assert(mygn);
      had_err = gt_genome_node_accept(mygn, gff3_out_stream->gff3_visitor, err);
      if (!had_err)
        *gn = mygn;
    }
  }
  return had_err;
}
static int add_ids_visitor_region_node(GtNodeVisitor *nv, GtRegionNode *rn,
                                       GT_UNUSED GtError *err)
{
  GtAddIDsVisitor *aiv;
  const char *seqid;
  int had_err = 0;
  gt_error_check(err);
  aiv = add_ids_visitor_cast(nv);
  seqid = gt_str_get(gt_genome_node_get_seqid((GtGenomeNode*) rn));
  if (gt_hashmap_get(aiv->undefined_sequence_regions, seqid)) {
    gt_error_set(err, "genome feature with id \"%s\" has been defined before "
                 "the corresponding \"%s\" definition on line %u in file "
                 "\"%s\"", seqid, GT_GFF_SEQUENCE_REGION,
                 gt_genome_node_get_line_number((GtGenomeNode*) rn),
                 gt_genome_node_get_filename((GtGenomeNode*) rn));
    had_err = -1;
  }
  if (!had_err) {
    if (!gt_cstr_table_get(aiv->defined_seqids, seqid))
      gt_cstr_table_add(aiv->defined_seqids, seqid);
    gt_queue_add(aiv->node_buffer, rn);
  }
  return had_err;
}
Exemple #22
0
int gt_gtf_parser_parse(GtGTFParser *parser, GtQueue *genome_nodes,
                        GtStr *filenamestr, GtFile *fpin, bool be_tolerant,
                        GtError *err)
{
  GtStr *seqid_str, *source_str, *line_buffer;
  char *line;
  size_t line_length;
  GtUword i, line_number = 0;
  GtGenomeNode *gn;
  GtRange range;
  GtPhase phase_value;
  GtStrand gt_strand_value;
  GtSplitter *splitter, *attribute_splitter;
  float score_value;
  char *seqname,
       *source,
       *feature,
       *start,
       *end,
       *score,
       *strand,
       *frame,
       *attributes,
       *token,
       *gene_id,
       *gene_name = NULL,
       *transcript_id,
       *transcript_name = NULL,
       **tokens;
  GtHashmap *transcript_id_hash; /* map from transcript id to array of genome
                                    nodes */
  GtArray *gt_genome_node_array;
  ConstructionInfo cinfo;
  GTF_feature_type gtf_feature_type;
  GT_UNUSED bool gff_type_is_valid = false;
  const char *type = NULL;
  const char *filename;
  bool score_is_defined;
  int had_err = 0;

  gt_assert(parser && genome_nodes);
  gt_error_check(err);

  filename = gt_str_get(filenamestr);

  /* alloc */
  line_buffer = gt_str_new();
  splitter = gt_splitter_new(),
  attribute_splitter = gt_splitter_new();

#define HANDLE_ERROR                                                \
        if (had_err) {                                              \
          if (be_tolerant) {                                        \
            fprintf(stderr, "skipping line: %s\n", gt_error_get(err)); \
            gt_error_unset(err);                                       \
            gt_str_reset(line_buffer);                                 \
            had_err = 0;                                            \
            continue;                                               \
          }                                                         \
          else {                                                    \
            had_err = -1;                                           \
            break;                                                  \
          }                                                         \
        }

  while (gt_str_read_next_line_generic(line_buffer, fpin) != EOF) {
    line = gt_str_get(line_buffer);
    line_length = gt_str_length(line_buffer);
    line_number++;
    had_err = 0;

    if (line_length == 0) {
      gt_warning("skipping blank line " GT_WU " in file \"%s\"", line_number,
                 filename);
    }
    else if (line[0] == '#') {
      /* storing comment */
      if (line_length >= 2 && line[1] == '#')
        gn = gt_comment_node_new(line+2); /* store '##' line as '#' line */
      else
        gn = gt_comment_node_new(line+1);
      gt_genome_node_set_origin(gn, filenamestr, line_number);
      gt_queue_add(genome_nodes, gn);
    }
    else {
      /* process tab delimited GTF line */
      gt_splitter_reset(splitter);
      gt_splitter_split(splitter, line, line_length, '\t');
      if (gt_splitter_size(splitter) != 9UL) {
        gt_error_set(err, "line " GT_WU " in file \"%s\" contains " GT_WU
                     " tab (\\t) " "separated fields instead of 9", line_number,
                     filename,
                  gt_splitter_size(splitter));
        had_err = -1;
        break;
      }
      tokens = gt_splitter_get_tokens(splitter);
      seqname    = tokens[0];
      source     = tokens[1];
      feature    = tokens[2];
      start      = tokens[3];
      end        = tokens[4];
      score      = tokens[5];
      strand     = tokens[6];
      frame      = tokens[7];
      attributes = tokens[8];

      /* parse feature */
      if (GTF_feature_type_get(&gtf_feature_type, feature) == -1) {
        /* we skip unknown features */
        fprintf(stderr, "skipping line " GT_WU " in file \"%s\": unknown "
                "feature: \"%s\"\n", line_number, filename, feature);
        gt_str_reset(line_buffer);
        continue;
      }

      /* translate into GFF3 feature type */
      switch (gtf_feature_type) {
        case GTF_CDS:
        case GTF_stop_codon:
          gff_type_is_valid = gt_type_checker_is_valid(parser->type_checker,
                                                       gt_ft_CDS);
          type = gt_ft_CDS;
          break;
        case GTF_exon:
          gff_type_is_valid = gt_type_checker_is_valid(parser->type_checker,
                                                       gt_ft_exon);
          type = gt_ft_exon;
      }
      gt_assert(gff_type_is_valid);

      /* parse the range */
      had_err = gt_parse_range(&range, start, end, line_number, filename, err);
      HANDLE_ERROR;

      /* process seqname (we have to do it here because we need the range) */
      gt_region_node_builder_add_region(parser->region_node_builder, seqname,
                                        range);

      /* parse the score */
      had_err = gt_parse_score(&score_is_defined, &score_value, score,
                               line_number, filename, err);
      HANDLE_ERROR;

      /* parse the strand */
      had_err = gt_parse_strand(&gt_strand_value, strand, line_number, filename,
                               err);
      HANDLE_ERROR;

      /* parse the frame */
      had_err = gt_parse_phase(&phase_value, frame, line_number, filename, err);
      HANDLE_ERROR;

      /* parse the attributes */
      gt_splitter_reset(attribute_splitter);
      gene_id = NULL;
      transcript_id = NULL;
      gt_splitter_split(attribute_splitter, attributes, strlen(attributes),
                        ';');
      for (i = 0; i < gt_splitter_size(attribute_splitter); i++) {
        token = gt_splitter_get_token(attribute_splitter, i);
        /* skip leading blanks */
        while (*token == ' ')
          token++;
        /* look for the two mandatory attributes */
        if (strncmp(token, GENE_ID_ATTRIBUTE, strlen(GENE_ID_ATTRIBUTE)) == 0) {
          if (strlen(token) + 2 < strlen(GENE_ID_ATTRIBUTE)) {
            gt_error_set(err, "missing value to attribute \"%s\" on line "
                         GT_WU "in file \"%s\"", GENE_ID_ATTRIBUTE, line_number,
                         filename);
            had_err = -1;
          }
          HANDLE_ERROR;
          gene_id = token + strlen(GENE_ID_ATTRIBUTE) + 1;
        }
        else if (strncmp(token, TRANSCRIPT_ID_ATTRIBUTE,
                         strlen(TRANSCRIPT_ID_ATTRIBUTE)) == 0) {
          if (strlen(token) + 2 < strlen(TRANSCRIPT_ID_ATTRIBUTE)) {
            gt_error_set(err, "missing value to attribute \"%s\" on line "
                         GT_WU "in file \"%s\"", TRANSCRIPT_ID_ATTRIBUTE,
                         line_number, filename);
            had_err = -1;
          }
          HANDLE_ERROR;
          transcript_id = token + strlen(TRANSCRIPT_ID_ATTRIBUTE) + 1;
        }
        else if (strncmp(token, GENE_NAME_ATTRIBUTE,
                         strlen(GENE_NAME_ATTRIBUTE)) == 0) {
          if (strlen(token) + 2 < strlen(GENE_NAME_ATTRIBUTE)) {
            gt_error_set(err, "missing value to attribute \"%s\" on line "
                         GT_WU "in file \"%s\"", GENE_NAME_ATTRIBUTE,
                         line_number, filename);
            had_err = -1;
          }
          HANDLE_ERROR;
          gene_name = token + strlen(GENE_NAME_ATTRIBUTE) + 1;
          /* for output we want to strip quotes */
          if (*gene_name == '"')
            gene_name++;
          if (gene_name[strlen(gene_name)-1] == '"')
            gene_name[strlen(gene_name)-1] = '\0';
        }
        else if (strncmp(token, TRANSCRIPT_NAME_ATTRIBUTE,
                         strlen(TRANSCRIPT_NAME_ATTRIBUTE)) == 0) {
          if (strlen(token) + 2 < strlen(TRANSCRIPT_NAME_ATTRIBUTE)) {
            gt_error_set(err, "missing value to attribute \"%s\" on line "
                         GT_WU "in file \"%s\"", TRANSCRIPT_NAME_ATTRIBUTE,
                         line_number, filename);
            had_err = -1;
          }
          HANDLE_ERROR;
          transcript_name = token + strlen(TRANSCRIPT_NAME_ATTRIBUTE) + 1;
          /* for output we want to strip quotes */
          if (*transcript_name == '"')
            transcript_name++;
          if (transcript_name[strlen(transcript_name)-1] == '"')
            transcript_name[strlen(transcript_name)-1] = '\0';
        }
      }

      /* check for the mandatory attributes */
      if (!gene_id) {
        gt_error_set(err, "missing attribute \"%s\" on line " GT_WU
                     " in file \"%s\"", GENE_ID_ATTRIBUTE, line_number,
                     filename);
        had_err = -1;
      }
      HANDLE_ERROR;
      if (!transcript_id) {
        gt_error_set(err, "missing attribute \"%s\" on line " GT_WU
                     " in file \"%s\"", TRANSCRIPT_ID_ATTRIBUTE, line_number,
                     filename);
        had_err = -1;
      }
      HANDLE_ERROR;

      /* process the mandatory attributes */
      if (!(transcript_id_hash = gt_hashmap_get(parser->gene_id_hash,
                                             gene_id))) {
        transcript_id_hash = gt_hashmap_new(GT_HASH_STRING, gt_free_func,
                                            (GtFree) gt_array_delete);
        gt_hashmap_add(parser->gene_id_hash, gt_cstr_dup(gene_id),
                    transcript_id_hash);
      }
      gt_assert(transcript_id_hash);

      if (!(gt_genome_node_array = gt_hashmap_get(transcript_id_hash,
                                            transcript_id))) {
        gt_genome_node_array = gt_array_new(sizeof (GtGenomeNode*));
        gt_hashmap_add(transcript_id_hash, gt_cstr_dup(transcript_id),
                    gt_genome_node_array);
      }
      gt_assert(gt_genome_node_array);

      /* save optional gene_name and transcript_name attributes */
      if (transcript_name
            && !gt_hashmap_get(parser->transcript_id_to_name_mapping,
                             transcript_id)) {
        gt_hashmap_add(parser->transcript_id_to_name_mapping,
                    gt_cstr_dup(transcript_id),
                    gt_cstr_dup(transcript_name));
      }
      if (gene_name && !gt_hashmap_get(parser->gene_id_to_name_mapping,
                                    gene_id)) {
        gt_hashmap_add(parser->gene_id_to_name_mapping,
                    gt_cstr_dup(gene_id),
                    gt_cstr_dup(gene_name));
      }

      /* get seqid */
      seqid_str = gt_hashmap_get(parser->seqid_to_str_mapping, seqname);
      if (!seqid_str) {
        seqid_str = gt_str_new_cstr(seqname);
        gt_hashmap_add(parser->seqid_to_str_mapping, gt_str_get(seqid_str),
                       seqid_str);
      }
      gt_assert(seqid_str);

      /* construct the new feature */
      gn = gt_feature_node_new(seqid_str, type, range.start, range.end,
                                 gt_strand_value);
      gt_genome_node_set_origin(gn, filenamestr, line_number);

      /* set source */
      source_str = gt_hashmap_get(parser->source_to_str_mapping, source);
      if (!source_str) {
        source_str = gt_str_new_cstr(source);
        gt_hashmap_add(parser->source_to_str_mapping, gt_str_get(source_str),
                    source_str);
      }
      gt_assert(source_str);
      gt_feature_node_set_source((GtFeatureNode*) gn, source_str);

      if (score_is_defined)
        gt_feature_node_set_score((GtFeatureNode*) gn, score_value);
      if (phase_value != GT_PHASE_UNDEFINED)
        gt_feature_node_set_phase((GtFeatureNode*) gn, phase_value);
      gt_array_add(gt_genome_node_array, gn);
    }

    gt_str_reset(line_buffer);
  }

  /* process all region nodes */
  if (!had_err)
    gt_region_node_builder_build(parser->region_node_builder, genome_nodes);

  /* process all feature nodes */
  cinfo.genome_nodes = genome_nodes;
  cinfo.gene_id_to_name_mapping = parser->gene_id_to_name_mapping;
  cinfo.transcript_id_to_name_mapping = parser->transcript_id_to_name_mapping;
  if (!had_err) {
    had_err = gt_hashmap_foreach(parser->gene_id_hash, construct_genes,
                              &cinfo, err);
  }

  /* free */
  gt_splitter_delete(splitter);
  gt_splitter_delete(attribute_splitter);
  gt_str_delete(line_buffer);

  return had_err;
}
Exemple #23
0
int gt_queue_unit_test(GtError *err)
{
  long check_counter = 0, check_counter_reverse = 1023;
  unsigned long i;
  int had_err = 0;
  GtQueue *q;

  gt_error_check(err);

  /* without wraparound */
  q = gt_queue_new();
  gt_ensure(had_err, !gt_queue_size(q));
  for (i = 0; !had_err && i < 1024; i++) {
    gt_queue_add(q, (void*) i);
    gt_ensure(had_err, gt_queue_size(q) == i + 1);
  }
  if (!had_err)
    had_err = gt_queue_iterate(q, check_queue, &check_counter, err);
  if (!had_err) {
    had_err = gt_queue_iterate_reverse(q, check_queue_reverse,
                                    &check_counter_reverse, err);
  }
  gt_ensure(had_err, gt_queue_iterate(q, fail_func, NULL, NULL));
  gt_ensure(had_err, gt_queue_iterate_reverse(q, fail_func, NULL, NULL));
  if (!had_err) {
    gt_queue_remove(q, (void*) 0);
    gt_ensure(had_err, gt_queue_size(q) == 1023);
  }
  for (i = 1; !had_err && i < 1024; i++) {
    gt_ensure(had_err, gt_queue_head(q) == (void*) i);
    gt_ensure(had_err, gt_queue_get(q) == (void*) i);
    gt_ensure(had_err, gt_queue_size(q) == 1024 - i - 1);
  }
  gt_ensure(had_err, !gt_queue_size(q));
  gt_queue_delete(q);

  /* with wraparound (without full queue) */
  if (!had_err) {
    q = gt_queue_new();
    gt_ensure(had_err, !gt_queue_size(q));
    for (i = 0; !had_err && i < 1024; i++) {
      gt_queue_add(q, (void*) i);
      gt_ensure(had_err, gt_queue_size(q) == i + 1);
    }
    check_counter = 0;
    check_counter_reverse = 1023;
    if (!had_err)
      had_err = gt_queue_iterate(q, check_queue, &check_counter, err);
    gt_ensure(had_err, gt_queue_iterate(q, fail_func, NULL, NULL));
    gt_ensure(had_err, gt_queue_iterate_reverse(q, fail_func, NULL, NULL));
    if (!had_err) {
      had_err = gt_queue_iterate_reverse(q, check_queue_reverse,
                                         &check_counter_reverse, err);
    }
    for (i = 0; !had_err && i < 512; i++) {
      gt_ensure(had_err, gt_queue_head(q) == (void*) i);
      gt_ensure(had_err, gt_queue_get(q) == (void*) i);
      gt_ensure(had_err, gt_queue_size(q) == 1024 - i - 1);
    }
    for (i = 0; !had_err && i < 512; i++) {
      gt_queue_add(q, (void*) (i + 1024));
      gt_ensure(had_err, gt_queue_size(q) == 512 + i + 1);
    }
    check_counter = 512;
    check_counter_reverse = 1535;
    if (!had_err)
      had_err = gt_queue_iterate(q, check_queue, &check_counter, err);
    if (!had_err) {
      had_err = gt_queue_iterate_reverse(q, check_queue_reverse,
                                         &check_counter_reverse, err);
    }
    gt_ensure(had_err, gt_queue_iterate(q, fail_func, NULL, NULL));
    gt_ensure(had_err, gt_queue_iterate_reverse(q, fail_func, NULL, NULL));
    if (!had_err) {
      gt_queue_remove(q, (void*) 512);
      gt_ensure(had_err, gt_queue_size(q) == 1023);
    }
    for (i = 1; !had_err && i < 1024; i++) {
      gt_ensure(had_err, gt_queue_head(q) == (void*) (512 + i));
      gt_ensure(had_err, gt_queue_get(q) == (void*) (512 + i));
      gt_ensure(had_err, gt_queue_size(q) == 1024 - i - 1);
    }
    gt_ensure(had_err, !gt_queue_size(q));
    gt_queue_delete(q);
  }

  /* with wraparound (with full queue) */
  if (!had_err) {
    q = gt_queue_new();
    gt_ensure(had_err, !gt_queue_size(q));
    for (i = 0; !had_err && i < 1024; i++) {
      gt_queue_add(q, (void*) i);
      gt_ensure(had_err, gt_queue_size(q) == i + 1);
    }
    check_counter = 0;
    check_counter_reverse = 1023;
    if (!had_err)
      had_err = gt_queue_iterate(q, check_queue, &check_counter, err);
    if (!had_err) {
      had_err = gt_queue_iterate_reverse(q, check_queue_reverse,
                                      &check_counter_reverse, err);
    }
    gt_ensure(had_err, gt_queue_iterate(q, fail_func, NULL, NULL));
    gt_ensure(had_err, gt_queue_iterate_reverse(q, fail_func, NULL, NULL));
    for (i = 0; !had_err && i < 512; i++) {
      gt_ensure(had_err, gt_queue_head(q) == (void*) i);
      gt_ensure(had_err, gt_queue_get(q) == (void*) i);
      gt_ensure(had_err, gt_queue_size(q) == 1024 - i - 1);
    }
    for (i = 0; !had_err && i < 1024; i++) {
      gt_queue_add(q, (void*) (i + 1024));
      gt_ensure(had_err, gt_queue_size(q) == 512 + i + 1);
    }
    check_counter = 512;
    check_counter_reverse = 2047;
    if (!had_err)
      had_err = gt_queue_iterate(q, check_queue, &check_counter, err);
    if (!had_err) {
      had_err = gt_queue_iterate_reverse(q, check_queue_reverse,
                                      &check_counter_reverse, err);
    }
    gt_ensure(had_err, gt_queue_iterate(q, fail_func, NULL, NULL));
    gt_ensure(had_err, gt_queue_iterate_reverse(q, fail_func, NULL, NULL));
    if (!had_err) {
      gt_queue_remove(q, (void*) 512);
      gt_ensure(had_err, gt_queue_size(q) == 1535);
    }
    for (i = 1; !had_err && i < 1536; i++) {
      gt_ensure(had_err, gt_queue_head(q) == (void*) (512 + i));
      gt_ensure(had_err, gt_queue_get(q) == (void*) (512 + i));
      gt_ensure(had_err, gt_queue_size(q) == 1536 - i - 1);
    }
    gt_ensure(had_err, !gt_queue_size(q));
    gt_queue_delete(q);
  }

  /* test a corner case */
  if (!had_err) {
    q = gt_queue_new();
    gt_queue_add(q, (void*) 1);
    gt_ensure(had_err, gt_queue_size(q) == 1);
    if (!had_err)
      gt_queue_add(q, (void*) 1);
    gt_ensure(had_err, gt_queue_size(q) == 2);
    gt_ensure(had_err, gt_queue_get(q));
    gt_ensure(had_err, gt_queue_size(q) == 1);
    if (!had_err)
      gt_queue_add(q, (void*) 1);
    gt_ensure(had_err, gt_queue_size(q) == 2);
    gt_ensure(had_err, gt_queue_get(q));
    gt_ensure(had_err, gt_queue_size(q) == 1);
    if (!had_err)
      gt_queue_add(q, (void*) 1);
    gt_ensure(had_err, gt_queue_size(q) == 2);
    gt_ensure(had_err, gt_queue_get(q));
    gt_ensure(had_err, gt_queue_size(q) == 1);
    gt_ensure(had_err, gt_queue_get(q));
    gt_ensure(had_err, gt_queue_size(q) == 0);
    if (!had_err)
      gt_queue_add(q, (void*) 1);
    gt_ensure(had_err, gt_queue_size(q) == 1);
    gt_ensure(had_err, gt_queue_get(q));
    gt_ensure(had_err, gt_queue_size(q) == 0);
    gt_queue_delete(q);
  }

  /* gt_queue_remove() corner case */
  if (!had_err) {
    q = gt_queue_new();
    gt_queue_add(q, (void*) 1);
    gt_ensure(had_err, gt_queue_size(q) == 1);
    gt_queue_remove(q, (void*) 1);
    gt_ensure(had_err, gt_queue_size(q) == 0);
    gt_queue_delete(q);
  }

  /* gt_queue_remove() corner case */
  if (!had_err) {
    q = gt_queue_new();
    gt_queue_add(q, (void*) 0);
    gt_queue_add(q, (void*) 1);
    gt_queue_add(q, (void*) 2);
    gt_queue_add(q, (void*) 3);
    gt_ensure(had_err, gt_queue_get(q) == (void*) 0);
    gt_ensure(had_err, gt_queue_get(q) == (void*) 1);
    gt_queue_add(q, (void*) 4);
    gt_queue_add(q, (void*) 5);
    gt_queue_remove(q, (void*) 4);
    gt_queue_remove(q, (void*) 2);
    gt_queue_remove(q, (void*) 5);
    gt_queue_remove(q, (void*) 3);
    gt_ensure(had_err, gt_queue_size(q) == 0);
    gt_queue_delete(q);
  }

  /* delete with contents */
  if (!had_err) {
    q = gt_queue_new();
    gt_ensure(had_err, !gt_queue_size(q));
    if (!had_err)
      gt_queue_add(q, gt_calloc(1, 16));
    gt_ensure(had_err, gt_queue_size(q) == 1);
    if (!had_err)
      gt_queue_add(q, gt_calloc(1, 32));
    gt_ensure(had_err, gt_queue_size(q) == 2);
    gt_queue_delete_with_contents(q);
  }

  return had_err;
}
Exemple #24
0
// Main method
int main(int argc, char * const *argv)
{
  GtError *error;
  GtLogger *logger;
  GtQueue *streams;
  GtNodeStream *stream, *last_stream;
  CanonGFF3Options options = { NULL, NULL, false };

  gt_lib_init();
  error = gt_error_new();
  canon_gff3_parse_options(argc, argv + 0, &options, error);

  streams = gt_queue_new();
  logger = gt_logger_new(true, "", stderr);

  stream = gt_gff3_in_stream_new_unsorted(argc - optind, (const char **)
                                                          argv+optind);
  gt_gff3_in_stream_check_id_attributes((GtGFF3InStream *)stream);
  gt_gff3_in_stream_enable_tidy_mode((GtGFF3InStream *)stream);
  gt_queue_add(streams, stream);
  last_stream = stream;

  if(options.infer)
  {
    GtHashmap *type_parents = gt_hashmap_new(GT_HASH_STRING, gt_free_func,
                                             gt_free_func);
    gt_hashmap_add(type_parents, gt_cstr_dup("mRNA"), gt_cstr_dup("gene"));
    gt_hashmap_add(type_parents, gt_cstr_dup("tRNA"), gt_cstr_dup("gene"));
    stream = agn_infer_parent_stream_new(last_stream,
                                                 type_parents);
    gt_hashmap_delete(type_parents);
    gt_queue_add(streams, stream);
    last_stream = stream;
  }

  stream = agn_gene_stream_new(last_stream, logger);
  gt_queue_add(streams, stream);
  last_stream = stream;

  if(options.source != NULL)
  {
    GtNodeVisitor *ssv = gt_set_source_visitor_new(options.source);
    stream = gt_visitor_stream_new(last_stream, ssv);
    gt_queue_add(streams, stream);
    last_stream = stream;
  }

  stream = gt_gff3_out_stream_new(last_stream, options.outstream);
  if(!options.infer)
    gt_gff3_out_stream_retain_id_attributes((GtGFF3OutStream *)stream);
  gt_queue_add(streams, stream);
  last_stream = stream;

  if(gt_node_stream_pull(last_stream, error) == -1)
  {
    fprintf(stderr, "[CanonGFF3] error processing node stream: %s",
            gt_error_get(error));
  }

  while(gt_queue_size(streams) > 0)
  {
    stream = gt_queue_get(streams);
    gt_node_stream_delete(stream);
  }
  gt_queue_delete(streams);
  if(options.source != NULL)
    gt_str_delete(options.source);
  if(options.outstream != NULL)
    gt_file_delete(options.outstream);
  gt_error_delete(error);
  gt_logger_delete(logger);
  gt_lib_clean();

  return 0;
}
Exemple #25
0
static int advancefastabufferstate(GtFastaBuffer *fb, GtError *err)
{
  int currentchar;
  unsigned long currentoutpos = 0, currentfileadd = 0, currentfileread = 0;
  GtUchar charcode;

  gt_error_check(err);
  while (true)
  {
    if (currentoutpos >= (unsigned long) OUTPUTFILEBUFFERSIZE)
    {
      if (fb->filelengthtab != NULL)
      {
        fb->filelengthtab[fb->filenum].length
          += (uint64_t) currentfileread;
        fb->filelengthtab[fb->filenum].effectivelength
          += (uint64_t) currentfileadd;
      }
      break;
    }
    if (fb->nextfile)
    {
      if (fb->filelengthtab != NULL)
      {
        fb->filelengthtab[fb->filenum].length = 0;
        fb->filelengthtab[fb->filenum].effectivelength = 0;
      }
      fb->nextfile = false;
      fb->indesc = false;
      fb->firstseqinfile = true;
      currentfileadd = 0;
      currentfileread = 0;
      fb->linenum = (uint64_t) 1;
      fb->inputstream = gt_file_xopen(gt_str_array_get(fb->filenametab,
                                                  (unsigned long) fb->filenum),
                                         "rb");
      fb->currentinpos = 0;
      fb->currentfillpos = 0;
    } else
    {
      currentchar = ownbuffer_genfile_getc(fb,fb->inputstream);
      if (currentchar == EOF)
      {
        gt_file_delete(fb->inputstream);
        fb->inputstream = NULL;
        if (fb->filelengthtab != NULL)
        {
          fb->filelengthtab[fb->filenum].length += currentfileread;
          fb->filelengthtab[fb->filenum].effectivelength += currentfileadd;
        }
        if ((unsigned long) fb->filenum == gt_str_array_size(fb->filenametab)-1)
        {
          fb->complete = true;
          break;
        }
        fb->filenum++;
        fb->nextfile = true;
      } else
      {
        currentfileread++;
        if (fb->indesc)
        {
          if (currentchar == NEWLINESYMBOL)
          {
            fb->linenum++;
            fb->indesc = false;
          }
          if (fb->descptr != NULL)
          {
            if (currentchar == NEWLINESYMBOL)
            {
              GT_STOREINARRAY(&fb->headerbuffer, char, 128, '\0');
              gt_queue_add(fb->descptr,
                           gt_cstr_dup(fb->headerbuffer.spacechar));
              fb->headerbuffer.nextfreechar = 0;
            } else
            {
              GT_STOREINARRAY(&fb->headerbuffer, char, 128, currentchar);
            }
          }
        } else
        {
          if (!isspace((int) currentchar))
          {
            if (currentchar == FASTASEPARATOR)
            {
              if (fb->firstoverallseq)
              {
                fb->firstoverallseq = false;
                fb->firstseqinfile = false;
              } else
              {
                if (fb->firstseqinfile)
                {
                  fb->firstseqinfile = false;
                } else
                {
                  currentfileadd++;
                }
                fb->outputbuffer[currentoutpos++] = (GtUchar) SEPARATOR;
                fb->lastspeciallength++;
              }
              fb->indesc = true;
            } else
            {
              if (fb->symbolmap == NULL)
              {
                fb->outputbuffer[currentoutpos++] = (GtUchar) currentchar;
              } else
              {
                charcode = fb->symbolmap[(unsigned int) currentchar];
                if (charcode == (GtUchar) UNDEFCHAR)
                {
                  gt_error_set(err,
                            "illegal character '%c': file \"%s\", line %llu",
                            currentchar,
                            gt_str_array_get(fb->filenametab, fb->filenum),
                            (unsigned long long) fb->linenum);
                  return -1;
                }
                if (ISSPECIAL(charcode))
                {
                  fb->lastspeciallength++;
                } else
                {
                  if (fb->lastspeciallength > 0)
                  {
                    fb->lastspeciallength = 0;
                  }
                  if (fb->characterdistribution != NULL)
                  {
                    fb->characterdistribution[charcode]++;
                  }
                }
                fb->outputbuffer[currentoutpos++] = charcode;
              }
              currentfileadd++;
            }
          }
        }
      }
GtNodeVisitor*
agn_gaeval_visitor_new(GtNodeStream *astream, AgnGaevalParams gparams)
{
  agn_assert(astream);

  // Create the node visitor
  GtNodeVisitor *nv = gt_node_visitor_create(gaeval_visitor_class());
  AgnGaevalVisitor *v = gaeval_visitor_cast(nv);
  v->alignments = gt_feature_index_memory_new();
  v->tsvout = NULL;
  v->params = gparams;

  // Check that sum of weights is 1.0
  double weights_total = gparams.alpha + gparams.beta +
                         gparams.gamma + gparams.epsilon;
  if(fabs(weights_total - 1.0) > 0.0001)
  {
    fprintf(stderr, "[AgnGaevalVisitor::agn_gaeval_visitor_new] warning: "
            "sum of weights is not 1.0 %.3lf; integrity calculations will be "
            "incorrect\n", weights_total);
  }


  // Set up node stream to load alignment features into memory
  GtQueue *streams = gt_queue_new();
  GtNodeStream *stream, *last_stream;
  GtHashmap *typestokeep = gt_hashmap_new(GT_HASH_STRING, NULL, NULL);
  gt_hashmap_add(typestokeep, "cDNA_match", "cDNA_match");
  gt_hashmap_add(typestokeep, "EST_match", "EST_match");
  gt_hashmap_add(typestokeep, "nucleotide_match", "nucleotide_match");
  stream = agn_filter_stream_new(astream, typestokeep);
  gt_queue_add(streams, stream);
  last_stream = stream;

  stream = gt_feature_out_stream_new(last_stream, v->alignments);
  gt_queue_add(streams, stream);
  last_stream = stream;

  stream = gt_inter_feature_stream_new(last_stream, "cDNA_match", "match_gap");
  gt_queue_add(streams, stream);
  last_stream = stream;

  stream = gt_inter_feature_stream_new(last_stream, "EST_match", "match_gap");
  gt_queue_add(streams, stream);
  last_stream = stream;

  stream = gt_inter_feature_stream_new(last_stream, "nucleotide_match",
                                       "match_gap");
  gt_queue_add(streams, stream);
  last_stream = stream;

  // Process the node stream
  GtError *error = gt_error_new();
  int result = gt_node_stream_pull(last_stream, error);
  if(result == -1)
  {
    fprintf(stderr, "[AEGeAn::AgnGaevalStream] error parsing alignments: %s\n",
            gt_error_get(error));
    gt_node_visitor_delete(nv);
    return NULL;
  }
  gt_error_delete(error);
  gt_hashmap_delete(typestokeep);
  while(gt_queue_size(streams) > 0)
  {
    stream = gt_queue_get(streams);
    gt_node_stream_delete(stream);
  }
  gt_queue_delete(streams);

  return nv;
}
Exemple #27
0
static int select_visitor_feature_node(GtNodeVisitor *nv,
                                       GtFeatureNode *fn,
                                       GT_UNUSED GtError *err)
{
  GtSelectVisitor *fv;
  bool filter_node = false;
  gt_error_check(err);
  fv = select_visitor_cast(nv);
  fv->current_feature++;
  if ((!gt_str_length(fv->seqid) || /* no seqid was specified or seqids are
                                       equal */
       !gt_str_cmp(fv->seqid, gt_genome_node_get_seqid((GtGenomeNode*) fn))) &&
      (!gt_str_length(fv->source) || /* no source was specified or sources are
                                        equal */
       !strcmp(gt_str_get(fv->source), gt_feature_node_get_source(fn)))) {
    GtRange range = gt_genome_node_get_range((GtGenomeNode*) fn);
    /* enforce maximum gene length */
    /* XXX: we (spuriously) assume that genes are always root nodes */
    if (fn && gt_feature_node_has_type(fn, gt_ft_gene)) {
      if (fv->max_gene_length != GT_UNDEF_ULONG &&
          gt_range_length(&range) > fv->max_gene_length) {
        filter_node = true;
      }
      else if (fv->max_gene_num != GT_UNDEF_ULONG &&
               fv->gene_num >= fv->max_gene_num) {
        filter_node = true;
      }
      else if (fv->min_gene_score != GT_UNDEF_DOUBLE &&
               gt_feature_node_get_score(fn) < fv->min_gene_score) {
        filter_node = true;
      }
      else if (fv->max_gene_score != GT_UNDEF_DOUBLE &&
               gt_feature_node_get_score(fn) > fv->max_gene_score) {
        filter_node = true;
      }
      else if (fv->feature_num != GT_UNDEF_ULONG &&
               fv->feature_num != fv->current_feature) {
        filter_node = true;
      }
      if (!filter_node)
        fv->gene_num++; /* gene passed filter */
    }
  }
  else
    filter_node = true;

  if (!filter_node)
    filter_node = filter_contain_range(fn, fv->contain_range);

  if (!filter_node)
    filter_node = filter_overlap_range(fn, fv->overlap_range);

  if (!filter_node)
    filter_node = filter_strand(fn, fv->strand);

  if (!filter_node)
    filter_node = filter_targetstrand(fn, fv->targetstrand);

  if (!filter_node)
    filter_node = filter_has_CDS(fn, fv->has_CDS);

  if (!filter_node)
    filter_node = filter_min_average_ssp(fn, fv->min_average_splice_site_prob);

  if (filter_node)
    gt_genome_node_delete((GtGenomeNode*) fn);
  else
    gt_queue_add(fv->node_buffer, fn);

  return 0;
}
static int gt_sequence_buffer_fasta_advance(GtSequenceBuffer *sb, GtError *err)
{
  int currentchar, ret = 0;
  unsigned long currentoutpos = 0, currentfileadd = 0, currentfileread = 0;
  GtSequenceBufferMembers *pvt;
  GtSequenceBufferFasta *sbf;

  gt_error_check(err);

  sbf = (GtSequenceBufferFasta*) sb;
  pvt = sb->pvt;
  while (true)
  {
    if (currentoutpos >= (unsigned long) OUTBUFSIZE)
    {
      if (pvt->filelengthtab != NULL)
      {
        pvt->filelengthtab[pvt->filenum].length
          += (uint64_t) currentfileread;
        pvt->filelengthtab[pvt->filenum].effectivelength
          += (uint64_t) currentfileadd;
      }
      break;
    }
    if (sbf->nextfile)
    {
      if (pvt->filelengthtab != NULL)
      {
        pvt->filelengthtab[pvt->filenum].length = 0;
        pvt->filelengthtab[pvt->filenum].effectivelength = 0;
      }
      sbf->nextfile = false;
      sbf->indesc = false;
      sbf->firstseqinfile = true;
      currentfileadd = 0;
      currentfileread = 0;
      pvt->linenum = (uint64_t) 1;
      pvt->inputstream = gt_file_xopen(gt_str_array_get(pvt->filenametab,
                                                  (unsigned long) pvt->filenum),
                                       "rb");
      pvt->currentinpos = 0;
      pvt->currentfillpos = 0;
    } else
    {
      currentchar = inlinebuf_getchar(sb, pvt->inputstream);
      if (currentchar == EOF)
      {
        gt_file_delete(pvt->inputstream);
        pvt->inputstream = NULL;
        if (pvt->filelengthtab != NULL)
        {
          pvt->filelengthtab[pvt->filenum].length += currentfileread;
          pvt->filelengthtab[pvt->filenum].effectivelength += currentfileadd;
        }
        if ((unsigned long) pvt->filenum
                                     == gt_str_array_size(pvt->filenametab)-1)
        {
          pvt->complete = true;
          break;
        }
        pvt->filenum++;
        sbf->nextfile = true;
      } else
      {
        currentfileread++;
        if (sbf->indesc)
        {
          if (currentchar == NEWLINESYMBOL)
          {
            pvt->linenum++;
            sbf->indesc = false;
          }
          if (pvt->descptr != NULL)
          {
            if (currentchar == NEWLINESYMBOL)
            {
              gt_queue_add(pvt->descptr,
                           gt_cstr_dup(gt_str_get(sbf->headerbuffer)));
              gt_str_reset(sbf->headerbuffer);
            } else
            {
              gt_str_append_char(sbf->headerbuffer, currentchar);
            }
          }
        } else
        {
          if (!isspace((int) currentchar))
          {
            if (currentchar == FASTASEPARATOR)
            {
              if (sbf->firstoverallseq)
              {
                sbf->firstoverallseq = false;
                sbf->firstseqinfile = false;
              } else
              {
                if (sbf->firstseqinfile)
                {
                  sbf->firstseqinfile = false;
                } else
                {
                  currentfileadd++;
                }
                pvt->outbuf[currentoutpos++] = (GtUchar) SEPARATOR;
                pvt->lastspeciallength++;
              }
              sbf->indesc = true;
            } else
            {
              if ((ret = process_char(sb, currentoutpos, currentchar, err)))
                return ret;
              currentoutpos++;
              currentfileadd++;
            }
          }
        }
      }
    }
  }
  if (sbf->firstoverallseq)
  {
    gt_error_set(err,"no sequences in multiple fasta file(s) %s ...",
              gt_str_array_get(pvt->filenametab,0));
    return -2;
  }
  pvt->nextfree = currentoutpos;
  return 0;
}
static int add_ids_visitor_feature_node(GtNodeVisitor *nv, GtFeatureNode *fn,
                                        GtError *err)
{
  AutomaticSequenceRegion *auto_sr;
  GtAddIDsVisitor *aiv;
  const char *seqid;
  bool is_circular;
  aiv = add_ids_visitor_cast(nv);
  seqid = gt_str_get(gt_genome_node_get_seqid((GtGenomeNode*) fn));
  if (aiv->ensure_sorting && !gt_cstr_table_get(aiv->defined_seqids, seqid)) {
    gt_error_set(err, "the file %s is not sorted (seqid \"%s\" on line %u has "
                 "not been previously introduced with a \"%s\" line)",
                 gt_genome_node_get_filename((GtGenomeNode*) fn), seqid,
                 gt_genome_node_get_line_number((GtGenomeNode*) fn),
                 GT_GFF_SEQUENCE_REGION);
    return -1;
  }
  if (!gt_cstr_table_get(aiv->defined_seqids, seqid)) {
    GtFeatureNodeIterator *fni;
    GtFeatureNode *node;
    GtRange range = gt_genome_node_get_range((GtGenomeNode*) fn);
    is_circular = gt_feature_node_get_attribute(fn, GT_GFF_IS_CIRCULAR)
                  ? true : false;
    if (!is_circular) {
      fni = gt_feature_node_iterator_new(fn);
      while ((node = gt_feature_node_iterator_next(fni))) {
        GtRange node_range = gt_genome_node_get_range((GtGenomeNode*) node);
        range = gt_range_join(&range, &node_range);
      }
      gt_feature_node_iterator_delete(fni);
    }
    /* sequence region has not been previously introduced -> check if one has
       already been created automatically */
    auto_sr = gt_hashmap_get(aiv->undefined_sequence_regions, seqid);
    if (!auto_sr) {
      GtStr *seqid_str;
      /* sequence region has not been createad automatically -> do it now */
      gt_warning("seqid \"%s\" on line %u in file \"%s\" has not been "
                 "previously introduced with a \"%s\" line, create such a line "
                 "automatically", seqid,
                 gt_genome_node_get_line_number((GtGenomeNode*) fn),
                 gt_genome_node_get_filename((GtGenomeNode*) fn),
                 GT_GFF_SEQUENCE_REGION);
      auto_sr = automatic_sequence_region_new(is_circular);
      seqid_str = gt_genome_node_get_seqid((GtGenomeNode*) fn);
      auto_sr->sequence_region = gt_region_node_new(seqid_str, range.start,
                                                               range.end);
      gt_hashmap_add(aiv->undefined_sequence_regions, gt_str_get(seqid_str),
                     auto_sr);
    }
    else {
      if (auto_sr->is_circular) {
        gt_assert(!is_circular); /* XXX */
      }
      else if (is_circular) {
        gt_assert(!auto_sr->is_circular); /* XXX */
        auto_sr->is_circular = true;
        gt_genome_node_set_range(auto_sr->sequence_region, &range);
      }
      else {
        GtRange joined_range,
                sr_range = gt_genome_node_get_range(auto_sr->sequence_region);
        /* update the range of the sequence region */
        joined_range = gt_range_join(&range, &sr_range);
        gt_genome_node_set_range(auto_sr->sequence_region, &joined_range);
      }
    }
    gt_array_add(auto_sr->feature_nodes, fn);
  }
  else
    gt_queue_add(aiv->node_buffer, fn);
  return 0;
}
Exemple #30
0
static int bed_rest(GtBEDParser *bed_parser, GtIO *bed_file, GtError *err)
{
  GtUword block_count = 0;
  GtGenomeNode *gn = NULL;
  GtRange range;
  GtStr *seqid;
  int had_err;
  gt_error_check(err);
  /* column 1.: chrom */
  seqid = get_seqid(bed_parser);
  had_err = skip_blanks(bed_file, err);
  /* column 2.: chromStart */
  if (!had_err) {
    word(bed_parser->word, bed_file);
    had_err = skip_blanks(bed_file, err);
  }
  /* column 3.: chromEnd */
  if (!had_err) {
    word(bed_parser->another_word, bed_file);
    had_err = parse_bed_range(&range, bed_parser->word,
                              bed_parser->another_word, bed_parser->offset,
                              bed_file, false, err);
  }
  if (!had_err) {
    /* add region */
    gt_region_node_builder_add_region(bed_parser->region_node_builder,
                                      gt_str_get(seqid), range);
    /* create feature */
    gn = gt_feature_node_new(seqid,
                             bed_parser->feature_type
                             ? bed_parser->feature_type
                             : BED_FEATURE_TYPE,
                             range.start, range.end, GT_STRAND_BOTH);
    gt_queue_add(bed_parser->feature_nodes, gn);
    if (bed_separator(bed_file))
      had_err = skip_blanks(bed_file, err);
  }
  /* optional column 4.: name */
  if (!had_err) {
    word(bed_parser->word, bed_file);
    if (gt_str_length(bed_parser->word)) {
      gt_feature_node_add_attribute((GtFeatureNode*) gn, GT_GFF_NAME,
                                    gt_str_get(bed_parser->word));
    }
    if (bed_separator(bed_file))
      had_err = skip_blanks(bed_file, err);
  }
  /* optional column 5.: score */
  if (!had_err) {
    word(bed_parser->word, bed_file);
    if (gt_str_length(bed_parser->word)) {
      bool score_is_defined;
      float score_value;
      had_err = gt_parse_score(&score_is_defined, &score_value,
                               gt_str_get(bed_parser->word),
                               gt_io_get_line_number(bed_file),
                               gt_io_get_filename(bed_file), err);
      if (!had_err && score_is_defined)
        gt_feature_node_set_score((GtFeatureNode*) gn, score_value);
    }
  }
  if (!had_err && bed_separator(bed_file))
    had_err = skip_blanks(bed_file, err);
  /* optional column 6.: strand */
  if (!had_err) {
    word(bed_parser->word, bed_file);
    if (gt_str_length(bed_parser->word)) {
      GtStrand strand;
      had_err = gt_parse_strand(&strand, gt_str_get(bed_parser->word),
                                gt_io_get_line_number(bed_file),
                                gt_io_get_filename(bed_file), err);
      if (!had_err)
        gt_feature_node_set_strand((GtFeatureNode*) gn, strand);
    }
  }
  if (!had_err && bed_separator(bed_file))
    had_err = skip_blanks(bed_file, err);
  /* optional column 7.: thickStart */
  if (!had_err) {
    word(bed_parser->word, bed_file);
    if (bed_separator(bed_file))
      had_err = skip_blanks(bed_file, err);
  }
  /* optional column 8.: thickEnd */
  if (!had_err) {
    word(bed_parser->another_word, bed_file);
    if (gt_str_length(bed_parser->another_word)) {
      gt_assert(gt_str_length(bed_parser->word));
      /* got a thickStart and a thickEnd -> construct corresponding feature */
      had_err = parse_bed_range(&range, bed_parser->word,
                                bed_parser->another_word, bed_parser->offset,
                                bed_file, true, err);
      if (!had_err && range.start <= range.end)
        construct_thick_feature(bed_parser, (GtFeatureNode*) gn, range);
    }
  }
  if (!had_err && bed_separator(bed_file))
    had_err = skip_blanks(bed_file, err);
  /* optional column 9.: itemRgb */
  if (!had_err) {
    word(bed_parser->word, bed_file);
    /* we do not use the RGB values */
    if (bed_separator(bed_file))
      had_err = skip_blanks(bed_file, err);
  }
  /* optional column 10.: blockCount */
  if (!had_err) {
    word(bed_parser->word, bed_file);
    if (gt_str_length(bed_parser->word)) {
      if (gt_parse_uword(&block_count, gt_str_get(bed_parser->word))) {
        gt_error_set(err,
                     "file \"%s\": line "GT_WU": could not parse blockCount",
                     gt_io_get_filename(bed_file),
                     gt_io_get_line_number(bed_file));
        had_err = -1;
      }
      else {
        /* reset to parse/process blockSizes and blockStarts properly */
        gt_str_reset(bed_parser->word);
        gt_str_reset(bed_parser->another_word);
      }
    }
  }
  if (!had_err && bed_separator(bed_file))
    had_err = skip_blanks(bed_file, err);
  /* optional column 11.: blockSizes */
  if (!had_err) {
    word(bed_parser->word, bed_file);
    if (bed_separator(bed_file))
      had_err = skip_blanks(bed_file, err);
  }
  /* optional column 12.: blockStarts */
  if (!had_err) {
    word(bed_parser->another_word, bed_file);
    if (bed_separator(bed_file))
      had_err = skip_blanks(bed_file, err);
  }
  /* process blocks if necessary */
  if (!had_err && block_count) {
    had_err = process_blocks(bed_parser, (GtFeatureNode*) gn, block_count,
                             bed_parser->word, bed_parser->another_word,
                             bed_file, err);
  }
  /* the end of the line should now be reached */
  if (!had_err)
    had_err = gt_io_expect(bed_file, GT_END_OF_LINE, err);
  return had_err;
}