Пример #1
0
static int proc_any_char(GtIO *obo_file, GtStr *capture, bool be_permissive,
                         GtError *err)
{
  gt_error_check(err);
  gt_assert(obo_file && capture);
  if (!any_char(obo_file, be_permissive)) {
    if (gt_io_peek(obo_file) == GT_END_OF_FILE) {
      gt_error_set(err, "file \"%s\": line %lu: unexpected end-of-file",
                gt_io_get_filename(obo_file), gt_io_get_line_number(obo_file));
    }
    else if ((gt_io_peek(obo_file) == GT_CARRIAGE_RETURN) ||
             (gt_io_peek(obo_file) == GT_END_OF_LINE)) {
      gt_error_set(err, "file \"%s\": line %lu: unexpected newline",
                gt_io_get_filename(obo_file), gt_io_get_line_number(obo_file));
    }
    else {
      gt_error_set(err, "file \"%s\": line %lu: unexpected character '%c'",
                gt_io_get_filename(obo_file), gt_io_get_line_number(obo_file),
                gt_io_peek(obo_file));
    }
    return -1;
  }
  gt_str_append_char(capture, gt_io_next(obo_file));
  return 0;
}
Пример #2
0
int gt_io_expect(GtIO *io, char expected_char, GtError *err)
{
  char cc;
  gt_error_check(err);
  cc = gt_io_next(io);
  if (cc != expected_char) {
    if (expected_char == GT_END_OF_LINE && cc == GT_CARRIAGE_RETURN) {
      if (gt_io_peek(io) == GT_END_OF_LINE)
        gt_io_next(io);
      return 0;
    }
    if (expected_char == GT_END_OF_FILE) {
      gt_error_set(err, "file \"%s\": line %lu: expected end-of-file, got '%c'",
                   gt_io_get_filename(io), gt_io_get_line_number(io), cc);
    }
    else if ((cc == GT_CARRIAGE_RETURN) || (cc == GT_END_OF_LINE)) {
      gt_error_set(err, "file \"%s\": line %lu: expected character '%c', got "
                   "newline", gt_io_get_filename(io), gt_io_get_line_number(io),
                   expected_char);
    }
    else {
      gt_error_set(err, "file \"%s\": line %lu: expected character '%c', got "
                   "'%c'", gt_io_get_filename(io), gt_io_get_line_number(io),
                   expected_char, cc);
    }
    return -1;
  }
  return 0;
}
Пример #3
0
static int parse_bed_range(GtRange *range, GtStr *start, GtStr *end,
                           GtWord offset, GtIO *bed_file, bool thick,
                           GtError *err)
{
  int had_err;
  gt_error_check(err);
  had_err = gt_parse_range(range, gt_str_get(start), gt_str_get(end),
                           gt_io_get_line_number(bed_file),
                           gt_io_get_filename(bed_file), err);
  /* BED has a weird numbering scheme: positions are 0-based, but the end
     position is not part of the feature. Transform to 1-based coordinates. */
  range->start++;
  /* Ranges defining a 'thick' region sometimes come with length 0 to
     designate that there are no thick regions. So do not fail here and
     handle that case later. */
  if (!thick) {
    if (!had_err && range->start > range->end) {
      gt_error_set(err, "file \"%s\": line "GT_WU": BED feature has length 0",
                   gt_io_get_filename(bed_file),
                   gt_io_get_line_number(bed_file));
      had_err = -1;
    }
  }
  if (offset)
    *range = gt_range_offset(range, offset);
  return had_err;
}
Пример #4
0
static int process_blocks(GtBEDParser *bed_parser, GtFeatureNode *fn,
                          unsigned long block_count, GtStr *block_sizes,
                          GtStr *block_starts, GtIO *bed_file, GtError *err)
{
  GtSplitter *size_splitter = NULL , *start_splitter = NULL;
  int had_err = 0;
  gt_error_check(err);
  gt_assert(fn && block_count && block_sizes && block_starts);
  if (!gt_str_length(block_sizes)) {
    gt_error_set(err,
                 "file \"%s\": line %lu: blockCount given without blockSizes",
                 gt_io_get_filename(bed_file),
                 gt_io_get_line_number(bed_file));
    had_err = -1;
  }
  if (!had_err && !gt_str_length(block_starts)) {
    gt_error_set(err,
                 "file \"%s\": line %lu: blockCount given without blockStarts",
                 gt_io_get_filename(bed_file),
                 gt_io_get_line_number(bed_file));
    had_err = -1;
  }
  if (!had_err) {
    /* remove terminal commas found in real-world BED files */
    remove_terminal_comma(block_sizes);
    remove_terminal_comma(block_starts);
  }
  if (!had_err) {
    size_splitter = gt_splitter_new();
    gt_splitter_split(size_splitter, gt_str_get(block_sizes),
                      gt_str_length(block_sizes), ',');
    if (gt_splitter_size(size_splitter) != block_count) {
      gt_error_set(err, "file \"%s\": line %lu: blockSizes column does not "
                        "have blockCount=%lu many comma separated fields",
                   gt_io_get_filename(bed_file),
                   gt_io_get_line_number(bed_file), block_count);
      had_err = -1;
    }
  }
  if (!had_err) {
    start_splitter = gt_splitter_new();
    gt_splitter_split(start_splitter, gt_str_get(block_starts),
                      gt_str_length(block_starts), ',');
    if (gt_splitter_size(start_splitter) != block_count) {
      gt_error_set(err, "file \"%s\": line %lu: blockStarts column does not "
                        "have " "blockCount=%lu many comma separated fields",
                   gt_io_get_filename(bed_file),
                   gt_io_get_line_number(bed_file), block_count);
      had_err = -1;
    }
  }
  if (!had_err) {
    had_err = create_block_features(bed_parser, fn, block_count, size_splitter,
                                    start_splitter, bed_file, err);
  }
  gt_splitter_delete(start_splitter);
  gt_splitter_delete(size_splitter);
  return had_err;
}
Пример #5
0
static int create_block_features(GtBEDParser *bed_parser, GtFeatureNode *fn,
                                 GtUword block_count,
                                 GtSplitter *size_splitter,
                                 GtSplitter *start_splitter, GtIO *bed_file,
                                 GtError *err)
{
  GtUword i;
  int had_err = 0;
  gt_assert(fn && block_count && size_splitter && start_splitter);
  gt_assert(gt_splitter_size(size_splitter) == block_count);
  gt_assert(gt_splitter_size(start_splitter) == block_count);
  for (i = 0; !had_err && i < block_count; i++) {
    GtUword block_size, block_start, start, end;
    GtGenomeNode *block;
    const char *name;
    if (gt_parse_uword(&block_size, gt_splitter_get_token(size_splitter, i))) {
      gt_error_set(err,
                   "file \"%s\": line "GT_WU": could not parse blockSize '%s'",
                   gt_io_get_filename(bed_file),
                   gt_io_get_line_number(bed_file),
                   gt_splitter_get_token(size_splitter, i));
      had_err = -1;
    }
    if (!had_err && gt_parse_uword(&block_start,
                                   gt_splitter_get_token(start_splitter, i))) {
      gt_error_set(err, "file \"%s\": line "GT_WU": could not parse blockStart "
                   "'%s'", gt_io_get_filename(bed_file),
                   gt_io_get_line_number(bed_file),
                   gt_splitter_get_token(start_splitter, i));
      had_err = -1;
    }
    if (!had_err) {
      start = gt_genome_node_get_start((GtGenomeNode*) fn) + block_start;
      end = start + block_size - 1;
      block = gt_feature_node_new(gt_genome_node_get_seqid((GtGenomeNode*) fn),
                                  bed_parser->block_type
                                  ? bed_parser->block_type
                                  : BED_BLOCK_TYPE,
                                  start, end, gt_feature_node_get_strand(fn));
      if ((name = gt_feature_node_get_attribute(fn, GT_GFF_NAME))) {
        gt_feature_node_add_attribute((GtFeatureNode*) block, GT_GFF_NAME,
                                      name);
      }
      gt_feature_node_set_score((GtFeatureNode*) block,
                                gt_feature_node_get_score(fn));
      gt_feature_node_set_strand((GtFeatureNode*) block,
                                 gt_feature_node_get_strand(fn));
      gt_feature_node_add_child(fn, (GtFeatureNode*) block);
    }
  }
  return had_err;
}
Пример #6
0
static int stanza(GtOBOParseTree *obo_parse_tree, GtIO *obo_file, GtError *err)
{
  unsigned long stanza_line_number;
  int had_err;
  GtStr *type, *tag, *value;
  gt_error_check(err);
  gt_assert(obo_parse_tree && obo_file);
  type = gt_str_new();
  tag = gt_str_new();
  value = gt_str_new();
  stanza_line_number = gt_io_get_line_number(obo_file);
  had_err = stanza_line(obo_file, type, err);
  if (!had_err) {
    GtOBOStanza *obo_stanza =
      gt_obo_stanza_new(gt_str_get(type), stanza_line_number,
                        gt_io_get_filename_str(obo_file));
    gt_obo_parse_tree_add_stanza(obo_parse_tree, obo_stanza);
    while (!had_err &&
           (any_char(obo_file, false) ||
            gt_io_peek(obo_file) == OBO_COMMENT_CHAR)) {
      gt_str_reset(tag);
      gt_str_reset(value);
      if (gt_io_peek(obo_file) == OBO_COMMENT_CHAR)
        had_err = comment_line(obo_file, err);
      else {
        had_err = tag_line(obo_file, tag, value, err);
        gt_obo_stanza_add(obo_stanza, gt_str_get(tag), gt_str_get(value));
      }
    }
  }
  gt_str_delete(value);
  gt_str_delete(tag);
  gt_str_delete(type);
  return had_err;
}
static int gt_xrf_abbr_parse_tree_entry(GtXRFAbbrParseTree *xrf_abbr_parse_tree,
                 GtIO *xrf_abbr_file, GtError *err)
{
  GtUword entry_line_number;
  int had_err = 0;
  GtStr *tag, *value;
  gt_error_check(err);
  gt_assert(xrf_abbr_parse_tree && xrf_abbr_file);
  tag = gt_str_new();
  value = gt_str_new();
  entry_line_number = gt_io_get_line_number(xrf_abbr_file);
  if (!had_err) {
    GtXRFAbbrEntry *xrf_abbr_entry =
      gt_xrf_abbr_entry_new(entry_line_number,
                            gt_io_get_filename_str(xrf_abbr_file));
    gt_xrf_abbr_parse_tree_add_entry(xrf_abbr_parse_tree, xrf_abbr_entry);
    while (!had_err &&
           (gt_xrf_abbr_parse_tree_any_char(xrf_abbr_file, false) ||
            gt_io_peek(xrf_abbr_file) == XRF_COMMENT_CHAR)) {
      gt_str_reset(tag);
      gt_str_reset(value);
      if (gt_io_peek(xrf_abbr_file) == XRF_COMMENT_CHAR)
        had_err = gt_xrf_abbr_parse_tree_comment_line(xrf_abbr_file, err);
      else {
        had_err = gt_xrf_abbr_parse_tree_tag_line(xrf_abbr_file, tag, value,
                                                  err);
        gt_xrf_abbr_entry_add(xrf_abbr_entry, gt_str_get(tag),
                              gt_str_get(value));
      }
    }
  }
  gt_str_delete(value);
  gt_str_delete(tag);
  return had_err;
}
Пример #8
0
static int parse_bed_range(GtRange *range, GtStr *start, GtStr *end,
                           long offset, GtIO *bed_file, GtError *err)
{
  int had_err;
  gt_error_check(err);
  had_err = gt_parse_range(range, gt_str_get(start), gt_str_get(end),
                           gt_io_get_line_number(bed_file),
                           gt_io_get_filename(bed_file), err);
  /* BED has a weird numbering scheme: positions are 0-based, but the end
     position is not part of the feature. Transform to 1-based coordinates. */
  range->start++;
  if (!had_err && range->start > range->end) {
    gt_error_set(err, "file \"%s\": line %lu: BED feature has length 0",
                 gt_io_get_filename(bed_file), gt_io_get_line_number(bed_file));
    had_err = -1;
  }
  if (offset)
    *range = gt_range_offset(range, offset);
  return had_err;
}
Пример #9
0
static int skip_blanks(GtIO *bed_file, GtError *err)
{
  gt_error_check(err);
  if (!bed_separator(bed_file)) {
    gt_error_set(err, "file \"%s\": line %lu: expected blank or tabulator, got "
                      "'%c'", gt_io_get_filename(bed_file),
                      gt_io_get_line_number(bed_file), gt_io_peek(bed_file));
    return -1;
  }
  while (bed_separator(bed_file))
    gt_io_next(bed_file);
  return 0;
}
Пример #10
0
static int track_rest(GtBEDParser *bed_parser, GtIO *bed_file, GtError *err)
{
  char cc;
  int had_err = 0;
  gt_error_check(err);
  bed_parser->offset = 0; /* reset offset for new track line */
  if (bed_separator(bed_file)) /* skip to first attribute=value pair */
    had_err = skip_blanks(bed_file, err);
  while (!had_err &&
         (cc = gt_io_peek(bed_file)) != GT_END_OF_LINE &&
         cc != GT_CARRIAGE_RETURN) {
    /* parse attribute */
    word(bed_parser->word, bed_file);
    had_err = gt_io_expect(bed_file, PAIR_SEPARATOR, err);
    /* parse value */
    if (!had_err) {
      if (gt_io_peek(bed_file) == QUOTE_CHAR)
        had_err = quoted_word(bed_parser->another_word, bed_file, err);
      else
        word(bed_parser->another_word, bed_file);
    }
    /* process offset if necessary */
    if (!had_err && !strcmp(gt_str_get(bed_parser->word), OFFSET_KEYWORD)) {
      if (gt_parse_word(&bed_parser->offset,
                         gt_str_get(bed_parser->another_word))) {
        gt_error_set(err,
                     "file \"%s\": line "GT_WU": could not parse offset value "
                     "'%s'", gt_io_get_filename(bed_file),
                     gt_io_get_line_number(bed_file),
                     gt_str_get(bed_parser->another_word));
        had_err = -1;
      }
    }
    /* skip blanks up to next attribute or end-of-line */
    if (!had_err && bed_separator(bed_file))
      had_err = skip_blanks(bed_file, err);
  }
  /* the end of the line should now be reached */
  if (!had_err)
    had_err = gt_io_expect(bed_file, GT_END_OF_LINE, err);
  return had_err;
}
Пример #11
0
static int parse_fasta_sequence(GtStr *sequence, GtIO *seqio, GtError *err)
{
  char cc;
  gt_error_check(err);
  gt_assert(sequence && seqio);
  gt_assert(!gt_str_length(sequence));
  /* read sequence */
  while (!gt_io_get_char(seqio, &cc) && cc != FASTA_SEPARATOR) {
    if (cc != '\n' && cc != ' ')
      gt_str_append_char(sequence, cc);
  }
  if (!gt_str_length(sequence)) {
    gt_error_set(err, "empty sequence given in line %lu",
              gt_io_get_line_number(seqio));
    return -1;
  }
  if (cc == FASTA_SEPARATOR)
    gt_io_unget_char(seqio, FASTA_SEPARATOR);
  return 0;
}
static int gt_xrf_abbr_parse_tree_tag_line(GtIO *xrf_abbr_file, GtStr *tag,
                                           GtStr *value, GtError *err)
{
  int had_err = 0;
  gt_error_check(err);
  gt_log_log("tag");
  gt_assert(xrf_abbr_file && tag && value);
  do {
    had_err = gt_xrf_abbr_parse_tree_proc_any_char(xrf_abbr_file, tag,
                                                   false, err);
  } while (!had_err && gt_xrf_abbr_parse_tree_any_char(xrf_abbr_file, false));
  if (!had_err)
    had_err = gt_io_expect(xrf_abbr_file, XRF_SEPARATOR_CHAR, err);
  while (!had_err && gt_io_peek(xrf_abbr_file) == XRF_BLANK_CHAR)
    gt_io_next(xrf_abbr_file);
  if (!had_err) {
    do {
      had_err = gt_xrf_abbr_parse_tree_proc_any_char(xrf_abbr_file, value,
                                                     true, err);
    } while (!had_err && gt_xrf_abbr_parse_tree_any_char(xrf_abbr_file, true));
  }
  if (!had_err) {
    if (gt_io_peek(xrf_abbr_file) == XRF_COMMENT_CHAR)
      had_err = gt_xrf_abbr_parse_tree_comment_line(xrf_abbr_file, err);
    else
      had_err = gt_io_expect(xrf_abbr_file, GT_END_OF_LINE, err);
  }
  if (!had_err && !gt_xrf_abbr_parse_tree_valid_label(gt_str_get(tag))) {
    gt_warning("file \"%s\": line "GT_WU": unknown label \"%s\"",
                gt_io_get_filename(xrf_abbr_file),
                gt_io_get_line_number(xrf_abbr_file),
                gt_str_get(tag));
  }
  gt_log_log("parsed line %s/%s", gt_str_get(tag), gt_str_get(value));
  return had_err;
}
Пример #13
0
static int bed_rest(GtBEDParser *bed_parser, GtIO *bed_file, GtError *err)
{
  GtUword block_count = 0;
  GtGenomeNode *gn = NULL;
  GtRange range;
  GtStr *seqid;
  int had_err;
  gt_error_check(err);
  /* column 1.: chrom */
  seqid = get_seqid(bed_parser);
  had_err = skip_blanks(bed_file, err);
  /* column 2.: chromStart */
  if (!had_err) {
    word(bed_parser->word, bed_file);
    had_err = skip_blanks(bed_file, err);
  }
  /* column 3.: chromEnd */
  if (!had_err) {
    word(bed_parser->another_word, bed_file);
    had_err = parse_bed_range(&range, bed_parser->word,
                              bed_parser->another_word, bed_parser->offset,
                              bed_file, false, err);
  }
  if (!had_err) {
    /* add region */
    gt_region_node_builder_add_region(bed_parser->region_node_builder,
                                      gt_str_get(seqid), range);
    /* create feature */
    gn = gt_feature_node_new(seqid,
                             bed_parser->feature_type
                             ? bed_parser->feature_type
                             : BED_FEATURE_TYPE,
                             range.start, range.end, GT_STRAND_BOTH);
    gt_queue_add(bed_parser->feature_nodes, gn);
    if (bed_separator(bed_file))
      had_err = skip_blanks(bed_file, err);
  }
  /* optional column 4.: name */
  if (!had_err) {
    word(bed_parser->word, bed_file);
    if (gt_str_length(bed_parser->word)) {
      gt_feature_node_add_attribute((GtFeatureNode*) gn, GT_GFF_NAME,
                                    gt_str_get(bed_parser->word));
    }
    if (bed_separator(bed_file))
      had_err = skip_blanks(bed_file, err);
  }
  /* optional column 5.: score */
  if (!had_err) {
    word(bed_parser->word, bed_file);
    if (gt_str_length(bed_parser->word)) {
      bool score_is_defined;
      float score_value;
      had_err = gt_parse_score(&score_is_defined, &score_value,
                               gt_str_get(bed_parser->word),
                               gt_io_get_line_number(bed_file),
                               gt_io_get_filename(bed_file), err);
      if (!had_err && score_is_defined)
        gt_feature_node_set_score((GtFeatureNode*) gn, score_value);
    }
  }
  if (!had_err && bed_separator(bed_file))
    had_err = skip_blanks(bed_file, err);
  /* optional column 6.: strand */
  if (!had_err) {
    word(bed_parser->word, bed_file);
    if (gt_str_length(bed_parser->word)) {
      GtStrand strand;
      had_err = gt_parse_strand(&strand, gt_str_get(bed_parser->word),
                                gt_io_get_line_number(bed_file),
                                gt_io_get_filename(bed_file), err);
      if (!had_err)
        gt_feature_node_set_strand((GtFeatureNode*) gn, strand);
    }
  }
  if (!had_err && bed_separator(bed_file))
    had_err = skip_blanks(bed_file, err);
  /* optional column 7.: thickStart */
  if (!had_err) {
    word(bed_parser->word, bed_file);
    if (bed_separator(bed_file))
      had_err = skip_blanks(bed_file, err);
  }
  /* optional column 8.: thickEnd */
  if (!had_err) {
    word(bed_parser->another_word, bed_file);
    if (gt_str_length(bed_parser->another_word)) {
      gt_assert(gt_str_length(bed_parser->word));
      /* got a thickStart and a thickEnd -> construct corresponding feature */
      had_err = parse_bed_range(&range, bed_parser->word,
                                bed_parser->another_word, bed_parser->offset,
                                bed_file, true, err);
      if (!had_err && range.start <= range.end)
        construct_thick_feature(bed_parser, (GtFeatureNode*) gn, range);
    }
  }
  if (!had_err && bed_separator(bed_file))
    had_err = skip_blanks(bed_file, err);
  /* optional column 9.: itemRgb */
  if (!had_err) {
    word(bed_parser->word, bed_file);
    /* we do not use the RGB values */
    if (bed_separator(bed_file))
      had_err = skip_blanks(bed_file, err);
  }
  /* optional column 10.: blockCount */
  if (!had_err) {
    word(bed_parser->word, bed_file);
    if (gt_str_length(bed_parser->word)) {
      if (gt_parse_uword(&block_count, gt_str_get(bed_parser->word))) {
        gt_error_set(err,
                     "file \"%s\": line "GT_WU": could not parse blockCount",
                     gt_io_get_filename(bed_file),
                     gt_io_get_line_number(bed_file));
        had_err = -1;
      }
      else {
        /* reset to parse/process blockSizes and blockStarts properly */
        gt_str_reset(bed_parser->word);
        gt_str_reset(bed_parser->another_word);
      }
    }
  }
  if (!had_err && bed_separator(bed_file))
    had_err = skip_blanks(bed_file, err);
  /* optional column 11.: blockSizes */
  if (!had_err) {
    word(bed_parser->word, bed_file);
    if (bed_separator(bed_file))
      had_err = skip_blanks(bed_file, err);
  }
  /* optional column 12.: blockStarts */
  if (!had_err) {
    word(bed_parser->another_word, bed_file);
    if (bed_separator(bed_file))
      had_err = skip_blanks(bed_file, err);
  }
  /* process blocks if necessary */
  if (!had_err && block_count) {
    had_err = process_blocks(bed_parser, (GtFeatureNode*) gn, block_count,
                             bed_parser->word, bed_parser->another_word,
                             bed_file, err);
  }
  /* the end of the line should now be reached */
  if (!had_err)
    had_err = gt_io_expect(bed_file, GT_END_OF_LINE, err);
  return had_err;
}