static int proc_any_char(GtIO *obo_file, GtStr *capture, bool be_permissive, GtError *err) { gt_error_check(err); gt_assert(obo_file && capture); if (!any_char(obo_file, be_permissive)) { if (gt_io_peek(obo_file) == GT_END_OF_FILE) { gt_error_set(err, "file \"%s\": line %lu: unexpected end-of-file", gt_io_get_filename(obo_file), gt_io_get_line_number(obo_file)); } else if ((gt_io_peek(obo_file) == GT_CARRIAGE_RETURN) || (gt_io_peek(obo_file) == GT_END_OF_LINE)) { gt_error_set(err, "file \"%s\": line %lu: unexpected newline", gt_io_get_filename(obo_file), gt_io_get_line_number(obo_file)); } else { gt_error_set(err, "file \"%s\": line %lu: unexpected character '%c'", gt_io_get_filename(obo_file), gt_io_get_line_number(obo_file), gt_io_peek(obo_file)); } return -1; } gt_str_append_char(capture, gt_io_next(obo_file)); return 0; }
int gt_io_expect(GtIO *io, char expected_char, GtError *err) { char cc; gt_error_check(err); cc = gt_io_next(io); if (cc != expected_char) { if (expected_char == GT_END_OF_LINE && cc == GT_CARRIAGE_RETURN) { if (gt_io_peek(io) == GT_END_OF_LINE) gt_io_next(io); return 0; } if (expected_char == GT_END_OF_FILE) { gt_error_set(err, "file \"%s\": line %lu: expected end-of-file, got '%c'", gt_io_get_filename(io), gt_io_get_line_number(io), cc); } else if ((cc == GT_CARRIAGE_RETURN) || (cc == GT_END_OF_LINE)) { gt_error_set(err, "file \"%s\": line %lu: expected character '%c', got " "newline", gt_io_get_filename(io), gt_io_get_line_number(io), expected_char); } else { gt_error_set(err, "file \"%s\": line %lu: expected character '%c', got " "'%c'", gt_io_get_filename(io), gt_io_get_line_number(io), expected_char, cc); } return -1; } return 0; }
static int parse_bed_range(GtRange *range, GtStr *start, GtStr *end, GtWord offset, GtIO *bed_file, bool thick, GtError *err) { int had_err; gt_error_check(err); had_err = gt_parse_range(range, gt_str_get(start), gt_str_get(end), gt_io_get_line_number(bed_file), gt_io_get_filename(bed_file), err); /* BED has a weird numbering scheme: positions are 0-based, but the end position is not part of the feature. Transform to 1-based coordinates. */ range->start++; /* Ranges defining a 'thick' region sometimes come with length 0 to designate that there are no thick regions. So do not fail here and handle that case later. */ if (!thick) { if (!had_err && range->start > range->end) { gt_error_set(err, "file \"%s\": line "GT_WU": BED feature has length 0", gt_io_get_filename(bed_file), gt_io_get_line_number(bed_file)); had_err = -1; } } if (offset) *range = gt_range_offset(range, offset); return had_err; }
static int process_blocks(GtBEDParser *bed_parser, GtFeatureNode *fn, unsigned long block_count, GtStr *block_sizes, GtStr *block_starts, GtIO *bed_file, GtError *err) { GtSplitter *size_splitter = NULL , *start_splitter = NULL; int had_err = 0; gt_error_check(err); gt_assert(fn && block_count && block_sizes && block_starts); if (!gt_str_length(block_sizes)) { gt_error_set(err, "file \"%s\": line %lu: blockCount given without blockSizes", gt_io_get_filename(bed_file), gt_io_get_line_number(bed_file)); had_err = -1; } if (!had_err && !gt_str_length(block_starts)) { gt_error_set(err, "file \"%s\": line %lu: blockCount given without blockStarts", gt_io_get_filename(bed_file), gt_io_get_line_number(bed_file)); had_err = -1; } if (!had_err) { /* remove terminal commas found in real-world BED files */ remove_terminal_comma(block_sizes); remove_terminal_comma(block_starts); } if (!had_err) { size_splitter = gt_splitter_new(); gt_splitter_split(size_splitter, gt_str_get(block_sizes), gt_str_length(block_sizes), ','); if (gt_splitter_size(size_splitter) != block_count) { gt_error_set(err, "file \"%s\": line %lu: blockSizes column does not " "have blockCount=%lu many comma separated fields", gt_io_get_filename(bed_file), gt_io_get_line_number(bed_file), block_count); had_err = -1; } } if (!had_err) { start_splitter = gt_splitter_new(); gt_splitter_split(start_splitter, gt_str_get(block_starts), gt_str_length(block_starts), ','); if (gt_splitter_size(start_splitter) != block_count) { gt_error_set(err, "file \"%s\": line %lu: blockStarts column does not " "have " "blockCount=%lu many comma separated fields", gt_io_get_filename(bed_file), gt_io_get_line_number(bed_file), block_count); had_err = -1; } } if (!had_err) { had_err = create_block_features(bed_parser, fn, block_count, size_splitter, start_splitter, bed_file, err); } gt_splitter_delete(start_splitter); gt_splitter_delete(size_splitter); return had_err; }
static int create_block_features(GtBEDParser *bed_parser, GtFeatureNode *fn, GtUword block_count, GtSplitter *size_splitter, GtSplitter *start_splitter, GtIO *bed_file, GtError *err) { GtUword i; int had_err = 0; gt_assert(fn && block_count && size_splitter && start_splitter); gt_assert(gt_splitter_size(size_splitter) == block_count); gt_assert(gt_splitter_size(start_splitter) == block_count); for (i = 0; !had_err && i < block_count; i++) { GtUword block_size, block_start, start, end; GtGenomeNode *block; const char *name; if (gt_parse_uword(&block_size, gt_splitter_get_token(size_splitter, i))) { gt_error_set(err, "file \"%s\": line "GT_WU": could not parse blockSize '%s'", gt_io_get_filename(bed_file), gt_io_get_line_number(bed_file), gt_splitter_get_token(size_splitter, i)); had_err = -1; } if (!had_err && gt_parse_uword(&block_start, gt_splitter_get_token(start_splitter, i))) { gt_error_set(err, "file \"%s\": line "GT_WU": could not parse blockStart " "'%s'", gt_io_get_filename(bed_file), gt_io_get_line_number(bed_file), gt_splitter_get_token(start_splitter, i)); had_err = -1; } if (!had_err) { start = gt_genome_node_get_start((GtGenomeNode*) fn) + block_start; end = start + block_size - 1; block = gt_feature_node_new(gt_genome_node_get_seqid((GtGenomeNode*) fn), bed_parser->block_type ? bed_parser->block_type : BED_BLOCK_TYPE, start, end, gt_feature_node_get_strand(fn)); if ((name = gt_feature_node_get_attribute(fn, GT_GFF_NAME))) { gt_feature_node_add_attribute((GtFeatureNode*) block, GT_GFF_NAME, name); } gt_feature_node_set_score((GtFeatureNode*) block, gt_feature_node_get_score(fn)); gt_feature_node_set_strand((GtFeatureNode*) block, gt_feature_node_get_strand(fn)); gt_feature_node_add_child(fn, (GtFeatureNode*) block); } } return had_err; }
static int header(GtOBOParseTree *obo_parse_tree, GtIO *obo_file, GtError *err) { GtStr *tag, *value; int had_err; gt_error_check(err); gt_assert(obo_parse_tree && obo_file); tag = gt_str_new(); value = gt_str_new(); do { gt_str_reset(tag); gt_str_reset(value); had_err = tag_line(obo_file, tag, value, err); if (!had_err) { obo_header_add(obo_parse_tree->obo_header, gt_str_get(tag), gt_str_get(value)); } } while (!had_err && any_char(obo_file, false)); if (!had_err) { had_err = obo_header_validate(obo_parse_tree->obo_header, gt_io_get_filename(obo_file), err); } gt_str_delete(value); gt_str_delete(tag); return had_err; }
static int gt_fasta_reader_rec_run(GtFastaReader *fasta_reader, GtFastaReaderProcDescription proc_description, GtFastaReaderProcSequencePart proc_sequence_part, GtFastaReaderProcSequenceLength proc_sequence_length, void *data, GtError *err) { GtFastaReaderRec *fr = gt_fasta_reader_rec_cast(fasta_reader); GtStr *description, *sequence; int had_err = 0; gt_error_check(err); /* at least one function has to be defined */ gt_assert(proc_description || proc_sequence_part || proc_sequence_length); /* init */ description = gt_str_new(); sequence = gt_str_new(); /* make sure file is not empty */ if (!gt_io_has_char(fr->seqio)) { gt_error_set(err, "sequence file \"%s\" is empty", gt_io_get_filename(fr->seqio)); had_err = -1; } /* parse file */ while (!had_err && gt_io_has_char(fr->seqio)) { /* reset */ gt_str_reset(description); gt_str_reset(sequence); /* parse entry */ had_err = parse_fasta_entry(description, sequence, fr->seqio, err); /* process entry */ if (!had_err && proc_description) { had_err = proc_description(gt_str_get(description), gt_str_length(description), data, err); } if (!had_err && proc_sequence_part) { had_err = proc_sequence_part(gt_str_get(sequence), gt_str_length(sequence), data, err); } if (!had_err && proc_sequence_length) had_err = proc_sequence_length(gt_str_length(sequence), data, err); } /* free */ gt_str_delete(description); gt_str_delete(sequence); return had_err; }
static int parse_bed_range(GtRange *range, GtStr *start, GtStr *end, long offset, GtIO *bed_file, GtError *err) { int had_err; gt_error_check(err); had_err = gt_parse_range(range, gt_str_get(start), gt_str_get(end), gt_io_get_line_number(bed_file), gt_io_get_filename(bed_file), err); /* BED has a weird numbering scheme: positions are 0-based, but the end position is not part of the feature. Transform to 1-based coordinates. */ range->start++; if (!had_err && range->start > range->end) { gt_error_set(err, "file \"%s\": line %lu: BED feature has length 0", gt_io_get_filename(bed_file), gt_io_get_line_number(bed_file)); had_err = -1; } if (offset) *range = gt_range_offset(range, offset); return had_err; }
static int skip_blanks(GtIO *bed_file, GtError *err) { gt_error_check(err); if (!bed_separator(bed_file)) { gt_error_set(err, "file \"%s\": line %lu: expected blank or tabulator, got " "'%c'", gt_io_get_filename(bed_file), gt_io_get_line_number(bed_file), gt_io_peek(bed_file)); return -1; } while (bed_separator(bed_file)) gt_io_next(bed_file); return 0; }
static int track_rest(GtBEDParser *bed_parser, GtIO *bed_file, GtError *err) { char cc; int had_err = 0; gt_error_check(err); bed_parser->offset = 0; /* reset offset for new track line */ if (bed_separator(bed_file)) /* skip to first attribute=value pair */ had_err = skip_blanks(bed_file, err); while (!had_err && (cc = gt_io_peek(bed_file)) != GT_END_OF_LINE && cc != GT_CARRIAGE_RETURN) { /* parse attribute */ word(bed_parser->word, bed_file); had_err = gt_io_expect(bed_file, PAIR_SEPARATOR, err); /* parse value */ if (!had_err) { if (gt_io_peek(bed_file) == QUOTE_CHAR) had_err = quoted_word(bed_parser->another_word, bed_file, err); else word(bed_parser->another_word, bed_file); } /* process offset if necessary */ if (!had_err && !strcmp(gt_str_get(bed_parser->word), OFFSET_KEYWORD)) { if (gt_parse_word(&bed_parser->offset, gt_str_get(bed_parser->another_word))) { gt_error_set(err, "file \"%s\": line "GT_WU": could not parse offset value " "'%s'", gt_io_get_filename(bed_file), gt_io_get_line_number(bed_file), gt_str_get(bed_parser->another_word)); had_err = -1; } } /* skip blanks up to next attribute or end-of-line */ if (!had_err && bed_separator(bed_file)) had_err = skip_blanks(bed_file, err); } /* the end of the line should now be reached */ if (!had_err) had_err = gt_io_expect(bed_file, GT_END_OF_LINE, err); return had_err; }
static int parse_fasta_description(GtStr *description, GtIO *seqio, GtError *err) { int rval; char cc; gt_error_check(err); gt_assert(description && seqio); rval = gt_io_get_char(seqio, &cc); gt_assert(!rval); /* was checked earlier */ /* make sure we got a proper fasta description */ if (cc != FASTA_SEPARATOR) { gt_error_set(err, "the first character of fasta file \"%s\" has to be '%c'", gt_io_get_filename(seqio), FASTA_SEPARATOR); return -1; } /* read description */ while (!gt_io_get_char(seqio, &cc) && cc != '\n') gt_str_append_char(description, cc); return 0; }
static int gt_xrf_abbr_parse_tree_tag_line(GtIO *xrf_abbr_file, GtStr *tag, GtStr *value, GtError *err) { int had_err = 0; gt_error_check(err); gt_log_log("tag"); gt_assert(xrf_abbr_file && tag && value); do { had_err = gt_xrf_abbr_parse_tree_proc_any_char(xrf_abbr_file, tag, false, err); } while (!had_err && gt_xrf_abbr_parse_tree_any_char(xrf_abbr_file, false)); if (!had_err) had_err = gt_io_expect(xrf_abbr_file, XRF_SEPARATOR_CHAR, err); while (!had_err && gt_io_peek(xrf_abbr_file) == XRF_BLANK_CHAR) gt_io_next(xrf_abbr_file); if (!had_err) { do { had_err = gt_xrf_abbr_parse_tree_proc_any_char(xrf_abbr_file, value, true, err); } while (!had_err && gt_xrf_abbr_parse_tree_any_char(xrf_abbr_file, true)); } if (!had_err) { if (gt_io_peek(xrf_abbr_file) == XRF_COMMENT_CHAR) had_err = gt_xrf_abbr_parse_tree_comment_line(xrf_abbr_file, err); else had_err = gt_io_expect(xrf_abbr_file, GT_END_OF_LINE, err); } if (!had_err && !gt_xrf_abbr_parse_tree_valid_label(gt_str_get(tag))) { gt_warning("file \"%s\": line "GT_WU": unknown label \"%s\"", gt_io_get_filename(xrf_abbr_file), gt_io_get_line_number(xrf_abbr_file), gt_str_get(tag)); } gt_log_log("parsed line %s/%s", gt_str_get(tag), gt_str_get(value)); return had_err; }
static int bed_rest(GtBEDParser *bed_parser, GtIO *bed_file, GtError *err) { GtUword block_count = 0; GtGenomeNode *gn = NULL; GtRange range; GtStr *seqid; int had_err; gt_error_check(err); /* column 1.: chrom */ seqid = get_seqid(bed_parser); had_err = skip_blanks(bed_file, err); /* column 2.: chromStart */ if (!had_err) { word(bed_parser->word, bed_file); had_err = skip_blanks(bed_file, err); } /* column 3.: chromEnd */ if (!had_err) { word(bed_parser->another_word, bed_file); had_err = parse_bed_range(&range, bed_parser->word, bed_parser->another_word, bed_parser->offset, bed_file, false, err); } if (!had_err) { /* add region */ gt_region_node_builder_add_region(bed_parser->region_node_builder, gt_str_get(seqid), range); /* create feature */ gn = gt_feature_node_new(seqid, bed_parser->feature_type ? bed_parser->feature_type : BED_FEATURE_TYPE, range.start, range.end, GT_STRAND_BOTH); gt_queue_add(bed_parser->feature_nodes, gn); if (bed_separator(bed_file)) had_err = skip_blanks(bed_file, err); } /* optional column 4.: name */ if (!had_err) { word(bed_parser->word, bed_file); if (gt_str_length(bed_parser->word)) { gt_feature_node_add_attribute((GtFeatureNode*) gn, GT_GFF_NAME, gt_str_get(bed_parser->word)); } if (bed_separator(bed_file)) had_err = skip_blanks(bed_file, err); } /* optional column 5.: score */ if (!had_err) { word(bed_parser->word, bed_file); if (gt_str_length(bed_parser->word)) { bool score_is_defined; float score_value; had_err = gt_parse_score(&score_is_defined, &score_value, gt_str_get(bed_parser->word), gt_io_get_line_number(bed_file), gt_io_get_filename(bed_file), err); if (!had_err && score_is_defined) gt_feature_node_set_score((GtFeatureNode*) gn, score_value); } } if (!had_err && bed_separator(bed_file)) had_err = skip_blanks(bed_file, err); /* optional column 6.: strand */ if (!had_err) { word(bed_parser->word, bed_file); if (gt_str_length(bed_parser->word)) { GtStrand strand; had_err = gt_parse_strand(&strand, gt_str_get(bed_parser->word), gt_io_get_line_number(bed_file), gt_io_get_filename(bed_file), err); if (!had_err) gt_feature_node_set_strand((GtFeatureNode*) gn, strand); } } if (!had_err && bed_separator(bed_file)) had_err = skip_blanks(bed_file, err); /* optional column 7.: thickStart */ if (!had_err) { word(bed_parser->word, bed_file); if (bed_separator(bed_file)) had_err = skip_blanks(bed_file, err); } /* optional column 8.: thickEnd */ if (!had_err) { word(bed_parser->another_word, bed_file); if (gt_str_length(bed_parser->another_word)) { gt_assert(gt_str_length(bed_parser->word)); /* got a thickStart and a thickEnd -> construct corresponding feature */ had_err = parse_bed_range(&range, bed_parser->word, bed_parser->another_word, bed_parser->offset, bed_file, true, err); if (!had_err && range.start <= range.end) construct_thick_feature(bed_parser, (GtFeatureNode*) gn, range); } } if (!had_err && bed_separator(bed_file)) had_err = skip_blanks(bed_file, err); /* optional column 9.: itemRgb */ if (!had_err) { word(bed_parser->word, bed_file); /* we do not use the RGB values */ if (bed_separator(bed_file)) had_err = skip_blanks(bed_file, err); } /* optional column 10.: blockCount */ if (!had_err) { word(bed_parser->word, bed_file); if (gt_str_length(bed_parser->word)) { if (gt_parse_uword(&block_count, gt_str_get(bed_parser->word))) { gt_error_set(err, "file \"%s\": line "GT_WU": could not parse blockCount", gt_io_get_filename(bed_file), gt_io_get_line_number(bed_file)); had_err = -1; } else { /* reset to parse/process blockSizes and blockStarts properly */ gt_str_reset(bed_parser->word); gt_str_reset(bed_parser->another_word); } } } if (!had_err && bed_separator(bed_file)) had_err = skip_blanks(bed_file, err); /* optional column 11.: blockSizes */ if (!had_err) { word(bed_parser->word, bed_file); if (bed_separator(bed_file)) had_err = skip_blanks(bed_file, err); } /* optional column 12.: blockStarts */ if (!had_err) { word(bed_parser->another_word, bed_file); if (bed_separator(bed_file)) had_err = skip_blanks(bed_file, err); } /* process blocks if necessary */ if (!had_err && block_count) { had_err = process_blocks(bed_parser, (GtFeatureNode*) gn, block_count, bed_parser->word, bed_parser->another_word, bed_file, err); } /* the end of the line should now be reached */ if (!had_err) had_err = gt_io_expect(bed_file, GT_END_OF_LINE, err); return had_err; }