static int select_visitor_region_node(GtNodeVisitor *nv, GtRegionNode *rn, GT_UNUSED GtError *err) { GtSelectVisitor *select_visitor; gt_error_check(err); select_visitor = select_visitor_cast(nv); if (!gt_str_length(select_visitor->seqid) || /* no seqid was specified */ !gt_str_cmp(select_visitor->seqid, /* or seqids are equal */ gt_genome_node_get_seqid((GtGenomeNode*) rn))) { if (select_visitor->contain_range.start != GT_UNDEF_ULONG) { GtRange range = gt_genome_node_get_range((GtGenomeNode*) rn); if (gt_range_overlap(&range, &select_visitor->contain_range)) { /* an overlapping contain range was defined -> update range */ range.start = MAX(range.start, select_visitor->contain_range.start); range.end = MIN(range.end, select_visitor->contain_range.end); gt_genome_node_set_range((GtGenomeNode*) rn, &range); gt_queue_add(select_visitor->node_buffer, rn); } else /* contain range does not overlap with <rn> range -> delete <rn> */ gt_genome_node_delete((GtGenomeNode*) rn); } else gt_queue_add(select_visitor->node_buffer, rn); } else gt_genome_node_delete((GtGenomeNode*) rn); return 0; }
void gt_desc_buffer_reset(GtDescBuffer *db) { GtUword laststartpos; gt_assert(db); if (!db->dirty) return; if (gt_queue_size(db->startqueue) == 0) { db->length = 0; db->dirty = false; return; } laststartpos = (GtUword) gt_queue_head(db->startqueue); if (laststartpos != 0) { laststartpos = (GtUword) gt_queue_get(db->startqueue); db->length = db->length - laststartpos; if (db->length >= laststartpos) { /* strings overlap */ memmove(db->buf, db->buf + laststartpos, db->length * sizeof (char)); } else { /* no overlap */ memcpy(db->buf, db->buf + laststartpos, db->length * sizeof (char)); } gt_queue_add(db->startqueue, (void*) 0); } db->dirty = false; }
static void infer_cds_visitor_test_data(GtQueue *queue) { GtError *error = gt_error_new(); const char *file = "data/gff3/grape-codons.gff3"; GtNodeStream *gff3in = gt_gff3_in_stream_new_unsorted(1, &file); gt_gff3_in_stream_check_id_attributes((GtGFF3InStream *)gff3in); gt_gff3_in_stream_enable_tidy_mode((GtGFF3InStream *)gff3in); GtLogger *logger = gt_logger_new(true, "", stderr); GtNodeStream *icv_stream = agn_infer_cds_stream_new(gff3in, NULL, logger); GtArray *feats = gt_array_new( sizeof(GtFeatureNode *) ); GtNodeStream *arraystream = gt_array_out_stream_new(icv_stream, feats, error); int pullresult = gt_node_stream_pull(arraystream, error); if(pullresult == -1) { fprintf(stderr, "[AgnInferCDSVisitor::infer_cds_visitor_test_data] error " "processing features: %s\n", gt_error_get(error)); } gt_node_stream_delete(gff3in); gt_node_stream_delete(icv_stream); gt_node_stream_delete(arraystream); gt_logger_delete(logger); gt_array_sort(feats, (GtCompare)agn_genome_node_compare); gt_array_reverse(feats); while(gt_array_size(feats) > 0) { GtFeatureNode *fn = *(GtFeatureNode **)gt_array_pop(feats); gt_queue_add(queue, fn); } gt_array_delete(feats); gt_error_delete(error); }
static int select_visitor_eof_node(GtNodeVisitor *nv, GtEOFNode *eofn, GT_UNUSED GtError *err) { GtSelectVisitor *select_visitor; gt_error_check(err); select_visitor = select_visitor_cast(nv); gt_queue_add(select_visitor->node_buffer, eofn); return 0; }
static int add_ids_visitor_comment_node(GtNodeVisitor *nv, GtCommentNode *c, GT_UNUSED GtError *err) { GtAddIDsVisitor *add_ids_visitor; gt_error_check(err); add_ids_visitor = add_ids_visitor_cast(nv); gt_queue_add(add_ids_visitor->node_buffer, c); return 0; }
static int select_visitor_comment_node(GtNodeVisitor *nv, GtCommentNode *c, GT_UNUSED GtError *err) { GtSelectVisitor *select_visitor; gt_error_check(err); select_visitor = select_visitor_cast(nv); gt_queue_add(select_visitor->node_buffer, c); return 0; }
static int add_ids_visitor_meta_node(GtNodeVisitor *nv, GtMetaNode *mn, GT_UNUSED GtError *err) { GtAddIDsVisitor *add_ids_visitor; gt_error_check(err); add_ids_visitor = add_ids_visitor_cast(nv); gt_queue_add(add_ids_visitor->node_buffer, mn); return 0; }
static int add_ids_visitor_eof_node(GtNodeVisitor *nv, GtEOFNode *eofn, GT_UNUSED GtError *err) { GtAddIDsVisitor *add_ids_visitor; gt_error_check(err); add_ids_visitor = add_ids_visitor_cast(nv); gt_add_ids_visitor_finalize(nv); gt_queue_add(add_ids_visitor->node_buffer, eofn); return 0; }
static int filter_stream_next(GtNodeStream *ns, GtGenomeNode **gn, GtError *error) { AgnFilterStream *stream; GtFeatureNode *fn; int had_err; gt_error_check(error); stream = filter_stream_cast(ns); if(gt_queue_size(stream->cache) > 0) { *gn = gt_queue_get(stream->cache); return 0; } while(1) { had_err = gt_node_stream_next(stream->in_stream, gn, error); if(had_err) return had_err; if(!*gn) return 0; fn = gt_feature_node_try_cast(*gn); if(!fn) return 0; GtFeatureNode *current; GtFeatureNodeIterator *iter = gt_feature_node_iterator_new(fn); for(current = gt_feature_node_iterator_next(iter); current != NULL; current = gt_feature_node_iterator_next(iter)) { const char *type = gt_feature_node_get_type(current); bool keepfeature = false; if(gt_hashmap_get(stream->typestokeep, type) != NULL) keepfeature = true; if(keepfeature) { gt_genome_node_ref((GtGenomeNode *)current); gt_queue_add(stream->cache, current); } } gt_feature_node_iterator_delete(iter); gt_genome_node_delete((GtGenomeNode *)fn); if(gt_queue_size(stream->cache) > 0) { *gn = gt_queue_get(stream->cache); return 0; } } return 0; }
static int add_auto_sr_to_queue(GT_UNUSED void *key, void *value, void *data, GT_UNUSED GtError *err) { AutomaticSequenceRegion *auto_sr = value; GtQueue *genome_nodes = data; GtGenomeNode *gf; unsigned int i; gt_error_check(err); gt_assert(key && value && data); if (gt_array_size(auto_sr->feature_nodes)) { gt_queue_add(genome_nodes, auto_sr->sequence_region); auto_sr->sequence_region = NULL; for (i = 0; i < gt_array_size(auto_sr->feature_nodes); i++) { gf = *(GtGenomeNode**) gt_array_get(auto_sr->feature_nodes, i); gt_queue_add(genome_nodes, gf); } gt_array_reset(auto_sr->feature_nodes); } return 0; }
static int add_ids_visitor_sequence_node(GtNodeVisitor *nv, GtSequenceNode *sn, GT_UNUSED GtError *err) { GtAddIDsVisitor *add_ids_visitor; gt_error_check(err); add_ids_visitor = add_ids_visitor_cast(nv); /* sequence nodes have to be at the end of a stream -> finalize first */ gt_add_ids_visitor_finalize(nv); /* then add sequence node to buffer */ gt_queue_add(add_ids_visitor->node_buffer, sn); return 0; }
static int select_visitor_sequence_node(GtNodeVisitor *nv, GtSequenceNode *sn, GT_UNUSED GtError *err) { GtSelectVisitor *select_visitor; gt_error_check(err); select_visitor = select_visitor_cast(nv); if (!gt_str_length(select_visitor->seqid) || /* no seqid was specified */ !gt_str_cmp(select_visitor->seqid, /* or seqids are equal */ gt_genome_node_get_seqid((GtGenomeNode*) sn))) { gt_queue_add(select_visitor->node_buffer, sn); } else gt_genome_node_delete((GtGenomeNode*) sn); return 0; }
GtDescBuffer* gt_desc_buffer_new(void) { GtDescBuffer *db = gt_malloc(sizeof *db); db->buf = gt_calloc(GT_DESC_BUFFER_INIT_SIZE, sizeof (char)); db->length = 0; db->maxlength = db->curlength = 0; db->allocated = GT_DESC_BUFFER_INIT_SIZE; db->finished = false; db->dirty = true; db->shorten = false; db->seen_whitespace = false; db->reference_count = 0; db->startqueue = gt_queue_new(); gt_queue_add(db->startqueue, (void*) 0); return db; }
static int buffer_stream_next(GtNodeStream *ns, GtGenomeNode **gn, GtError *err) { GtBufferStream *bs; gt_error_check(err); bs = buffer_stream_cast(ns); if (bs->buffering) { int had_err = gt_node_stream_next(bs->in_stream, gn, err); if (!had_err && *gn) gt_queue_add(bs->node_buffer, gt_genome_node_ref(*gn)); return had_err; } else { *gn = gt_queue_size(bs->node_buffer) ? gt_queue_get(bs->node_buffer) : NULL; return 0; } }
static int snp_annotator_stream_process_current_gene(GtSNPAnnotatorStream *sas, GtError *err) { int had_err = 0; GtUword i; GtUword nof_genes = gt_array_size(sas->cur_gene_set); gt_error_check(err); if (gt_queue_size(sas->snps) > 0) { /* we need to process SNPs for a gene cluster*/ gt_assert(gt_queue_size(sas->outqueue) == 0); for (i = 0; !had_err && i < nof_genes; i++) { GtNodeVisitor *sav; GtFeatureNode *gene; gene = *(GtFeatureNode**) gt_array_get(sas->cur_gene_set, i); sav = gt_snp_annotator_visitor_new(gene, sas->tt, sas->rmap, err); if (!sav) had_err = -1; if (!had_err) { if (i < nof_genes-1) { had_err = gt_queue_iterate(sas->snps, snp_annotator_stream_process_snp, sav, err); } else { while (!had_err && gt_queue_size(sas->snps) > 0) { GtFeatureNode *snp = (GtFeatureNode*) gt_queue_get(sas->snps); had_err = gt_genome_node_accept((GtGenomeNode*) snp, sav, err); gt_queue_add(sas->outqueue, snp); gt_genome_node_delete((GtGenomeNode*) snp); } } gt_node_visitor_delete(sav); } gt_genome_node_delete((GtGenomeNode*) gene); } } else { /* no SNPs for this gene cluster, delete it */ for (i = 0; !had_err && i < nof_genes; i++) { gt_genome_node_delete(*(GtGenomeNode**) gt_array_get(sas->cur_gene_set, i)); } } gt_assert(gt_queue_size(sas->snps) == 0); gt_array_reset(sas->cur_gene_set); return had_err; }
int gt_bed_parser_parse(GtBEDParser *bed_parser, GtQueue *genome_nodes, const char *filename, GtError *err) { GtIO *bed_file; int had_err; gt_error_check(err); gt_assert(bed_parser && genome_nodes); bed_file = gt_io_new(filename, "r"); /* parse BED file */ had_err = parse_bed_file(bed_parser, bed_file, err); /* process created region and feature nodes */ gt_region_node_builder_build(bed_parser->region_node_builder, genome_nodes); gt_region_node_builder_reset(bed_parser->region_node_builder); while (gt_queue_size(bed_parser->feature_nodes)) gt_queue_add(genome_nodes, gt_queue_get(bed_parser->feature_nodes)); gt_io_delete(bed_file); return had_err; }
void gt_orphanage_add(GtOrphanage *o, GtGenomeNode *orphan, const char *orphan_id, GtStrArray *missing_parents) { const char *missing_parent; GtUword i; gt_assert(o && orphan); gt_assert(gt_feature_node_get_attribute((GtFeatureNode*) orphan, GT_GFF_PARENT)); gt_queue_add(o->orphans, orphan); if (orphan_id && !gt_cstr_table_get(o->orphan_ids, orphan_id)) gt_cstr_table_add(o->orphan_ids, orphan_id); if (missing_parents) { for (i = 0; i < gt_str_array_size(missing_parents); i++) { missing_parent = gt_str_array_get(missing_parents, i); if (!gt_cstr_table_get(o->missing_parents, missing_parent)) gt_cstr_table_add(o->missing_parents, missing_parent); } } }
void gt_desc_buffer_append_char(GtDescBuffer *db, char c) { gt_assert(db); if (db->shorten) { if (db->seen_whitespace) return; if (isspace(c)) { db->seen_whitespace = true; return; } } if (db->finished) { gt_queue_add(db->startqueue, (void*) (db->length)); db->finished = false; } if (db->length + 2 > db->allocated) { db->buf = gt_dynalloc(db->buf, &db->allocated, (db->length + 2) * sizeof (char)); } db->curlength++; db->buf[db->length++] = c; }
void feature_in_stream_init(GtFeatureInStream *stream) { GtUword i; GtError *error = gt_error_new(); stream->seqids = gt_feature_index_get_seqids(stream->fi, error); stream->seqindex = 0; for (i = 0; i < gt_str_array_size(stream->seqids); i++) { const char *seqid = gt_str_array_get(stream->seqids, i); GtRange seqrange; if (stream->useorig) gt_feature_index_get_orig_range_for_seqid(stream->fi, &seqrange, seqid, error); else gt_feature_index_get_range_for_seqid(stream->fi, &seqrange, seqid, error); GtStr *seqstr = gt_str_new_cstr(seqid); GtGenomeNode *rn = gt_region_node_new(seqstr, seqrange.start, seqrange.end); gt_queue_add(stream->regioncache, rn); gt_str_delete(seqstr); } gt_error_delete(error); }
static int gff3_numsorted_out_stream_next(GtNodeStream *ns, GtGenomeNode **gn, GtError *err) { GtGFF3NumsortedOutStream *gff3_out_stream; int had_err = 0; GtUword i = 0; gt_error_check(err); gff3_out_stream = gff3_numsorted_out_stream_cast(ns); if (!gff3_out_stream->outqueue) { gff3_out_stream->outqueue = gt_queue_new(); while (!(had_err = gt_node_stream_next(gff3_out_stream->in_stream, gn, err))) { if (!*gn) break; gt_array_add(gff3_out_stream->buffer, *gn); } if (!had_err) { gt_genome_nodes_sort_stable_with_func(gff3_out_stream->buffer, (GtCompare) gt_genome_node_compare_numeric_seqids); for (i = 0; !had_err && i < gt_array_size(gff3_out_stream->buffer); i++) { GtGenomeNode *mygn = *(GtGenomeNode**) gt_array_get(gff3_out_stream->buffer, i); gt_queue_add(gff3_out_stream->outqueue, mygn); } } } if (gff3_out_stream->outqueue && !had_err) { if (gt_queue_size(gff3_out_stream->outqueue) > 0) { GtGenomeNode *mygn = (GtGenomeNode*) gt_queue_get(gff3_out_stream->outqueue); gt_assert(mygn); had_err = gt_genome_node_accept(mygn, gff3_out_stream->gff3_visitor, err); if (!had_err) *gn = mygn; } } return had_err; }
static int add_ids_visitor_region_node(GtNodeVisitor *nv, GtRegionNode *rn, GT_UNUSED GtError *err) { GtAddIDsVisitor *aiv; const char *seqid; int had_err = 0; gt_error_check(err); aiv = add_ids_visitor_cast(nv); seqid = gt_str_get(gt_genome_node_get_seqid((GtGenomeNode*) rn)); if (gt_hashmap_get(aiv->undefined_sequence_regions, seqid)) { gt_error_set(err, "genome feature with id \"%s\" has been defined before " "the corresponding \"%s\" definition on line %u in file " "\"%s\"", seqid, GT_GFF_SEQUENCE_REGION, gt_genome_node_get_line_number((GtGenomeNode*) rn), gt_genome_node_get_filename((GtGenomeNode*) rn)); had_err = -1; } if (!had_err) { if (!gt_cstr_table_get(aiv->defined_seqids, seqid)) gt_cstr_table_add(aiv->defined_seqids, seqid); gt_queue_add(aiv->node_buffer, rn); } return had_err; }
int gt_gtf_parser_parse(GtGTFParser *parser, GtQueue *genome_nodes, GtStr *filenamestr, GtFile *fpin, bool be_tolerant, GtError *err) { GtStr *seqid_str, *source_str, *line_buffer; char *line; size_t line_length; GtUword i, line_number = 0; GtGenomeNode *gn; GtRange range; GtPhase phase_value; GtStrand gt_strand_value; GtSplitter *splitter, *attribute_splitter; float score_value; char *seqname, *source, *feature, *start, *end, *score, *strand, *frame, *attributes, *token, *gene_id, *gene_name = NULL, *transcript_id, *transcript_name = NULL, **tokens; GtHashmap *transcript_id_hash; /* map from transcript id to array of genome nodes */ GtArray *gt_genome_node_array; ConstructionInfo cinfo; GTF_feature_type gtf_feature_type; GT_UNUSED bool gff_type_is_valid = false; const char *type = NULL; const char *filename; bool score_is_defined; int had_err = 0; gt_assert(parser && genome_nodes); gt_error_check(err); filename = gt_str_get(filenamestr); /* alloc */ line_buffer = gt_str_new(); splitter = gt_splitter_new(), attribute_splitter = gt_splitter_new(); #define HANDLE_ERROR \ if (had_err) { \ if (be_tolerant) { \ fprintf(stderr, "skipping line: %s\n", gt_error_get(err)); \ gt_error_unset(err); \ gt_str_reset(line_buffer); \ had_err = 0; \ continue; \ } \ else { \ had_err = -1; \ break; \ } \ } while (gt_str_read_next_line_generic(line_buffer, fpin) != EOF) { line = gt_str_get(line_buffer); line_length = gt_str_length(line_buffer); line_number++; had_err = 0; if (line_length == 0) { gt_warning("skipping blank line " GT_WU " in file \"%s\"", line_number, filename); } else if (line[0] == '#') { /* storing comment */ if (line_length >= 2 && line[1] == '#') gn = gt_comment_node_new(line+2); /* store '##' line as '#' line */ else gn = gt_comment_node_new(line+1); gt_genome_node_set_origin(gn, filenamestr, line_number); gt_queue_add(genome_nodes, gn); } else { /* process tab delimited GTF line */ gt_splitter_reset(splitter); gt_splitter_split(splitter, line, line_length, '\t'); if (gt_splitter_size(splitter) != 9UL) { gt_error_set(err, "line " GT_WU " in file \"%s\" contains " GT_WU " tab (\\t) " "separated fields instead of 9", line_number, filename, gt_splitter_size(splitter)); had_err = -1; break; } tokens = gt_splitter_get_tokens(splitter); seqname = tokens[0]; source = tokens[1]; feature = tokens[2]; start = tokens[3]; end = tokens[4]; score = tokens[5]; strand = tokens[6]; frame = tokens[7]; attributes = tokens[8]; /* parse feature */ if (GTF_feature_type_get(>f_feature_type, feature) == -1) { /* we skip unknown features */ fprintf(stderr, "skipping line " GT_WU " in file \"%s\": unknown " "feature: \"%s\"\n", line_number, filename, feature); gt_str_reset(line_buffer); continue; } /* translate into GFF3 feature type */ switch (gtf_feature_type) { case GTF_CDS: case GTF_stop_codon: gff_type_is_valid = gt_type_checker_is_valid(parser->type_checker, gt_ft_CDS); type = gt_ft_CDS; break; case GTF_exon: gff_type_is_valid = gt_type_checker_is_valid(parser->type_checker, gt_ft_exon); type = gt_ft_exon; } gt_assert(gff_type_is_valid); /* parse the range */ had_err = gt_parse_range(&range, start, end, line_number, filename, err); HANDLE_ERROR; /* process seqname (we have to do it here because we need the range) */ gt_region_node_builder_add_region(parser->region_node_builder, seqname, range); /* parse the score */ had_err = gt_parse_score(&score_is_defined, &score_value, score, line_number, filename, err); HANDLE_ERROR; /* parse the strand */ had_err = gt_parse_strand(>_strand_value, strand, line_number, filename, err); HANDLE_ERROR; /* parse the frame */ had_err = gt_parse_phase(&phase_value, frame, line_number, filename, err); HANDLE_ERROR; /* parse the attributes */ gt_splitter_reset(attribute_splitter); gene_id = NULL; transcript_id = NULL; gt_splitter_split(attribute_splitter, attributes, strlen(attributes), ';'); for (i = 0; i < gt_splitter_size(attribute_splitter); i++) { token = gt_splitter_get_token(attribute_splitter, i); /* skip leading blanks */ while (*token == ' ') token++; /* look for the two mandatory attributes */ if (strncmp(token, GENE_ID_ATTRIBUTE, strlen(GENE_ID_ATTRIBUTE)) == 0) { if (strlen(token) + 2 < strlen(GENE_ID_ATTRIBUTE)) { gt_error_set(err, "missing value to attribute \"%s\" on line " GT_WU "in file \"%s\"", GENE_ID_ATTRIBUTE, line_number, filename); had_err = -1; } HANDLE_ERROR; gene_id = token + strlen(GENE_ID_ATTRIBUTE) + 1; } else if (strncmp(token, TRANSCRIPT_ID_ATTRIBUTE, strlen(TRANSCRIPT_ID_ATTRIBUTE)) == 0) { if (strlen(token) + 2 < strlen(TRANSCRIPT_ID_ATTRIBUTE)) { gt_error_set(err, "missing value to attribute \"%s\" on line " GT_WU "in file \"%s\"", TRANSCRIPT_ID_ATTRIBUTE, line_number, filename); had_err = -1; } HANDLE_ERROR; transcript_id = token + strlen(TRANSCRIPT_ID_ATTRIBUTE) + 1; } else if (strncmp(token, GENE_NAME_ATTRIBUTE, strlen(GENE_NAME_ATTRIBUTE)) == 0) { if (strlen(token) + 2 < strlen(GENE_NAME_ATTRIBUTE)) { gt_error_set(err, "missing value to attribute \"%s\" on line " GT_WU "in file \"%s\"", GENE_NAME_ATTRIBUTE, line_number, filename); had_err = -1; } HANDLE_ERROR; gene_name = token + strlen(GENE_NAME_ATTRIBUTE) + 1; /* for output we want to strip quotes */ if (*gene_name == '"') gene_name++; if (gene_name[strlen(gene_name)-1] == '"') gene_name[strlen(gene_name)-1] = '\0'; } else if (strncmp(token, TRANSCRIPT_NAME_ATTRIBUTE, strlen(TRANSCRIPT_NAME_ATTRIBUTE)) == 0) { if (strlen(token) + 2 < strlen(TRANSCRIPT_NAME_ATTRIBUTE)) { gt_error_set(err, "missing value to attribute \"%s\" on line " GT_WU "in file \"%s\"", TRANSCRIPT_NAME_ATTRIBUTE, line_number, filename); had_err = -1; } HANDLE_ERROR; transcript_name = token + strlen(TRANSCRIPT_NAME_ATTRIBUTE) + 1; /* for output we want to strip quotes */ if (*transcript_name == '"') transcript_name++; if (transcript_name[strlen(transcript_name)-1] == '"') transcript_name[strlen(transcript_name)-1] = '\0'; } } /* check for the mandatory attributes */ if (!gene_id) { gt_error_set(err, "missing attribute \"%s\" on line " GT_WU " in file \"%s\"", GENE_ID_ATTRIBUTE, line_number, filename); had_err = -1; } HANDLE_ERROR; if (!transcript_id) { gt_error_set(err, "missing attribute \"%s\" on line " GT_WU " in file \"%s\"", TRANSCRIPT_ID_ATTRIBUTE, line_number, filename); had_err = -1; } HANDLE_ERROR; /* process the mandatory attributes */ if (!(transcript_id_hash = gt_hashmap_get(parser->gene_id_hash, gene_id))) { transcript_id_hash = gt_hashmap_new(GT_HASH_STRING, gt_free_func, (GtFree) gt_array_delete); gt_hashmap_add(parser->gene_id_hash, gt_cstr_dup(gene_id), transcript_id_hash); } gt_assert(transcript_id_hash); if (!(gt_genome_node_array = gt_hashmap_get(transcript_id_hash, transcript_id))) { gt_genome_node_array = gt_array_new(sizeof (GtGenomeNode*)); gt_hashmap_add(transcript_id_hash, gt_cstr_dup(transcript_id), gt_genome_node_array); } gt_assert(gt_genome_node_array); /* save optional gene_name and transcript_name attributes */ if (transcript_name && !gt_hashmap_get(parser->transcript_id_to_name_mapping, transcript_id)) { gt_hashmap_add(parser->transcript_id_to_name_mapping, gt_cstr_dup(transcript_id), gt_cstr_dup(transcript_name)); } if (gene_name && !gt_hashmap_get(parser->gene_id_to_name_mapping, gene_id)) { gt_hashmap_add(parser->gene_id_to_name_mapping, gt_cstr_dup(gene_id), gt_cstr_dup(gene_name)); } /* get seqid */ seqid_str = gt_hashmap_get(parser->seqid_to_str_mapping, seqname); if (!seqid_str) { seqid_str = gt_str_new_cstr(seqname); gt_hashmap_add(parser->seqid_to_str_mapping, gt_str_get(seqid_str), seqid_str); } gt_assert(seqid_str); /* construct the new feature */ gn = gt_feature_node_new(seqid_str, type, range.start, range.end, gt_strand_value); gt_genome_node_set_origin(gn, filenamestr, line_number); /* set source */ source_str = gt_hashmap_get(parser->source_to_str_mapping, source); if (!source_str) { source_str = gt_str_new_cstr(source); gt_hashmap_add(parser->source_to_str_mapping, gt_str_get(source_str), source_str); } gt_assert(source_str); gt_feature_node_set_source((GtFeatureNode*) gn, source_str); if (score_is_defined) gt_feature_node_set_score((GtFeatureNode*) gn, score_value); if (phase_value != GT_PHASE_UNDEFINED) gt_feature_node_set_phase((GtFeatureNode*) gn, phase_value); gt_array_add(gt_genome_node_array, gn); } gt_str_reset(line_buffer); } /* process all region nodes */ if (!had_err) gt_region_node_builder_build(parser->region_node_builder, genome_nodes); /* process all feature nodes */ cinfo.genome_nodes = genome_nodes; cinfo.gene_id_to_name_mapping = parser->gene_id_to_name_mapping; cinfo.transcript_id_to_name_mapping = parser->transcript_id_to_name_mapping; if (!had_err) { had_err = gt_hashmap_foreach(parser->gene_id_hash, construct_genes, &cinfo, err); } /* free */ gt_splitter_delete(splitter); gt_splitter_delete(attribute_splitter); gt_str_delete(line_buffer); return had_err; }
int gt_queue_unit_test(GtError *err) { long check_counter = 0, check_counter_reverse = 1023; unsigned long i; int had_err = 0; GtQueue *q; gt_error_check(err); /* without wraparound */ q = gt_queue_new(); gt_ensure(had_err, !gt_queue_size(q)); for (i = 0; !had_err && i < 1024; i++) { gt_queue_add(q, (void*) i); gt_ensure(had_err, gt_queue_size(q) == i + 1); } if (!had_err) had_err = gt_queue_iterate(q, check_queue, &check_counter, err); if (!had_err) { had_err = gt_queue_iterate_reverse(q, check_queue_reverse, &check_counter_reverse, err); } gt_ensure(had_err, gt_queue_iterate(q, fail_func, NULL, NULL)); gt_ensure(had_err, gt_queue_iterate_reverse(q, fail_func, NULL, NULL)); if (!had_err) { gt_queue_remove(q, (void*) 0); gt_ensure(had_err, gt_queue_size(q) == 1023); } for (i = 1; !had_err && i < 1024; i++) { gt_ensure(had_err, gt_queue_head(q) == (void*) i); gt_ensure(had_err, gt_queue_get(q) == (void*) i); gt_ensure(had_err, gt_queue_size(q) == 1024 - i - 1); } gt_ensure(had_err, !gt_queue_size(q)); gt_queue_delete(q); /* with wraparound (without full queue) */ if (!had_err) { q = gt_queue_new(); gt_ensure(had_err, !gt_queue_size(q)); for (i = 0; !had_err && i < 1024; i++) { gt_queue_add(q, (void*) i); gt_ensure(had_err, gt_queue_size(q) == i + 1); } check_counter = 0; check_counter_reverse = 1023; if (!had_err) had_err = gt_queue_iterate(q, check_queue, &check_counter, err); gt_ensure(had_err, gt_queue_iterate(q, fail_func, NULL, NULL)); gt_ensure(had_err, gt_queue_iterate_reverse(q, fail_func, NULL, NULL)); if (!had_err) { had_err = gt_queue_iterate_reverse(q, check_queue_reverse, &check_counter_reverse, err); } for (i = 0; !had_err && i < 512; i++) { gt_ensure(had_err, gt_queue_head(q) == (void*) i); gt_ensure(had_err, gt_queue_get(q) == (void*) i); gt_ensure(had_err, gt_queue_size(q) == 1024 - i - 1); } for (i = 0; !had_err && i < 512; i++) { gt_queue_add(q, (void*) (i + 1024)); gt_ensure(had_err, gt_queue_size(q) == 512 + i + 1); } check_counter = 512; check_counter_reverse = 1535; if (!had_err) had_err = gt_queue_iterate(q, check_queue, &check_counter, err); if (!had_err) { had_err = gt_queue_iterate_reverse(q, check_queue_reverse, &check_counter_reverse, err); } gt_ensure(had_err, gt_queue_iterate(q, fail_func, NULL, NULL)); gt_ensure(had_err, gt_queue_iterate_reverse(q, fail_func, NULL, NULL)); if (!had_err) { gt_queue_remove(q, (void*) 512); gt_ensure(had_err, gt_queue_size(q) == 1023); } for (i = 1; !had_err && i < 1024; i++) { gt_ensure(had_err, gt_queue_head(q) == (void*) (512 + i)); gt_ensure(had_err, gt_queue_get(q) == (void*) (512 + i)); gt_ensure(had_err, gt_queue_size(q) == 1024 - i - 1); } gt_ensure(had_err, !gt_queue_size(q)); gt_queue_delete(q); } /* with wraparound (with full queue) */ if (!had_err) { q = gt_queue_new(); gt_ensure(had_err, !gt_queue_size(q)); for (i = 0; !had_err && i < 1024; i++) { gt_queue_add(q, (void*) i); gt_ensure(had_err, gt_queue_size(q) == i + 1); } check_counter = 0; check_counter_reverse = 1023; if (!had_err) had_err = gt_queue_iterate(q, check_queue, &check_counter, err); if (!had_err) { had_err = gt_queue_iterate_reverse(q, check_queue_reverse, &check_counter_reverse, err); } gt_ensure(had_err, gt_queue_iterate(q, fail_func, NULL, NULL)); gt_ensure(had_err, gt_queue_iterate_reverse(q, fail_func, NULL, NULL)); for (i = 0; !had_err && i < 512; i++) { gt_ensure(had_err, gt_queue_head(q) == (void*) i); gt_ensure(had_err, gt_queue_get(q) == (void*) i); gt_ensure(had_err, gt_queue_size(q) == 1024 - i - 1); } for (i = 0; !had_err && i < 1024; i++) { gt_queue_add(q, (void*) (i + 1024)); gt_ensure(had_err, gt_queue_size(q) == 512 + i + 1); } check_counter = 512; check_counter_reverse = 2047; if (!had_err) had_err = gt_queue_iterate(q, check_queue, &check_counter, err); if (!had_err) { had_err = gt_queue_iterate_reverse(q, check_queue_reverse, &check_counter_reverse, err); } gt_ensure(had_err, gt_queue_iterate(q, fail_func, NULL, NULL)); gt_ensure(had_err, gt_queue_iterate_reverse(q, fail_func, NULL, NULL)); if (!had_err) { gt_queue_remove(q, (void*) 512); gt_ensure(had_err, gt_queue_size(q) == 1535); } for (i = 1; !had_err && i < 1536; i++) { gt_ensure(had_err, gt_queue_head(q) == (void*) (512 + i)); gt_ensure(had_err, gt_queue_get(q) == (void*) (512 + i)); gt_ensure(had_err, gt_queue_size(q) == 1536 - i - 1); } gt_ensure(had_err, !gt_queue_size(q)); gt_queue_delete(q); } /* test a corner case */ if (!had_err) { q = gt_queue_new(); gt_queue_add(q, (void*) 1); gt_ensure(had_err, gt_queue_size(q) == 1); if (!had_err) gt_queue_add(q, (void*) 1); gt_ensure(had_err, gt_queue_size(q) == 2); gt_ensure(had_err, gt_queue_get(q)); gt_ensure(had_err, gt_queue_size(q) == 1); if (!had_err) gt_queue_add(q, (void*) 1); gt_ensure(had_err, gt_queue_size(q) == 2); gt_ensure(had_err, gt_queue_get(q)); gt_ensure(had_err, gt_queue_size(q) == 1); if (!had_err) gt_queue_add(q, (void*) 1); gt_ensure(had_err, gt_queue_size(q) == 2); gt_ensure(had_err, gt_queue_get(q)); gt_ensure(had_err, gt_queue_size(q) == 1); gt_ensure(had_err, gt_queue_get(q)); gt_ensure(had_err, gt_queue_size(q) == 0); if (!had_err) gt_queue_add(q, (void*) 1); gt_ensure(had_err, gt_queue_size(q) == 1); gt_ensure(had_err, gt_queue_get(q)); gt_ensure(had_err, gt_queue_size(q) == 0); gt_queue_delete(q); } /* gt_queue_remove() corner case */ if (!had_err) { q = gt_queue_new(); gt_queue_add(q, (void*) 1); gt_ensure(had_err, gt_queue_size(q) == 1); gt_queue_remove(q, (void*) 1); gt_ensure(had_err, gt_queue_size(q) == 0); gt_queue_delete(q); } /* gt_queue_remove() corner case */ if (!had_err) { q = gt_queue_new(); gt_queue_add(q, (void*) 0); gt_queue_add(q, (void*) 1); gt_queue_add(q, (void*) 2); gt_queue_add(q, (void*) 3); gt_ensure(had_err, gt_queue_get(q) == (void*) 0); gt_ensure(had_err, gt_queue_get(q) == (void*) 1); gt_queue_add(q, (void*) 4); gt_queue_add(q, (void*) 5); gt_queue_remove(q, (void*) 4); gt_queue_remove(q, (void*) 2); gt_queue_remove(q, (void*) 5); gt_queue_remove(q, (void*) 3); gt_ensure(had_err, gt_queue_size(q) == 0); gt_queue_delete(q); } /* delete with contents */ if (!had_err) { q = gt_queue_new(); gt_ensure(had_err, !gt_queue_size(q)); if (!had_err) gt_queue_add(q, gt_calloc(1, 16)); gt_ensure(had_err, gt_queue_size(q) == 1); if (!had_err) gt_queue_add(q, gt_calloc(1, 32)); gt_ensure(had_err, gt_queue_size(q) == 2); gt_queue_delete_with_contents(q); } return had_err; }
// Main method int main(int argc, char * const *argv) { GtError *error; GtLogger *logger; GtQueue *streams; GtNodeStream *stream, *last_stream; CanonGFF3Options options = { NULL, NULL, false }; gt_lib_init(); error = gt_error_new(); canon_gff3_parse_options(argc, argv + 0, &options, error); streams = gt_queue_new(); logger = gt_logger_new(true, "", stderr); stream = gt_gff3_in_stream_new_unsorted(argc - optind, (const char **) argv+optind); gt_gff3_in_stream_check_id_attributes((GtGFF3InStream *)stream); gt_gff3_in_stream_enable_tidy_mode((GtGFF3InStream *)stream); gt_queue_add(streams, stream); last_stream = stream; if(options.infer) { GtHashmap *type_parents = gt_hashmap_new(GT_HASH_STRING, gt_free_func, gt_free_func); gt_hashmap_add(type_parents, gt_cstr_dup("mRNA"), gt_cstr_dup("gene")); gt_hashmap_add(type_parents, gt_cstr_dup("tRNA"), gt_cstr_dup("gene")); stream = agn_infer_parent_stream_new(last_stream, type_parents); gt_hashmap_delete(type_parents); gt_queue_add(streams, stream); last_stream = stream; } stream = agn_gene_stream_new(last_stream, logger); gt_queue_add(streams, stream); last_stream = stream; if(options.source != NULL) { GtNodeVisitor *ssv = gt_set_source_visitor_new(options.source); stream = gt_visitor_stream_new(last_stream, ssv); gt_queue_add(streams, stream); last_stream = stream; } stream = gt_gff3_out_stream_new(last_stream, options.outstream); if(!options.infer) gt_gff3_out_stream_retain_id_attributes((GtGFF3OutStream *)stream); gt_queue_add(streams, stream); last_stream = stream; if(gt_node_stream_pull(last_stream, error) == -1) { fprintf(stderr, "[CanonGFF3] error processing node stream: %s", gt_error_get(error)); } while(gt_queue_size(streams) > 0) { stream = gt_queue_get(streams); gt_node_stream_delete(stream); } gt_queue_delete(streams); if(options.source != NULL) gt_str_delete(options.source); if(options.outstream != NULL) gt_file_delete(options.outstream); gt_error_delete(error); gt_logger_delete(logger); gt_lib_clean(); return 0; }
static int advancefastabufferstate(GtFastaBuffer *fb, GtError *err) { int currentchar; unsigned long currentoutpos = 0, currentfileadd = 0, currentfileread = 0; GtUchar charcode; gt_error_check(err); while (true) { if (currentoutpos >= (unsigned long) OUTPUTFILEBUFFERSIZE) { if (fb->filelengthtab != NULL) { fb->filelengthtab[fb->filenum].length += (uint64_t) currentfileread; fb->filelengthtab[fb->filenum].effectivelength += (uint64_t) currentfileadd; } break; } if (fb->nextfile) { if (fb->filelengthtab != NULL) { fb->filelengthtab[fb->filenum].length = 0; fb->filelengthtab[fb->filenum].effectivelength = 0; } fb->nextfile = false; fb->indesc = false; fb->firstseqinfile = true; currentfileadd = 0; currentfileread = 0; fb->linenum = (uint64_t) 1; fb->inputstream = gt_file_xopen(gt_str_array_get(fb->filenametab, (unsigned long) fb->filenum), "rb"); fb->currentinpos = 0; fb->currentfillpos = 0; } else { currentchar = ownbuffer_genfile_getc(fb,fb->inputstream); if (currentchar == EOF) { gt_file_delete(fb->inputstream); fb->inputstream = NULL; if (fb->filelengthtab != NULL) { fb->filelengthtab[fb->filenum].length += currentfileread; fb->filelengthtab[fb->filenum].effectivelength += currentfileadd; } if ((unsigned long) fb->filenum == gt_str_array_size(fb->filenametab)-1) { fb->complete = true; break; } fb->filenum++; fb->nextfile = true; } else { currentfileread++; if (fb->indesc) { if (currentchar == NEWLINESYMBOL) { fb->linenum++; fb->indesc = false; } if (fb->descptr != NULL) { if (currentchar == NEWLINESYMBOL) { GT_STOREINARRAY(&fb->headerbuffer, char, 128, '\0'); gt_queue_add(fb->descptr, gt_cstr_dup(fb->headerbuffer.spacechar)); fb->headerbuffer.nextfreechar = 0; } else { GT_STOREINARRAY(&fb->headerbuffer, char, 128, currentchar); } } } else { if (!isspace((int) currentchar)) { if (currentchar == FASTASEPARATOR) { if (fb->firstoverallseq) { fb->firstoverallseq = false; fb->firstseqinfile = false; } else { if (fb->firstseqinfile) { fb->firstseqinfile = false; } else { currentfileadd++; } fb->outputbuffer[currentoutpos++] = (GtUchar) SEPARATOR; fb->lastspeciallength++; } fb->indesc = true; } else { if (fb->symbolmap == NULL) { fb->outputbuffer[currentoutpos++] = (GtUchar) currentchar; } else { charcode = fb->symbolmap[(unsigned int) currentchar]; if (charcode == (GtUchar) UNDEFCHAR) { gt_error_set(err, "illegal character '%c': file \"%s\", line %llu", currentchar, gt_str_array_get(fb->filenametab, fb->filenum), (unsigned long long) fb->linenum); return -1; } if (ISSPECIAL(charcode)) { fb->lastspeciallength++; } else { if (fb->lastspeciallength > 0) { fb->lastspeciallength = 0; } if (fb->characterdistribution != NULL) { fb->characterdistribution[charcode]++; } } fb->outputbuffer[currentoutpos++] = charcode; } currentfileadd++; } } } }
GtNodeVisitor* agn_gaeval_visitor_new(GtNodeStream *astream, AgnGaevalParams gparams) { agn_assert(astream); // Create the node visitor GtNodeVisitor *nv = gt_node_visitor_create(gaeval_visitor_class()); AgnGaevalVisitor *v = gaeval_visitor_cast(nv); v->alignments = gt_feature_index_memory_new(); v->tsvout = NULL; v->params = gparams; // Check that sum of weights is 1.0 double weights_total = gparams.alpha + gparams.beta + gparams.gamma + gparams.epsilon; if(fabs(weights_total - 1.0) > 0.0001) { fprintf(stderr, "[AgnGaevalVisitor::agn_gaeval_visitor_new] warning: " "sum of weights is not 1.0 %.3lf; integrity calculations will be " "incorrect\n", weights_total); } // Set up node stream to load alignment features into memory GtQueue *streams = gt_queue_new(); GtNodeStream *stream, *last_stream; GtHashmap *typestokeep = gt_hashmap_new(GT_HASH_STRING, NULL, NULL); gt_hashmap_add(typestokeep, "cDNA_match", "cDNA_match"); gt_hashmap_add(typestokeep, "EST_match", "EST_match"); gt_hashmap_add(typestokeep, "nucleotide_match", "nucleotide_match"); stream = agn_filter_stream_new(astream, typestokeep); gt_queue_add(streams, stream); last_stream = stream; stream = gt_feature_out_stream_new(last_stream, v->alignments); gt_queue_add(streams, stream); last_stream = stream; stream = gt_inter_feature_stream_new(last_stream, "cDNA_match", "match_gap"); gt_queue_add(streams, stream); last_stream = stream; stream = gt_inter_feature_stream_new(last_stream, "EST_match", "match_gap"); gt_queue_add(streams, stream); last_stream = stream; stream = gt_inter_feature_stream_new(last_stream, "nucleotide_match", "match_gap"); gt_queue_add(streams, stream); last_stream = stream; // Process the node stream GtError *error = gt_error_new(); int result = gt_node_stream_pull(last_stream, error); if(result == -1) { fprintf(stderr, "[AEGeAn::AgnGaevalStream] error parsing alignments: %s\n", gt_error_get(error)); gt_node_visitor_delete(nv); return NULL; } gt_error_delete(error); gt_hashmap_delete(typestokeep); while(gt_queue_size(streams) > 0) { stream = gt_queue_get(streams); gt_node_stream_delete(stream); } gt_queue_delete(streams); return nv; }
static int select_visitor_feature_node(GtNodeVisitor *nv, GtFeatureNode *fn, GT_UNUSED GtError *err) { GtSelectVisitor *fv; bool filter_node = false; gt_error_check(err); fv = select_visitor_cast(nv); fv->current_feature++; if ((!gt_str_length(fv->seqid) || /* no seqid was specified or seqids are equal */ !gt_str_cmp(fv->seqid, gt_genome_node_get_seqid((GtGenomeNode*) fn))) && (!gt_str_length(fv->source) || /* no source was specified or sources are equal */ !strcmp(gt_str_get(fv->source), gt_feature_node_get_source(fn)))) { GtRange range = gt_genome_node_get_range((GtGenomeNode*) fn); /* enforce maximum gene length */ /* XXX: we (spuriously) assume that genes are always root nodes */ if (fn && gt_feature_node_has_type(fn, gt_ft_gene)) { if (fv->max_gene_length != GT_UNDEF_ULONG && gt_range_length(&range) > fv->max_gene_length) { filter_node = true; } else if (fv->max_gene_num != GT_UNDEF_ULONG && fv->gene_num >= fv->max_gene_num) { filter_node = true; } else if (fv->min_gene_score != GT_UNDEF_DOUBLE && gt_feature_node_get_score(fn) < fv->min_gene_score) { filter_node = true; } else if (fv->max_gene_score != GT_UNDEF_DOUBLE && gt_feature_node_get_score(fn) > fv->max_gene_score) { filter_node = true; } else if (fv->feature_num != GT_UNDEF_ULONG && fv->feature_num != fv->current_feature) { filter_node = true; } if (!filter_node) fv->gene_num++; /* gene passed filter */ } } else filter_node = true; if (!filter_node) filter_node = filter_contain_range(fn, fv->contain_range); if (!filter_node) filter_node = filter_overlap_range(fn, fv->overlap_range); if (!filter_node) filter_node = filter_strand(fn, fv->strand); if (!filter_node) filter_node = filter_targetstrand(fn, fv->targetstrand); if (!filter_node) filter_node = filter_has_CDS(fn, fv->has_CDS); if (!filter_node) filter_node = filter_min_average_ssp(fn, fv->min_average_splice_site_prob); if (filter_node) gt_genome_node_delete((GtGenomeNode*) fn); else gt_queue_add(fv->node_buffer, fn); return 0; }
static int gt_sequence_buffer_fasta_advance(GtSequenceBuffer *sb, GtError *err) { int currentchar, ret = 0; unsigned long currentoutpos = 0, currentfileadd = 0, currentfileread = 0; GtSequenceBufferMembers *pvt; GtSequenceBufferFasta *sbf; gt_error_check(err); sbf = (GtSequenceBufferFasta*) sb; pvt = sb->pvt; while (true) { if (currentoutpos >= (unsigned long) OUTBUFSIZE) { if (pvt->filelengthtab != NULL) { pvt->filelengthtab[pvt->filenum].length += (uint64_t) currentfileread; pvt->filelengthtab[pvt->filenum].effectivelength += (uint64_t) currentfileadd; } break; } if (sbf->nextfile) { if (pvt->filelengthtab != NULL) { pvt->filelengthtab[pvt->filenum].length = 0; pvt->filelengthtab[pvt->filenum].effectivelength = 0; } sbf->nextfile = false; sbf->indesc = false; sbf->firstseqinfile = true; currentfileadd = 0; currentfileread = 0; pvt->linenum = (uint64_t) 1; pvt->inputstream = gt_file_xopen(gt_str_array_get(pvt->filenametab, (unsigned long) pvt->filenum), "rb"); pvt->currentinpos = 0; pvt->currentfillpos = 0; } else { currentchar = inlinebuf_getchar(sb, pvt->inputstream); if (currentchar == EOF) { gt_file_delete(pvt->inputstream); pvt->inputstream = NULL; if (pvt->filelengthtab != NULL) { pvt->filelengthtab[pvt->filenum].length += currentfileread; pvt->filelengthtab[pvt->filenum].effectivelength += currentfileadd; } if ((unsigned long) pvt->filenum == gt_str_array_size(pvt->filenametab)-1) { pvt->complete = true; break; } pvt->filenum++; sbf->nextfile = true; } else { currentfileread++; if (sbf->indesc) { if (currentchar == NEWLINESYMBOL) { pvt->linenum++; sbf->indesc = false; } if (pvt->descptr != NULL) { if (currentchar == NEWLINESYMBOL) { gt_queue_add(pvt->descptr, gt_cstr_dup(gt_str_get(sbf->headerbuffer))); gt_str_reset(sbf->headerbuffer); } else { gt_str_append_char(sbf->headerbuffer, currentchar); } } } else { if (!isspace((int) currentchar)) { if (currentchar == FASTASEPARATOR) { if (sbf->firstoverallseq) { sbf->firstoverallseq = false; sbf->firstseqinfile = false; } else { if (sbf->firstseqinfile) { sbf->firstseqinfile = false; } else { currentfileadd++; } pvt->outbuf[currentoutpos++] = (GtUchar) SEPARATOR; pvt->lastspeciallength++; } sbf->indesc = true; } else { if ((ret = process_char(sb, currentoutpos, currentchar, err))) return ret; currentoutpos++; currentfileadd++; } } } } } } if (sbf->firstoverallseq) { gt_error_set(err,"no sequences in multiple fasta file(s) %s ...", gt_str_array_get(pvt->filenametab,0)); return -2; } pvt->nextfree = currentoutpos; return 0; }
static int add_ids_visitor_feature_node(GtNodeVisitor *nv, GtFeatureNode *fn, GtError *err) { AutomaticSequenceRegion *auto_sr; GtAddIDsVisitor *aiv; const char *seqid; bool is_circular; aiv = add_ids_visitor_cast(nv); seqid = gt_str_get(gt_genome_node_get_seqid((GtGenomeNode*) fn)); if (aiv->ensure_sorting && !gt_cstr_table_get(aiv->defined_seqids, seqid)) { gt_error_set(err, "the file %s is not sorted (seqid \"%s\" on line %u has " "not been previously introduced with a \"%s\" line)", gt_genome_node_get_filename((GtGenomeNode*) fn), seqid, gt_genome_node_get_line_number((GtGenomeNode*) fn), GT_GFF_SEQUENCE_REGION); return -1; } if (!gt_cstr_table_get(aiv->defined_seqids, seqid)) { GtFeatureNodeIterator *fni; GtFeatureNode *node; GtRange range = gt_genome_node_get_range((GtGenomeNode*) fn); is_circular = gt_feature_node_get_attribute(fn, GT_GFF_IS_CIRCULAR) ? true : false; if (!is_circular) { fni = gt_feature_node_iterator_new(fn); while ((node = gt_feature_node_iterator_next(fni))) { GtRange node_range = gt_genome_node_get_range((GtGenomeNode*) node); range = gt_range_join(&range, &node_range); } gt_feature_node_iterator_delete(fni); } /* sequence region has not been previously introduced -> check if one has already been created automatically */ auto_sr = gt_hashmap_get(aiv->undefined_sequence_regions, seqid); if (!auto_sr) { GtStr *seqid_str; /* sequence region has not been createad automatically -> do it now */ gt_warning("seqid \"%s\" on line %u in file \"%s\" has not been " "previously introduced with a \"%s\" line, create such a line " "automatically", seqid, gt_genome_node_get_line_number((GtGenomeNode*) fn), gt_genome_node_get_filename((GtGenomeNode*) fn), GT_GFF_SEQUENCE_REGION); auto_sr = automatic_sequence_region_new(is_circular); seqid_str = gt_genome_node_get_seqid((GtGenomeNode*) fn); auto_sr->sequence_region = gt_region_node_new(seqid_str, range.start, range.end); gt_hashmap_add(aiv->undefined_sequence_regions, gt_str_get(seqid_str), auto_sr); } else { if (auto_sr->is_circular) { gt_assert(!is_circular); /* XXX */ } else if (is_circular) { gt_assert(!auto_sr->is_circular); /* XXX */ auto_sr->is_circular = true; gt_genome_node_set_range(auto_sr->sequence_region, &range); } else { GtRange joined_range, sr_range = gt_genome_node_get_range(auto_sr->sequence_region); /* update the range of the sequence region */ joined_range = gt_range_join(&range, &sr_range); gt_genome_node_set_range(auto_sr->sequence_region, &joined_range); } } gt_array_add(auto_sr->feature_nodes, fn); } else gt_queue_add(aiv->node_buffer, fn); return 0; }
static int bed_rest(GtBEDParser *bed_parser, GtIO *bed_file, GtError *err) { GtUword block_count = 0; GtGenomeNode *gn = NULL; GtRange range; GtStr *seqid; int had_err; gt_error_check(err); /* column 1.: chrom */ seqid = get_seqid(bed_parser); had_err = skip_blanks(bed_file, err); /* column 2.: chromStart */ if (!had_err) { word(bed_parser->word, bed_file); had_err = skip_blanks(bed_file, err); } /* column 3.: chromEnd */ if (!had_err) { word(bed_parser->another_word, bed_file); had_err = parse_bed_range(&range, bed_parser->word, bed_parser->another_word, bed_parser->offset, bed_file, false, err); } if (!had_err) { /* add region */ gt_region_node_builder_add_region(bed_parser->region_node_builder, gt_str_get(seqid), range); /* create feature */ gn = gt_feature_node_new(seqid, bed_parser->feature_type ? bed_parser->feature_type : BED_FEATURE_TYPE, range.start, range.end, GT_STRAND_BOTH); gt_queue_add(bed_parser->feature_nodes, gn); if (bed_separator(bed_file)) had_err = skip_blanks(bed_file, err); } /* optional column 4.: name */ if (!had_err) { word(bed_parser->word, bed_file); if (gt_str_length(bed_parser->word)) { gt_feature_node_add_attribute((GtFeatureNode*) gn, GT_GFF_NAME, gt_str_get(bed_parser->word)); } if (bed_separator(bed_file)) had_err = skip_blanks(bed_file, err); } /* optional column 5.: score */ if (!had_err) { word(bed_parser->word, bed_file); if (gt_str_length(bed_parser->word)) { bool score_is_defined; float score_value; had_err = gt_parse_score(&score_is_defined, &score_value, gt_str_get(bed_parser->word), gt_io_get_line_number(bed_file), gt_io_get_filename(bed_file), err); if (!had_err && score_is_defined) gt_feature_node_set_score((GtFeatureNode*) gn, score_value); } } if (!had_err && bed_separator(bed_file)) had_err = skip_blanks(bed_file, err); /* optional column 6.: strand */ if (!had_err) { word(bed_parser->word, bed_file); if (gt_str_length(bed_parser->word)) { GtStrand strand; had_err = gt_parse_strand(&strand, gt_str_get(bed_parser->word), gt_io_get_line_number(bed_file), gt_io_get_filename(bed_file), err); if (!had_err) gt_feature_node_set_strand((GtFeatureNode*) gn, strand); } } if (!had_err && bed_separator(bed_file)) had_err = skip_blanks(bed_file, err); /* optional column 7.: thickStart */ if (!had_err) { word(bed_parser->word, bed_file); if (bed_separator(bed_file)) had_err = skip_blanks(bed_file, err); } /* optional column 8.: thickEnd */ if (!had_err) { word(bed_parser->another_word, bed_file); if (gt_str_length(bed_parser->another_word)) { gt_assert(gt_str_length(bed_parser->word)); /* got a thickStart and a thickEnd -> construct corresponding feature */ had_err = parse_bed_range(&range, bed_parser->word, bed_parser->another_word, bed_parser->offset, bed_file, true, err); if (!had_err && range.start <= range.end) construct_thick_feature(bed_parser, (GtFeatureNode*) gn, range); } } if (!had_err && bed_separator(bed_file)) had_err = skip_blanks(bed_file, err); /* optional column 9.: itemRgb */ if (!had_err) { word(bed_parser->word, bed_file); /* we do not use the RGB values */ if (bed_separator(bed_file)) had_err = skip_blanks(bed_file, err); } /* optional column 10.: blockCount */ if (!had_err) { word(bed_parser->word, bed_file); if (gt_str_length(bed_parser->word)) { if (gt_parse_uword(&block_count, gt_str_get(bed_parser->word))) { gt_error_set(err, "file \"%s\": line "GT_WU": could not parse blockCount", gt_io_get_filename(bed_file), gt_io_get_line_number(bed_file)); had_err = -1; } else { /* reset to parse/process blockSizes and blockStarts properly */ gt_str_reset(bed_parser->word); gt_str_reset(bed_parser->another_word); } } } if (!had_err && bed_separator(bed_file)) had_err = skip_blanks(bed_file, err); /* optional column 11.: blockSizes */ if (!had_err) { word(bed_parser->word, bed_file); if (bed_separator(bed_file)) had_err = skip_blanks(bed_file, err); } /* optional column 12.: blockStarts */ if (!had_err) { word(bed_parser->another_word, bed_file); if (bed_separator(bed_file)) had_err = skip_blanks(bed_file, err); } /* process blocks if necessary */ if (!had_err && block_count) { had_err = process_blocks(bed_parser, (GtFeatureNode*) gn, block_count, bed_parser->word, bed_parser->another_word, bed_file, err); } /* the end of the line should now be reached */ if (!had_err) had_err = gt_io_expect(bed_file, GT_END_OF_LINE, err); return had_err; }