static int select_visitor_region_node(GtNodeVisitor *nv, GtRegionNode *rn, GT_UNUSED GtError *err) { GtSelectVisitor *select_visitor; gt_error_check(err); select_visitor = select_visitor_cast(nv); if (!gt_str_length(select_visitor->seqid) || /* no seqid was specified */ !gt_str_cmp(select_visitor->seqid, /* or seqids are equal */ gt_genome_node_get_seqid((GtGenomeNode*) rn))) { if (select_visitor->contain_range.start != GT_UNDEF_ULONG) { GtRange range = gt_genome_node_get_range((GtGenomeNode*) rn); if (gt_range_overlap(&range, &select_visitor->contain_range)) { /* an overlapping contain range was defined -> update range */ range.start = MAX(range.start, select_visitor->contain_range.start); range.end = MIN(range.end, select_visitor->contain_range.end); gt_genome_node_set_range((GtGenomeNode*) rn, &range); gt_queue_add(select_visitor->node_buffer, rn); } else /* contain range does not overlap with <rn> range -> delete <rn> */ gt_genome_node_delete((GtGenomeNode*) rn); } else gt_queue_add(select_visitor->node_buffer, rn); } else gt_genome_node_delete((GtGenomeNode*) rn); return 0; }
static int gt_sort_stream_next(GtNodeStream *ns, GtGenomeNode **gn, GtError *err) { GtSortStream *sort_stream; GtGenomeNode *node, *eofn; int had_err = 0; gt_error_check(err); sort_stream = gt_sort_stream_cast(ns); if (!sort_stream->sorted) { while (!(had_err = gt_node_stream_next(sort_stream->in_stream, &node, err)) && node) { if ((eofn = gt_eof_node_try_cast(node))) gt_genome_node_delete(eofn); /* get rid of EOF nodes */ else gt_array_add(sort_stream->nodes, node); } if (!had_err) { gt_genome_nodes_sort_stable(sort_stream->nodes); sort_stream->sorted = true; } } if (!had_err) { gt_assert(sort_stream->sorted); if (sort_stream->idx < gt_array_size(sort_stream->nodes)) { *gn = *(GtGenomeNode**) gt_array_get(sort_stream->nodes, sort_stream->idx); sort_stream->idx++; /* join region nodes with the same sequence ID */ if (gt_region_node_try_cast(*gn)) { GtRange range_a, range_b; while (sort_stream->idx < gt_array_size(sort_stream->nodes)) { node = *(GtGenomeNode**) gt_array_get(sort_stream->nodes, sort_stream->idx); if (!gt_region_node_try_cast(node) || gt_str_cmp(gt_genome_node_get_seqid(*gn), gt_genome_node_get_seqid(node))) { /* the next node is not a region node with the same ID */ break; } range_a = gt_genome_node_get_range(*gn); range_b = gt_genome_node_get_range(node); range_a = gt_range_join(&range_a, &range_b); gt_genome_node_set_range(*gn, &range_a); gt_genome_node_delete(node); sort_stream->idx++; } } return 0; } } if (!had_err) { gt_array_reset(sort_stream->nodes); *gn = NULL; } return had_err; }
void gt_region_node_consolidate(GtRegionNode *rn_a, GtRegionNode *rn_b) { GtRange range_a, range_b; gt_assert(rn_a); gt_assert(rn_b); gt_assert(!gt_str_cmp(gt_genome_node_get_seqid((GtGenomeNode*) rn_a), gt_genome_node_get_seqid((GtGenomeNode*) rn_b))); range_a = gt_genome_node_get_range((GtGenomeNode*) rn_a); range_b = gt_genome_node_get_range((GtGenomeNode*) rn_b); range_a = gt_range_join(&range_a, &range_b); gt_genome_node_set_range((GtGenomeNode*) rn_a, &range_a); }
int gt_condenseq_output_to_gff3(const GtCondenseq *condenseq, GtError *err) { int had_err = 0; GtUword idx, name_len, seqnum = 0, seqstart = 0, seqend = 0, desclen; GtStr *filename = NULL, *id = gt_str_new_cstr("U"), *name = gt_str_new_cstr("unique"), *parent_unique = gt_str_new_cstr("U"), *seqid = gt_str_new(), *source = gt_str_new_cstr("Condenseq"); GtFile *outfile = NULL; GtGFF3Visitor *gffv = NULL; GtNodeVisitor *nodev = NULL; GtFeatureNode *fnode = NULL; GtGenomeNode *node = NULL; GtRange range; gt_assert(condenseq != NULL); filename = gt_str_new_cstr(gt_condenseq_basefilename(condenseq)); name_len = gt_str_length(name); gt_str_append_cstr(filename, ".gff3"); outfile = gt_file_new(gt_str_get(filename), "w", err); nodev = gt_gff3_visitor_new(outfile); gffv = (GtGFF3Visitor *) nodev; gt_gff3_visitor_retain_id_attributes(gffv); node = gt_feature_node_new(seqid, "experimental_feature", (GtUword) 1, (GtUword) 1, GT_STRAND_BOTH); fnode = (GtFeatureNode*) node; gt_feature_node_set_source(fnode, source); for (idx = 0; !had_err && idx < condenseq->udb_nelems; ++idx) { GtCondenseqUnique uq = condenseq->uniques[idx]; if (seqend <= uq.orig_startpos) { const char *desc; gt_genome_node_delete(node); seqnum = gt_condenseq_pos2seqnum(condenseq, uq.orig_startpos); seqstart = gt_condenseq_seqstartpos(condenseq, seqnum); seqend = seqstart + condenseq_seqlength_help(condenseq, seqnum, seqstart); desc = gt_condenseq_description(condenseq, &desclen, seqnum); gt_str_reset(seqid); gt_str_append_cstr_nt(seqid, desc, desclen); node = gt_feature_node_new(seqid, "experimental_feature", (GtUword) 1, (GtUword) 1, GT_STRAND_BOTH); fnode = (GtFeatureNode*) node; gt_feature_node_set_source(fnode, source); } gt_str_set_length(name, name_len); gt_str_append_uword(name, idx); gt_str_set_length(id, (GtUword) 1); gt_str_append_uword(id, idx); gt_feature_node_set_attribute(fnode, "Name", gt_str_get(name)); gt_feature_node_set_attribute(fnode, "ID", gt_str_get(id)); /* 1 Based coordinates! */ range.start = uq.orig_startpos + 1 - seqstart; range.end = uq.orig_startpos + uq.len - seqstart; gt_genome_node_set_range(node, &range); had_err = gt_genome_node_accept(node, nodev, err); } gt_str_reset(name); gt_str_append_cstr(name, "link"); gt_str_reset(id); gt_str_append_cstr(id, "L"); name_len = gt_str_length(name); seqend = 0; for (idx = 0; !had_err && idx < condenseq->ldb_nelems; ++idx) { GtCondenseqLink link = condenseq->links[idx]; if (seqend <= link.orig_startpos) { const char *desc; gt_genome_node_delete(node); seqnum = gt_condenseq_pos2seqnum(condenseq, link.orig_startpos); seqstart = gt_condenseq_seqstartpos(condenseq, seqnum); seqend = seqstart + condenseq_seqlength_help(condenseq, seqnum, seqstart); desc = gt_condenseq_description(condenseq, &desclen, seqnum); gt_str_reset(seqid); gt_str_append_cstr_nt(seqid, desc, desclen); node = gt_feature_node_new(seqid, "experimental_feature", (GtUword) 1, (GtUword) 1, GT_STRAND_BOTH); fnode = (GtFeatureNode*) node; gt_feature_node_set_source(fnode, source); } gt_str_set_length(name, name_len); gt_str_append_uword(name, idx); gt_str_set_length(id, (GtUword) 1); gt_str_append_uword(id, idx); gt_feature_node_set_attribute(fnode, "Name", gt_str_get(name)); gt_feature_node_set_attribute(fnode, "ID", gt_str_get(id)); gt_str_set_length(parent_unique, (GtUword) 1); gt_str_append_uword(parent_unique, link.unique_id); gt_feature_node_set_attribute(fnode, "Derives_from", gt_str_get(parent_unique)); /* 1 Based coordinates! */ range.start = link.orig_startpos + 1 - seqstart; range.end = link.orig_startpos + link.len - seqstart; gt_genome_node_set_range(node, &range); had_err = gt_genome_node_accept(node, nodev, err); } gt_file_delete(outfile); gt_genome_node_delete(node); gt_node_visitor_delete(nodev); gt_str_delete(filename); gt_str_delete(id); gt_str_delete(name); gt_str_delete(parent_unique); gt_str_delete(seqid); gt_str_delete(source); return had_err; }
static void make_sequence_region(GtHashmap *sequence_regions, GtStr *sequenceid, GthRegionFactory *srf, GthInput *input, GtUword filenum, GtUword seqnum) { GtUword offset_is_defined = false; GtRange range, descrange; GtGenomeNode *sr = NULL; gt_assert(sequence_regions && sequenceid && srf && input); if (gth_input_use_substring_spec(input)) { range.start = gth_input_genomic_substring_from(input); range.end = gth_input_genomic_substring_to(input); } else { range = gth_input_get_relative_genomic_range(input, filenum, seqnum); } if (srf->use_desc_ranges) { GtStr *description = gt_str_new(); gth_input_get_genomic_description(input, description, filenum, seqnum); if (!gt_parse_description_range(gt_str_get(description), &descrange)) offset_is_defined = true; gt_str_delete(description); } if (offset_is_defined) range = gt_range_offset(&range, descrange.start); else range = gt_range_offset(&range, 1); /* 1-based */ if (!gt_str_length(sequenceid) || (gt_cstr_table_get(srf->used_seqids, gt_str_get(sequenceid)) && !offset_is_defined)) { /* sequenceid is empty or exists already (and no offset has been parsed) -> make one up */ GtStr *seqid; char *base; base = gt_basename(gth_input_get_genomic_filename(input, filenum)); seqid = gt_str_new_cstr(base); gt_free(base); gt_str_append_char(seqid, '|'); gt_str_append_uword(seqid, seqnum + 1); /* 1-based */ seqid_store_add(srf->seqid_store, filenum, seqnum, seqid, GT_UNDEF_UWORD); gt_assert(!gt_cstr_table_get(srf->used_seqids, gt_str_get(seqid))); gt_cstr_table_add(srf->used_seqids, gt_str_get(seqid)); sr = gt_region_node_new(seqid_store_get(srf->seqid_store, filenum, seqnum), range.start, range.end); gt_hashmap_add(sequence_regions, (void*) gt_cstr_table_get(srf->used_seqids, gt_str_get(seqid)), sr); gt_str_delete(seqid); } else { /* sequenceid does not exists already (or an offset has been parsed) -> use this one */ if (!gt_cstr_table_get(srf->used_seqids, gt_str_get(sequenceid))) { /* no sequence region with this id exists -> create one */ gt_cstr_table_add(srf->used_seqids, gt_str_get(sequenceid)); seqid_store_add(srf->seqid_store, filenum, seqnum, sequenceid, offset_is_defined ? descrange.start : GT_UNDEF_UWORD); sr = gt_region_node_new(seqid_store_get(srf->seqid_store, filenum, seqnum), range.start, range.end); gt_hashmap_add(sequence_regions, (void*) gt_cstr_table_get(srf->used_seqids, gt_str_get(sequenceid)), sr); } else { GtRange prev_range, new_range; /* sequence region with this id exists already -> modify range */ sr = gt_hashmap_get(sequence_regions, gt_str_get(sequenceid)); gt_assert(sr); prev_range = gt_genome_node_get_range(sr); new_range = gt_range_join(&prev_range, &range); gt_genome_node_set_range(sr, &new_range); seqid_store_add(srf->seqid_store, filenum, seqnum, sequenceid, offset_is_defined ? descrange.start : GT_UNDEF_UWORD); } } gt_assert(sr); }
static int add_ids_visitor_feature_node(GtNodeVisitor *nv, GtFeatureNode *fn, GtError *err) { AutomaticSequenceRegion *auto_sr; GtAddIDsVisitor *aiv; const char *seqid; bool is_circular; aiv = add_ids_visitor_cast(nv); seqid = gt_str_get(gt_genome_node_get_seqid((GtGenomeNode*) fn)); if (aiv->ensure_sorting && !gt_cstr_table_get(aiv->defined_seqids, seqid)) { gt_error_set(err, "the file %s is not sorted (seqid \"%s\" on line %u has " "not been previously introduced with a \"%s\" line)", gt_genome_node_get_filename((GtGenomeNode*) fn), seqid, gt_genome_node_get_line_number((GtGenomeNode*) fn), GT_GFF_SEQUENCE_REGION); return -1; } if (!gt_cstr_table_get(aiv->defined_seqids, seqid)) { GtFeatureNodeIterator *fni; GtFeatureNode *node; GtRange range = gt_genome_node_get_range((GtGenomeNode*) fn); is_circular = gt_feature_node_get_attribute(fn, GT_GFF_IS_CIRCULAR) ? true : false; if (!is_circular) { fni = gt_feature_node_iterator_new(fn); while ((node = gt_feature_node_iterator_next(fni))) { GtRange node_range = gt_genome_node_get_range((GtGenomeNode*) node); range = gt_range_join(&range, &node_range); } gt_feature_node_iterator_delete(fni); } /* sequence region has not been previously introduced -> check if one has already been created automatically */ auto_sr = gt_hashmap_get(aiv->undefined_sequence_regions, seqid); if (!auto_sr) { GtStr *seqid_str; /* sequence region has not been createad automatically -> do it now */ gt_warning("seqid \"%s\" on line %u in file \"%s\" has not been " "previously introduced with a \"%s\" line, create such a line " "automatically", seqid, gt_genome_node_get_line_number((GtGenomeNode*) fn), gt_genome_node_get_filename((GtGenomeNode*) fn), GT_GFF_SEQUENCE_REGION); auto_sr = automatic_sequence_region_new(is_circular); seqid_str = gt_genome_node_get_seqid((GtGenomeNode*) fn); auto_sr->sequence_region = gt_region_node_new(seqid_str, range.start, range.end); gt_hashmap_add(aiv->undefined_sequence_regions, gt_str_get(seqid_str), auto_sr); } else { if (auto_sr->is_circular) { gt_assert(!is_circular); /* XXX */ } else if (is_circular) { gt_assert(!auto_sr->is_circular); /* XXX */ auto_sr->is_circular = true; gt_genome_node_set_range(auto_sr->sequence_region, &range); } else { GtRange joined_range, sr_range = gt_genome_node_get_range(auto_sr->sequence_region); /* update the range of the sequence region */ joined_range = gt_range_join(&range, &sr_range); gt_genome_node_set_range(auto_sr->sequence_region, &joined_range); } } gt_array_add(auto_sr->feature_nodes, fn); } else gt_queue_add(aiv->node_buffer, fn); return 0; }
static int construct_mRNAs(GT_UNUSED void *key, void *value, void *data, GtError *err) { ConstructionInfo *cinfo = (ConstructionInfo*) data; GtArray *gt_genome_node_array = (GtArray*) value, *mRNAs = (GtArray*) cinfo->mRNAs; GtGenomeNode *mRNA_node, *first_node, *gn; const char *tname; GtStrand mRNA_strand; GtRange mRNA_range; GtStr *mRNA_seqid; GtUword i; int had_err = 0; gt_error_check(err); gt_assert(key && value && data); /* at least one node in array */ gt_assert(gt_array_size(gt_genome_node_array)); /* determine the range and the strand of the mRNA */ first_node = *(GtGenomeNode**) gt_array_get(gt_genome_node_array, 0); mRNA_range = gt_genome_node_get_range(first_node); mRNA_strand = gt_feature_node_get_strand((GtFeatureNode*) first_node); mRNA_seqid = gt_genome_node_get_seqid(first_node); /* TODO: support discontinuous start/stop codons */ for (i = 0; !had_err && i < gt_array_size(gt_genome_node_array); i++) { gn = *(GtGenomeNode**) gt_array_get(gt_genome_node_array, i); if (gt_feature_node_get_attribute((GtFeatureNode*) gn, GTF_PARSER_STOP_CODON_FLAG)) { GtUword j; GtRange stop_codon_rng = gt_genome_node_get_range(gn); bool found_cds = false; for (j = 0; !had_err && j < gt_array_size(gt_genome_node_array); j++) { GtGenomeNode* gn2; GtRange this_rng; const char *this_type; gn2 = *(GtGenomeNode**) gt_array_get(gt_genome_node_array, j); if (gn == gn2) continue; this_rng = gt_genome_node_get_range(gn2); this_type = gt_feature_node_get_type((GtFeatureNode*) gn2); if (this_type == gt_symbol(gt_ft_CDS)) { if (gt_range_contains(&this_rng, &stop_codon_rng)) { if (cinfo->tidy) { gt_warning("stop codon on line %u in file %s is contained in " "CDS in line %u", gt_genome_node_get_line_number(gn), gt_genome_node_get_filename(gn), gt_genome_node_get_line_number(gn2)); found_cds = true; } else { gt_error_set(err, "stop codon on line %u in file %s is " "contained in CDS in line %u", gt_genome_node_get_line_number(gn), gt_genome_node_get_filename(gn), gt_genome_node_get_line_number(gn2)); had_err = -1; } break; } if (this_rng.end + 1 == stop_codon_rng.start) { this_rng.end = stop_codon_rng.end; gt_genome_node_set_range(gn2, &this_rng); found_cds = true; break; } if (this_rng.start == stop_codon_rng.end + 1) { this_rng.start = stop_codon_rng.start; gt_genome_node_set_range(gn2, &this_rng); found_cds = true; break; } } } if (!found_cds) { if (!had_err) { if (cinfo->tidy) { gt_warning("found stop codon on line %u in file %s with no " "flanking CDS, ignoring it", gt_genome_node_get_line_number(gn), gt_genome_node_get_filename(gn)); } else { gt_error_set(err, "found stop codon on line %u in file %s with no " "flanking CDS", gt_genome_node_get_line_number(gn), gt_genome_node_get_filename(gn)); had_err = -1; break; } } } else { gt_array_rem(gt_genome_node_array, i); gt_genome_node_delete(gn); } } } for (i = 1; !had_err && i < gt_array_size(gt_genome_node_array); i++) { GtRange range; GtStrand strand; gn = *(GtGenomeNode**) gt_array_get(gt_genome_node_array, i); range = gt_genome_node_get_range(gn); mRNA_range = gt_range_join(&mRNA_range, &range); strand = gt_feature_node_get_strand((GtFeatureNode*) gn); if (strand != mRNA_strand) { gt_error_set(err, "feature %s on line %u has strand %c, but the " "parent transcript has strand %c", (const char*) key, gt_genome_node_get_line_number(gn), GT_STRAND_CHARS[strand], GT_STRAND_CHARS[mRNA_strand]); had_err = -1; break; } else { mRNA_strand = gt_strand_join(mRNA_strand, strand); } if (!had_err && gt_str_cmp(mRNA_seqid, gt_genome_node_get_seqid(gn))) { gt_error_set(err, "The features on lines %u and %u refer to different " "genomic sequences (``seqname''), although they have the same " "gene IDs (``gene_id'') which must be globally unique", gt_genome_node_get_line_number(first_node), gt_genome_node_get_line_number(gn)); had_err = -1; break; } } if (!had_err) { mRNA_node = gt_feature_node_new(mRNA_seqid, gt_ft_mRNA, mRNA_range.start, mRNA_range.end, mRNA_strand); gt_feature_node_add_attribute(((GtFeatureNode*) mRNA_node), "ID", key); gt_feature_node_add_attribute(((GtFeatureNode*) mRNA_node), "transcript_id", key); if ((tname = gt_hashmap_get(cinfo->transcript_id_to_name_mapping, (const char*) key)) && strlen(tname) > 0) { gt_feature_node_add_attribute((GtFeatureNode*) mRNA_node, GT_GFF_NAME, tname); } /* register children */ for (i = 0; i < gt_array_size(gt_genome_node_array); i++) { gn = *(GtGenomeNode**) gt_array_get(gt_genome_node_array, i); gt_feature_node_add_child((GtFeatureNode*) mRNA_node, (GtFeatureNode*) gt_genome_node_ref(gn)); } /* store the mRNA */ gt_array_add(mRNAs, mRNA_node); } return had_err; }