static int update_bioseq_if_necessary(GtRegionMapping *rm, GtStr *seqid, GtError *err) { int had_err = 0; gt_error_check(err); gt_assert(rm && seqid); if (!rm->sequence_file || gt_str_cmp(rm->sequence_name, seqid)) { gt_str_delete(rm->sequence_file); rm->sequence_file = region_mapping_map(rm, gt_str_get(seqid), err); if (!rm->sequence_file) had_err = -1; else { if (!rm->sequence_name) rm->sequence_name = gt_str_new(); else gt_str_reset(rm->sequence_name); gt_str_append_str(rm->sequence_name, seqid); gt_bioseq_delete(rm->bioseq); rm->bioseq = gt_bioseq_new_str(rm->sequence_file, err); if (!rm->bioseq) had_err = -1; } } return had_err; }
static int select_visitor_region_node(GtNodeVisitor *nv, GtRegionNode *rn, GT_UNUSED GtError *err) { GtSelectVisitor *select_visitor; gt_error_check(err); select_visitor = select_visitor_cast(nv); if (!gt_str_length(select_visitor->seqid) || /* no seqid was specified */ !gt_str_cmp(select_visitor->seqid, /* or seqids are equal */ gt_genome_node_get_seqid((GtGenomeNode*) rn))) { if (select_visitor->contain_range.start != GT_UNDEF_ULONG) { GtRange range = gt_genome_node_get_range((GtGenomeNode*) rn); if (gt_range_overlap(&range, &select_visitor->contain_range)) { /* an overlapping contain range was defined -> update range */ range.start = MAX(range.start, select_visitor->contain_range.start); range.end = MIN(range.end, select_visitor->contain_range.end); gt_genome_node_set_range((GtGenomeNode*) rn, &range); gt_queue_add(select_visitor->node_buffer, rn); } else /* contain range does not overlap with <rn> range -> delete <rn> */ gt_genome_node_delete((GtGenomeNode*) rn); } else gt_queue_add(select_visitor->node_buffer, rn); } else gt_genome_node_delete((GtGenomeNode*) rn); return 0; }
static int gt_sort_stream_next(GtNodeStream *ns, GtGenomeNode **gn, GtError *err) { GtSortStream *sort_stream; GtGenomeNode *node, *eofn; int had_err = 0; gt_error_check(err); sort_stream = gt_sort_stream_cast(ns); if (!sort_stream->sorted) { while (!(had_err = gt_node_stream_next(sort_stream->in_stream, &node, err)) && node) { if ((eofn = gt_eof_node_try_cast(node))) gt_genome_node_delete(eofn); /* get rid of EOF nodes */ else gt_array_add(sort_stream->nodes, node); } if (!had_err) { gt_genome_nodes_sort_stable(sort_stream->nodes); sort_stream->sorted = true; } } if (!had_err) { gt_assert(sort_stream->sorted); if (sort_stream->idx < gt_array_size(sort_stream->nodes)) { *gn = *(GtGenomeNode**) gt_array_get(sort_stream->nodes, sort_stream->idx); sort_stream->idx++; /* join region nodes with the same sequence ID */ if (gt_region_node_try_cast(*gn)) { GtRange range_a, range_b; while (sort_stream->idx < gt_array_size(sort_stream->nodes)) { node = *(GtGenomeNode**) gt_array_get(sort_stream->nodes, sort_stream->idx); if (!gt_region_node_try_cast(node) || gt_str_cmp(gt_genome_node_get_seqid(*gn), gt_genome_node_get_seqid(node))) { /* the next node is not a region node with the same ID */ break; } range_a = gt_genome_node_get_range(*gn); range_b = gt_genome_node_get_range(node); range_a = gt_range_join(&range_a, &range_b); gt_genome_node_set_range(*gn, &range_a); gt_genome_node_delete(node); sort_stream->idx++; } } return 0; } } if (!had_err) { gt_array_reset(sort_stream->nodes); *gn = NULL; } return had_err; }
void gt_region_node_consolidate(GtRegionNode *rn_a, GtRegionNode *rn_b) { GtRange range_a, range_b; gt_assert(rn_a); gt_assert(rn_b); gt_assert(!gt_str_cmp(gt_genome_node_get_seqid((GtGenomeNode*) rn_a), gt_genome_node_get_seqid((GtGenomeNode*) rn_b))); range_a = gt_genome_node_get_range((GtGenomeNode*) rn_a); range_b = gt_genome_node_get_range((GtGenomeNode*) rn_b); range_a = gt_range_join(&range_a, &range_b); gt_genome_node_set_range((GtGenomeNode*) rn_a, &range_a); }
static inline int parse_fastq_block(GtSeqIteratorFastQ *seqit, GtError *err) { int had_err = 0; gt_assert(seqit); gt_error_check(err); /* parse @<seqname> */ had_err = parse_fastq_seqname(seqit, seqit->descbuffer, GT_FASTQ_BLOCK_START_CHAR, err); if (!had_err) { /* parse sequence */ had_err = parse_fastq_sequence(seqit, err); gt_fastq_premature_end_check(had_err, seqit); } if (!had_err) { /* parse +[seqname] */ had_err = parse_fastq_seqname(seqit, seqit->qdescbuffer, GT_FASTQ_QUAL_SEPARATOR_CHAR, err); gt_fastq_premature_end_check(had_err, seqit); } if (!had_err && gt_str_length(seqit->qdescbuffer) && gt_str_cmp(seqit->descbuffer, seqit->qdescbuffer) != 0) { gt_error_set(err, "sequence description '%s' is not equal to " "qualities description '%s' in line %lu", gt_str_get(seqit->descbuffer), gt_str_get(seqit->qdescbuffer), seqit->curline-1); return -2; } if (!had_err) { /* parse qualities */ had_err = parse_fastq_qualities(seqit, err); if (gt_str_length(seqit->qualsbuffer) != gt_str_length(seqit->sequencebuffer)) { gt_error_set(err, "lengths of character sequence and qualities " "sequence differ (%lu <-> %lu)", gt_str_length(seqit->qualsbuffer), gt_str_length(seqit->sequencebuffer)); return -2; } } return had_err; }
static int select_visitor_sequence_node(GtNodeVisitor *nv, GtSequenceNode *sn, GT_UNUSED GtError *err) { GtSelectVisitor *select_visitor; gt_error_check(err); select_visitor = select_visitor_cast(nv); if (!gt_str_length(select_visitor->seqid) || /* no seqid was specified */ !gt_str_cmp(select_visitor->seqid, /* or seqids are equal */ gt_genome_node_get_seqid((GtGenomeNode*) sn))) { gt_queue_add(select_visitor->node_buffer, sn); } else gt_genome_node_delete((GtGenomeNode*) sn); return 0; }
static int inter_feature_in_children(GtFeatureNode *current_feature, void *data, GT_UNUSED GtError *err) { GtInterFeatureVisitor *aiv = (GtInterFeatureVisitor*) data; GtFeatureNode *inter_node; GtRange previous_range, current_range, inter_range; GtStrand previous_strand, /*current_strand, */inter_strand; GtStr *parent_seqid; gt_error_check(err); gt_assert(current_feature); if (gt_feature_node_has_type(current_feature, aiv->outside_type)) { if (aiv->previous_feature) { /* determine inter range */ previous_range = gt_genome_node_get_range((GtGenomeNode*) aiv->previous_feature); current_range = gt_genome_node_get_range((GtGenomeNode*) current_feature); if (previous_range.end >= current_range.start) { gt_warning("overlapping boundary features " GT_WU "-" GT_WU " and " GT_WU "-" GT_WU ", " "not placing '%s' inter-feature", previous_range.start, previous_range.end, current_range.start, current_range.end, aiv->inter_type); return 0; } if (current_range.start - previous_range.end < 2) { gt_warning("no space for inter-feature '%s' between " GT_WU " and " GT_WU, aiv->inter_type, previous_range.end, current_range.start); return 0; } inter_range.start = previous_range.end + 1; inter_range.end = current_range.start - 1; /* determine inter strand */ previous_strand = gt_feature_node_get_strand(aiv->previous_feature); /*current_strand = gt_feature_node_get_strand(current_feature);*/ gt_assert(previous_strand == gt_feature_node_get_strand(current_feature)); inter_strand = previous_strand; /* determine sequence id */ parent_seqid = gt_genome_node_get_seqid((GtGenomeNode*) aiv->parent_feature); gt_assert(!gt_str_cmp(parent_seqid, gt_genome_node_get_seqid((GtGenomeNode*) aiv->previous_feature))); gt_assert(!gt_str_cmp(parent_seqid, gt_genome_node_get_seqid((GtGenomeNode*) current_feature))); /* create inter feature */ inter_node = (GtFeatureNode*) gt_feature_node_new(parent_seqid, aiv->inter_type, inter_range.start, inter_range.end, inter_strand); gt_feature_node_add_child(aiv->parent_feature, inter_node); } aiv->previous_feature = current_feature; } return 0; }
static int select_visitor_feature_node(GtNodeVisitor *nv, GtFeatureNode *fn, GT_UNUSED GtError *err) { GtSelectVisitor *fv; bool filter_node = false; gt_error_check(err); fv = select_visitor_cast(nv); fv->current_feature++; if ((!gt_str_length(fv->seqid) || /* no seqid was specified or seqids are equal */ !gt_str_cmp(fv->seqid, gt_genome_node_get_seqid((GtGenomeNode*) fn))) && (!gt_str_length(fv->source) || /* no source was specified or sources are equal */ !strcmp(gt_str_get(fv->source), gt_feature_node_get_source(fn)))) { GtRange range = gt_genome_node_get_range((GtGenomeNode*) fn); /* enforce maximum gene length */ /* XXX: we (spuriously) assume that genes are always root nodes */ if (fn && gt_feature_node_has_type(fn, gt_ft_gene)) { if (fv->max_gene_length != GT_UNDEF_ULONG && gt_range_length(&range) > fv->max_gene_length) { filter_node = true; } else if (fv->max_gene_num != GT_UNDEF_ULONG && fv->gene_num >= fv->max_gene_num) { filter_node = true; } else if (fv->min_gene_score != GT_UNDEF_DOUBLE && gt_feature_node_get_score(fn) < fv->min_gene_score) { filter_node = true; } else if (fv->max_gene_score != GT_UNDEF_DOUBLE && gt_feature_node_get_score(fn) > fv->max_gene_score) { filter_node = true; } else if (fv->feature_num != GT_UNDEF_ULONG && fv->feature_num != fv->current_feature) { filter_node = true; } if (!filter_node) fv->gene_num++; /* gene passed filter */ } } else filter_node = true; if (!filter_node) filter_node = filter_contain_range(fn, fv->contain_range); if (!filter_node) filter_node = filter_overlap_range(fn, fv->overlap_range); if (!filter_node) filter_node = filter_strand(fn, fv->strand); if (!filter_node) filter_node = filter_targetstrand(fn, fv->targetstrand); if (!filter_node) filter_node = filter_has_CDS(fn, fv->has_CDS); if (!filter_node) filter_node = filter_min_average_ssp(fn, fv->min_average_splice_site_prob); if (filter_node) gt_genome_node_delete((GtGenomeNode*) fn); else gt_queue_add(fv->node_buffer, fn); return 0; }
int gt_block_unit_test(GtError *err) { GtRange r1, r2, r_temp, b_range; GtStrand s; GtGenomeNode *gn1, *gn2; GtElement *e1, *e2; double height; GtBlock *b; GtStr *seqid, *caption1, *caption2; int had_err = 0; GtStyle *sty; GtError *testerr; gt_error_check(err); seqid = gt_str_new_cstr("seqid"); caption1 = gt_str_new_cstr("foo"); caption2 = gt_str_new_cstr("bar"); testerr = gt_error_new(); r1.start = 10UL; r1.end = 50UL; r2.start = 40UL; r2.end = 50UL; gn1 = gt_feature_node_new(seqid, gt_ft_gene, r1.start, r1.end, GT_STRAND_FORWARD); gn2 = gt_feature_node_new(seqid, gt_ft_exon, r2.start, r2.end, GT_STRAND_FORWARD); e1 = gt_element_new((GtFeatureNode*) gn1); e2 = gt_element_new((GtFeatureNode*) gn2); b = gt_block_new(); /* test gt_block_insert_elements */ gt_ensure((0UL == gt_block_get_size(b))); gt_block_insert_element(b, (GtFeatureNode*) gn1); gt_ensure((1UL == gt_block_get_size(b))); gt_block_insert_element(b, (GtFeatureNode*) gn2); gt_ensure((2UL == gt_block_get_size(b))); /* test gt_block_set_range & gt_block_get_range */ r_temp = gt_range_join(&r1, &r2); gt_block_set_range(b, r_temp); b_range = gt_block_get_range(b); gt_ensure((0 == gt_range_compare(&b_range, &r_temp))); gt_ensure((1 == gt_range_compare(&r2, &r_temp))); /* tests gt_block_set_caption & gt_block_get_caption */ gt_block_set_caption(b, caption1); gt_ensure((0 == gt_str_cmp(gt_block_get_caption(b), caption1))); gt_ensure((0 != gt_str_cmp(gt_block_get_caption(b), caption2))); /* tests gt_block_set_strand & gt_block_get_range */ s = gt_block_get_strand(b); gt_ensure((GT_STRAND_UNKNOWN == s)); gt_block_set_strand(b, GT_STRAND_FORWARD); s = gt_block_get_strand(b); gt_ensure((GT_STRAND_FORWARD == s)); /* test gt_block_get_max_height() */ sty = gt_style_new(err); gt_ensure(gt_block_get_max_height(b, &height, sty, err) == 0); gt_ensure(!gt_error_is_set(testerr)); gt_ensure(height == BAR_HEIGHT_DEFAULT); gt_style_set_num(sty, "exon", "bar_height", 42); gt_ensure(gt_block_get_max_height(b, &height, sty, err) == 0); gt_ensure(!gt_error_is_set(testerr)); gt_ensure(height == 42); gt_style_set_num(sty, "gene", "bar_height", 23); gt_ensure(gt_block_get_max_height(b, &height, sty, err) == 0); gt_ensure(!gt_error_is_set(testerr)); gt_ensure(height == 42); gt_style_unset(sty, "exon", "bar_height"); gt_ensure(gt_block_get_max_height(b, &height, sty, err) == 0); gt_ensure(!gt_error_is_set(testerr)); gt_ensure(height == 23); gt_str_delete(caption2); gt_str_delete(seqid); gt_element_delete(e1); gt_element_delete(e2); gt_block_delete(b); gt_style_delete(sty); gt_error_delete(testerr); gt_genome_node_delete(gn1); gt_genome_node_delete(gn2); return had_err; }
bool gth_sas_are_equal(const GthSA *saA, const GthSA *saB) { Exoninfo *exoninfoA, *exoninfoB; Introninfo *introninfoA, *introninfoB; GtUword i; /* compare element 0 */ if (gth_sa_alphatype(saA) != gth_sa_alphatype(saB)) return false; /* compare element 1 */ if (gth_backtrace_path_length(saA->backtrace_path) != gth_backtrace_path_length(saB->backtrace_path)) { return false; } for (i = 0; i < gth_backtrace_path_length(saA->backtrace_path); i++) { if (((Editoperation*) gth_backtrace_path_get(saA->backtrace_path))[i] != ((Editoperation*) gth_backtrace_path_get(saB->backtrace_path))[i]) { return false; } } /* element 2 has been removed (indelcount) */ /* compare element 3 */ if (gth_sa_gen_dp_length(saA) != gth_sa_gen_dp_length(saB)) return false; /* compare element 4 */ if (saA->gen_total_length != saB->gen_total_length) return false; /* compare element 5 */ if (saA->gen_offset != saB->gen_offset) return false; /* compare element 6 */ if (gth_sa_ref_total_length(saA) != gth_sa_ref_total_length(saB)) return false; /* compare element 7 */ if (gth_sa_gen_dp_start(saA) != gth_sa_gen_dp_start(saB)) return false; /* element 8 has been removed (gen_dp_end) */ /* compare element 9 */ if (saA->gen_file_num != saB->gen_file_num) return false; /* compare element 10 */ if (saA->gen_seq_num != saB->gen_seq_num) return false; /* compare element 11 */ if (saA->ref_file_num != saB->ref_file_num) return false; /* compare element 12 */ if (saA->ref_seq_num != saB->ref_seq_num) return false; /* compare element 13 */ if (gt_str_cmp(saA->gen_id, saB->gen_id)) return false; /* compare element 14 */ if (gt_str_cmp(saA->ref_id, saB->ref_id)) return false; /* compare element 15 */ if (saA->gen_strand_forward != saB->gen_strand_forward) return false; /* compare element 16 */ if (saA->ref_strand_forward != saB->ref_strand_forward) return false; /* compare element 17 */ if (gth_sa_genomiccutoff_start(saA) != gth_sa_genomiccutoff_start(saB)) return false; if (gth_sa_referencecutoff_start(saA) != gth_sa_referencecutoff_start(saB)) return false; if (gth_sa_eopcutoff_start(saA) != gth_sa_eopcutoff_start(saB)) return false; if (gth_sa_genomiccutoff_end(saA) != gth_sa_genomiccutoff_end(saB)) return false; if (gth_sa_referencecutoff_end(saA) != gth_sa_referencecutoff_end(saB)) return false; if (gth_sa_eopcutoff_end(saA) != gth_sa_eopcutoff_end(saB)) return false; /* compare element 18 */ if (gt_array_size(saA->exons) != gt_array_size(saB->exons)) return false; for (i = 0; i < gt_array_size(saA->exons); i++) { exoninfoA = (Exoninfo*) gt_array_get(saA->exons, i); exoninfoB = (Exoninfo*) gt_array_get(saB->exons, i); if (exoninfoA->leftgenomicexonborder != exoninfoB->leftgenomicexonborder) return false; if (exoninfoA->rightgenomicexonborder != exoninfoB->rightgenomicexonborder) return false; if (exoninfoA->leftreferenceexonborder != exoninfoB->leftreferenceexonborder) { return false; } if (exoninfoA->rightreferenceexonborder != exoninfoB->rightreferenceexonborder) { return false; } if (!gt_double_equals_double(exoninfoA->exonscore, exoninfoB->exonscore)) { return false; } } /* compare element 19 */ if (gt_array_size(saA->introns) != gt_array_size(saB->introns)) return false; for (i = 0; i < gt_array_size(saA->introns); i++) { introninfoA = (Introninfo*) gt_array_get(saA->introns, i); introninfoB = (Introninfo*) gt_array_get(saB->introns, i); if (!gt_double_equals_double(introninfoA->donorsiteprobability, introninfoB->donorsiteprobability)) { return false; } if (!gt_double_equals_double(introninfoA->acceptorsiteprobability, introninfoB->acceptorsiteprobability)) { return false; } if (!gt_double_equals_double(introninfoA->donorsitescore, introninfoB->donorsitescore)) { return false; } if (!gt_double_equals_double(introninfoA->acceptorsitescore, introninfoB->acceptorsitescore)) { return false; } } /* compare element 20 */ if (saA->polyAtailpos.start != saB->polyAtailpos.start) return false; if (saA->polyAtailpos.end != saB->polyAtailpos.end) return false; /* compare element 21 */ if (saA->alignmentscore != saB->alignmentscore) return false; /* compare element 22 */ if (saA->coverage != saB->coverage) return false; /* compare element 23 */ if (saA->genomic_cov_is_highest != saB->genomic_cov_is_highest) return false; /* compare element 24 */ if (saA->cumlen_scored_exons != saB->cumlen_scored_exons) return false; return true; }
static int update_seq_col_if_necessary(GtRegionMapping *rm, GtStr *seqid, GtError *err) { int had_err = 0; gt_error_check(err); gt_assert(rm && seqid); /* for mappings, we need to load the changed sequence, if needed... */ if (rm->mapping) { if (!rm->sequence_file || (gt_str_cmp(rm->sequence_name, seqid))) { gt_str_delete(rm->sequence_file); /* ignore MD5 hashes when using region mappings */ if (gt_md5_seqid_has_prefix(gt_str_get(seqid))) { rm->sequence_file = region_mapping_map(rm, gt_str_get(seqid) +GT_MD5_SEQID_TOTAL_LEN, err); } else rm->sequence_file = region_mapping_map(rm, gt_str_get(seqid), err); if (!rm->sequence_file) had_err = -1; else { /* load new seqcol */ if (!rm->sequence_filenames) rm->sequence_filenames = gt_str_array_new(); else gt_str_array_reset(rm->sequence_filenames); gt_str_array_add(rm->sequence_filenames, rm->sequence_file); if (!rm->sequence_name) rm->sequence_name = gt_str_new(); else gt_str_reset(rm->sequence_name); gt_str_append_str(rm->sequence_name, seqid); gt_seq_col_delete(rm->seq_col); rm->seq_col = gt_bioseq_col_new(rm->sequence_filenames, err); if (!rm->seq_col) had_err = -1; } } } else { /* ...otherwise, just make sure the seqcol is loaded */ if (!rm->seq_col) { if (rm->encseq) { if (!(rm->seq_col = gt_encseq_col_new(rm->encseq, err))) had_err = -1; } else { gt_assert(rm->sequence_filenames); if (!(rm->seq_col = gt_bioseq_col_new(rm->sequence_filenames, err))) had_err = -1; } } if (!had_err && rm->usedesc) { if (rm->seqid2seqnum_mapping) gt_seqid2seqnum_mapping_delete(rm->seqid2seqnum_mapping); rm->seqid2seqnum_mapping = gt_seqid2seqnum_mapping_new_seqcol(rm->seq_col, err); if (!rm->seqid2seqnum_mapping) { had_err = -1; } } } return had_err; }
static int construct_genes(GT_UNUSED void *key, void *value, void *data, GtError *err) { GtHashmap *transcript_id_hash = (GtHashmap*) value; ConstructionInfo *cinfo = (ConstructionInfo*) data; GtQueue *genome_nodes = cinfo->genome_nodes; const char *gname; GtArray *mRNAs = gt_array_new(sizeof (GtGenomeNode*)); GtGenomeNode *gene_node, *gn; GtStrand gene_strand; GtRange gene_range; GtStr *gene_seqid; GtUword i; int had_err = 0; gt_error_check(err); gt_assert(key && value && data); cinfo->mRNAs = mRNAs; had_err = gt_hashmap_foreach(transcript_id_hash, construct_mRNAs, cinfo, err); if (!had_err) { gt_assert(gt_array_size(mRNAs)); /* at least one mRNA constructed */ /* determine the range and the strand of the gene */ gn = *(GtGenomeNode**) gt_array_get(mRNAs, 0); gene_range = gt_genome_node_get_range(gn); gene_strand = gt_feature_node_get_strand((GtFeatureNode*) gn); gene_seqid = gt_genome_node_get_seqid(gn); for (i = 1; i < gt_array_size(mRNAs); i++) { GtRange range; gn = *(GtGenomeNode**) gt_array_get(mRNAs, i); range = gt_genome_node_get_range(gn); gene_range = gt_range_join(&gene_range, &range); gene_strand = gt_strand_join(gene_strand, gt_feature_node_get_strand((GtFeatureNode*) gn)); gt_assert(gt_str_cmp(gene_seqid, gt_genome_node_get_seqid(gn)) == 0); } gene_node = gt_feature_node_new(gene_seqid, gt_ft_gene, gene_range.start, gene_range.end, gene_strand); if ((gname = gt_hashmap_get(cinfo->gene_id_to_name_mapping, (const char*) key))) { gt_feature_node_add_attribute((GtFeatureNode*) gene_node, GT_GFF_NAME, gname); } /* register children */ for (i = 0; i < gt_array_size(mRNAs); i++) { gn = *(GtGenomeNode**) gt_array_get(mRNAs, i); gt_feature_node_add_child((GtFeatureNode*) gene_node, (GtFeatureNode*) gn); } /* store the gene */ gt_queue_add(genome_nodes, gene_node); /* free */ gt_array_delete(mRNAs); } return had_err; }
static int construct_mRNAs(GT_UNUSED void *key, void *value, void *data, GtError *err) { ConstructionInfo *cinfo = (ConstructionInfo*) data; GtArray *gt_genome_node_array = (GtArray*) value, *mRNAs = (GtArray*) cinfo->mRNAs; GtGenomeNode *mRNA_node, *first_node, *gn; const char *tname; GtStrand mRNA_strand; GtRange mRNA_range; GtStr *mRNA_seqid; GtUword i; int had_err = 0; gt_error_check(err); gt_assert(key && value && data); /* at least one node in array */ gt_assert(gt_array_size(gt_genome_node_array)); /* determine the range and the strand of the mRNA */ first_node = *(GtGenomeNode**) gt_array_get(gt_genome_node_array, 0); mRNA_range = gt_genome_node_get_range(first_node); mRNA_strand = gt_feature_node_get_strand((GtFeatureNode*) first_node); mRNA_seqid = gt_genome_node_get_seqid(first_node); for (i = 1; i < gt_array_size(gt_genome_node_array); i++) { GtRange range; gn = *(GtGenomeNode**) gt_array_get(gt_genome_node_array, i); range = gt_genome_node_get_range(gn); mRNA_range = gt_range_join(&mRNA_range, &range); /* XXX: an error check is necessary here, otherwise gt_strand_join() can cause a failed assertion */ mRNA_strand = gt_strand_join(mRNA_strand, gt_feature_node_get_strand((GtFeatureNode*) gn)); if (gt_str_cmp(mRNA_seqid, gt_genome_node_get_seqid(gn))) { gt_error_set(err, "The features on lines %u and %u refer to different " "genomic sequences (``seqname''), although they have the same " "gene IDs (``gene_id'') which must be globally unique", gt_genome_node_get_line_number(first_node), gt_genome_node_get_line_number(gn)); had_err = -1; break; } } if (!had_err) { mRNA_node = gt_feature_node_new(mRNA_seqid, gt_ft_mRNA, mRNA_range.start, mRNA_range.end, mRNA_strand); if ((tname = gt_hashmap_get(cinfo->transcript_id_to_name_mapping, (const char*) key))) { gt_feature_node_add_attribute((GtFeatureNode*) mRNA_node, GT_GFF_NAME, tname); } /* register children */ for (i = 0; i < gt_array_size(gt_genome_node_array); i++) { gn = *(GtGenomeNode**) gt_array_get(gt_genome_node_array, i); gt_feature_node_add_child((GtFeatureNode*) mRNA_node, (GtFeatureNode*) gn); } /* store the mRNA */ gt_array_add(mRNAs, mRNA_node); } return had_err; }
static int construct_mRNAs(GT_UNUSED void *key, void *value, void *data, GtError *err) { ConstructionInfo *cinfo = (ConstructionInfo*) data; GtArray *gt_genome_node_array = (GtArray*) value, *mRNAs = (GtArray*) cinfo->mRNAs; GtGenomeNode *mRNA_node, *first_node, *gn; const char *tname; GtStrand mRNA_strand; GtRange mRNA_range; GtStr *mRNA_seqid; GtUword i; int had_err = 0; gt_error_check(err); gt_assert(key && value && data); /* at least one node in array */ gt_assert(gt_array_size(gt_genome_node_array)); /* determine the range and the strand of the mRNA */ first_node = *(GtGenomeNode**) gt_array_get(gt_genome_node_array, 0); mRNA_range = gt_genome_node_get_range(first_node); mRNA_strand = gt_feature_node_get_strand((GtFeatureNode*) first_node); mRNA_seqid = gt_genome_node_get_seqid(first_node); /* TODO: support discontinuous start/stop codons */ for (i = 0; !had_err && i < gt_array_size(gt_genome_node_array); i++) { gn = *(GtGenomeNode**) gt_array_get(gt_genome_node_array, i); if (gt_feature_node_get_attribute((GtFeatureNode*) gn, GTF_PARSER_STOP_CODON_FLAG)) { GtUword j; GtRange stop_codon_rng = gt_genome_node_get_range(gn); bool found_cds = false; for (j = 0; !had_err && j < gt_array_size(gt_genome_node_array); j++) { GtGenomeNode* gn2; GtRange this_rng; const char *this_type; gn2 = *(GtGenomeNode**) gt_array_get(gt_genome_node_array, j); if (gn == gn2) continue; this_rng = gt_genome_node_get_range(gn2); this_type = gt_feature_node_get_type((GtFeatureNode*) gn2); if (this_type == gt_symbol(gt_ft_CDS)) { if (gt_range_contains(&this_rng, &stop_codon_rng)) { if (cinfo->tidy) { gt_warning("stop codon on line %u in file %s is contained in " "CDS in line %u", gt_genome_node_get_line_number(gn), gt_genome_node_get_filename(gn), gt_genome_node_get_line_number(gn2)); found_cds = true; } else { gt_error_set(err, "stop codon on line %u in file %s is " "contained in CDS in line %u", gt_genome_node_get_line_number(gn), gt_genome_node_get_filename(gn), gt_genome_node_get_line_number(gn2)); had_err = -1; } break; } if (this_rng.end + 1 == stop_codon_rng.start) { this_rng.end = stop_codon_rng.end; gt_genome_node_set_range(gn2, &this_rng); found_cds = true; break; } if (this_rng.start == stop_codon_rng.end + 1) { this_rng.start = stop_codon_rng.start; gt_genome_node_set_range(gn2, &this_rng); found_cds = true; break; } } } if (!found_cds) { if (!had_err) { if (cinfo->tidy) { gt_warning("found stop codon on line %u in file %s with no " "flanking CDS, ignoring it", gt_genome_node_get_line_number(gn), gt_genome_node_get_filename(gn)); } else { gt_error_set(err, "found stop codon on line %u in file %s with no " "flanking CDS", gt_genome_node_get_line_number(gn), gt_genome_node_get_filename(gn)); had_err = -1; break; } } } else { gt_array_rem(gt_genome_node_array, i); gt_genome_node_delete(gn); } } } for (i = 1; !had_err && i < gt_array_size(gt_genome_node_array); i++) { GtRange range; GtStrand strand; gn = *(GtGenomeNode**) gt_array_get(gt_genome_node_array, i); range = gt_genome_node_get_range(gn); mRNA_range = gt_range_join(&mRNA_range, &range); strand = gt_feature_node_get_strand((GtFeatureNode*) gn); if (strand != mRNA_strand) { gt_error_set(err, "feature %s on line %u has strand %c, but the " "parent transcript has strand %c", (const char*) key, gt_genome_node_get_line_number(gn), GT_STRAND_CHARS[strand], GT_STRAND_CHARS[mRNA_strand]); had_err = -1; break; } else { mRNA_strand = gt_strand_join(mRNA_strand, strand); } if (!had_err && gt_str_cmp(mRNA_seqid, gt_genome_node_get_seqid(gn))) { gt_error_set(err, "The features on lines %u and %u refer to different " "genomic sequences (``seqname''), although they have the same " "gene IDs (``gene_id'') which must be globally unique", gt_genome_node_get_line_number(first_node), gt_genome_node_get_line_number(gn)); had_err = -1; break; } } if (!had_err) { mRNA_node = gt_feature_node_new(mRNA_seqid, gt_ft_mRNA, mRNA_range.start, mRNA_range.end, mRNA_strand); gt_feature_node_add_attribute(((GtFeatureNode*) mRNA_node), "ID", key); gt_feature_node_add_attribute(((GtFeatureNode*) mRNA_node), "transcript_id", key); if ((tname = gt_hashmap_get(cinfo->transcript_id_to_name_mapping, (const char*) key)) && strlen(tname) > 0) { gt_feature_node_add_attribute((GtFeatureNode*) mRNA_node, GT_GFF_NAME, tname); } /* register children */ for (i = 0; i < gt_array_size(gt_genome_node_array); i++) { gn = *(GtGenomeNode**) gt_array_get(gt_genome_node_array, i); gt_feature_node_add_child((GtFeatureNode*) mRNA_node, (GtFeatureNode*) gt_genome_node_ref(gn)); } /* store the mRNA */ gt_array_add(mRNAs, mRNA_node); } return had_err; }