static int pdom_hit_attach_gff3(GtPdomModel *model, GtPdomModelHit *hit, void *data, GT_UNUSED GtError *err) { unsigned long i; GtRange rng; GtLTRdigestStream *ls = (GtLTRdigestStream *) data; GtStrand strand; gt_assert(model && hit); strand = gt_pdom_model_hit_get_best_strand(hit); /* do not use the hits on the non-predicted strand -- maybe identify nested elements ? */ if (strand != gt_feature_node_get_strand(ls->element.mainnode)) return 0; for (i=0;i<gt_pdom_model_hit_best_chain_length(hit);i++) { GtGenomeNode *gf; GtStr *alignmentstring, *aastring; GtPdomSingleHit *singlehit; GtPhase frame; singlehit = gt_pdom_model_hit_best_single_hit(hit, i); alignmentstring = gt_str_new(); aastring = gt_str_new(); frame = gt_pdom_single_hit_get_phase(singlehit); rng = gt_pdom_single_hit_get_range(singlehit); gt_pdom_single_hit_format_alignment(singlehit, GT_ALIWIDTH, alignmentstring); gt_pdom_single_hit_get_aaseq(singlehit, aastring); rng.start++; rng.end++; /* GFF3 is 1-based */ gf = gt_feature_node_new(gt_genome_node_get_seqid((GtGenomeNode*) ls->element.mainnode), GT_PDOM_TYPE, rng.start, rng.end, strand); gt_genome_node_add_user_data((GtGenomeNode*) gf, "pdom_alignment", alignmentstring, (GtFree) gt_str_delete); gt_genome_node_add_user_data((GtGenomeNode*) gf, "pdom_aaseq", aastring, (GtFree) gt_str_delete); gt_feature_node_set_source((GtFeatureNode*) gf, ls->ltrdigest_tag); gt_feature_node_set_score((GtFeatureNode*) gf, gt_pdom_single_hit_get_evalue(singlehit)); gt_feature_node_set_phase((GtFeatureNode*) gf, frame); if (gt_pdom_model_get_name(model)) { gt_feature_node_add_attribute((GtFeatureNode*) gf, "name", gt_pdom_model_get_name(model)); } if (gt_pdom_model_get_acc(model)) { gt_feature_node_add_attribute((GtFeatureNode*) gf, "id", gt_pdom_model_get_acc(model)); } gt_feature_node_add_child(ls->element.mainnode, (GtFeatureNode*) gf); } return 0; }
static int gt_ltrdigest_pdom_visitor_attach_hit(GtLTRdigestPdomVisitor *lv, GtHMMERModelHit *modelhit, GtHMMERSingleHit *singlehit) { GT_UNUSED GtUword i; GtGenomeNode *gf; int had_err = 0; GtRange rrng; gt_assert(lv && singlehit); rrng = gt_ltrdigest_pdom_visitor_coords(lv, singlehit); if (gt_array_size(singlehit->chains) > 0 || lv->output_all_chains) { char buf[32]; gf = gt_feature_node_new(gt_genome_node_get_seqid((GtGenomeNode*) lv->ltr_retrotrans), gt_ft_protein_match, rrng.start, rrng.end, singlehit->strand); gt_genome_node_add_user_data((GtGenomeNode*) gf, "pdom_alignment", gt_str_ref(singlehit->alignment), (GtFree) gt_str_delete); gt_genome_node_add_user_data((GtGenomeNode*) gf, "pdom_aaseq", gt_str_ref(singlehit->aastring), (GtFree) gt_str_delete); gt_feature_node_set_source((GtFeatureNode*) gf, lv->tag); gt_feature_node_set_score((GtFeatureNode*) gf, (float) singlehit->evalue); (void) snprintf(buf, (size_t) 32, "%d", (int) singlehit->frame); gt_feature_node_add_attribute((GtFeatureNode*) gf, "reading_frame", buf); if (modelhit->modelname != NULL) { gt_feature_node_add_attribute((GtFeatureNode*) gf, "name", modelhit->modelname); } if (gt_array_size(singlehit->chains) > 1UL && lv->output_all_chains) { GtStr *buffer; GtUword j; gt_assert(singlehit->chains != NULL); buffer = gt_str_new(); for (j = 0UL; j < gt_array_size(singlehit->chains); j++) { gt_str_append_cstr(buffer, modelhit->modelname); gt_str_append_char(buffer, ':'); gt_str_append_ulong(buffer, *(GtUword*) gt_array_get(singlehit->chains, j)); if (j != gt_array_size(singlehit->chains) - 1) { gt_str_append_char(buffer, ','); } } gt_feature_node_set_attribute((GtFeatureNode*) gf, "chains", gt_str_get(buffer)); gt_str_delete(buffer); } gt_feature_node_add_child(lv->ltr_retrotrans, (GtFeatureNode*) gf); } gt_array_delete(singlehit->chains); singlehit->chains = NULL; return had_err; }
static void pbs_attach_results_to_gff3(GtPBSResults *results, GtLTRElement *element, GtStrand *canonical_strand, GtStr *tag) { GtRange pbs_range; GtGenomeNode *gf; unsigned long i = 0; char buffer[BUFSIZ]; GtPBSHit* hit = gt_pbs_results_get_ranked_hit(results, i++); if (*canonical_strand == GT_STRAND_UNKNOWN) *canonical_strand = gt_pbs_hit_get_strand(hit); else { /* do we have to satisfy a strand constraint? * then find best-scoring PBS on the given canonical strand */ while (gt_pbs_hit_get_strand(hit) != *canonical_strand && i < gt_pbs_results_get_number_of_hits(results)) { gt_log_log("dropping PBS because of nonconsistent strand: %s\n", gt_feature_node_get_attribute(element->mainnode, "ID")); hit = gt_pbs_results_get_ranked_hit(results, i++); } /* if there is none, do not report a PBS */ if (gt_pbs_hit_get_strand(hit) != *canonical_strand) return; } pbs_range = gt_pbs_hit_get_coords(hit); pbs_range.start++; pbs_range.end++; /* GFF3 is 1-based */ gf = gt_feature_node_new(gt_genome_node_get_seqid((GtGenomeNode*) element->mainnode), GT_PBS_TYPE, pbs_range.start, pbs_range.end, gt_pbs_hit_get_strand(hit)); gt_feature_node_set_source((GtFeatureNode*) gf, tag); gt_feature_node_set_score((GtFeatureNode*) gf, (float) gt_pbs_hit_get_score(hit)); if (gt_pbs_hit_get_trna(hit) != NULL) { gt_feature_node_add_attribute((GtFeatureNode*) gf, "trna", gt_pbs_hit_get_trna(hit)); } gt_feature_node_set_strand(element->mainnode, gt_pbs_hit_get_strand(hit)); (void) snprintf(buffer, BUFSIZ-1, "%lu", gt_pbs_hit_get_tstart(hit)); gt_feature_node_add_attribute((GtFeatureNode*) gf, "trnaoffset", buffer); (void) snprintf(buffer, BUFSIZ-1, "%lu", gt_pbs_hit_get_offset(hit)); gt_feature_node_add_attribute((GtFeatureNode*) gf, "pbsoffset", buffer); (void) snprintf(buffer, BUFSIZ-1, "%lu", gt_pbs_hit_get_edist(hit)); gt_feature_node_add_attribute((GtFeatureNode*) gf, "edist", buffer); gt_feature_node_add_child(element->mainnode, (GtFeatureNode*) gf); }
static void infer_cds_visitor_check_cds_multi(AgnInferCDSVisitor *v) { if(gt_array_size(v->cds) <= 1) { return; } GtFeatureNode **firstsegment = gt_array_get(v->cds, 0); const char *id = gt_feature_node_get_attribute(*firstsegment, "ID"); if(id == NULL) { char newid[64]; sprintf(newid, "CDS%lu", v->cdscounter++); gt_feature_node_add_attribute(*firstsegment, "ID", newid); } gt_feature_node_make_multi_representative(*firstsegment); GtUword i; for(i = 0; i < gt_array_size(v->cds); i++) { GtFeatureNode **segment = gt_array_get(v->cds, i); if(!gt_feature_node_is_multi(*segment)) { gt_feature_node_set_multi_representative(*segment, *firstsegment); } } }
static int gaeval_visitor_visit_feature_node(GtNodeVisitor *nv, GtFeatureNode *fn, GtError *error) { AgnGaevalVisitor *v = gaeval_visitor_cast(nv); gt_error_check(error); GtFeatureNodeIterator *feats = gt_feature_node_iterator_new(fn); GtFeatureNode *tempfeat; for(tempfeat = gt_feature_node_iterator_next(feats); tempfeat != NULL; tempfeat = gt_feature_node_iterator_next(feats)) { if(agn_typecheck_mrna(tempfeat) == false) continue; double coverage = gaeval_visitor_calculate_coverage(v, tempfeat, error); char covstr[16]; sprintf(covstr, "%.3lf", coverage); gt_feature_node_add_attribute(tempfeat, "gaeval_coverage", covstr); double integrity_components[5]; double integrity = gaeval_visitor_calculate_integrity( v, tempfeat, coverage, integrity_components, error ); char intstr[16]; sprintf(intstr, "%.3lf", integrity); gt_feature_node_add_attribute(tempfeat, "gaeval_integrity", intstr); if(v->tsvout) { const char *mrnaid = gt_feature_node_get_attribute(tempfeat, "ID"); const char *mrnalabel = agn_feature_node_get_label(tempfeat); GtUword num_introns = agn_typecheck_count(tempfeat, agn_typecheck_intron); fprintf(v->tsvout, "%s\t%s\t%s\t%s\t%lu\t%.3lf\t%.3lf\t%.3lf\t%.3lf\n", mrnaid, mrnalabel, intstr, covstr, num_introns, integrity_components[0], integrity_components[1], integrity_components[2], integrity_components[3]); } } gt_feature_node_iterator_delete(feats); return 0; }
static int create_block_features(GtBEDParser *bed_parser, GtFeatureNode *fn, GtUword block_count, GtSplitter *size_splitter, GtSplitter *start_splitter, GtIO *bed_file, GtError *err) { GtUword i; int had_err = 0; gt_assert(fn && block_count && size_splitter && start_splitter); gt_assert(gt_splitter_size(size_splitter) == block_count); gt_assert(gt_splitter_size(start_splitter) == block_count); for (i = 0; !had_err && i < block_count; i++) { GtUword block_size, block_start, start, end; GtGenomeNode *block; const char *name; if (gt_parse_uword(&block_size, gt_splitter_get_token(size_splitter, i))) { gt_error_set(err, "file \"%s\": line "GT_WU": could not parse blockSize '%s'", gt_io_get_filename(bed_file), gt_io_get_line_number(bed_file), gt_splitter_get_token(size_splitter, i)); had_err = -1; } if (!had_err && gt_parse_uword(&block_start, gt_splitter_get_token(start_splitter, i))) { gt_error_set(err, "file \"%s\": line "GT_WU": could not parse blockStart " "'%s'", gt_io_get_filename(bed_file), gt_io_get_line_number(bed_file), gt_splitter_get_token(start_splitter, i)); had_err = -1; } if (!had_err) { start = gt_genome_node_get_start((GtGenomeNode*) fn) + block_start; end = start + block_size - 1; block = gt_feature_node_new(gt_genome_node_get_seqid((GtGenomeNode*) fn), bed_parser->block_type ? bed_parser->block_type : BED_BLOCK_TYPE, start, end, gt_feature_node_get_strand(fn)); if ((name = gt_feature_node_get_attribute(fn, GT_GFF_NAME))) { gt_feature_node_add_attribute((GtFeatureNode*) block, GT_GFF_NAME, name); } gt_feature_node_set_score((GtFeatureNode*) block, gt_feature_node_get_score(fn)); gt_feature_node_set_strand((GtFeatureNode*) block, gt_feature_node_get_strand(fn)); gt_feature_node_add_child(fn, (GtFeatureNode*) block); } } return had_err; }
static void construct_thick_feature(GtBEDParser *bed_parser, GtFeatureNode *fn, GtRange range) { GtGenomeNode *thick_feature; const char *name; gt_assert(fn); thick_feature = gt_feature_node_new(gt_genome_node_get_seqid((GtGenomeNode*) fn), bed_parser->thick_feature_type ? bed_parser->thick_feature_type : BED_THICK_FEATURE_TYPE, range.start, range.end, gt_feature_node_get_strand(fn)); if ((name = gt_feature_node_get_attribute(fn, "Name"))) gt_feature_node_add_attribute((GtFeatureNode*) thick_feature, "Name", name); gt_feature_node_set_score((GtFeatureNode*) thick_feature, gt_feature_node_get_score(fn)); gt_feature_node_set_strand((GtFeatureNode*) thick_feature, gt_feature_node_get_strand(fn)); gt_feature_node_add_child(fn, (GtFeatureNode*) thick_feature); }
static int bed_rest(GtBEDParser *bed_parser, GtIO *bed_file, GtError *err) { GtUword block_count = 0; GtGenomeNode *gn = NULL; GtRange range; GtStr *seqid; int had_err; gt_error_check(err); /* column 1.: chrom */ seqid = get_seqid(bed_parser); had_err = skip_blanks(bed_file, err); /* column 2.: chromStart */ if (!had_err) { word(bed_parser->word, bed_file); had_err = skip_blanks(bed_file, err); } /* column 3.: chromEnd */ if (!had_err) { word(bed_parser->another_word, bed_file); had_err = parse_bed_range(&range, bed_parser->word, bed_parser->another_word, bed_parser->offset, bed_file, false, err); } if (!had_err) { /* add region */ gt_region_node_builder_add_region(bed_parser->region_node_builder, gt_str_get(seqid), range); /* create feature */ gn = gt_feature_node_new(seqid, bed_parser->feature_type ? bed_parser->feature_type : BED_FEATURE_TYPE, range.start, range.end, GT_STRAND_BOTH); gt_queue_add(bed_parser->feature_nodes, gn); if (bed_separator(bed_file)) had_err = skip_blanks(bed_file, err); } /* optional column 4.: name */ if (!had_err) { word(bed_parser->word, bed_file); if (gt_str_length(bed_parser->word)) { gt_feature_node_add_attribute((GtFeatureNode*) gn, GT_GFF_NAME, gt_str_get(bed_parser->word)); } if (bed_separator(bed_file)) had_err = skip_blanks(bed_file, err); } /* optional column 5.: score */ if (!had_err) { word(bed_parser->word, bed_file); if (gt_str_length(bed_parser->word)) { bool score_is_defined; float score_value; had_err = gt_parse_score(&score_is_defined, &score_value, gt_str_get(bed_parser->word), gt_io_get_line_number(bed_file), gt_io_get_filename(bed_file), err); if (!had_err && score_is_defined) gt_feature_node_set_score((GtFeatureNode*) gn, score_value); } } if (!had_err && bed_separator(bed_file)) had_err = skip_blanks(bed_file, err); /* optional column 6.: strand */ if (!had_err) { word(bed_parser->word, bed_file); if (gt_str_length(bed_parser->word)) { GtStrand strand; had_err = gt_parse_strand(&strand, gt_str_get(bed_parser->word), gt_io_get_line_number(bed_file), gt_io_get_filename(bed_file), err); if (!had_err) gt_feature_node_set_strand((GtFeatureNode*) gn, strand); } } if (!had_err && bed_separator(bed_file)) had_err = skip_blanks(bed_file, err); /* optional column 7.: thickStart */ if (!had_err) { word(bed_parser->word, bed_file); if (bed_separator(bed_file)) had_err = skip_blanks(bed_file, err); } /* optional column 8.: thickEnd */ if (!had_err) { word(bed_parser->another_word, bed_file); if (gt_str_length(bed_parser->another_word)) { gt_assert(gt_str_length(bed_parser->word)); /* got a thickStart and a thickEnd -> construct corresponding feature */ had_err = parse_bed_range(&range, bed_parser->word, bed_parser->another_word, bed_parser->offset, bed_file, true, err); if (!had_err && range.start <= range.end) construct_thick_feature(bed_parser, (GtFeatureNode*) gn, range); } } if (!had_err && bed_separator(bed_file)) had_err = skip_blanks(bed_file, err); /* optional column 9.: itemRgb */ if (!had_err) { word(bed_parser->word, bed_file); /* we do not use the RGB values */ if (bed_separator(bed_file)) had_err = skip_blanks(bed_file, err); } /* optional column 10.: blockCount */ if (!had_err) { word(bed_parser->word, bed_file); if (gt_str_length(bed_parser->word)) { if (gt_parse_uword(&block_count, gt_str_get(bed_parser->word))) { gt_error_set(err, "file \"%s\": line "GT_WU": could not parse blockCount", gt_io_get_filename(bed_file), gt_io_get_line_number(bed_file)); had_err = -1; } else { /* reset to parse/process blockSizes and blockStarts properly */ gt_str_reset(bed_parser->word); gt_str_reset(bed_parser->another_word); } } } if (!had_err && bed_separator(bed_file)) had_err = skip_blanks(bed_file, err); /* optional column 11.: blockSizes */ if (!had_err) { word(bed_parser->word, bed_file); if (bed_separator(bed_file)) had_err = skip_blanks(bed_file, err); } /* optional column 12.: blockStarts */ if (!had_err) { word(bed_parser->another_word, bed_file); if (bed_separator(bed_file)) had_err = skip_blanks(bed_file, err); } /* process blocks if necessary */ if (!had_err && block_count) { had_err = process_blocks(bed_parser, (GtFeatureNode*) gn, block_count, bed_parser->word, bed_parser->another_word, bed_file, err); } /* the end of the line should now be reached */ if (!had_err) had_err = gt_io_expect(bed_file, GT_END_OF_LINE, err); return had_err; }
int gt_track_unit_test(GtError *err) { int had_err = 0; GtBlock *b[4]; GtRange r[4]; GtTrack *track; GtGenomeNode *parent[4], *gn[4]; GtStr *title; double height, tmp; GtStyle *sty; unsigned long i; GtLineBreaker *lb; double t_rest = 0, l_rest = 0; gt_error_check(err); title = gt_str_new_cstr("test"); r[0].start=100UL; r[0].end=1000UL; r[1].start=1001UL; r[1].end=1500UL; r[2].start=700UL; r[2].end=1200UL; r[3].start=10UL; r[3].end=200UL; for (i=0; i<4; i++) { parent[i] = gt_feature_node_new(title, gt_ft_gene, r[i].start, r[i].end, GT_STRAND_FORWARD); gn[i] = gt_feature_node_new(title, gt_ft_exon, r[i].start, r[i].end, GT_STRAND_FORWARD); gt_feature_node_add_child((GtFeatureNode*) parent[i], (GtFeatureNode*) gn[i]); gt_feature_node_add_attribute((GtFeatureNode*) parent[i], GT_GFF_NAME, "parent"); gt_feature_node_add_attribute((GtFeatureNode*) gn[i], GT_GFF_NAME, "child"); } for (i=0; i<4; i++) { b[i] = gt_block_new(); gt_block_set_range(b[i], r[i]); gt_block_insert_element(b[i], (GtFeatureNode*) parent[i]); gt_block_insert_element(b[i], (GtFeatureNode*) gn[i]); } lb = gt_line_breaker_bases_new(); sty = gt_style_new(err); if (gt_style_get_num(sty, "format", "track_caption_font_size", &tmp, NULL, err) == GT_STYLE_QUERY_NOT_SET) { tmp = TEXT_SIZE_DEFAULT; } t_rest += tmp; if (gt_style_get_num(sty, "format", "track_caption_space", &tmp, NULL, err) == GT_STYLE_QUERY_NOT_SET) { tmp = CAPTION_BAR_SPACE_DEFAULT; } t_rest += tmp; if (gt_style_get_num(sty, "format", "track_vspace", &tmp, NULL, err) == GT_STYLE_QUERY_NOT_SET) { tmp = TRACK_VSPACE_DEFAULT; } t_rest += tmp; if (gt_style_get_num(sty, "format", "bar_vspace", &l_rest, NULL, err) == GT_STYLE_QUERY_NOT_SET) { l_rest = BAR_VSPACE_DEFAULT; } track = gt_track_new(title, GT_UNDEF_ULONG, true, lb); gt_ensure(had_err, track); gt_ensure(had_err, gt_track_get_title(track) == title); gt_ensure(had_err, gt_track_get_number_of_lines(track) == 0); gt_ensure(had_err, gt_track_get_height(track, &height, sty, err) == 0); gt_ensure(had_err, height == t_rest); gt_ensure(had_err, !gt_error_is_set(err)); gt_ensure(had_err, gt_track_insert_block(track, b[0], err) == 0); gt_ensure(had_err, !gt_error_is_set(err)); gt_ensure(had_err, gt_track_get_number_of_lines(track) == 1); gt_ensure(had_err, gt_track_get_height(track, &height, sty, err) == 0); gt_ensure(had_err, height == t_rest + l_rest + BAR_HEIGHT_DEFAULT); gt_ensure(had_err, !gt_error_is_set(err)); gt_ensure(had_err, gt_track_insert_block(track, b[1], err) == 0); gt_ensure(had_err, !gt_error_is_set(err)); gt_ensure(had_err, gt_track_get_number_of_lines(track) == 1); gt_ensure(had_err, gt_track_get_height(track, &height, sty, err) == 0); gt_ensure(had_err, height == t_rest + l_rest + BAR_HEIGHT_DEFAULT); gt_ensure(had_err, !gt_error_is_set(err)); gt_ensure(had_err, gt_track_insert_block(track, b[2], err) == 0); gt_ensure(had_err, !gt_error_is_set(err)); gt_ensure(had_err, gt_track_get_number_of_lines(track) == 2); gt_ensure(had_err, gt_track_insert_block(track, b[3], err) == 0); gt_ensure(had_err, !gt_error_is_set(err)); gt_ensure(had_err, gt_track_get_number_of_lines(track) == 2); gt_ensure(had_err, gt_track_get_height(track, &height, sty, err) == 0); gt_ensure(had_err, height == t_rest + 2*(l_rest + BAR_HEIGHT_DEFAULT)); gt_ensure(had_err, !gt_error_is_set(err)); gt_style_set_num(sty, "exon", "bar_height", 42); gt_ensure(had_err, gt_track_get_height(track, &height, sty, err) == 0); gt_ensure(had_err, height == t_rest + 2*(l_rest+42)); gt_ensure(had_err, !gt_error_is_set(err)); gt_style_set_num(sty, "gene", "bar_height", 23); gt_ensure(had_err, gt_track_get_height(track, &height, sty, err) == 0); gt_ensure(had_err, height == t_rest + 2*(l_rest+42)); gt_ensure(had_err, !gt_error_is_set(err)); gt_style_unset(sty, "exon", "bar_height"); gt_ensure(had_err, gt_track_get_height(track, &height, sty, err) == 0); gt_ensure(had_err, height == t_rest + 2*(l_rest+23)); gt_ensure(had_err, !gt_error_is_set(err)); gt_style_unset(sty, "gene", "bar_height"); gt_style_set_num(sty, "format", "bar_height", 99); gt_ensure(had_err, gt_track_get_height(track, &height, sty, err) == 0); gt_ensure(had_err, height == t_rest + 2*(l_rest+99)); gt_ensure(had_err, !gt_error_is_set(err)); gt_ensure(had_err, gt_track_get_number_of_discarded_blocks(track) == 0); gt_track_delete(track); gt_str_delete(title); gt_style_delete(sty); for (i=0; i<4; i++) { gt_block_delete(b[i]); gt_genome_node_delete(parent[i]); } return had_err; }
static int construct_genes(GT_UNUSED void *key, void *value, void *data, GtError *err) { GtHashmap *transcript_id_hash = (GtHashmap*) value; ConstructionInfo *cinfo = (ConstructionInfo*) data; GtQueue *genome_nodes = cinfo->genome_nodes; const char *gname; GtArray *mRNAs = gt_array_new(sizeof (GtGenomeNode*)); GtGenomeNode *gene_node, *gn; GtStrand gene_strand; GtRange gene_range; GtStr *gene_seqid; GtUword i; int had_err = 0; gt_error_check(err); gt_assert(key && value && data); cinfo->mRNAs = mRNAs; had_err = gt_hashmap_foreach(transcript_id_hash, construct_mRNAs, cinfo, err); if (!had_err) { gt_assert(gt_array_size(mRNAs)); /* at least one mRNA constructed */ /* determine the range and the strand of the gene */ gn = *(GtGenomeNode**) gt_array_get(mRNAs, 0); gene_range = gt_genome_node_get_range(gn); gene_strand = gt_feature_node_get_strand((GtFeatureNode*) gn); gene_seqid = gt_genome_node_get_seqid(gn); for (i = 1; i < gt_array_size(mRNAs); i++) { GtRange range; gn = *(GtGenomeNode**) gt_array_get(mRNAs, i); range = gt_genome_node_get_range(gn); gene_range = gt_range_join(&gene_range, &range); gene_strand = gt_strand_join(gene_strand, gt_feature_node_get_strand((GtFeatureNode*) gn)); gt_assert(gt_str_cmp(gene_seqid, gt_genome_node_get_seqid(gn)) == 0); } gene_node = gt_feature_node_new(gene_seqid, gt_ft_gene, gene_range.start, gene_range.end, gene_strand); if ((gname = gt_hashmap_get(cinfo->gene_id_to_name_mapping, (const char*) key))) { gt_feature_node_add_attribute((GtFeatureNode*) gene_node, GT_GFF_NAME, gname); } /* register children */ for (i = 0; i < gt_array_size(mRNAs); i++) { gn = *(GtGenomeNode**) gt_array_get(mRNAs, i); gt_feature_node_add_child((GtFeatureNode*) gene_node, (GtFeatureNode*) gn); } /* store the gene */ gt_queue_add(genome_nodes, gene_node); /* free */ gt_array_delete(mRNAs); } return had_err; }
static int construct_mRNAs(GT_UNUSED void *key, void *value, void *data, GtError *err) { ConstructionInfo *cinfo = (ConstructionInfo*) data; GtArray *gt_genome_node_array = (GtArray*) value, *mRNAs = (GtArray*) cinfo->mRNAs; GtGenomeNode *mRNA_node, *first_node, *gn; const char *tname; GtStrand mRNA_strand; GtRange mRNA_range; GtStr *mRNA_seqid; GtUword i; int had_err = 0; gt_error_check(err); gt_assert(key && value && data); /* at least one node in array */ gt_assert(gt_array_size(gt_genome_node_array)); /* determine the range and the strand of the mRNA */ first_node = *(GtGenomeNode**) gt_array_get(gt_genome_node_array, 0); mRNA_range = gt_genome_node_get_range(first_node); mRNA_strand = gt_feature_node_get_strand((GtFeatureNode*) first_node); mRNA_seqid = gt_genome_node_get_seqid(first_node); for (i = 1; i < gt_array_size(gt_genome_node_array); i++) { GtRange range; gn = *(GtGenomeNode**) gt_array_get(gt_genome_node_array, i); range = gt_genome_node_get_range(gn); mRNA_range = gt_range_join(&mRNA_range, &range); /* XXX: an error check is necessary here, otherwise gt_strand_join() can cause a failed assertion */ mRNA_strand = gt_strand_join(mRNA_strand, gt_feature_node_get_strand((GtFeatureNode*) gn)); if (gt_str_cmp(mRNA_seqid, gt_genome_node_get_seqid(gn))) { gt_error_set(err, "The features on lines %u and %u refer to different " "genomic sequences (``seqname''), although they have the same " "gene IDs (``gene_id'') which must be globally unique", gt_genome_node_get_line_number(first_node), gt_genome_node_get_line_number(gn)); had_err = -1; break; } } if (!had_err) { mRNA_node = gt_feature_node_new(mRNA_seqid, gt_ft_mRNA, mRNA_range.start, mRNA_range.end, mRNA_strand); if ((tname = gt_hashmap_get(cinfo->transcript_id_to_name_mapping, (const char*) key))) { gt_feature_node_add_attribute((GtFeatureNode*) mRNA_node, GT_GFF_NAME, tname); } /* register children */ for (i = 0; i < gt_array_size(gt_genome_node_array); i++) { gn = *(GtGenomeNode**) gt_array_get(gt_genome_node_array, i); gt_feature_node_add_child((GtFeatureNode*) mRNA_node, (GtFeatureNode*) gn); } /* store the mRNA */ gt_array_add(mRNAs, mRNA_node); } return had_err; }
int gt_gtf_parser_parse(GtGTFParser *parser, GtQueue *genome_nodes, GtStr *filenamestr, GtFile *fpin, bool be_tolerant, GtError *err) { GtStr *seqid_str, *source_str, *line_buffer; char *line; size_t line_length; GtUword i, line_number = 0; GtGenomeNode *gn; GtRange range; GtPhase phase_value; GtStrand gt_strand_value; GtSplitter *splitter, *attribute_splitter; float score_value; char *seqname, *source, *feature, *start, *end, *score, *strand, *frame, *attributes, *token, *gene_id, *gene_name = NULL, *transcript_id, *transcript_name = NULL, **tokens; GtHashmap *transcript_id_hash; /* map from transcript id to array of genome nodes */ GtArray *gt_genome_node_array; ConstructionInfo cinfo; GTF_feature_type gtf_feature_type; GT_UNUSED bool gff_type_is_valid = false; const char *type = NULL; const char *filename; bool score_is_defined; int had_err = 0; gt_assert(parser && genome_nodes); gt_error_check(err); filename = gt_str_get(filenamestr); /* alloc */ line_buffer = gt_str_new(); splitter = gt_splitter_new(), attribute_splitter = gt_splitter_new(); #define HANDLE_ERROR \ if (had_err) { \ if (be_tolerant) { \ fprintf(stderr, "skipping line: %s\n", gt_error_get(err)); \ gt_error_unset(err); \ gt_str_reset(line_buffer); \ had_err = 0; \ continue; \ } \ else { \ had_err = -1; \ break; \ } \ } while (gt_str_read_next_line_generic(line_buffer, fpin) != EOF) { line = gt_str_get(line_buffer); line_length = gt_str_length(line_buffer); line_number++; gene_name = gene_id = transcript_id = transcript_name = NULL; had_err = 0; if (line_length == 0) { gt_warning("skipping blank line " GT_WU " in file \"%s\"", line_number, filename); } else if (line[0] == '#') { /* storing comment */ if (line_length >= 2 && line[1] == '#') gn = gt_comment_node_new(line+2); /* store '##' line as '#' line */ else gn = gt_comment_node_new(line+1); gt_genome_node_set_origin(gn, filenamestr, line_number); gt_queue_add(genome_nodes, gn); } else { bool stop_codon = false; char *tokendup, *attrkey; GtStrArray *attrkeys, *attrvals; /* process tab delimited GTF line */ gt_splitter_reset(splitter); gt_splitter_split(splitter, line, line_length, '\t'); if (gt_splitter_size(splitter) != 9UL) { gt_error_set(err, "line " GT_WU " in file \"%s\" contains " GT_WU " tab (\\t) " "separated fields instead of 9", line_number, filename, gt_splitter_size(splitter)); had_err = -1; break; } tokens = gt_splitter_get_tokens(splitter); seqname = tokens[0]; source = tokens[1]; feature = tokens[2]; start = tokens[3]; end = tokens[4]; score = tokens[5]; strand = tokens[6]; frame = tokens[7]; attributes = tokens[8]; /* parse feature */ if (GTF_feature_type_get(>f_feature_type, feature) == -1) { /* we skip unknown features */ fprintf(stderr, "skipping line " GT_WU " in file \"%s\": unknown " "feature: \"%s\"\n", line_number, filename, feature); gt_str_reset(line_buffer); continue; } /* translate into GFF3 feature type */ switch (gtf_feature_type) { case GTF_stop_codon: stop_codon = true; case GTF_CDS: gff_type_is_valid = gt_type_checker_is_valid(parser->type_checker, gt_ft_CDS); type = gt_ft_CDS; break; case GTF_exon: gff_type_is_valid = gt_type_checker_is_valid(parser->type_checker, gt_ft_exon); type = gt_ft_exon; break; case GTF_start_codon: /* we can skip the start codons, they are part of the CDS anyway */ gt_str_reset(line_buffer); continue; } gt_assert(gff_type_is_valid); /* parse the range */ had_err = gt_parse_range(&range, start, end, line_number, filename, err); HANDLE_ERROR; /* process seqname (we have to do it here because we need the range) */ gt_region_node_builder_add_region(parser->region_node_builder, seqname, range); /* parse the score */ had_err = gt_parse_score(&score_is_defined, &score_value, score, line_number, filename, err); HANDLE_ERROR; /* parse the strand */ had_err = gt_parse_strand(>_strand_value, strand, line_number, filename, err); HANDLE_ERROR; /* parse the frame */ had_err = gt_parse_phase(&phase_value, frame, line_number, filename, err); HANDLE_ERROR; /* parse the attributes */ attrkeys = gt_str_array_new(); attrvals = gt_str_array_new(); gt_splitter_reset(attribute_splitter); gene_id = NULL; transcript_id = NULL; gt_splitter_split(attribute_splitter, attributes, strlen(attributes), ';'); for (i = 0; i < gt_splitter_size(attribute_splitter); i++) { token = gt_splitter_get_token(attribute_splitter, i); /* skip leading blanks */ while (*token == ' ') token++; tokendup = gt_cstr_dup(token); attrkey = strtok(tokendup, " "); if (attrkey) { char *attrval = strtok(NULL, " "); if (attrval == NULL || strcmp(attrval, "") == 0 || strcmp(attrval, "\"\"") == 0) { gt_error_set(err, "missing value to attribute \"%s\" on line " GT_WU " in file \"%s\"", attrkey,line_number,filename); had_err = -1; } HANDLE_ERROR; if (*attrval == '"') attrval++; if (attrval[strlen(attrval)-1] == '"') attrval[strlen(attrval)-1] = '\0'; gt_assert(attrkey && strlen(attrkey) > 0); gt_assert(attrval && strlen(attrval) > 0); gt_str_array_add_cstr(attrkeys, attrkey); gt_str_array_add_cstr(attrvals, attrval); } gt_free(tokendup); /* look for the two mandatory attributes */ if (strncmp(token, GENE_ID_ATTRIBUTE, strlen(GENE_ID_ATTRIBUTE)) == 0) { if (strlen(token) + 2 < strlen(GENE_ID_ATTRIBUTE)) { gt_error_set(err, "missing value to attribute \"%s\" on line " GT_WU "in file \"%s\"", GENE_ID_ATTRIBUTE, line_number, filename); had_err = -1; } HANDLE_ERROR; gene_id = token + strlen(GENE_ID_ATTRIBUTE) + 1; if (*gene_id == '"') gene_id++; if (gene_id[strlen(gene_id)-1] == '"') gene_id[strlen(gene_id)-1] = '\0'; } else if (strncmp(token, TRANSCRIPT_ID_ATTRIBUTE, strlen(TRANSCRIPT_ID_ATTRIBUTE)) == 0) { if (strlen(token) + 2 < strlen(TRANSCRIPT_ID_ATTRIBUTE)) { gt_error_set(err, "missing value to attribute \"%s\" on line " GT_WU "in file \"%s\"", TRANSCRIPT_ID_ATTRIBUTE, line_number, filename); had_err = -1; } HANDLE_ERROR; transcript_id = token + strlen(TRANSCRIPT_ID_ATTRIBUTE) + 1; if (*transcript_id == '"') transcript_id++; if (transcript_id[strlen(transcript_id)-1] == '"') transcript_id[strlen(transcript_id)-1] = '\0'; } else if (strncmp(token, GENE_NAME_ATTRIBUTE, strlen(GENE_NAME_ATTRIBUTE)) == 0) { if (strlen(token) + 2 < strlen(GENE_NAME_ATTRIBUTE)) { gt_error_set(err, "missing value to attribute \"%s\" on line " GT_WU "in file \"%s\"", GENE_NAME_ATTRIBUTE, line_number, filename); had_err = -1; } HANDLE_ERROR; gene_name = token + strlen(GENE_NAME_ATTRIBUTE) + 1; /* for output we want to strip quotes */ if (*gene_name == '"') gene_name++; if (gene_name[strlen(gene_name)-1] == '"') gene_name[strlen(gene_name)-1] = '\0'; } else if (strncmp(token, TRANSCRIPT_NAME_ATTRIBUTE, strlen(TRANSCRIPT_NAME_ATTRIBUTE)) == 0) { if (strlen(token) + 2 < strlen(TRANSCRIPT_NAME_ATTRIBUTE)) { gt_error_set(err, "missing value to attribute \"%s\" on line " GT_WU "in file \"%s\"", TRANSCRIPT_NAME_ATTRIBUTE, line_number, filename); had_err = -1; } HANDLE_ERROR; transcript_name = token + strlen(TRANSCRIPT_NAME_ATTRIBUTE) + 1; /* for output we want to strip quotes */ if (*transcript_name == '"') transcript_name++; if (transcript_name[strlen(transcript_name)-1] == '"') transcript_name[strlen(transcript_name)-1] = '\0'; } } /* check for the mandatory attributes */ if (!gene_id) { gt_error_set(err, "missing attribute \"%s\" on line " GT_WU " in file \"%s\"", GENE_ID_ATTRIBUTE, line_number, filename); had_err = -1; } HANDLE_ERROR; if (!transcript_id) { gt_error_set(err, "missing attribute \"%s\" on line " GT_WU " in file \"%s\"", TRANSCRIPT_ID_ATTRIBUTE, line_number, filename); had_err = -1; } HANDLE_ERROR; /* process the mandatory attributes */ if (!(transcript_id_hash = gt_hashmap_get(parser->gene_id_hash, gene_id))) { transcript_id_hash = gt_hashmap_new(GT_HASH_STRING, gt_free_func, (GtFree) gt_array_delete); gt_hashmap_add(parser->gene_id_hash, gt_cstr_dup(gene_id), transcript_id_hash); } gt_assert(transcript_id_hash); if (!(gt_genome_node_array = gt_hashmap_get(transcript_id_hash, transcript_id))) { gt_genome_node_array = gt_array_new(sizeof (GtGenomeNode*)); gt_hashmap_add(transcript_id_hash, gt_cstr_dup(transcript_id), gt_genome_node_array); } gt_assert(gt_genome_node_array); /* save optional gene_name and transcript_name attributes */ if (transcript_name && strlen(transcript_name) > 0 && !gt_hashmap_get(parser->transcript_id_to_name_mapping, transcript_id)) { gt_hashmap_add(parser->transcript_id_to_name_mapping, gt_cstr_dup(transcript_id), gt_cstr_dup(transcript_name)); } if (gene_name && strlen(gene_name) > 0 && !gt_hashmap_get(parser->gene_id_to_name_mapping, gene_id)) { gt_hashmap_add(parser->gene_id_to_name_mapping, gt_cstr_dup(gene_id), gt_cstr_dup(gene_name)); } /* get seqid */ seqid_str = gt_hashmap_get(parser->seqid_to_str_mapping, seqname); if (!seqid_str) { seqid_str = gt_str_new_cstr(seqname); gt_hashmap_add(parser->seqid_to_str_mapping, gt_str_get(seqid_str), seqid_str); } gt_assert(seqid_str); /* construct the new feature */ gn = gt_feature_node_new(seqid_str, type, range.start, range.end, gt_strand_value); gt_genome_node_set_origin(gn, filenamestr, line_number); if (stop_codon) { gt_feature_node_add_attribute((GtFeatureNode*) gn, GTF_PARSER_STOP_CODON_FLAG, "true"); } for (i = 0; i < gt_str_array_size(attrkeys); i++) { GtFeatureNode *fn = (GtFeatureNode *)gn; const char *key = gt_str_array_get(attrkeys, i); const char *val = gt_str_array_get(attrvals, i); /* Not a comprehensive solution to ensure correct encoding, just bare minimum required to get Cufflinks output parsed */ if (strcmp(val, "=") == 0) val = "%26"; if (gt_feature_node_get_attribute(fn, key) != NULL) { const char *oldval = gt_feature_node_get_attribute(fn, key); GtStr *newval = gt_str_new_cstr(oldval); gt_str_append_char(newval, ','); gt_str_append_cstr(newval, val); gt_feature_node_set_attribute(fn, key, gt_str_get(newval)); gt_str_delete(newval); } else gt_feature_node_add_attribute(fn, key, val); } gt_str_array_delete(attrkeys); gt_str_array_delete(attrvals); /* set source */ source_str = gt_hashmap_get(parser->source_to_str_mapping, source); if (!source_str) { source_str = gt_str_new_cstr(source); gt_hashmap_add(parser->source_to_str_mapping, gt_str_get(source_str), source_str); } gt_assert(source_str); gt_feature_node_set_source((GtFeatureNode*) gn, source_str); if (score_is_defined) gt_feature_node_set_score((GtFeatureNode*) gn, score_value); if (phase_value != GT_PHASE_UNDEFINED) gt_feature_node_set_phase((GtFeatureNode*) gn, phase_value); gt_array_add(gt_genome_node_array, gn); } gt_str_reset(line_buffer); } /* process all region nodes */ if (!had_err) gt_region_node_builder_build(parser->region_node_builder, genome_nodes); /* process all feature nodes */ cinfo.genome_nodes = genome_nodes; cinfo.tidy = be_tolerant; cinfo.gene_id_to_name_mapping = parser->gene_id_to_name_mapping; cinfo.transcript_id_to_name_mapping = parser->transcript_id_to_name_mapping; if (!had_err) { had_err = gt_hashmap_foreach(parser->gene_id_hash, construct_genes, &cinfo, err); } gt_hashmap_foreach(parser->gene_id_hash, delete_genes, NULL, err); /* free */ gt_splitter_delete(splitter); gt_splitter_delete(attribute_splitter); gt_str_delete(line_buffer); return had_err; }
static int construct_mRNAs(GT_UNUSED void *key, void *value, void *data, GtError *err) { ConstructionInfo *cinfo = (ConstructionInfo*) data; GtArray *gt_genome_node_array = (GtArray*) value, *mRNAs = (GtArray*) cinfo->mRNAs; GtGenomeNode *mRNA_node, *first_node, *gn; const char *tname; GtStrand mRNA_strand; GtRange mRNA_range; GtStr *mRNA_seqid; GtUword i; int had_err = 0; gt_error_check(err); gt_assert(key && value && data); /* at least one node in array */ gt_assert(gt_array_size(gt_genome_node_array)); /* determine the range and the strand of the mRNA */ first_node = *(GtGenomeNode**) gt_array_get(gt_genome_node_array, 0); mRNA_range = gt_genome_node_get_range(first_node); mRNA_strand = gt_feature_node_get_strand((GtFeatureNode*) first_node); mRNA_seqid = gt_genome_node_get_seqid(first_node); /* TODO: support discontinuous start/stop codons */ for (i = 0; !had_err && i < gt_array_size(gt_genome_node_array); i++) { gn = *(GtGenomeNode**) gt_array_get(gt_genome_node_array, i); if (gt_feature_node_get_attribute((GtFeatureNode*) gn, GTF_PARSER_STOP_CODON_FLAG)) { GtUword j; GtRange stop_codon_rng = gt_genome_node_get_range(gn); bool found_cds = false; for (j = 0; !had_err && j < gt_array_size(gt_genome_node_array); j++) { GtGenomeNode* gn2; GtRange this_rng; const char *this_type; gn2 = *(GtGenomeNode**) gt_array_get(gt_genome_node_array, j); if (gn == gn2) continue; this_rng = gt_genome_node_get_range(gn2); this_type = gt_feature_node_get_type((GtFeatureNode*) gn2); if (this_type == gt_symbol(gt_ft_CDS)) { if (gt_range_contains(&this_rng, &stop_codon_rng)) { if (cinfo->tidy) { gt_warning("stop codon on line %u in file %s is contained in " "CDS in line %u", gt_genome_node_get_line_number(gn), gt_genome_node_get_filename(gn), gt_genome_node_get_line_number(gn2)); found_cds = true; } else { gt_error_set(err, "stop codon on line %u in file %s is " "contained in CDS in line %u", gt_genome_node_get_line_number(gn), gt_genome_node_get_filename(gn), gt_genome_node_get_line_number(gn2)); had_err = -1; } break; } if (this_rng.end + 1 == stop_codon_rng.start) { this_rng.end = stop_codon_rng.end; gt_genome_node_set_range(gn2, &this_rng); found_cds = true; break; } if (this_rng.start == stop_codon_rng.end + 1) { this_rng.start = stop_codon_rng.start; gt_genome_node_set_range(gn2, &this_rng); found_cds = true; break; } } } if (!found_cds) { if (!had_err) { if (cinfo->tidy) { gt_warning("found stop codon on line %u in file %s with no " "flanking CDS, ignoring it", gt_genome_node_get_line_number(gn), gt_genome_node_get_filename(gn)); } else { gt_error_set(err, "found stop codon on line %u in file %s with no " "flanking CDS", gt_genome_node_get_line_number(gn), gt_genome_node_get_filename(gn)); had_err = -1; break; } } } else { gt_array_rem(gt_genome_node_array, i); gt_genome_node_delete(gn); } } } for (i = 1; !had_err && i < gt_array_size(gt_genome_node_array); i++) { GtRange range; GtStrand strand; gn = *(GtGenomeNode**) gt_array_get(gt_genome_node_array, i); range = gt_genome_node_get_range(gn); mRNA_range = gt_range_join(&mRNA_range, &range); strand = gt_feature_node_get_strand((GtFeatureNode*) gn); if (strand != mRNA_strand) { gt_error_set(err, "feature %s on line %u has strand %c, but the " "parent transcript has strand %c", (const char*) key, gt_genome_node_get_line_number(gn), GT_STRAND_CHARS[strand], GT_STRAND_CHARS[mRNA_strand]); had_err = -1; break; } else { mRNA_strand = gt_strand_join(mRNA_strand, strand); } if (!had_err && gt_str_cmp(mRNA_seqid, gt_genome_node_get_seqid(gn))) { gt_error_set(err, "The features on lines %u and %u refer to different " "genomic sequences (``seqname''), although they have the same " "gene IDs (``gene_id'') which must be globally unique", gt_genome_node_get_line_number(first_node), gt_genome_node_get_line_number(gn)); had_err = -1; break; } } if (!had_err) { mRNA_node = gt_feature_node_new(mRNA_seqid, gt_ft_mRNA, mRNA_range.start, mRNA_range.end, mRNA_strand); gt_feature_node_add_attribute(((GtFeatureNode*) mRNA_node), "ID", key); gt_feature_node_add_attribute(((GtFeatureNode*) mRNA_node), "transcript_id", key); if ((tname = gt_hashmap_get(cinfo->transcript_id_to_name_mapping, (const char*) key)) && strlen(tname) > 0) { gt_feature_node_add_attribute((GtFeatureNode*) mRNA_node, GT_GFF_NAME, tname); } /* register children */ for (i = 0; i < gt_array_size(gt_genome_node_array); i++) { gn = *(GtGenomeNode**) gt_array_get(gt_genome_node_array, i); gt_feature_node_add_child((GtFeatureNode*) mRNA_node, (GtFeatureNode*) gt_genome_node_ref(gn)); } /* store the mRNA */ gt_array_add(mRNAs, mRNA_node); } return had_err; }