static int gt_genomediff_arguments_check(int rest_argc, void *tool_arguments, GtError *err) { GtGenomediffArguments *arguments = tool_arguments; bool prepared_index; int had_err = 0; gt_error_check(err); gt_assert(arguments); if (rest_argc == 0) { gt_error_set(err, "give at least one file (base)name!"); had_err = -1; } if (!had_err) { if (strcmp("esa", gt_str_get(arguments->indextype)) == 0) arguments->with_esa = true; else if (strcmp("pck", gt_str_get(arguments->indextype)) == 0) arguments->with_pck = true; } prepared_index = (arguments->with_esa || arguments->with_pck); if (!had_err && arguments->user_max_depth != -1 && !arguments->with_pck) gt_warning("option -maxdepth does only apply to -indextype pck"); if (!had_err && prepared_index && gt_encseq_options_mirrored_value(arguments->loadopts)) gt_warning("option -mirrored is ignored with esa and pck index"); if (!had_err && prepared_index && rest_argc > 1) { gt_error_set(err, "there should be only one basename argument with " "-indextype esa|pck"); had_err = -1; } if (rest_argc == 1 && gt_str_length(arguments->indexname) != 0) { gt_error_set(err, "Option -indexname is only needed with sequence files, " "if one file is given as argument, this should be an index."); had_err = -1; } if (!had_err && rest_argc > 1 && gt_str_length(arguments->indexname) == 0) { gt_error_set(err, "more than one input file given, please use -indexname " "for basename of indices created during run."); had_err = -1; } if (!had_err) arguments->with_units = gt_option_is_set(arguments->ref_unitfile); return had_err; }
static int check_boundaries_visitor_check_rec(GtFeatureNode *parent, GtFeatureNode *child, GtError *err) { GtFeatureNodeIterator *fni; GtFeatureNode *node; GtRange range, p_range; int had_err = 0; range = gt_genome_node_get_range((GtGenomeNode*) child); p_range = gt_genome_node_get_range((GtGenomeNode*) parent); if (range.start < p_range.start || range.end > p_range.end) { gt_warning("%s child range " GT_WU "-" GT_WU " (file %s, line %u) not " "contained in %s parent range " GT_WU "-" GT_WU " (file %s, " "line %u)", gt_feature_node_get_type(child), range.start, range.end, gt_genome_node_get_filename((GtGenomeNode*) child), gt_genome_node_get_line_number((GtGenomeNode*) child), gt_feature_node_get_type(parent), p_range.start, p_range.end, gt_genome_node_get_filename((GtGenomeNode*) parent), gt_genome_node_get_line_number((GtGenomeNode*) parent)); } fni = gt_feature_node_iterator_new_direct(child); while ((node = gt_feature_node_iterator_next(fni))) { had_err = check_boundaries_visitor_check_rec(child, node, err); } gt_feature_node_iterator_delete(fni); return had_err; }
static int gt_splicesiteinfo_runner(int argc, const char **argv, int parsed_args, void *tool_arguments, GtError *err) { SpliceSiteInfoArguments *arguments = tool_arguments; GtNodeStream *gff3_in_stream = NULL, *add_introns_stream = NULL, *splice_site_info_stream = NULL; GtRegionMapping *region_mapping; int had_err = 0; gt_error_check(err); gt_assert(arguments); if (!had_err) { /* create gff3 input stream */ gff3_in_stream = gt_gff3_in_stream_new_unsorted(argc - parsed_args, argv + parsed_args); /* create region mapping */ region_mapping = gt_seqid2file_region_mapping_new(arguments->s2fi, err); if (!region_mapping) had_err = -1; } if (!had_err) { /* create addintrons stream (if necessary) */ if (arguments->addintrons) add_introns_stream = gt_add_introns_stream_new(gff3_in_stream); /* create extract feature stream */ splice_site_info_stream = gt_splice_site_info_stream_new( arguments->addintrons ? add_introns_stream : gff3_in_stream, region_mapping); /* pull the features through the stream and free them afterwards */ had_err = gt_node_stream_pull(splice_site_info_stream, err); } if (!had_err) { if (!gt_splice_site_info_stream_show(splice_site_info_stream, arguments->outfp)) { gt_warning("input file(s) contained no intron, use option -addintrons to " "add introns automatically"); } } /* free */ gt_node_stream_delete(splice_site_info_stream); gt_node_stream_delete(add_introns_stream); gt_node_stream_delete(gff3_in_stream); return had_err; }
static int toolbox_iterate(void *key, void *value, void *data, GT_UNUSED GtError *err) { const char *name = key; GtToolinfo *toolinfo = value; IterateInfo *info = data; gt_error_check(err); gt_assert(key && value && data); if (!toolinfo->hidden) { if (toolinfo->tool) info->func(name, toolinfo->tool, info->data); else gt_warning("skipping tool '%s' in iterator (not a GtTool object)", name); } return 0; }
static int gt_speck_arguments_check(GT_UNUSED int rest_argc, void *tool_arguments, GT_UNUSED GtError *err) { SpeccheckArguments *arguments = tool_arguments; int had_err = 0; gt_error_check(err); gt_assert(arguments); if (arguments->outfp && arguments->colored) { gt_warning("file output requested ('-o'), disabling colored output"); arguments->colored = false; } return had_err = 0; }
static int gt_speck_arguments_check(GT_UNUSED int rest_argc, void *tool_arguments, GT_UNUSED GtError *err) { SpeccheckArguments *arguments = tool_arguments; int had_err = 0; gt_error_check(err); gt_assert(arguments); if ((arguments->outfp || (!arguments->outfp && !isatty(STDOUT_FILENO))) && arguments->colored) { gt_warning("not printing to terminal, disabling colored output"); arguments->colored = false; } return had_err = 0; }
static int determine_outfp(void *data, GtError *err) { GtOutputFileInfo *ofi = (GtOutputFileInfo*) data; GtFileMode file_mode; int had_err = 0; gt_error_check(err); gt_assert(ofi); if (!gt_str_length(ofi->output_filename)) *ofi->outfp = NULL; /* no output file given -> use stdout */ else { /* outputfile given -> create generic file pointer */ gt_assert(!(ofi->gzip && ofi->bzip2)); if (ofi->gzip) file_mode = GT_FILE_MODE_GZIP; else if (ofi->bzip2) file_mode = GT_FILE_MODE_BZIP2; else file_mode = GT_FILE_MODE_UNCOMPRESSED; if (file_mode != GT_FILE_MODE_UNCOMPRESSED && strcmp(gt_str_get(ofi->output_filename) + gt_str_length(ofi->output_filename) - strlen(gt_file_mode_suffix(file_mode)), gt_file_mode_suffix(file_mode))) { gt_warning("output file '%s' doesn't have correct suffix '%s', appending " "it", gt_str_get(ofi->output_filename), gt_file_mode_suffix(file_mode)); gt_str_append_cstr(ofi->output_filename, gt_file_mode_suffix(file_mode)); } if (!ofi->force && gt_file_exists(gt_str_get(ofi->output_filename))) { gt_error_set(err, "file \"%s\" exists already, use option -%s to " "overwrite", gt_str_get(ofi->output_filename), GT_FORCE_OPT_CSTR); had_err = -1; } if (!had_err) { *ofi->outfp = gt_file_xopen_file_mode(file_mode, gt_str_get(ofi->output_filename), "w"); gt_assert(*ofi->outfp); } } return had_err; }
static GtStr* make_id_unique(GtGFF3Visitor *gff3_visitor, GtFeatureNode *fn) { GtUword i = 1; GtStr *id = gt_str_new_cstr(gt_feature_node_get_attribute(fn, "ID")); if (gt_cstr_table_get(gff3_visitor->used_ids, gt_str_get(id))) { GtStr *buf = gt_str_new(); while (!id_string_is_unique(id, buf, gff3_visitor->used_ids, i++)); gt_warning("feature ID \"%s\" not unique: changing to %s", gt_str_get(id), gt_str_get(buf)); gt_str_set(id, gt_str_get(buf)); gt_str_delete(buf); } /* update table with the new id */ gt_cstr_table_add(gff3_visitor->used_ids, gt_str_get(id)); /* store (unique) id */ gt_hashmap_add(gff3_visitor->feature_node_to_unique_id_str, fn, id); return id; }
void gt_lib_init(void) { const char *bookkeeping; bookkeeping = getenv("GT_MEM_BOOKKEEPING"); gt_ma_init(bookkeeping && !strcmp(bookkeeping, "on")); proc_env_options(); if (spacepeak && !(bookkeeping && !strcmp(bookkeeping, "on"))) gt_warning("GT_ENV_OPTIONS=-spacepeak used without GT_MEM_BOOKKEEPING=on"); gt_fa_init(); if (spacepeak) { gt_spacepeak_init(); gt_ma_enable_global_spacepeak(); gt_fa_enable_global_spacepeak(); } gt_log_init(); if (showtime) gt_showtime_enable(); gt_symbol_init(); gt_class_prealloc_run(); gt_ya_rand_init(0); }
int gt_ltrdigest_arguments_check(GT_UNUSED int rest_argc, void *tool_arguments, GtError* err) { GtLTRdigestOptions *arguments = tool_arguments; int had_err = 0; if (arguments->nthreads > 0) { gt_warning("The '-threads' option is deprecated. Please use the '-j'" "option of the 'gt' call instead, e.g.:\n" " gt -j %lu ltrdigest ...", arguments->nthreads); } /* -trnas */ if (!had_err && arguments->trna_lib && gt_str_length(arguments->trna_lib) > 0) { if (!gt_file_exists(gt_str_get(arguments->trna_lib))) { gt_error_set(err, "File '%s' does not exist!", gt_str_get(arguments->trna_lib)); had_err = -1; } } if (!had_err) { GtHMM *hmm; GtAlphabet *alpha; alpha = gt_alphabet_new_dna(); hmm = gt_ppt_hmm_new(alpha, &arguments->ppt_opts); if (!hmm) { gt_error_set(err, "PPT HMM parameters are not valid!"); had_err = -1; } else gt_hmm_delete(hmm); gt_alphabet_delete(alpha); } return had_err; }
static int gtf_show_feature_node(GtFeatureNode *fn, void *data, GtError *err) { GtGTFVisitor *gtf_visitor = (GtGTFVisitor*) data; int had_err = 0; if (gt_feature_node_has_type(fn, gt_ft_gene)) { gtf_visitor->gene_id++; gtf_visitor->transcript_id = 0; had_err = gtf_show_transcript(fn, gtf_visitor, err); } else if (gt_feature_node_has_type(fn, gt_ft_mRNA)) { had_err = gtf_show_transcript(fn, gtf_visitor, err); } else if (!(gt_feature_node_has_type(fn, gt_ft_CDS) || gt_feature_node_has_type(fn, gt_ft_exon))) { gt_warning("skipping GFF3 feature of type \"%s\" (from line %u in file " "\"%s\")", gt_feature_node_get_type(fn), gt_genome_node_get_line_number((GtGenomeNode*) fn), gt_genome_node_get_filename((GtGenomeNode*) fn)); } return had_err; }
int gt_ltrdigest_arguments_check(GT_UNUSED int rest_argc, void *tool_arguments, GtError* err) { GtLTRdigestOptions *arguments = tool_arguments; int had_err = 0; if (arguments->nthreads > 0) { gt_warning("The '-threads' option is deprecated. Please use the '-j'" "option of the 'gt' call instead, e.g.:\n" " gt -j "GT_WU" ltrdigest ...", arguments->nthreads); } /* -trnas */ if (!had_err && arguments->trna_lib && gt_str_length(arguments->trna_lib) > 0) { if (!gt_file_exists(gt_str_get(arguments->trna_lib))) { gt_error_set(err, "File '%s' does not exist!", gt_str_get(arguments->trna_lib)); had_err = -1; } } return had_err; }
static int gt_xrf_abbr_parse_tree_tag_line(GtIO *xrf_abbr_file, GtStr *tag, GtStr *value, GtError *err) { int had_err = 0; gt_error_check(err); gt_log_log("tag"); gt_assert(xrf_abbr_file && tag && value); do { had_err = gt_xrf_abbr_parse_tree_proc_any_char(xrf_abbr_file, tag, false, err); } while (!had_err && gt_xrf_abbr_parse_tree_any_char(xrf_abbr_file, false)); if (!had_err) had_err = gt_io_expect(xrf_abbr_file, XRF_SEPARATOR_CHAR, err); while (!had_err && gt_io_peek(xrf_abbr_file) == XRF_BLANK_CHAR) gt_io_next(xrf_abbr_file); if (!had_err) { do { had_err = gt_xrf_abbr_parse_tree_proc_any_char(xrf_abbr_file, value, true, err); } while (!had_err && gt_xrf_abbr_parse_tree_any_char(xrf_abbr_file, true)); } if (!had_err) { if (gt_io_peek(xrf_abbr_file) == XRF_COMMENT_CHAR) had_err = gt_xrf_abbr_parse_tree_comment_line(xrf_abbr_file, err); else had_err = gt_io_expect(xrf_abbr_file, GT_END_OF_LINE, err); } if (!had_err && !gt_xrf_abbr_parse_tree_valid_label(gt_str_get(tag))) { gt_warning("file \"%s\": line "GT_WU": unknown label \"%s\"", gt_io_get_filename(xrf_abbr_file), gt_io_get_line_number(xrf_abbr_file), gt_str_get(tag)); } gt_log_log("parsed line %s/%s", gt_str_get(tag), gt_str_get(value)); return had_err; }
static int construct_mRNAs(GT_UNUSED void *key, void *value, void *data, GtError *err) { ConstructionInfo *cinfo = (ConstructionInfo*) data; GtArray *gt_genome_node_array = (GtArray*) value, *mRNAs = (GtArray*) cinfo->mRNAs; GtGenomeNode *mRNA_node, *first_node, *gn; const char *tname; GtStrand mRNA_strand; GtRange mRNA_range; GtStr *mRNA_seqid; GtUword i; int had_err = 0; gt_error_check(err); gt_assert(key && value && data); /* at least one node in array */ gt_assert(gt_array_size(gt_genome_node_array)); /* determine the range and the strand of the mRNA */ first_node = *(GtGenomeNode**) gt_array_get(gt_genome_node_array, 0); mRNA_range = gt_genome_node_get_range(first_node); mRNA_strand = gt_feature_node_get_strand((GtFeatureNode*) first_node); mRNA_seqid = gt_genome_node_get_seqid(first_node); /* TODO: support discontinuous start/stop codons */ for (i = 0; !had_err && i < gt_array_size(gt_genome_node_array); i++) { gn = *(GtGenomeNode**) gt_array_get(gt_genome_node_array, i); if (gt_feature_node_get_attribute((GtFeatureNode*) gn, GTF_PARSER_STOP_CODON_FLAG)) { GtUword j; GtRange stop_codon_rng = gt_genome_node_get_range(gn); bool found_cds = false; for (j = 0; !had_err && j < gt_array_size(gt_genome_node_array); j++) { GtGenomeNode* gn2; GtRange this_rng; const char *this_type; gn2 = *(GtGenomeNode**) gt_array_get(gt_genome_node_array, j); if (gn == gn2) continue; this_rng = gt_genome_node_get_range(gn2); this_type = gt_feature_node_get_type((GtFeatureNode*) gn2); if (this_type == gt_symbol(gt_ft_CDS)) { if (gt_range_contains(&this_rng, &stop_codon_rng)) { if (cinfo->tidy) { gt_warning("stop codon on line %u in file %s is contained in " "CDS in line %u", gt_genome_node_get_line_number(gn), gt_genome_node_get_filename(gn), gt_genome_node_get_line_number(gn2)); found_cds = true; } else { gt_error_set(err, "stop codon on line %u in file %s is " "contained in CDS in line %u", gt_genome_node_get_line_number(gn), gt_genome_node_get_filename(gn), gt_genome_node_get_line_number(gn2)); had_err = -1; } break; } if (this_rng.end + 1 == stop_codon_rng.start) { this_rng.end = stop_codon_rng.end; gt_genome_node_set_range(gn2, &this_rng); found_cds = true; break; } if (this_rng.start == stop_codon_rng.end + 1) { this_rng.start = stop_codon_rng.start; gt_genome_node_set_range(gn2, &this_rng); found_cds = true; break; } } } if (!found_cds) { if (!had_err) { if (cinfo->tidy) { gt_warning("found stop codon on line %u in file %s with no " "flanking CDS, ignoring it", gt_genome_node_get_line_number(gn), gt_genome_node_get_filename(gn)); } else { gt_error_set(err, "found stop codon on line %u in file %s with no " "flanking CDS", gt_genome_node_get_line_number(gn), gt_genome_node_get_filename(gn)); had_err = -1; break; } } } else { gt_array_rem(gt_genome_node_array, i); gt_genome_node_delete(gn); } } } for (i = 1; !had_err && i < gt_array_size(gt_genome_node_array); i++) { GtRange range; GtStrand strand; gn = *(GtGenomeNode**) gt_array_get(gt_genome_node_array, i); range = gt_genome_node_get_range(gn); mRNA_range = gt_range_join(&mRNA_range, &range); strand = gt_feature_node_get_strand((GtFeatureNode*) gn); if (strand != mRNA_strand) { gt_error_set(err, "feature %s on line %u has strand %c, but the " "parent transcript has strand %c", (const char*) key, gt_genome_node_get_line_number(gn), GT_STRAND_CHARS[strand], GT_STRAND_CHARS[mRNA_strand]); had_err = -1; break; } else { mRNA_strand = gt_strand_join(mRNA_strand, strand); } if (!had_err && gt_str_cmp(mRNA_seqid, gt_genome_node_get_seqid(gn))) { gt_error_set(err, "The features on lines %u and %u refer to different " "genomic sequences (``seqname''), although they have the same " "gene IDs (``gene_id'') which must be globally unique", gt_genome_node_get_line_number(first_node), gt_genome_node_get_line_number(gn)); had_err = -1; break; } } if (!had_err) { mRNA_node = gt_feature_node_new(mRNA_seqid, gt_ft_mRNA, mRNA_range.start, mRNA_range.end, mRNA_strand); gt_feature_node_add_attribute(((GtFeatureNode*) mRNA_node), "ID", key); gt_feature_node_add_attribute(((GtFeatureNode*) mRNA_node), "transcript_id", key); if ((tname = gt_hashmap_get(cinfo->transcript_id_to_name_mapping, (const char*) key)) && strlen(tname) > 0) { gt_feature_node_add_attribute((GtFeatureNode*) mRNA_node, GT_GFF_NAME, tname); } /* register children */ for (i = 0; i < gt_array_size(gt_genome_node_array); i++) { gn = *(GtGenomeNode**) gt_array_get(gt_genome_node_array, i); gt_feature_node_add_child((GtFeatureNode*) mRNA_node, (GtFeatureNode*) gt_genome_node_ref(gn)); } /* store the mRNA */ gt_array_add(mRNAs, mRNA_node); } return had_err; }
static int add_ids_visitor_feature_node(GtNodeVisitor *nv, GtFeatureNode *fn, GtError *err) { AutomaticSequenceRegion *auto_sr; GtAddIDsVisitor *aiv; const char *seqid; bool is_circular; aiv = add_ids_visitor_cast(nv); seqid = gt_str_get(gt_genome_node_get_seqid((GtGenomeNode*) fn)); if (aiv->ensure_sorting && !gt_cstr_table_get(aiv->defined_seqids, seqid)) { gt_error_set(err, "the file %s is not sorted (seqid \"%s\" on line %u has " "not been previously introduced with a \"%s\" line)", gt_genome_node_get_filename((GtGenomeNode*) fn), seqid, gt_genome_node_get_line_number((GtGenomeNode*) fn), GT_GFF_SEQUENCE_REGION); return -1; } if (!gt_cstr_table_get(aiv->defined_seqids, seqid)) { GtFeatureNodeIterator *fni; GtFeatureNode *node; GtRange range = gt_genome_node_get_range((GtGenomeNode*) fn); is_circular = gt_feature_node_get_attribute(fn, GT_GFF_IS_CIRCULAR) ? true : false; if (!is_circular) { fni = gt_feature_node_iterator_new(fn); while ((node = gt_feature_node_iterator_next(fni))) { GtRange node_range = gt_genome_node_get_range((GtGenomeNode*) node); range = gt_range_join(&range, &node_range); } gt_feature_node_iterator_delete(fni); } /* sequence region has not been previously introduced -> check if one has already been created automatically */ auto_sr = gt_hashmap_get(aiv->undefined_sequence_regions, seqid); if (!auto_sr) { GtStr *seqid_str; /* sequence region has not been createad automatically -> do it now */ gt_warning("seqid \"%s\" on line %u in file \"%s\" has not been " "previously introduced with a \"%s\" line, create such a line " "automatically", seqid, gt_genome_node_get_line_number((GtGenomeNode*) fn), gt_genome_node_get_filename((GtGenomeNode*) fn), GT_GFF_SEQUENCE_REGION); auto_sr = automatic_sequence_region_new(is_circular); seqid_str = gt_genome_node_get_seqid((GtGenomeNode*) fn); auto_sr->sequence_region = gt_region_node_new(seqid_str, range.start, range.end); gt_hashmap_add(aiv->undefined_sequence_regions, gt_str_get(seqid_str), auto_sr); } else { if (auto_sr->is_circular) { gt_assert(!is_circular); /* XXX */ } else if (is_circular) { gt_assert(!auto_sr->is_circular); /* XXX */ auto_sr->is_circular = true; gt_genome_node_set_range(auto_sr->sequence_region, &range); } else { GtRange joined_range, sr_range = gt_genome_node_get_range(auto_sr->sequence_region); /* update the range of the sequence region */ joined_range = gt_range_join(&range, &sr_range); gt_genome_node_set_range(auto_sr->sequence_region, &joined_range); } } gt_array_add(auto_sr->feature_nodes, fn); } else gt_queue_add(aiv->node_buffer, fn); return 0; }
static int condenseq_io(GtCondenseq *condenseq, FILE* fp, GtIOFunc io_func, GtError *err) { int had_err = 0; int file_format = GT_CONDENSEQ_VERSION; GtUword idx; had_err = gt_condenseq_io_one(condenseq->orig_length); if (!had_err) had_err = gt_condenseq_io_one(file_format); if (!had_err && file_format != GT_CONDENSEQ_VERSION) { gt_error_set(err, "condenseq index is format version %d, current is " "%d -- please re-encode", file_format, GT_CONDENSEQ_VERSION); had_err = -1; } if (!had_err) had_err = gt_condenseq_io_one(condenseq->orig_num_seq); if (!had_err) had_err = gt_condenseq_io_one(condenseq->ldb_nelems); if (!had_err) { if (condenseq->ldb_nelems == 0) { gt_warning("compression of condenseq did not succeed in finding any " "compressable similarities, maybe the input is to small or " "the chosen parameters should be reconsidered."); } if (condenseq->links == NULL) { condenseq->links = gt_calloc((size_t) condenseq->ldb_nelems, sizeof (*condenseq->links)); condenseq->ldb_allocated = condenseq->ldb_nelems; } had_err = gt_condenseq_io_one(condenseq->udb_nelems); } if (!had_err) { gt_assert(condenseq->udb_nelems > 0); if (condenseq->uniques == NULL) { condenseq->uniques = gt_malloc(sizeof (*condenseq->uniques) * condenseq->udb_nelems ); condenseq->udb_allocated = condenseq->udb_nelems; } } for (idx = 0; !had_err && idx < condenseq->ldb_nelems; idx++) { had_err = condenseq_linkentry_io(&condenseq->links[idx], fp, io_func, err); } for (idx = 0; !had_err && idx < condenseq->udb_nelems; idx++) { had_err = condenseq_uniqueentry_io(&condenseq->uniques[idx], fp, io_func, err); } if (!had_err && condenseq->orig_num_seq > (GtUword) 1) { condenseq->ssptab = gt_intset_io(condenseq->ssptab, fp, err); if (condenseq->ssptab == NULL) had_err = 1; } if (!had_err) had_err = gt_condenseq_io_one(condenseq->id_len); if (!had_err) { if (condenseq->id_len == GT_UNDEF_UWORD) { condenseq->sdstab = gt_intset_io(condenseq->sdstab, fp, err); if (condenseq->sdstab == NULL) had_err = 1; } } if (!had_err) had_err = gt_condenseq_io_one(condenseq->ids_total_len); if (!had_err) { condenseq->orig_ids = gt_realloc(condenseq->orig_ids, (size_t) condenseq->ids_total_len); had_err = io_func(condenseq->orig_ids, sizeof (*condenseq->orig_ids), (size_t) condenseq->ids_total_len, fp, err); } return had_err; }
int gt_gtf_parser_parse(GtGTFParser *parser, GtQueue *genome_nodes, GtStr *filenamestr, GtFile *fpin, bool be_tolerant, GtError *err) { GtStr *seqid_str, *source_str, *line_buffer; char *line; size_t line_length; GtUword i, line_number = 0; GtGenomeNode *gn; GtRange range; GtPhase phase_value; GtStrand gt_strand_value; GtSplitter *splitter, *attribute_splitter; float score_value; char *seqname, *source, *feature, *start, *end, *score, *strand, *frame, *attributes, *token, *gene_id, *gene_name = NULL, *transcript_id, *transcript_name = NULL, **tokens; GtHashmap *transcript_id_hash; /* map from transcript id to array of genome nodes */ GtArray *gt_genome_node_array; ConstructionInfo cinfo; GTF_feature_type gtf_feature_type; GT_UNUSED bool gff_type_is_valid = false; const char *type = NULL; const char *filename; bool score_is_defined; int had_err = 0; gt_assert(parser && genome_nodes); gt_error_check(err); filename = gt_str_get(filenamestr); /* alloc */ line_buffer = gt_str_new(); splitter = gt_splitter_new(), attribute_splitter = gt_splitter_new(); #define HANDLE_ERROR \ if (had_err) { \ if (be_tolerant) { \ fprintf(stderr, "skipping line: %s\n", gt_error_get(err)); \ gt_error_unset(err); \ gt_str_reset(line_buffer); \ had_err = 0; \ continue; \ } \ else { \ had_err = -1; \ break; \ } \ } while (gt_str_read_next_line_generic(line_buffer, fpin) != EOF) { line = gt_str_get(line_buffer); line_length = gt_str_length(line_buffer); line_number++; had_err = 0; if (line_length == 0) { gt_warning("skipping blank line " GT_WU " in file \"%s\"", line_number, filename); } else if (line[0] == '#') { /* storing comment */ if (line_length >= 2 && line[1] == '#') gn = gt_comment_node_new(line+2); /* store '##' line as '#' line */ else gn = gt_comment_node_new(line+1); gt_genome_node_set_origin(gn, filenamestr, line_number); gt_queue_add(genome_nodes, gn); } else { /* process tab delimited GTF line */ gt_splitter_reset(splitter); gt_splitter_split(splitter, line, line_length, '\t'); if (gt_splitter_size(splitter) != 9UL) { gt_error_set(err, "line " GT_WU " in file \"%s\" contains " GT_WU " tab (\\t) " "separated fields instead of 9", line_number, filename, gt_splitter_size(splitter)); had_err = -1; break; } tokens = gt_splitter_get_tokens(splitter); seqname = tokens[0]; source = tokens[1]; feature = tokens[2]; start = tokens[3]; end = tokens[4]; score = tokens[5]; strand = tokens[6]; frame = tokens[7]; attributes = tokens[8]; /* parse feature */ if (GTF_feature_type_get(>f_feature_type, feature) == -1) { /* we skip unknown features */ fprintf(stderr, "skipping line " GT_WU " in file \"%s\": unknown " "feature: \"%s\"\n", line_number, filename, feature); gt_str_reset(line_buffer); continue; } /* translate into GFF3 feature type */ switch (gtf_feature_type) { case GTF_CDS: case GTF_stop_codon: gff_type_is_valid = gt_type_checker_is_valid(parser->type_checker, gt_ft_CDS); type = gt_ft_CDS; break; case GTF_exon: gff_type_is_valid = gt_type_checker_is_valid(parser->type_checker, gt_ft_exon); type = gt_ft_exon; } gt_assert(gff_type_is_valid); /* parse the range */ had_err = gt_parse_range(&range, start, end, line_number, filename, err); HANDLE_ERROR; /* process seqname (we have to do it here because we need the range) */ gt_region_node_builder_add_region(parser->region_node_builder, seqname, range); /* parse the score */ had_err = gt_parse_score(&score_is_defined, &score_value, score, line_number, filename, err); HANDLE_ERROR; /* parse the strand */ had_err = gt_parse_strand(>_strand_value, strand, line_number, filename, err); HANDLE_ERROR; /* parse the frame */ had_err = gt_parse_phase(&phase_value, frame, line_number, filename, err); HANDLE_ERROR; /* parse the attributes */ gt_splitter_reset(attribute_splitter); gene_id = NULL; transcript_id = NULL; gt_splitter_split(attribute_splitter, attributes, strlen(attributes), ';'); for (i = 0; i < gt_splitter_size(attribute_splitter); i++) { token = gt_splitter_get_token(attribute_splitter, i); /* skip leading blanks */ while (*token == ' ') token++; /* look for the two mandatory attributes */ if (strncmp(token, GENE_ID_ATTRIBUTE, strlen(GENE_ID_ATTRIBUTE)) == 0) { if (strlen(token) + 2 < strlen(GENE_ID_ATTRIBUTE)) { gt_error_set(err, "missing value to attribute \"%s\" on line " GT_WU "in file \"%s\"", GENE_ID_ATTRIBUTE, line_number, filename); had_err = -1; } HANDLE_ERROR; gene_id = token + strlen(GENE_ID_ATTRIBUTE) + 1; } else if (strncmp(token, TRANSCRIPT_ID_ATTRIBUTE, strlen(TRANSCRIPT_ID_ATTRIBUTE)) == 0) { if (strlen(token) + 2 < strlen(TRANSCRIPT_ID_ATTRIBUTE)) { gt_error_set(err, "missing value to attribute \"%s\" on line " GT_WU "in file \"%s\"", TRANSCRIPT_ID_ATTRIBUTE, line_number, filename); had_err = -1; } HANDLE_ERROR; transcript_id = token + strlen(TRANSCRIPT_ID_ATTRIBUTE) + 1; } else if (strncmp(token, GENE_NAME_ATTRIBUTE, strlen(GENE_NAME_ATTRIBUTE)) == 0) { if (strlen(token) + 2 < strlen(GENE_NAME_ATTRIBUTE)) { gt_error_set(err, "missing value to attribute \"%s\" on line " GT_WU "in file \"%s\"", GENE_NAME_ATTRIBUTE, line_number, filename); had_err = -1; } HANDLE_ERROR; gene_name = token + strlen(GENE_NAME_ATTRIBUTE) + 1; /* for output we want to strip quotes */ if (*gene_name == '"') gene_name++; if (gene_name[strlen(gene_name)-1] == '"') gene_name[strlen(gene_name)-1] = '\0'; } else if (strncmp(token, TRANSCRIPT_NAME_ATTRIBUTE, strlen(TRANSCRIPT_NAME_ATTRIBUTE)) == 0) { if (strlen(token) + 2 < strlen(TRANSCRIPT_NAME_ATTRIBUTE)) { gt_error_set(err, "missing value to attribute \"%s\" on line " GT_WU "in file \"%s\"", TRANSCRIPT_NAME_ATTRIBUTE, line_number, filename); had_err = -1; } HANDLE_ERROR; transcript_name = token + strlen(TRANSCRIPT_NAME_ATTRIBUTE) + 1; /* for output we want to strip quotes */ if (*transcript_name == '"') transcript_name++; if (transcript_name[strlen(transcript_name)-1] == '"') transcript_name[strlen(transcript_name)-1] = '\0'; } } /* check for the mandatory attributes */ if (!gene_id) { gt_error_set(err, "missing attribute \"%s\" on line " GT_WU " in file \"%s\"", GENE_ID_ATTRIBUTE, line_number, filename); had_err = -1; } HANDLE_ERROR; if (!transcript_id) { gt_error_set(err, "missing attribute \"%s\" on line " GT_WU " in file \"%s\"", TRANSCRIPT_ID_ATTRIBUTE, line_number, filename); had_err = -1; } HANDLE_ERROR; /* process the mandatory attributes */ if (!(transcript_id_hash = gt_hashmap_get(parser->gene_id_hash, gene_id))) { transcript_id_hash = gt_hashmap_new(GT_HASH_STRING, gt_free_func, (GtFree) gt_array_delete); gt_hashmap_add(parser->gene_id_hash, gt_cstr_dup(gene_id), transcript_id_hash); } gt_assert(transcript_id_hash); if (!(gt_genome_node_array = gt_hashmap_get(transcript_id_hash, transcript_id))) { gt_genome_node_array = gt_array_new(sizeof (GtGenomeNode*)); gt_hashmap_add(transcript_id_hash, gt_cstr_dup(transcript_id), gt_genome_node_array); } gt_assert(gt_genome_node_array); /* save optional gene_name and transcript_name attributes */ if (transcript_name && !gt_hashmap_get(parser->transcript_id_to_name_mapping, transcript_id)) { gt_hashmap_add(parser->transcript_id_to_name_mapping, gt_cstr_dup(transcript_id), gt_cstr_dup(transcript_name)); } if (gene_name && !gt_hashmap_get(parser->gene_id_to_name_mapping, gene_id)) { gt_hashmap_add(parser->gene_id_to_name_mapping, gt_cstr_dup(gene_id), gt_cstr_dup(gene_name)); } /* get seqid */ seqid_str = gt_hashmap_get(parser->seqid_to_str_mapping, seqname); if (!seqid_str) { seqid_str = gt_str_new_cstr(seqname); gt_hashmap_add(parser->seqid_to_str_mapping, gt_str_get(seqid_str), seqid_str); } gt_assert(seqid_str); /* construct the new feature */ gn = gt_feature_node_new(seqid_str, type, range.start, range.end, gt_strand_value); gt_genome_node_set_origin(gn, filenamestr, line_number); /* set source */ source_str = gt_hashmap_get(parser->source_to_str_mapping, source); if (!source_str) { source_str = gt_str_new_cstr(source); gt_hashmap_add(parser->source_to_str_mapping, gt_str_get(source_str), source_str); } gt_assert(source_str); gt_feature_node_set_source((GtFeatureNode*) gn, source_str); if (score_is_defined) gt_feature_node_set_score((GtFeatureNode*) gn, score_value); if (phase_value != GT_PHASE_UNDEFINED) gt_feature_node_set_phase((GtFeatureNode*) gn, phase_value); gt_array_add(gt_genome_node_array, gn); } gt_str_reset(line_buffer); } /* process all region nodes */ if (!had_err) gt_region_node_builder_build(parser->region_node_builder, genome_nodes); /* process all feature nodes */ cinfo.genome_nodes = genome_nodes; cinfo.gene_id_to_name_mapping = parser->gene_id_to_name_mapping; cinfo.transcript_id_to_name_mapping = parser->transcript_id_to_name_mapping; if (!had_err) { had_err = gt_hashmap_foreach(parser->gene_id_hash, construct_genes, &cinfo, err); } /* free */ gt_splitter_delete(splitter); gt_splitter_delete(attribute_splitter); gt_str_delete(line_buffer); return had_err; }
static int inter_feature_in_children(GtFeatureNode *current_feature, void *data, GT_UNUSED GtError *err) { GtInterFeatureVisitor *aiv = (GtInterFeatureVisitor*) data; GtFeatureNode *inter_node; GtRange previous_range, current_range, inter_range; GtStrand previous_strand, /*current_strand, */inter_strand; GtStr *parent_seqid; gt_error_check(err); gt_assert(current_feature); if (gt_feature_node_has_type(current_feature, aiv->outside_type)) { if (aiv->previous_feature) { /* determine inter range */ previous_range = gt_genome_node_get_range((GtGenomeNode*) aiv->previous_feature); current_range = gt_genome_node_get_range((GtGenomeNode*) current_feature); if (previous_range.end >= current_range.start) { gt_warning("overlapping boundary features " GT_WU "-" GT_WU " and " GT_WU "-" GT_WU ", " "not placing '%s' inter-feature", previous_range.start, previous_range.end, current_range.start, current_range.end, aiv->inter_type); return 0; } if (current_range.start - previous_range.end < 2) { gt_warning("no space for inter-feature '%s' between " GT_WU " and " GT_WU, aiv->inter_type, previous_range.end, current_range.start); return 0; } inter_range.start = previous_range.end + 1; inter_range.end = current_range.start - 1; /* determine inter strand */ previous_strand = gt_feature_node_get_strand(aiv->previous_feature); /*current_strand = gt_feature_node_get_strand(current_feature);*/ gt_assert(previous_strand == gt_feature_node_get_strand(current_feature)); inter_strand = previous_strand; /* determine sequence id */ parent_seqid = gt_genome_node_get_seqid((GtGenomeNode*) aiv->parent_feature); gt_assert(!gt_str_cmp(parent_seqid, gt_genome_node_get_seqid((GtGenomeNode*) aiv->previous_feature))); gt_assert(!gt_str_cmp(parent_seqid, gt_genome_node_get_seqid((GtGenomeNode*) current_feature))); /* create inter feature */ inter_node = (GtFeatureNode*) gt_feature_node_new(parent_seqid, aiv->inter_type, inter_range.start, inter_range.end, inter_strand); gt_feature_node_add_child(aiv->parent_feature, inter_node); } aiv->previous_feature = current_feature; } return 0; }
static int parse_range(GtRange *range, const char *start, const char *end, unsigned int line_number, const char *filename, bool tidy, bool correct_negative, GtError *err) { long start_val, end_val; char *ep; gt_assert(start && end && filename); gt_error_check(err); range->start = GT_UNDEF_ULONG; range->end = GT_UNDEF_ULONG; /* parse and check start */ errno = 0; start_val = strtol(start, &ep, 10); if (start[0] == '\0' || *ep != '\0') { gt_error_set(err, "could not parse number '%s' on line %u in file '%s'", start, line_number, filename); return -1; } if (errno == ERANGE && (start_val == LONG_MAX || start_val == LONG_MIN)) { gt_error_set(err, "number '%s' out of range on line %u in file '%s'", start, line_number, filename); return -1; } if (start_val < 0) { if (tidy || correct_negative) { gt_warning("start '%s' is negative on line %u in file '%s'; reset to 1", start, line_number, filename); start_val = 1; } else { gt_error_set(err, "start '%s' is negative on line %u in file '%s'", start, line_number, filename); return -1; } } if (start_val == 0 && tidy) { gt_warning("start '%s' is zero on line %u in file '%s' (GFF3 files are " "1-based); reset to 1", start, line_number, filename); start_val = 1; } /* parse and check end */ errno = 0; end_val = strtol(end, &ep, 10); if (end[0] == '\0' || *ep != '\0') { gt_error_set(err, "could not parse number '%s' on line %u in file '%s'", end, line_number, filename); return -1; } if (errno == ERANGE && (end_val == LONG_MAX || end_val == LONG_MIN)) { gt_error_set(err, "number '%s' out of range on line %u in file '%s'", end, line_number, filename); return -1; } if (end_val < 0) { if (tidy || correct_negative) { gt_warning("end '%s' is negative on line %u in file '%s'; reset to 1", end, line_number, filename); end_val = 1; } else { gt_error_set(err, "end '%s' is negative on line %u in file '%s'", end, line_number, filename); return -1; } } if (end_val == 0 && tidy) { gt_warning("end '%s' is zero on line %u in file '%s' (GFF3 files are " "1-based); reset to 1", end, line_number, filename); end_val = 1; } /* check range */ if (start_val > end_val) { if (tidy) { long tmp_val; gt_warning("start '%lu' is larger then end '%lu' on line %u in file " "'%s'; swap them", start_val, end_val, line_number, filename); tmp_val = start_val; start_val = end_val; end_val = tmp_val; } else { gt_error_set(err, "start '%lu' is larger then end '%lu' on line %u in " "file '%s'", start_val, end_val, line_number, filename); return -1; } } /* set result */ range->start = start_val; range->end = end_val; return 0; }
static int gt_seqorder_runner(GT_UNUSED int argc, const char **argv, int parsed_args, void *tool_arguments, GtError *err) { GtSeqorderArguments *arguments = tool_arguments; int had_err = 0; GtEncseq *encseq; GtEncseqLoader *loader; unsigned long i, nofseqs; gt_error_check(err); gt_assert(arguments != NULL); /* load encseq */ loader = gt_encseq_loader_new(); encseq = gt_encseq_loader_load(loader, argv[parsed_args], err); if (encseq == NULL) had_err = -1; if (had_err == 0 && !gt_encseq_has_description_support(encseq)) gt_warning("%s has no description support", argv[parsed_args]); if (!had_err) { nofseqs = gt_encseq_num_of_sequences(encseq); if (arguments->invert) { for (i = nofseqs; i > 0; i--) gt_seqorder_output(i - 1, encseq); } else if (arguments->shuffle) { unsigned long *seqnums; seqnums = gt_malloc(sizeof (unsigned long) * nofseqs); gt_seqorder_get_shuffled_seqnums(nofseqs, seqnums); for (i = 0; i < nofseqs; i++) gt_seqorder_output(seqnums[i], encseq); gt_free(seqnums); } else { GtSuffixsortspace *suffixsortspace; gt_assert(arguments->sort || arguments->revsort); suffixsortspace = gt_suffixsortspace_new(nofseqs, /* Use iterator over sequence separators: saves a lot of binary searches */ gt_encseq_seqstartpos(encseq, nofseqs-1), false,NULL); gt_seqorder_sort(suffixsortspace, encseq); if (arguments->sort) for (i = 0; i < nofseqs; i++) gt_seqorder_output(gt_encseq_seqnum(encseq, gt_suffixsortspace_getdirect(suffixsortspace, i)), encseq); else for (i = nofseqs; i > 0; i--) gt_seqorder_output(gt_encseq_seqnum(encseq, gt_suffixsortspace_getdirect(suffixsortspace, i - 1)), encseq); gt_suffixsortspace_delete(suffixsortspace, false); } } gt_encseq_loader_delete(loader); gt_encseq_delete(encseq); return had_err; }
static int calc_spliced_alignments(GthSACollection *sa_collection, GthChainCollection *chain_collection, GthCallInfo *call_info, GthInput *input, GthStat *stat, GtUword gen_file_num, GtUword ref_file_num, bool directmatches, GthMatchInfo *match_info, GthDNACompletePathMatrixJT dna_complete_path_matrix_jt, GthProteinCompletePathMatrixJT protein_complete_path_matrix_jt) { const unsigned char *ref_seq_tran, *ref_seq_orig, *ref_seq_tran_rc = NULL, *ref_seq_orig_rc = NULL; GtUword chainctr, gen_offset = GT_UNDEF_UWORD, gen_total_length, ref_total_length; GtFile *outfp = call_info->out->outfp; GtRange gen_seq_bounds, gen_seq_bounds_rc; bool refseqisdna; GthChain *chain; GtRange range; GthSA *saA; int rval; gt_assert(sa_collection && chain_collection); refseqisdna = gth_input_ref_file_is_dna(input, ref_file_num); for (chainctr = 0; chainctr < gth_chain_collection_size(chain_collection); chainctr++) { chain = gth_chain_collection_get(chain_collection, chainctr); if (++match_info->call_number > call_info->firstalshown && call_info->firstalshown > 0) { if (!(call_info->out->xmlout || call_info->out->gff3out)) gt_file_xfputc('\n', outfp); else if (call_info->out->xmlout) gt_file_xprintf(outfp, "<!--\n"); if (!call_info->out->gff3out) { gt_file_xprintf(outfp, "Maximal matching %s count (%u) reached.\n", refseqisdna ? "EST" : "protein", call_info->firstalshown); gt_file_xprintf(outfp, "Only the first %u matches will be " "displayed.\n", call_info->firstalshown); } if (!(call_info->out->xmlout || call_info->out->gff3out)) gt_file_xfputc('\n', outfp); else if (call_info->out->xmlout) gt_file_xprintf(outfp, "-->\n"); match_info->max_call_number_reached = true; break; /* break out of loop */ } /* compute considered genomic regions if not set by -frompos */ if (!gth_input_use_substring_spec(input)) { gen_seq_bounds = gth_input_get_genomic_range(input, chain->gen_file_num, chain->gen_seq_num); gen_total_length = gt_range_length(&gen_seq_bounds); gen_offset = gen_seq_bounds.start; gen_seq_bounds_rc = gen_seq_bounds; } else { /* genomic multiseq contains exactly one sequence */ gt_assert(gth_input_num_of_gen_seqs(input, chain->gen_file_num) == 1); gen_total_length = gth_input_genomic_file_total_length(input, chain ->gen_file_num); gen_seq_bounds.start = gth_input_genomic_substring_from(input); gen_seq_bounds.end = gth_input_genomic_substring_to(input); gen_offset = 0; gen_seq_bounds_rc.start = gen_total_length - 1 - gen_seq_bounds.end; gen_seq_bounds_rc.end = gen_total_length - 1 - gen_seq_bounds.start; } /* "retrieving" the reference sequence */ range = gth_input_get_reference_range(input, chain->ref_file_num, chain->ref_seq_num); ref_seq_tran = gth_input_current_ref_seq_tran(input) + range.start; ref_seq_orig = gth_input_current_ref_seq_orig(input) + range.start; if (refseqisdna) { ref_seq_tran_rc = gth_input_current_ref_seq_tran_rc(input) + range.start; ref_seq_orig_rc = gth_input_current_ref_seq_orig_rc(input) + range.start; } ref_total_length = range.end - range.start + 1; /* check if protein sequences have a stop amino acid */ if (!refseqisdna && !match_info->stop_amino_acid_warning && ref_seq_orig[ref_total_length - 1] != GT_STOP_AMINO) { GtStr *ref_id = gt_str_new(); gth_input_save_ref_id(input, ref_id, chain->ref_file_num, chain->ref_seq_num); gt_warning("protein sequence '%s' (#" GT_WU " in file %s) does not end " "with a stop amino acid ('%c'). If it is not a protein " "fragment you should add a stop amino acid to improve the " "prediction. For example with `gt seqtransform " "-addstopaminos` (see http://genometools.org for details).", gt_str_get(ref_id), chain->ref_seq_num, gth_input_get_reference_filename(input, chain->ref_file_num), GT_STOP_AMINO); match_info->stop_amino_acid_warning = true; gt_str_delete(ref_id); } /* allocating space for alignment */ saA = gth_sa_new_and_set(directmatches, true, input, chain->gen_file_num, chain->gen_seq_num, chain->ref_file_num, chain->ref_seq_num, match_info->call_number, gen_total_length, gen_offset, ref_total_length); /* extend the DP borders to the left and to the right */ gth_chain_extend_borders(chain, &gen_seq_bounds, &gen_seq_bounds_rc, gen_total_length, gen_offset); /* From here on the dp positions always refer to the forward strand of the genomic DNA. */ /* call the Dynamic Programming */ if (refseqisdna) { rval = call_dna_DP(directmatches, call_info, input, stat, sa_collection, saA, gen_file_num, ref_file_num, gen_total_length, gen_offset, &gen_seq_bounds, &gen_seq_bounds_rc, ref_total_length, range.start, chainctr, gth_chain_collection_size(chain_collection), match_info, ref_seq_tran, ref_seq_orig, ref_seq_tran_rc, ref_seq_orig_rc, chain, dna_complete_path_matrix_jt, protein_complete_path_matrix_jt); } else { rval = call_protein_DP(directmatches, call_info, input, stat, sa_collection, saA, gen_file_num, ref_file_num, gen_total_length, gen_offset, &gen_seq_bounds, &gen_seq_bounds_rc, ref_total_length, range.start, chainctr, gth_chain_collection_size(chain_collection), match_info, ref_seq_tran, ref_seq_orig, chain, dna_complete_path_matrix_jt, protein_complete_path_matrix_jt); } /* check return value */ if (rval == GTH_ERROR_DP_PARAMETER_ALLOCATION_FAILED) { /* statistics bookkeeping */ gth_stat_increment_numoffailedDPparameterallocations(stat); gth_stat_increment_numofundeterminedSAs(stat); /* free space */ gth_sa_delete(saA); match_info->call_number--; continue; /* continue with the next DP range */ } else if (rval) return -1; } if (!call_info->out->xmlout && !call_info->out->gff3out && !directmatches && !match_info->significant_match_found && match_info->call_number <= call_info->firstalshown) { show_no_match_line(gth_input_get_alphatype(input, ref_file_num), outfp); } return 0; }
static int check_cds_phases(GtArray *cds_features, GtCDSCheckVisitor *v, bool is_multi, bool second_pass, GtError *err) { GtPhase current_phase, correct_phase = GT_PHASE_ZERO; GtFeatureNode *fn; GtStrand strand; unsigned long i, current_length; int had_err = 0; gt_error_check(err); gt_assert(cds_features); gt_assert(gt_array_size(cds_features)); fn = *(GtFeatureNode**) gt_array_get_first(cds_features); strand = gt_feature_node_get_strand(fn); if (strand == GT_STRAND_REVERSE) gt_array_reverse(cds_features); for (i = 0; !had_err && i < gt_array_size(cds_features); i++) { fn = *(GtFeatureNode**) gt_array_get(cds_features, i); /* the first phase can be anything (except being undefined), because the GFF3 spec says: NOTE 4 - CDS features MUST have have a defined phase field. Otherwise it is not possible to infer the correct polypeptides corresponding to partially annotated genes. */ if ((!i && gt_feature_node_get_phase(fn) == GT_PHASE_UNDEFINED) || (i && gt_feature_node_get_phase(fn) != correct_phase)) { if (gt_hashmap_get(v->cds_features, fn)) { if (v->tidy && !is_multi && !gt_feature_node_has_children(fn)) { /* we can split the feature */ gt_warning("%s feature on line %u in file \"%s\" has multiple " "parents which require different phases; split feature", gt_ft_CDS, gt_genome_node_get_line_number((GtGenomeNode*) fn), gt_genome_node_get_filename((GtGenomeNode*) fn)); gt_hashmap_add(v->cds_features_to_split, fn, fn); v->splitting_is_necessary = true; /* split later */ } else { gt_error_set(err, "%s feature on line %u in file \"%s\" has multiple " "parents which require different phases", gt_ft_CDS, gt_genome_node_get_line_number((GtGenomeNode*) fn), gt_genome_node_get_filename((GtGenomeNode*) fn)); had_err = -1; } } else { if (v->tidy) { if (!second_pass) { gt_warning("%s feature on line %u in file \"%s\" has the wrong " "phase %c -> correcting it to %c", gt_ft_CDS, gt_genome_node_get_line_number((GtGenomeNode*) fn), gt_genome_node_get_filename((GtGenomeNode*) fn), GT_PHASE_CHARS[gt_feature_node_get_phase(fn)], GT_PHASE_CHARS[correct_phase]); } gt_feature_node_set_phase(fn, correct_phase); } else { gt_error_set(err, "%s feature on line %u in file \"%s\" has the " "wrong phase %c (should be %c)", gt_ft_CDS, gt_genome_node_get_line_number((GtGenomeNode*) fn), gt_genome_node_get_filename((GtGenomeNode*) fn), GT_PHASE_CHARS[gt_feature_node_get_phase(fn)], GT_PHASE_CHARS[correct_phase]); had_err = -1; } } } if (!had_err) { current_phase = gt_feature_node_get_phase(fn); current_length = gt_genome_node_get_length((GtGenomeNode*) fn); correct_phase = (3 - (current_length - current_phase) % 3) % 3; gt_hashmap_add(v->cds_features, fn, fn); /* record CDS feature */ } } return had_err; }
static int gt_seqtranslate_runner(int argc, const char **argv, int parsed_args, void *tool_arguments, GT_UNUSED GtError *err) { GtTranslateArguments *arguments = tool_arguments; GtSeqIterator *si = NULL; GtSequenceBuffer *sb = NULL; GtStrArray *infiles; int had_err = 0, rval, i; GtStr *translations[3]; translations[0] = gt_str_new(); translations[1] = gt_str_new(); translations[2] = gt_str_new(); gt_error_check(err); gt_assert(arguments); infiles = gt_str_array_new(); for (i = parsed_args; i < argc; i++) { gt_str_array_add_cstr(infiles, argv[i]); } sb = gt_sequence_buffer_new_guess_type(infiles, err); if (!sb) had_err = -1; if (!had_err) { si = gt_seq_iterator_sequence_buffer_new_with_buffer(sb); if (!si) had_err = -1; } if (!had_err) { char *desc; const GtUchar *sequence; GtUword len; while (!had_err && (rval = gt_seq_iterator_next(si, &sequence, &len, &desc, err))) { if (rval < 0) { had_err = -1; break; } if (len < GT_CODON_LENGTH) { gt_warning("sequence '%s' is shorter than codon length of %d, skipping", desc, GT_CODON_LENGTH); } else { had_err = gt_seqtranslate_do_translation(arguments, (char*) sequence, len, desc, translations, false, err); if (!had_err && arguments->reverse) { char *revseq = gt_cstr_dup_nt((char*) sequence, len); had_err = gt_reverse_complement(revseq, len, err); if (!had_err) { had_err = gt_seqtranslate_do_translation(arguments, revseq, len, desc, translations, true, err); } gt_free(revseq); } } } } gt_str_delete(translations[0]); gt_str_delete(translations[1]); gt_str_delete(translations[2]); gt_str_array_delete(infiles); gt_seq_iterator_delete(si); gt_sequence_buffer_delete(sb); return had_err; }
static int output_sequence(GtEncseq *encseq, GtEncseqDecodeArguments *args, const char *filename, GtError *err) { GtUword i, j, sfrom, sto; int had_err = 0; bool has_desc; GtEncseqReader *esr; gt_assert(encseq); if (!(has_desc = gt_encseq_has_description_support(encseq))) gt_warning("Missing description support for file %s", filename); if (strcmp(gt_str_get(args->mode), "fasta") == 0) { /* specify a single sequence to extract */ if (args->seq != GT_UNDEF_UWORD) { if (args->seq >= gt_encseq_num_of_sequences(encseq)) { gt_error_set(err, "requested sequence "GT_WU" exceeds number of sequences " "("GT_WU")", args->seq, gt_encseq_num_of_sequences(encseq)); return -1; } sfrom = args->seq; sto = args->seq + 1; } else if (args->seqrng.start != GT_UNDEF_UWORD && args->seqrng.end != GT_UNDEF_UWORD) { /* specify a sequence range to extract */ if (args->seqrng.start >= gt_encseq_num_of_sequences(encseq) || args->seqrng.end >= gt_encseq_num_of_sequences(encseq)) { gt_error_set(err, "range "GT_WU"-"GT_WU" includes a sequence number " "exceeding the total number of sequences ("GT_WU")", args->seqrng.start, args->seqrng.end, gt_encseq_num_of_sequences(encseq)); return -1; } sfrom = args->seqrng.start; sto = args->seqrng.end + 1; } else { /* extract all sequences */ sfrom = 0; sto = gt_encseq_num_of_sequences(encseq); } for (i = sfrom; i < sto; i++) { GtUword desclen, startpos, len; char buf[BUFSIZ]; const char *desc = NULL; /* XXX: maybe make this distinction in the functions via readmode? */ if (!GT_ISDIRREVERSE(args->rm)) { startpos = gt_encseq_seqstartpos(encseq, i); len = gt_encseq_seqlength(encseq, i); if (has_desc) { desc = gt_encseq_description(encseq, &desclen, i); } else { (void) snprintf(buf, BUFSIZ, "sequence "GT_WU"", i); desclen = strlen(buf); desc = buf; } } else { startpos = gt_encseq_seqstartpos(encseq, i); len = gt_encseq_seqlength(encseq, gt_encseq_num_of_sequences(encseq)-1-i); startpos = gt_encseq_total_length(encseq) - (gt_encseq_seqstartpos(encseq, gt_encseq_num_of_sequences( encseq)-1-i) + len); if (has_desc) { desc = gt_encseq_description(encseq, &desclen, gt_encseq_num_of_sequences(encseq)-1-i); } else { (void) snprintf(buf, BUFSIZ, "sequence "GT_WU"", i); desclen = strlen(buf); desc = buf; } } gt_assert(desc); /* output description */ gt_xfputc(GT_FASTA_SEPARATOR, stdout); gt_xfwrite(desc, 1, desclen, stdout); gt_xfputc('\n', stdout); /* XXX: make this more efficient by writing in a buffer first and then showing the result */ if (args->singlechars) { for (j = 0; j < len; j++) { gt_xfputc(gt_encseq_get_decoded_char(encseq, startpos + j, args->rm), stdout); } } else { esr = gt_encseq_create_reader_with_readmode(encseq, args->rm, startpos); for (j = 0; j < len; j++) { gt_xfputc(gt_encseq_reader_next_decoded_char(esr), stdout); } gt_encseq_reader_delete(esr); } gt_xfputc('\n', stdout); } } if (strcmp(gt_str_get(args->mode), "concat") == 0) { GtUword from = 0, to = gt_encseq_total_length(encseq) - 1; if (args->rng.start != GT_UNDEF_UWORD && args->rng.end != GT_UNDEF_UWORD) { if (args->rng.end > to) { had_err = -1; gt_error_set(err, "end of range ("GT_WU") exceeds encoded sequence length " "("GT_WU")", args->rng.end, to); } if (!had_err) { from = args->rng.start; to = args->rng.end; } } if (!had_err) { if (args->singlechars) { for (j = from; j <= to; j++) { char cc = gt_encseq_get_decoded_char(encseq, j, args->rm); if (cc == (char) SEPARATOR) cc = gt_str_get(args->sepchar)[0]; gt_xfputc(cc, stdout); } } else { esr = gt_encseq_create_reader_with_readmode(encseq, args->rm, from); if (esr) { for (j = from; j <= to; j++) { char cc = gt_encseq_reader_next_decoded_char(esr); if (cc == (char) SEPARATOR) cc = gt_str_get(args->sepchar)[0]; gt_xfputc(cc, stdout); } gt_encseq_reader_delete(esr); } } gt_xfputc('\n', stdout); } } return had_err; }
static int gt_sketch_page_runner(GT_UNUSED int argc, const char **argv, int parsed_args, void *tool_arguments, GtError *err) { SketchPageArguments *arguments = tool_arguments; int had_err = 0; GtFeatureIndex *features = NULL; GtRange qry_range, sequence_region_range; GtStyle *sty = NULL; GtStr *prog, *gt_style_file; GtDiagram *d = NULL; GtLayout *l = NULL; GtBioseq *bioseq = NULL; GtCanvas *canvas = NULL; const char *seqid = NULL, *outfile; unsigned long start, height, num_pages = 0; double offsetpos, usable_height; cairo_surface_t *surf = NULL; cairo_t *cr = NULL; GtTextWidthCalculator *twc; gt_error_check(err); features = gt_feature_index_memory_new(); if (cairo_version() < CAIRO_VERSION_ENCODE(1, 8, 6)) gt_warning("Your cairo library (version %s) is older than version 1.8.6! " "These versions contain a bug which may result in " "corrupted PDF output!", cairo_version_string()); /* get style */ sty = gt_style_new(err); if (gt_str_length(arguments->stylefile) == 0) { prog = gt_str_new(); gt_str_append_cstr_nt(prog, argv[0], gt_cstr_length_up_to_char(argv[0], ' ')); gt_style_file = gt_get_gtdata_path(gt_str_get(prog), err); gt_str_delete(prog); gt_str_append_cstr(gt_style_file, "/sketch/default.style"); } else { gt_style_file = gt_str_ref(arguments->stylefile); } had_err = gt_style_load_file(sty, gt_str_get(gt_style_file), err); outfile = argv[parsed_args]; if (!had_err) { /* get features */ had_err = gt_feature_index_add_gff3file(features, argv[parsed_args+1], err); if (!had_err && gt_str_length(arguments->seqid) == 0) { seqid = gt_feature_index_get_first_seqid(features); if (seqid == NULL) { gt_error_set(err, "GFF input file must contain a sequence region!"); had_err = -1; } } else if (!had_err && !gt_feature_index_has_seqid(features, gt_str_get(arguments->seqid))) { gt_error_set(err, "sequence region '%s' does not exist in GFF input file", gt_str_get(arguments->seqid)); had_err = -1; } else if (!had_err) seqid = gt_str_get(arguments->seqid); } /* set text */ if (gt_str_length(arguments->text) == 0) { gt_str_delete(arguments->text); arguments->text = gt_str_new_cstr(argv[parsed_args+1]); } if (!had_err) { /* set display range */ gt_feature_index_get_range_for_seqid(features, &sequence_region_range, seqid); qry_range.start = (arguments->range.start == GT_UNDEF_ULONG ? sequence_region_range.start : arguments->range.start); qry_range.end = (arguments->range.end == GT_UNDEF_ULONG ? sequence_region_range.end : arguments->range.end); /* set output format */ if (strcmp(gt_str_get(arguments->format), "pdf") == 0) { surf = cairo_pdf_surface_create(outfile, mm_to_pt(arguments->pwidth), mm_to_pt(arguments->pheight)); } else if (strcmp(gt_str_get(arguments->format), "ps") == 0) { surf = cairo_ps_surface_create(outfile, mm_to_pt(arguments->pwidth), mm_to_pt(arguments->pheight)); } gt_log_log("created page with %.2f:%.2f dimensions\n", mm_to_pt(arguments->pwidth), mm_to_pt(arguments->pheight)); offsetpos = TEXT_SPACER + arguments->theight + TEXT_SPACER; usable_height = mm_to_pt(arguments->pheight) - arguments->theight - arguments->theight - 4*TEXT_SPACER; if (gt_str_length(arguments->seqfile) > 0) { bioseq = gt_bioseq_new(gt_str_get(arguments->seqfile), err); } cr = cairo_create(surf); cairo_set_font_size(cr, 8); twc = gt_text_width_calculator_cairo_new(cr, sty); for (start = qry_range.start; start <= qry_range.end; start += arguments->width) { GtRange single_range; GtCustomTrack *ct = NULL; const char *seq; single_range.start = start; single_range.end = start + arguments->width; if (had_err) break; d = gt_diagram_new(features, seqid, &single_range, sty, err); if (!d) { had_err = -1; break; } if (bioseq) { seq = gt_bioseq_get_sequence(bioseq, 0); ct = gt_custom_track_gc_content_new(seq, gt_bioseq_get_sequence_length(bioseq, 0), 800, 70, 0.4, true); gt_diagram_add_custom_track(d, ct); } l = gt_layout_new_with_twc(d, mm_to_pt(arguments->width), sty, twc, err); had_err = gt_layout_get_height(l, &height, err); if (!had_err) { if (gt_double_smaller_double(usable_height - 10 - 2*TEXT_SPACER - arguments->theight, offsetpos + height)) { draw_header(cr, gt_str_get(arguments->text), argv[parsed_args+1], seqid, num_pages, mm_to_pt(arguments->pwidth), mm_to_pt(arguments->pheight), arguments->theight); cairo_show_page(cr); offsetpos = TEXT_SPACER + arguments->theight + TEXT_SPACER; num_pages++; } canvas = gt_canvas_cairo_context_new(sty, cr, offsetpos, mm_to_pt(arguments->pwidth), height, NULL, err); if (!canvas) had_err = -1; offsetpos += height; if (!had_err) had_err = gt_layout_sketch(l, canvas, err); } gt_canvas_delete(canvas); gt_layout_delete(l); gt_diagram_delete(d); if (ct) gt_custom_track_delete(ct); } draw_header(cr, gt_str_get(arguments->text), argv[parsed_args+1], seqid, num_pages, mm_to_pt(arguments->pwidth), mm_to_pt(arguments->pheight), arguments->theight); cairo_show_page(cr); num_pages++; gt_log_log("finished, should be %lu pages\n", num_pages); gt_text_width_calculator_delete(twc); cairo_destroy(cr); cairo_surface_flush(surf); cairo_surface_finish(surf); cairo_surface_destroy(surf); cairo_debug_reset_static_data(); if (bioseq) gt_bioseq_delete(bioseq); gt_style_delete(sty); gt_str_delete(gt_style_file); gt_feature_index_delete(features); } return had_err; }