static int gt_seqmutate_runner(int argc, const char **argv, int parsed_args, void *tool_arguments, GtError *err) { MutateArguments *arguments = tool_arguments; GtBioseqIterator *bsi; unsigned long i; GtBioseq *bioseq; GtSeq *mutated_seq; int had_err; gt_error_check(err); gt_assert(arguments); bsi = gt_bioseq_iterator_new(argc - parsed_args, argv + parsed_args); while (!(had_err = gt_bioseq_iterator_next(bsi, &bioseq, err)) && bioseq) { for (i = 0; i < gt_bioseq_number_of_sequences(bioseq); i++) { mutated_seq = gt_mutate_seq(gt_bioseq_get_description(bioseq, i), gt_bioseq_get_sequence(bioseq, i), gt_bioseq_get_sequence_length(bioseq, i), gt_bioseq_get_alphabet(bioseq), arguments->rate); gt_fasta_show_entry(gt_seq_get_description(mutated_seq), gt_seq_get_orig(mutated_seq), gt_seq_length(mutated_seq), arguments->width, arguments->outfp); gt_seq_delete(mutated_seq); } gt_bioseq_delete(bioseq); } gt_bioseq_iterator_delete(bsi); return had_err; }
static GtBioseq* bioseq_new_with_recreate_and_type(GtStr *sequence_file, bool recreate, GtError *err) { GtBioseq *bs; int had_err = 0; gt_error_check(err); bs = gt_calloc(1, sizeof *bs); if (!strcmp(gt_str_get(sequence_file), "-")) bs->use_stdin = true; if (!bs->use_stdin && !gt_file_exists(gt_str_get(sequence_file))) { gt_error_set(err, "sequence file \"%s\" does not exist or is not readable", gt_str_get(sequence_file)); had_err = -1; } if (!had_err) { bs->sequence_file = gt_str_ref(sequence_file); had_err = bioseq_fill(bs, recreate, err); } if (had_err) { gt_bioseq_delete(bs); return NULL; } gt_assert(bs->encseq); bs->descriptions = gt_calloc(gt_encseq_num_of_sequences(bs->encseq), sizeof (char*)); return bs; }
static int gt_extractseq_runner(int argc, const char **argv, int parsed_args, void *tool_arguments, GtError *err) { ExtractSeqArguments *arguments = tool_arguments; int had_err = 0; gt_error_check(err); gt_assert(arguments); if (gt_str_length(arguments->fastakeyfile)) { had_err = process_fastakeyfile(arguments->fastakeyfile, argc - parsed_args, argv + parsed_args, arguments->width, arguments->outfp, err); } else { GtBioseqIterator *bsi; GtBioseq *bs; bsi = gt_bioseq_iterator_new(argc - parsed_args, argv + parsed_args); while (!had_err && !(had_err = gt_bioseq_iterator_next(bsi, &bs, err)) && bs) { if (arguments->frompos) { had_err = extractseq_pos(arguments->outfp, bs, arguments->frompos, arguments->topos, arguments->width, err); } else { had_err = extractseq_match(arguments->outfp, bs, gt_str_get(arguments->pattern), arguments->width, err); } gt_bioseq_delete(bs); } gt_bioseq_iterator_delete(bsi); } return had_err; }
static int update_bioseq_if_necessary(GtRegionMapping *rm, GtStr *seqid, GtError *err) { int had_err = 0; gt_error_check(err); gt_assert(rm && seqid); if (!rm->sequence_file || gt_str_cmp(rm->sequence_name, seqid)) { gt_str_delete(rm->sequence_file); rm->sequence_file = region_mapping_map(rm, gt_str_get(seqid), err); if (!rm->sequence_file) had_err = -1; else { if (!rm->sequence_name) rm->sequence_name = gt_str_new(); else gt_str_reset(rm->sequence_name); gt_str_append_str(rm->sequence_name, seqid); gt_bioseq_delete(rm->bioseq); rm->bioseq = gt_bioseq_new_str(rm->sequence_file, err); if (!rm->bioseq) had_err = -1; } } return had_err; }
static void gt_bioseq_col_delete(GtSeqCol *sc) { GtUword i; GtBioseqCol *bsc; bsc = gt_bioseq_col_cast(sc); if (!bsc) return; gt_seq_info_cache_delete(bsc->grep_cache); for (i = 0; i < bsc->num_of_seqfiles; i++) gt_bioseq_delete(bsc->bioseqs[i]); gt_free(bsc->bioseqs); }
static int extracttarget_from_seqfiles(const char *target, GtStrArray *seqfiles, GtError *err) { GtStr *unescaped_target; char *escaped_target; GtSplitter *splitter; unsigned long i; int had_err = 0; gt_error_check(err); gt_assert(target && seqfiles); splitter = gt_splitter_new(); unescaped_target = gt_str_new(); escaped_target = gt_cstr_dup(target); gt_splitter_split(splitter, escaped_target, strlen(escaped_target), ','); for (i = 0; !had_err && i < gt_splitter_size(splitter); i++) { GtSplitter *blank_splitter; char *token = gt_splitter_get_token(splitter, i); blank_splitter = gt_splitter_new(); gt_splitter_split(blank_splitter, token, strlen(token), ' '); had_err = gt_gff3_unescape(unescaped_target, gt_splitter_get_token(blank_splitter, 0), strlen(gt_splitter_get_token(blank_splitter, 0)), err); if (!had_err) { unsigned long j; for (j = 0; j < gt_str_array_size(seqfiles); j++) { unsigned long k; GtBioseq *bioseq; if (!(bioseq = gt_bioseq_new(gt_str_array_get(seqfiles, j), err))) { had_err = -1; break; } for (k = 0; k < gt_bioseq_number_of_sequences(bioseq); k++) { TargetInfo target_info; const char *desc = gt_bioseq_get_description(bioseq, k); target_info.bioseq = bioseq; target_info.seqnum = k; gt_string_matching_bmh(desc, strlen(desc), gt_str_get(unescaped_target), gt_str_length(unescaped_target), show_target, &target_info); } gt_bioseq_delete(bioseq); } } gt_splitter_delete(blank_splitter); } gt_free(escaped_target); gt_str_delete(unescaped_target); gt_splitter_delete(splitter); return had_err; }
void gt_region_mapping_delete(GtRegionMapping *rm) { if (!rm) return; if (rm->reference_count) { rm->reference_count--; return; } gt_str_delete(rm->sequence_filename); gt_str_delete(rm->sequence_file); gt_str_delete(rm->sequence_name); gt_mapping_delete(rm->mapping); gt_bioseq_delete(rm->bioseq); gt_free(rm); }
static int gt_seqfilter_runner(int argc, const char **argv, int parsed_args, void *tool_arguments, GtError *err) { SeqFilterArguments *arguments = tool_arguments; GtBioseqIterator *bsi; GtBioseq *bioseq; unsigned long i; unsigned long long passed = 0, filtered = 0, num_of_sequences = 0; int had_err = 0; gt_error_check(err); gt_assert(tool_arguments); bsi = gt_bioseq_iterator_new(argc - parsed_args, argv + parsed_args); while (!(had_err = gt_bioseq_iterator_next(bsi, &bioseq, err)) && bioseq) { for (i = 0; i < gt_bioseq_number_of_sequences(bioseq); i++) { if ((arguments->minlength == GT_UNDEF_ULONG || gt_bioseq_get_sequence_length(bioseq, i) >= arguments->minlength) && (arguments->maxlength == GT_UNDEF_ULONG || gt_bioseq_get_sequence_length(bioseq, i) <= arguments->maxlength) && (arguments->maxseqnum == GT_UNDEF_ULONG || passed + 1 <= arguments->maxseqnum)) { gt_fasta_show_entry(gt_bioseq_get_description(bioseq, i), gt_bioseq_get_sequence(bioseq, i), gt_bioseq_get_sequence_length(bioseq, i), arguments->width, arguments->outfp); passed++; } else filtered++; num_of_sequences++; } gt_bioseq_delete(bioseq); } /* show statistics */ if (!had_err) { gt_assert(passed + filtered == num_of_sequences); fprintf(stderr, "# %llu out of %llu sequences have been removed (%.3f%%)\n", filtered, num_of_sequences, ((double) filtered / num_of_sequences) * 100.0); } gt_bioseq_iterator_delete(bsi); return had_err; }
static int gt_shredder_runner(GT_UNUSED int argc, const char **argv, int parsed_args, void *tool_arguments, GtError *err) { GtShredderArguments *arguments = tool_arguments; GtBioseqIterator *bsi; unsigned long i; GtBioseq *bioseq; int had_err; GtStr *desc; gt_error_check(err); gt_assert(arguments); /* init */ desc = gt_str_new(); bsi = gt_bioseq_iterator_new(argc - parsed_args, argv + parsed_args); /* shredder */ while (!(had_err = gt_bioseq_iterator_next(bsi, &bioseq, err)) && bioseq) { for (i = 0; i < arguments->coverage; i++) { GtShredder *shredder; unsigned long fragment_length; const char *fragment; shredder = gt_shredder_new(bioseq, arguments->minlength, arguments->maxlength); gt_shredder_set_overlap(shredder, arguments->overlap); gt_shredder_set_sample_probability(shredder, arguments->sample_probability); while ((fragment = gt_shredder_shred(shredder, &fragment_length, desc))) { gt_str_append_cstr(desc, " [shreddered fragment]"); gt_fasta_show_entry(gt_str_get(desc), fragment, fragment_length, 0); } gt_shredder_delete(shredder); } gt_bioseq_delete(bioseq); } /* free */ gt_bioseq_iterator_delete(bsi); gt_str_delete(desc); return had_err; }
static int gt_seqtransform_runner(int argc, const char **argv, int parsed_args, void *tool_arguments, GtError *err) { SeqtransformArguments *arguments = tool_arguments; GtBioseqIterator *bsi; unsigned long i; GtBioseq *bioseq; int had_err; gt_error_check(err); gt_assert(arguments); bsi = gt_bioseq_iterator_new(argc - parsed_args, argv + parsed_args); while (!(had_err = gt_bioseq_iterator_next(bsi, &bioseq, err)) && bioseq) { GtAlphabet *alphabet; bool is_protein; alphabet = gt_bioseq_get_alphabet(bioseq); is_protein = gt_alphabet_is_protein(alphabet); for (i = 0; i < gt_bioseq_number_of_sequences(bioseq); i++) { const char *desc, *suffix = NULL; char *seq; unsigned long seqlen; desc = gt_bioseq_get_description(bioseq, i); seq = gt_bioseq_get_sequence(bioseq, i); seqlen = gt_bioseq_get_sequence_length(bioseq, i); if (arguments->addstopaminos && is_protein && seqlen && seq[seqlen-1] != GT_STOP_AMINO) { suffix = GT_STOP_AMINO_CSTR; } gt_fasta_show_entry_with_suffix(desc, seq, seqlen, suffix, arguments->width, arguments->outfp); gt_free(seq); } gt_bioseq_delete(bioseq); } gt_bioseq_iterator_delete(bsi); return had_err; }
static int split_description(const char *filename, GtStr *splitdesc, unsigned long width, bool force, GtError *err) { unsigned long i; GtBioseq *bioseq; GtStr *descname; int had_err = 0; gt_error_check(err); gt_assert(filename && splitdesc && gt_str_length(splitdesc)); descname = gt_str_new(); if (!(bioseq = gt_bioseq_new(filename, err))) had_err = -1; for (i = 0; !had_err && i < gt_bioseq_number_of_sequences(bioseq); i++) { GtFile *outfp; char *seq; gt_str_reset(descname); gt_str_append_str(descname, splitdesc); gt_str_append_char(descname, '/'); gt_str_append_cstr(descname, gt_bioseq_get_description(bioseq, i)); gt_str_append_cstr(descname, gt_file_suffix(filename)); if (!(outfp = gt_output_file_xopen_forcecheck(gt_str_get(descname), "w", force, err))) { had_err = -1; break; } seq = gt_bioseq_get_sequence(bioseq, i); gt_fasta_show_entry(gt_bioseq_get_description(bioseq, i), seq, gt_bioseq_get_sequence_length(bioseq, i), width, outfp); gt_free(seq); gt_file_delete(outfp); } gt_bioseq_delete(bioseq); gt_str_delete(descname); return had_err; }
int main(int argc, char *argv[]) { const char *style_file, *png_file, *gff3_file; char *seqid; GtStyle *style; GtBioseq *bioseq; GtFeatureIndex *feature_index; GtRange range; GtDiagram *diagram; GtLayout *layout; GtCanvas *canvas; GtCustomTrack *custom; GtUword height, windowsize; GtError *err; if (argc != 9) { fprintf(stderr, "Usage: %s style_file PNG_file GFF3_file Seq_file seqid" " start end windowsize\n", argv[0]); return EXIT_FAILURE; } style_file = argv[1]; png_file = argv[2]; gff3_file = argv[3]; /* initialize */ gt_lib_init(); /* create error object */ err = gt_error_new(); /* create style */ if (!(style = gt_style_new(err))) handle_error(err); /* load style file */ if (gt_style_load_file(style, style_file, err)) handle_error(err); /* create feature index */ feature_index = gt_feature_index_memory_new(); /* add GFF3 file to index */ if (gt_feature_index_add_gff3file(feature_index, gff3_file, err)) handle_error(err); /* create diagram for first sequence ID in feature index */ seqid = argv[5]; if (gt_feature_index_get_range_for_seqid(feature_index, &range, seqid, err)) handle_error(err); sscanf(argv[6], "%lu", &range.start); sscanf(argv[7], "%lu", &range.end); sscanf(argv[8], "%lu", &windowsize); diagram = gt_diagram_new(feature_index, seqid, &range, style, err); if (gt_error_is_set(err)) handle_error(err); /* load sequence for GC plot */ bioseq = gt_bioseq_new(argv[4], err); if (gt_error_is_set(err)) handle_error(err); /* create custom track with GC plot for first sequence in file, window size 1000, 40px height and average line at 16.5% */ custom = gt_custom_track_gc_content_new(gt_bioseq_get_sequence(bioseq, 0), gt_bioseq_get_sequence_length(bioseq, 0), windowsize, 70, 0.165, true); gt_diagram_add_custom_track(diagram, custom); /* create layout with given width, determine resulting image height */ layout = gt_layout_new(diagram, 600, style, err); if (gt_error_is_set(err)) handle_error(err); if (gt_layout_get_height(layout, &height, err)) handle_error(err); /* create PNG canvas */ canvas = gt_canvas_cairo_file_new(style, GT_GRAPHICS_PNG, 600, height, NULL, err); if (!canvas) handle_error(err); /* sketch layout on canvas */ if (gt_layout_sketch(layout, canvas, err)) handle_error(err); /* write canvas to file */ if (gt_canvas_cairo_file_to_file((GtCanvasCairoFile*) canvas, png_file, err)) handle_error(err); /* free */ gt_custom_track_delete(custom); gt_bioseq_delete(bioseq); gt_canvas_delete(canvas); gt_layout_delete(layout); gt_diagram_delete(diagram); gt_feature_index_delete(feature_index); gt_style_delete(style); gt_error_delete(err); /* perform static data cleanup */ gt_lib_clean(); return EXIT_SUCCESS; }
static int gt_ltrdigest_runner(GT_UNUSED int argc, const char **argv, int parsed_args, void *tool_arguments, GtError *err) { GtLTRdigestOptions *arguments = tool_arguments; GtNodeStream *gff3_in_stream = NULL, *gff3_out_stream = NULL, *ltrdigest_stream = NULL, *tab_out_stream = NULL, *last_stream = NULL; int had_err = 0, tests_to_run = 0, arg = parsed_args; const char *indexname = argv[arg+1]; GtLogger *logger = gt_logger_new(arguments->verbose, GT_LOGGER_DEFLT_PREFIX, stdout); GtEncseqLoader *el; GtEncseq *encseq; gt_error_check(err); gt_assert(arguments); /* Set sequence encoder options. Defaults are ok. */ el = gt_encseq_loader_new(); gt_encseq_loader_set_logger(el, logger); /* Open sequence file */ encseq = gt_encseq_loader_load(el, indexname, err); if (!encseq) had_err = -1; /* Always search for PPT. */ tests_to_run |= GT_LTRDIGEST_RUN_PPT; /* Open tRNA library if given. */ if (!had_err && arguments->trna_lib && gt_str_length(arguments->trna_lib) > 0) { tests_to_run |= GT_LTRDIGEST_RUN_PBS; arguments->pbs_opts.trna_lib = gt_bioseq_new(gt_str_get(arguments->trna_lib), err); if (gt_error_is_set(err)) had_err = -1; } #ifdef HAVE_HMMER /* Open HMMER files if given. */ if (!had_err && gt_str_array_size(arguments->pdom_opts.hmm_files) > 0) { tests_to_run |= GT_LTRDIGEST_RUN_PDOM; if (!strcmp(gt_str_get(arguments->cutoffs), "GA")) { arguments->pdom_opts.cutoff = GT_PHMM_CUTOFF_GA; } else if (!strcmp(gt_str_get(arguments->cutoffs), "TC")) { arguments->pdom_opts.cutoff = GT_PHMM_CUTOFF_TC; } else if (!strcmp(gt_str_get(arguments->cutoffs), "NONE")) { arguments->pdom_opts.cutoff = GT_PHMM_CUTOFF_NONE; } else { gt_error_set(err, "invalid cutoff setting!"); had_err = -1; } } #endif if (!had_err) { /* set up stream flow * ------------------*/ last_stream = gff3_in_stream = gt_gff3_in_stream_new_sorted(argv[arg]); last_stream = ltrdigest_stream = gt_ltrdigest_stream_new(last_stream, tests_to_run, encseq, &arguments->pbs_opts, &arguments->ppt_opts, #ifdef HAVE_HMMER &arguments->pdom_opts, #endif err); if (!ltrdigest_stream) had_err = -1; } if (!had_err) { /* attach tabular output stream, if requested */ if (gt_str_length(arguments->prefix) > 0) { last_stream = tab_out_stream = gt_ltr_fileout_stream_new(last_stream, tests_to_run, encseq, gt_str_get(arguments->prefix), &arguments->ppt_opts, &arguments->pbs_opts, #ifdef HAVE_HMMER &arguments->pdom_opts, #endif gt_str_get(arguments->trna_lib), argv[arg+1], argv[arg], arguments->seqnamelen, err); #ifdef HAVE_HMMER if (&arguments->pdom_opts.write_alignments) gt_ltr_fileout_stream_enable_pdom_alignment_output(tab_out_stream); if (&arguments->pdom_opts.write_aaseqs) gt_ltr_fileout_stream_enable_aa_sequence_output(tab_out_stream); #endif } last_stream = gff3_out_stream = gt_gff3_out_stream_new(last_stream, arguments->outfp); /* pull the features through the stream and free them afterwards */ had_err = gt_node_stream_pull(last_stream, err); } gt_node_stream_delete(gff3_out_stream); gt_node_stream_delete(ltrdigest_stream); if (tab_out_stream != NULL) gt_node_stream_delete(tab_out_stream); gt_node_stream_delete(gff3_in_stream); gt_encseq_loader_delete(el); gt_encseq_delete(encseq); encseq = NULL; gt_bioseq_delete(arguments->pbs_opts.trna_lib); gt_logger_delete(logger); return had_err; }
static int gt_ltrdigest_runner(GT_UNUSED int argc, const char **argv, int parsed_args, void *tool_arguments, GtError *err) { GtLTRdigestOptions *arguments = tool_arguments; GtNodeStream *gff3_in_stream = NULL, *gff3_out_stream = NULL, *pdom_stream = NULL, *ppt_stream = NULL, *pbs_stream = NULL, *tab_out_stream = NULL, *sa_stream = NULL, *last_stream = NULL; int had_err = 0, tests_to_run = 0, arg = parsed_args; GtRegionMapping *rmap = NULL; GtPdomModelSet *ms = NULL; gt_error_check(err); gt_assert(arguments); /* determine and open sequence source */ if (gt_seqid2file_option_used(arguments->s2fi)) { /* create region mapping */ rmap = gt_seqid2file_region_mapping_new(arguments->s2fi, err); if (!rmap) had_err = -1; } else { GtEncseqLoader *el; GtEncseq *encseq; /* no new-style sequence source option given, fall back to legacy syntax */ if (argc < 3) { gt_error_set(err, "missing mandatory argument(s)"); had_err = -1; } if (!had_err) { el = gt_encseq_loader_new(); gt_encseq_loader_disable_autosupport(el); gt_encseq_loader_require_md5_support(el); gt_encseq_loader_require_description_support(el); encseq = gt_encseq_loader_load(el, argv[argc-1], err); /* XXX: clip off terminal argument */ gt_free((char*) argv[argc-1]); argv[argc-1] = NULL; argc--; gt_encseq_loader_delete(el); if (!encseq) had_err = -1; else { rmap = gt_region_mapping_new_encseq_seqno(encseq); gt_encseq_delete(encseq); } } } gt_assert(had_err || rmap); /* Always search for PPT. */ tests_to_run |= GT_LTRDIGEST_RUN_PPT; /* Open tRNA library if given. */ if (!had_err && arguments->trna_lib && gt_str_length(arguments->trna_lib) > 0) { tests_to_run |= GT_LTRDIGEST_RUN_PBS; arguments->trna_lib_bs = gt_bioseq_new(gt_str_get(arguments->trna_lib), err); if (gt_error_is_set(err)) had_err = -1; } /* Set HMMER cutoffs. */ if (!had_err && gt_str_array_size(arguments->hmm_files) > 0) { tests_to_run |= GT_LTRDIGEST_RUN_PDOM; if (!strcmp(gt_str_get(arguments->cutoffs), "GA")) { arguments->cutoff = GT_PHMM_CUTOFF_GA; } else if (!strcmp(gt_str_get(arguments->cutoffs), "TC")) { arguments->cutoff = GT_PHMM_CUTOFF_TC; } else if (!strcmp(gt_str_get(arguments->cutoffs), "NONE")) { arguments->cutoff = GT_PHMM_CUTOFF_NONE; } else { gt_error_set(err, "invalid cutoff setting!"); had_err = -1; } } if (!had_err) { last_stream = gff3_in_stream = gt_gff3_in_stream_new_sorted(argv[arg]); } if (!had_err && gt_str_array_size(arguments->hmm_files) > 0) { GtNodeVisitor *pdom_v; ms = gt_pdom_model_set_new(arguments->hmm_files, err); if (ms != NULL) { pdom_v = gt_ltrdigest_pdom_visitor_new(ms, arguments->evalue_cutoff, arguments->chain_max_gap_length, arguments->cutoff, rmap, err); if (pdom_v == NULL) had_err = -1; if (!had_err) { gt_ltrdigest_pdom_visitor_set_source_tag((GtLTRdigestPdomVisitor*) pdom_v, GT_LTRDIGEST_TAG); if (arguments->output_all_chains) gt_ltrdigest_pdom_visitor_output_all_chains((GtLTRdigestPdomVisitor*) pdom_v); last_stream = pdom_stream = gt_visitor_stream_new(last_stream, pdom_v); } } else had_err = -1; } if (!had_err && arguments->trna_lib_bs) { GtNodeVisitor *pbs_v; pbs_v = gt_ltrdigest_pbs_visitor_new(rmap, arguments->pbs_radius, arguments->max_edist, arguments->alilen, arguments->offsetlen, arguments->trnaoffsetlen, arguments->ali_score_match, arguments->ali_score_mismatch, arguments->ali_score_insertion, arguments->ali_score_deletion, arguments->trna_lib_bs, err); if (pbs_v != NULL) last_stream = pbs_stream = gt_visitor_stream_new(last_stream, pbs_v); else had_err = -1; } if (!had_err) { GtNodeVisitor *ppt_v; ppt_v = gt_ltrdigest_ppt_visitor_new(rmap, arguments->ppt_len, arguments->ubox_len, arguments->ppt_pyrimidine_prob, arguments->ppt_purine_prob, arguments->bkg_a_prob, arguments->bkg_g_prob, arguments->bkg_t_prob, arguments->bkg_c_prob, arguments->ubox_u_prob, arguments->ppt_radius, arguments->max_ubox_dist, err); if (ppt_v != NULL) last_stream = ppt_stream = gt_visitor_stream_new(last_stream, ppt_v); else had_err = -1; } if (!had_err) { GtNodeVisitor *sa_v; sa_v = gt_ltrdigest_strand_assign_visitor_new(); gt_assert(sa_v); last_stream = sa_stream = gt_visitor_stream_new(last_stream, sa_v); } if (!had_err) { /* attach tabular output stream, if requested */ if (gt_str_length(arguments->prefix) > 0) { last_stream = tab_out_stream = gt_ltrdigest_file_out_stream_new( last_stream, tests_to_run, rmap, gt_str_get(arguments->prefix), arguments->seqnamelen, err); if (!tab_out_stream) had_err = -1; if (!had_err && arguments->print_metadata) { had_err = gt_ltrdigest_file_out_stream_write_metadata( (GtLTRdigestFileOutStream*) tab_out_stream, tests_to_run, gt_str_get(arguments->trna_lib), argv[arg], arguments->ppt_len, arguments->ubox_len, arguments->ppt_radius, arguments->alilen, arguments->max_edist, arguments->offsetlen, arguments->trnaoffsetlen, arguments->pbs_radius, arguments->hmm_files, arguments->chain_max_gap_length, arguments->evalue_cutoff, err); } if (!had_err) { if (arguments->write_alignments) gt_ltrdigest_file_out_stream_enable_pdom_alignment_output( tab_out_stream); if (arguments->write_aaseqs) gt_ltrdigest_file_out_stream_enable_aa_sequence_output( tab_out_stream); } } last_stream = gff3_out_stream = gt_gff3_out_stream_new(last_stream, arguments->outfp); /* pull the features through the stream and free them afterwards */ had_err = gt_node_stream_pull(last_stream, err); } gt_pdom_model_set_delete(ms); gt_node_stream_delete(gff3_out_stream); gt_node_stream_delete(ppt_stream); gt_node_stream_delete(pbs_stream); gt_node_stream_delete(sa_stream); gt_node_stream_delete(pdom_stream); gt_node_stream_delete(tab_out_stream); gt_node_stream_delete(gff3_in_stream); gt_bioseq_delete(arguments->trna_lib_bs); gt_region_mapping_delete(rmap); return had_err; }
static int gt_seqfilter_runner(int argc, const char **argv, int parsed_args, void *tool_arguments, GtError *err) { SeqFilterArguments *arguments = tool_arguments; GtBioseqIterator *bsi; GtBioseq *bioseq; GtUint64 passed = 0, filtered = 0, num_of_sequences = 0, steps = 0; int had_err = 0; gt_error_check(err); gt_assert(tool_arguments); bsi = gt_bioseq_iterator_new(argc - parsed_args, argv + parsed_args); while (!(had_err = gt_bioseq_iterator_next(bsi, &bioseq, err)) && bioseq != NULL) { GtUword i; GtUint64 current_num = gt_bioseq_number_of_sequences(bioseq); for (i = 0; i < current_num && (arguments->maxseqnum == GT_UNDEF_UWORD || passed + 1 <= arguments->maxseqnum); i++) { char *seq; if ((arguments->step == 1 || steps + 1 == arguments->step) && (arguments->sample_prob == 1.0 || gt_rand_0_to_1() <= arguments->sample_prob) && (arguments->minlength == GT_UNDEF_UWORD || gt_bioseq_get_sequence_length(bioseq, i) >= arguments->minlength) && (arguments->maxlength == GT_UNDEF_UWORD || gt_bioseq_get_sequence_length(bioseq, i) <= arguments->maxlength)) { seq = gt_bioseq_get_sequence(bioseq, i); gt_fasta_show_entry(gt_bioseq_get_description(bioseq, i), seq, gt_bioseq_get_sequence_length(bioseq, i), arguments->width, arguments->outfp); gt_free(seq); passed++; } else { filtered++; } steps = (steps + 1 == arguments->step) ? 0 : steps + 1; } filtered += current_num - i; num_of_sequences += current_num; gt_bioseq_delete(bioseq); } /* show statistics */ if (!had_err) { gt_assert(passed + filtered == num_of_sequences); fprintf(stderr, "# " GT_LLU " out of " GT_LLU " sequences have been removed (%.3f%%)\n", filtered, num_of_sequences, ((double) filtered / num_of_sequences) * 100.0); } gt_bioseq_iterator_delete(bsi); return had_err; }
static int gt_sequniq_runner(int argc, const char **argv, int parsed_args, void *tool_arguments, GtError *err) { GtSequniqArguments *arguments = tool_arguments; GtUint64 duplicates = 0, num_of_sequences = 0; int i, had_err = 0; GtMD5Set *md5set; gt_error_check(err); gt_assert(arguments); md5set = gt_md5set_new(arguments->nofseqs); if (!arguments->seqit) { GtUword j; GtBioseq *bs; for (i = parsed_args; !had_err && i < argc; i++) { if (!(bs = gt_bioseq_new(argv[i], err))) had_err = -1; if (!had_err) { GtMD5SetStatus retval; for (j = 0; j < gt_bioseq_number_of_sequences(bs) && !had_err; j++) { char *seq = gt_bioseq_get_sequence(bs, j); retval = gt_md5set_add_sequence(md5set, seq, gt_bioseq_get_sequence_length(bs, j), arguments->rev, err); if (retval == GT_MD5SET_NOT_FOUND) gt_fasta_show_entry(gt_bioseq_get_description(bs, j), seq, gt_bioseq_get_sequence_length(bs, j), arguments->width, arguments->outfp); else if (retval != GT_MD5SET_ERROR) duplicates++; else had_err = -1; num_of_sequences++; gt_free(seq); } gt_bioseq_delete(bs); } } } else { GtSeqIterator *seqit; GtStrArray *files; off_t totalsize; const GtUchar *sequence; char *desc; GtUword len; files = gt_str_array_new(); for (i = parsed_args; i < argc; i++) gt_str_array_add_cstr(files, argv[i]); totalsize = gt_files_estimate_total_size(files); seqit = gt_seq_iterator_sequence_buffer_new(files, err); if (!seqit) had_err = -1; if (!had_err) { if (arguments->verbose) { gt_progressbar_start(gt_seq_iterator_getcurrentcounter(seqit, (GtUint64) totalsize), (GtUint64) totalsize); } while (!had_err) { GtMD5SetStatus retval; if ((gt_seq_iterator_next(seqit, &sequence, &len, &desc, err)) != 1) break; retval = gt_md5set_add_sequence(md5set, (const char*) sequence, len, arguments->rev, err); if (retval == GT_MD5SET_NOT_FOUND) gt_fasta_show_entry(desc, (const char*) sequence, len, arguments->width, arguments->outfp); else if (retval != GT_MD5SET_ERROR) duplicates++; else had_err = -1; num_of_sequences++; } if (arguments->verbose) gt_progressbar_stop(); gt_seq_iterator_delete(seqit); } gt_str_array_delete(files); } /* show statistics */ if (!had_err) { fprintf(stderr, "# "GT_WU" out of "GT_WU" sequences have been removed (%.3f%%)\n", (GtUword)duplicates, (GtUword)num_of_sequences, ((double) duplicates / (double)num_of_sequences) * 100.0); } gt_md5set_delete(md5set); return had_err; }
int gt_pbs_unit_test(GtError *err) { int had_err = 0; GtLTRElement element; GtPBSOptions o; GtStr *tmpfilename; FILE *tmpfp; GtPBSResults *res; GtPBSHit *hit; double score1, score2; GtRange rng; char *rev_seq, *seq, tmp[BUFSIZ]; const char *fullseq = "aaaaaaaaaaaaaaaaaaaa" "tatagcactgcatttcgaatatagtttcgaatatagcactgcatttcgaa" "tatagcactgcatttcgaatatagtttcgaatatagcactgcatttcgaa" "acatactaggatgctag" /* <- PBS forward */ "aatatagtttcgaatatagcactgcatttcgaa" "tatagcactgcatttcgaatatagtttcgaatatagcactgcatttcgaa" "tatagcactgcatttcgaatatagtttcgaatatagcactgcatttcgaa" "tatagcactgcatttcgaatatagtttcgaatatagcactgcatttcgaa" "tatagcactgcatttcgaatatagtttcgaatatagcactgcatttcgaa" "tatagcactgcatttcgaatatagtttcgaatatagcactgcatttcgaa" "tatagcactgcatttcgaatatagtttcgaatatagcactgcatttcgaa" "tatagcactgcatttcgaatatagtttcgaatatag" /* PBS reverse -> */ "gatcctaaggctac" "tatagcactgcatttcgaatatagtttcgaatatagcactgcatttcgaa" "tatagcactgcatttcgaatatagtttcgaatatagcactgcatttcgaa" "aaaaaaaaaaaaaaaaaaaa"; /* notice previous errors */ gt_error_check(err); /* create temporary tRNA library file */ tmpfilename = gt_str_new(); tmpfp = gt_xtmpfp(tmpfilename); fprintf(tmpfp, ">test1\nccccccccccccccctagcatcctagtatgtccc\n" ">test2\ncccccccccgatcctagggctaccctttc\n"); gt_fa_xfclose(tmpfp); ensure(had_err, gt_file_exists(gt_str_get(tmpfilename))); /* setup testing parameters */ o.radius = 30; o.max_edist = 1; o.alilen.start = 11; o.alilen.end = 30; o.offsetlen.start = 0; o.offsetlen.end = 5; o.trnaoffsetlen.start = 0; o.trnaoffsetlen.end = 40; o.ali_score_match = 5; o.ali_score_mismatch = -10; o.ali_score_insertion = o.ali_score_deletion = -20; o.trna_lib = gt_bioseq_new(gt_str_get(tmpfilename), err); ensure(had_err, gt_bioseq_number_of_sequences(o.trna_lib) == 2); element.leftLTR_5 = 20; element.leftLTR_3 = 119; element.rightLTR_5 = 520; element.rightLTR_3 = 619; /* setup sequences */ seq = gt_malloc(600 * sizeof (char)); rev_seq = gt_malloc(600 * sizeof (char)); memcpy(seq, fullseq + 20, 600); memcpy(rev_seq, fullseq + 20, 600); gt_reverse_complement(rev_seq, 600, err); /* try to find PBS in sequences */ res = gt_pbs_find(seq, rev_seq, &element, &o, err); ensure(had_err, res != NULL); ensure(had_err, gt_pbs_results_get_number_of_hits(res) == 2); /* check first hit on forward strand */ hit = gt_pbs_results_get_ranked_hit(res, 0); ensure(had_err, hit != NULL); ensure(had_err, gt_pbs_hit_get_alignment_length(hit) == 17); ensure(had_err, gt_pbs_hit_get_edist(hit) == 0); ensure(had_err, gt_pbs_hit_get_offset(hit) == 0); ensure(had_err, gt_pbs_hit_get_tstart(hit) == 3); ensure(had_err, strcmp(gt_pbs_hit_get_trna(hit), "test1") == 0); rng = gt_pbs_hit_get_coords(hit); ensure(had_err, rng.start == 120); ensure(had_err, rng.end == 136); score1 = gt_pbs_hit_get_score(hit); ensure(had_err, gt_pbs_hit_get_strand(hit) == GT_STRAND_FORWARD); memset(tmp, 0, BUFSIZ-1); memcpy(tmp, fullseq + (rng.start * sizeof (char)), (rng.end - rng.start + 1) * sizeof (char)); ensure(had_err, strcmp(tmp, "acatactaggatgctag" ) == 0); /* check second hit on reverse strand */ hit = gt_pbs_results_get_ranked_hit(res, 1); ensure(had_err, hit != NULL); ensure(had_err, gt_pbs_hit_get_alignment_length(hit) == 14); ensure(had_err, gt_pbs_hit_get_edist(hit) == 1); ensure(had_err, gt_pbs_hit_get_offset(hit) == 0); ensure(had_err, gt_pbs_hit_get_tstart(hit) == 6); ensure(had_err, strcmp(gt_pbs_hit_get_trna(hit), "test2") == 0); rng = gt_pbs_hit_get_coords(hit); ensure(had_err, rng.start == 506); ensure(had_err, rng.end == 519); score2 = gt_pbs_hit_get_score(hit); ensure(had_err, gt_double_compare(score1, score2) > 0); ensure(had_err, gt_pbs_hit_get_strand(hit) == GT_STRAND_REVERSE); memset(tmp, 0, BUFSIZ-1); memcpy(tmp, fullseq + (rng.start * sizeof (char)), (rng.end - rng.start + 1) * sizeof (char)); ensure(had_err, strcmp(tmp, "gatcctaaggctac" ) == 0); /* clean up */ gt_xremove(gt_str_get(tmpfilename)); ensure(had_err, !gt_file_exists(gt_str_get(tmpfilename))); gt_str_delete(tmpfilename); gt_bioseq_delete(o.trna_lib); gt_free(rev_seq); gt_free(seq); gt_pbs_results_delete(res); return had_err; }
int gth_bssm_param_parameterize(GthBSSMParam *bssm_param, const char *path, Termtype termtype, bool gzip, GtError *err) { GtAlphabet *alphabet = NULL; GtBioseq *bioseq; GtStr *file2proc; GtUword i, j; int had_err = 0; gt_error_check(err); file2proc = gt_str_new(); /* set version number */ bssm_param->version_num = (unsigned char) MYVERSION; /* set model to true and set window sizes */ switch (termtype) { case GT_DONOR_TYPE: bssm_param->gt_donor_model_set = true; set_window_sizes_in_Bssmmodel(&bssm_param->gt_donor_model); break; case GC_DONOR_TYPE: bssm_param->gc_donor_model_set = true; set_window_sizes_in_Bssmmodel(&bssm_param->gc_donor_model); break; case AG_ACCEPTOR_TYPE: bssm_param->ag_acceptor_model_set = true; set_window_sizes_in_Bssmmodel(&bssm_param->ag_acceptor_model); break; default: gt_assert(0); } for (i = 0; !had_err && i < NUMOFFILES; i++) { /* process datafile */ gt_str_append_cstr(file2proc, path); switch (termtype) { case GT_DONOR_TYPE: gt_str_append_cstr(file2proc, "/GT_donor/"); gt_str_append_cstr(file2proc, filenames[i]); break; case GC_DONOR_TYPE: gt_str_append_cstr(file2proc, "/GC_donor/"); gt_str_append_cstr(file2proc, filenames[i]); break; case AG_ACCEPTOR_TYPE: gt_str_append_cstr(file2proc, "/AG_acceptor/"); gt_str_append_cstr(file2proc, filenames[i]); break; default: gt_assert(0); } if (gzip) gt_str_append_cstr(file2proc, ".gz"); if (!(bioseq = gt_bioseq_new(gt_str_get(file2proc), err))) had_err = -1; if (!had_err) alphabet = gt_bioseq_get_alphabet(bioseq); /* check here if all sequences have the length 102 and correct bases at positions 51 and 52 (i.e., GT, GC, or AG) */ for (j = 0; !had_err && j < gt_bioseq_number_of_sequences(bioseq); j++) { GtUchar encoded_seq[2]; /* check length */ if (gt_bioseq_get_sequence_length(bioseq, j) != STRINGSIZE) { gt_error_set(err, "sequence "GT_WU" in file \"%s\" does not have length %u", j, gt_str_get(file2proc), STRINGSIZE); had_err = -1; } encoded_seq[0] = gt_bioseq_get_encoded_char(bioseq, j, 50); encoded_seq[1] = gt_bioseq_get_encoded_char(bioseq, j, 51); if (!had_err) { /* check base correctness */ switch (termtype) { case GT_DONOR_TYPE: if (encoded_seq[0] != gt_alphabet_encode(alphabet, 'G') || encoded_seq[1] != gt_alphabet_encode(alphabet, 'T')) { gt_error_set(err, "sequence "GT_WU" in file \"%s\" is not a GT " "sequence", j, gt_str_get(file2proc)); had_err = -1; } break; case GC_DONOR_TYPE: if (encoded_seq[0] != gt_alphabet_encode(alphabet, 'G') || encoded_seq[1] != gt_alphabet_encode(alphabet, 'C')) { gt_error_set(err, "sequence "GT_WU" in file \"%s\" is not a GC " "sequence", j, gt_str_get(file2proc)); had_err = -1; } break; case AG_ACCEPTOR_TYPE: if (encoded_seq[0] != gt_alphabet_encode(alphabet, 'A') || encoded_seq[1] != gt_alphabet_encode(alphabet, 'G')) { gt_error_set(err, "sequence "GT_WU" in file \"%s\" is not a AG " "sequence", j, gt_str_get(file2proc)); had_err = -1; } break; default: gt_assert(0); } } } if (!had_err) { switch (termtype) { case GT_DONOR_TYPE: build_bssm(bioseq, &bssm_param->gt_donor_model, i); break; case GC_DONOR_TYPE: build_bssm(bioseq, &bssm_param->gc_donor_model, i); break; case AG_ACCEPTOR_TYPE: build_bssm(bioseq, &bssm_param->ag_acceptor_model, i); break; default: gt_assert(0); } } /* reset */ gt_str_reset(file2proc); /* free space */ gt_bioseq_delete(bioseq); } gt_str_delete(file2proc); return had_err; }
static int gt_sketch_page_runner(GT_UNUSED int argc, const char **argv, int parsed_args, void *tool_arguments, GtError *err) { SketchPageArguments *arguments = tool_arguments; int had_err = 0; GtFeatureIndex *features = NULL; GtRange qry_range, sequence_region_range; GtStyle *sty = NULL; GtStr *prog, *gt_style_file; GtDiagram *d = NULL; GtLayout *l = NULL; GtBioseq *bioseq = NULL; GtCanvas *canvas = NULL; const char *seqid = NULL, *outfile; unsigned long start, height, num_pages = 0; double offsetpos, usable_height; cairo_surface_t *surf = NULL; cairo_t *cr = NULL; GtTextWidthCalculator *twc; gt_error_check(err); features = gt_feature_index_memory_new(); if (cairo_version() < CAIRO_VERSION_ENCODE(1, 8, 6)) gt_warning("Your cairo library (version %s) is older than version 1.8.6! " "These versions contain a bug which may result in " "corrupted PDF output!", cairo_version_string()); /* get style */ sty = gt_style_new(err); if (gt_str_length(arguments->stylefile) == 0) { prog = gt_str_new(); gt_str_append_cstr_nt(prog, argv[0], gt_cstr_length_up_to_char(argv[0], ' ')); gt_style_file = gt_get_gtdata_path(gt_str_get(prog), err); gt_str_delete(prog); gt_str_append_cstr(gt_style_file, "/sketch/default.style"); } else { gt_style_file = gt_str_ref(arguments->stylefile); } had_err = gt_style_load_file(sty, gt_str_get(gt_style_file), err); outfile = argv[parsed_args]; if (!had_err) { /* get features */ had_err = gt_feature_index_add_gff3file(features, argv[parsed_args+1], err); if (!had_err && gt_str_length(arguments->seqid) == 0) { seqid = gt_feature_index_get_first_seqid(features); if (seqid == NULL) { gt_error_set(err, "GFF input file must contain a sequence region!"); had_err = -1; } } else if (!had_err && !gt_feature_index_has_seqid(features, gt_str_get(arguments->seqid))) { gt_error_set(err, "sequence region '%s' does not exist in GFF input file", gt_str_get(arguments->seqid)); had_err = -1; } else if (!had_err) seqid = gt_str_get(arguments->seqid); } /* set text */ if (gt_str_length(arguments->text) == 0) { gt_str_delete(arguments->text); arguments->text = gt_str_new_cstr(argv[parsed_args+1]); } if (!had_err) { /* set display range */ gt_feature_index_get_range_for_seqid(features, &sequence_region_range, seqid); qry_range.start = (arguments->range.start == GT_UNDEF_ULONG ? sequence_region_range.start : arguments->range.start); qry_range.end = (arguments->range.end == GT_UNDEF_ULONG ? sequence_region_range.end : arguments->range.end); /* set output format */ if (strcmp(gt_str_get(arguments->format), "pdf") == 0) { surf = cairo_pdf_surface_create(outfile, mm_to_pt(arguments->pwidth), mm_to_pt(arguments->pheight)); } else if (strcmp(gt_str_get(arguments->format), "ps") == 0) { surf = cairo_ps_surface_create(outfile, mm_to_pt(arguments->pwidth), mm_to_pt(arguments->pheight)); } gt_log_log("created page with %.2f:%.2f dimensions\n", mm_to_pt(arguments->pwidth), mm_to_pt(arguments->pheight)); offsetpos = TEXT_SPACER + arguments->theight + TEXT_SPACER; usable_height = mm_to_pt(arguments->pheight) - arguments->theight - arguments->theight - 4*TEXT_SPACER; if (gt_str_length(arguments->seqfile) > 0) { bioseq = gt_bioseq_new(gt_str_get(arguments->seqfile), err); } cr = cairo_create(surf); cairo_set_font_size(cr, 8); twc = gt_text_width_calculator_cairo_new(cr, sty); for (start = qry_range.start; start <= qry_range.end; start += arguments->width) { GtRange single_range; GtCustomTrack *ct = NULL; const char *seq; single_range.start = start; single_range.end = start + arguments->width; if (had_err) break; d = gt_diagram_new(features, seqid, &single_range, sty, err); if (!d) { had_err = -1; break; } if (bioseq) { seq = gt_bioseq_get_sequence(bioseq, 0); ct = gt_custom_track_gc_content_new(seq, gt_bioseq_get_sequence_length(bioseq, 0), 800, 70, 0.4, true); gt_diagram_add_custom_track(d, ct); } l = gt_layout_new_with_twc(d, mm_to_pt(arguments->width), sty, twc, err); had_err = gt_layout_get_height(l, &height, err); if (!had_err) { if (gt_double_smaller_double(usable_height - 10 - 2*TEXT_SPACER - arguments->theight, offsetpos + height)) { draw_header(cr, gt_str_get(arguments->text), argv[parsed_args+1], seqid, num_pages, mm_to_pt(arguments->pwidth), mm_to_pt(arguments->pheight), arguments->theight); cairo_show_page(cr); offsetpos = TEXT_SPACER + arguments->theight + TEXT_SPACER; num_pages++; } canvas = gt_canvas_cairo_context_new(sty, cr, offsetpos, mm_to_pt(arguments->pwidth), height, NULL, err); if (!canvas) had_err = -1; offsetpos += height; if (!had_err) had_err = gt_layout_sketch(l, canvas, err); } gt_canvas_delete(canvas); gt_layout_delete(l); gt_diagram_delete(d); if (ct) gt_custom_track_delete(ct); } draw_header(cr, gt_str_get(arguments->text), argv[parsed_args+1], seqid, num_pages, mm_to_pt(arguments->pwidth), mm_to_pt(arguments->pheight), arguments->theight); cairo_show_page(cr); num_pages++; gt_log_log("finished, should be %lu pages\n", num_pages); gt_text_width_calculator_delete(twc); cairo_destroy(cr); cairo_surface_flush(surf); cairo_surface_finish(surf); cairo_surface_destroy(surf); cairo_debug_reset_static_data(); if (bioseq) gt_bioseq_delete(bioseq); gt_style_delete(sty); gt_str_delete(gt_style_file); gt_feature_index_delete(features); } return had_err; }