static int gt_seqfilter_runner(int argc, const char **argv, int parsed_args, void *tool_arguments, GtError *err) { SeqFilterArguments *arguments = tool_arguments; GtBioseqIterator *bsi; GtBioseq *bioseq; unsigned long i; unsigned long long passed = 0, filtered = 0, num_of_sequences = 0; int had_err = 0; gt_error_check(err); gt_assert(tool_arguments); bsi = gt_bioseq_iterator_new(argc - parsed_args, argv + parsed_args); while (!(had_err = gt_bioseq_iterator_next(bsi, &bioseq, err)) && bioseq) { for (i = 0; i < gt_bioseq_number_of_sequences(bioseq); i++) { if ((arguments->minlength == GT_UNDEF_ULONG || gt_bioseq_get_sequence_length(bioseq, i) >= arguments->minlength) && (arguments->maxlength == GT_UNDEF_ULONG || gt_bioseq_get_sequence_length(bioseq, i) <= arguments->maxlength) && (arguments->maxseqnum == GT_UNDEF_ULONG || passed + 1 <= arguments->maxseqnum)) { gt_fasta_show_entry(gt_bioseq_get_description(bioseq, i), gt_bioseq_get_sequence(bioseq, i), gt_bioseq_get_sequence_length(bioseq, i), arguments->width, arguments->outfp); passed++; } else filtered++; num_of_sequences++; } gt_bioseq_delete(bioseq); } /* show statistics */ if (!had_err) { gt_assert(passed + filtered == num_of_sequences); fprintf(stderr, "# %llu out of %llu sequences have been removed (%.3f%%)\n", filtered, num_of_sequences, ((double) filtered / num_of_sequences) * 100.0); } gt_bioseq_iterator_delete(bsi); return had_err; }
static int gt_bioseq_col_grep_desc(GtSeqCol *sc, char **seq, GtUword start, GtUword end, GtStr *seqid, GtError *err) { GtUword filenum = 0, seqnum = 0, seqlength; int had_err; GtBioseqCol *bsc; bsc = gt_bioseq_col_cast(sc); gt_error_check(err); gt_assert(bsc && seq && seqid); had_err = grep_desc(bsc, &filenum, &seqnum, seqid, err); if (!had_err) { seqlength = gt_bioseq_get_sequence_length(bsc->bioseqs[filenum], seqnum); if (start > seqlength - 1 || end > seqlength - 1) { had_err = -1; gt_error_set(err, "trying to extract range "GT_WU"-"GT_WU" on sequence " "``%s'' which is not covered by that sequence (only " ""GT_WU" characters in size). Has the sequence-region " "to sequence mapping been defined correctly?", start, end, gt_str_get(seqid), seqlength); } } if (!had_err) { *seq = gt_bioseq_get_sequence_range(bsc->bioseqs[filenum], seqnum, start, end); } return had_err; }
static int extractseq_match(GtFile *outfp, GtBioseq *bs, const char *pattern, unsigned long width, GtError *err) { const char *desc; unsigned long i; bool match; int had_err = 0; gt_error_check(err); gt_assert(bs && pattern); for (i = 0; !had_err && i < gt_bioseq_number_of_sequences(bs); i++) { desc = gt_bioseq_get_description(bs, i); gt_assert(desc); had_err = gt_grep(&match, pattern, desc, err); if (!had_err && match) { gt_fasta_show_entry_generic(desc, gt_bioseq_get_sequence(bs, i), gt_bioseq_get_sequence_length(bs, i), width, outfp); } } return had_err; }
static int gt_seqmutate_runner(int argc, const char **argv, int parsed_args, void *tool_arguments, GtError *err) { MutateArguments *arguments = tool_arguments; GtBioseqIterator *bsi; unsigned long i; GtBioseq *bioseq; GtSeq *mutated_seq; int had_err; gt_error_check(err); gt_assert(arguments); bsi = gt_bioseq_iterator_new(argc - parsed_args, argv + parsed_args); while (!(had_err = gt_bioseq_iterator_next(bsi, &bioseq, err)) && bioseq) { for (i = 0; i < gt_bioseq_number_of_sequences(bioseq); i++) { mutated_seq = gt_mutate_seq(gt_bioseq_get_description(bioseq, i), gt_bioseq_get_sequence(bioseq, i), gt_bioseq_get_sequence_length(bioseq, i), gt_bioseq_get_alphabet(bioseq), arguments->rate); gt_fasta_show_entry(gt_seq_get_description(mutated_seq), gt_seq_get_orig(mutated_seq), gt_seq_length(mutated_seq), arguments->width, arguments->outfp); gt_seq_delete(mutated_seq); } gt_bioseq_delete(bioseq); } gt_bioseq_iterator_delete(bsi); return had_err; }
static GtUword gt_bioseq_col_get_sequence_length(const GtSeqCol *sc, GtUword filenum, GtUword seqnum) { GtBioseqCol *bsc; bsc = gt_bioseq_col_cast(sc); gt_assert(bsc && filenum < bsc->num_of_seqfiles); return gt_bioseq_get_sequence_length(bsc->bioseqs[filenum], seqnum); }
static bool show_target(GT_UNUSED unsigned long pos, void *data) { TargetInfo *ti = data; gt_assert(ti); gt_fasta_show_entry(gt_bioseq_get_description(ti->bioseq, ti->seqnum), gt_bioseq_get_sequence(ti->bioseq, ti->seqnum), gt_bioseq_get_sequence_length(ti->bioseq, ti->seqnum), 0); return true; }
GtSeq* gt_bioseq_get_seq(GtBioseq *bs, GtUword idx) { GtSeq *seq; gt_assert(bs); gt_assert(idx < gt_encseq_num_of_sequences(bs->encseq)); seq = gt_seq_new_own(gt_bioseq_get_sequence(bs, idx), gt_bioseq_get_sequence_length(bs, idx), gt_encseq_alphabet(bs->encseq)); gt_seq_set_description(seq, gt_bioseq_get_description(bs, idx)); return seq; }
void gt_bioseq_show_seqlengthdistri(GtBioseq *bs, GtFile *outfp) { GtDiscDistri *d; GtUword i; gt_assert(bs); d = gt_disc_distri_new(); for (i = 0; i < gt_bioseq_number_of_sequences(bs); i++) gt_disc_distri_add(d, gt_bioseq_get_sequence_length(bs, i)); gt_file_xprintf(outfp, "sequence length distribution:\n"); gt_disc_distri_show(d, outfp); gt_disc_distri_delete(d); }
void gt_bioseq_show_as_fasta(GtBioseq *bs, GtUword width, GtFile *outfp) { GtUword i; gt_assert(bs); for (i = 0; i < gt_bioseq_number_of_sequences(bs); i++) { char *seq = gt_bioseq_get_sequence(bs, i); gt_fasta_show_entry(gt_bioseq_get_description(bs, i), seq, gt_bioseq_get_sequence_length(bs, i), width, outfp); gt_free(seq); } }
void gt_bioseq_show_sequence_as_fasta(GtBioseq *bs, GtUword seqnum, GtUword width, GtFile *outfp) { char *seq = NULL; gt_assert(bs); gt_assert(seqnum < gt_bioseq_number_of_sequences(bs)); seq = gt_bioseq_get_sequence(bs, seqnum); gt_fasta_show_entry(gt_bioseq_get_description(bs, seqnum), seq, gt_bioseq_get_sequence_length(bs, seqnum), width, outfp); gt_free(seq); }
static int gt_bioseq_col_grep_desc_sequence_length(GtSeqCol *sc, GtUword *length, GtStr *seqid, GtError *err) { GtUword filenum = 0, seqnum = 0; int had_err; GtBioseqCol *bsc; bsc = gt_bioseq_col_cast(sc); gt_error_check(err); gt_assert(bsc && length && seqid); had_err = grep_desc(bsc, &filenum, &seqnum, seqid, err); if (!had_err) *length = gt_bioseq_get_sequence_length(bsc->bioseqs[filenum], seqnum); return had_err; }
void gt_bioseq_show_stat(GtBioseq *bs, GtFile *outfp) { GtUword i, num_of_seqs; gt_assert(bs); num_of_seqs = gt_bioseq_number_of_sequences(bs); gt_file_xprintf(outfp, "showing statistics for sequence file \"%s\"\n", gt_str_get(bs->sequence_file)); gt_file_xprintf(outfp, "number of sequences: "GT_WU"\n", num_of_seqs); gt_file_xprintf(outfp, "total length: "GT_WU"\n", gt_encseq_total_length(bs->encseq) - gt_encseq_num_of_sequences(bs->encseq) + 1); for (i = 0; i < num_of_seqs; i++) { gt_file_xprintf(outfp, "sequence #"GT_WU" length: "GT_WU"\n", i+1, gt_bioseq_get_sequence_length(bs, i)); } }
int gt_bioseq_col_md5_to_sequence_length(GtSeqCol *sc, GtUword *len, GtStr *md5_seqid, GtError *err) { GtUword seqnum = GT_UNDEF_UWORD; GtBioseq *bioseq = NULL; GtBioseqCol *bsc; int had_err = 0; bsc = gt_bioseq_col_cast(sc); gt_error_check(err); gt_assert(bsc && len && md5_seqid && err); gt_assert(gt_md5_seqid_has_prefix(gt_str_get(md5_seqid))); if (!(had_err = md5_to_index(&bioseq, &seqnum, bsc, md5_seqid, err))) { gt_assert(seqnum != GT_UNDEF_UWORD); *len = gt_bioseq_get_sequence_length(bioseq, seqnum); } return had_err; }
static int gt_seqtransform_runner(int argc, const char **argv, int parsed_args, void *tool_arguments, GtError *err) { SeqtransformArguments *arguments = tool_arguments; GtBioseqIterator *bsi; unsigned long i; GtBioseq *bioseq; int had_err; gt_error_check(err); gt_assert(arguments); bsi = gt_bioseq_iterator_new(argc - parsed_args, argv + parsed_args); while (!(had_err = gt_bioseq_iterator_next(bsi, &bioseq, err)) && bioseq) { GtAlphabet *alphabet; bool is_protein; alphabet = gt_bioseq_get_alphabet(bioseq); is_protein = gt_alphabet_is_protein(alphabet); for (i = 0; i < gt_bioseq_number_of_sequences(bioseq); i++) { const char *desc, *suffix = NULL; char *seq; unsigned long seqlen; desc = gt_bioseq_get_description(bioseq, i); seq = gt_bioseq_get_sequence(bioseq, i); seqlen = gt_bioseq_get_sequence_length(bioseq, i); if (arguments->addstopaminos && is_protein && seqlen && seq[seqlen-1] != GT_STOP_AMINO) { suffix = GT_STOP_AMINO_CSTR; } gt_fasta_show_entry_with_suffix(desc, seq, seqlen, suffix, arguments->width, arguments->outfp); gt_free(seq); } gt_bioseq_delete(bioseq); } gt_bioseq_iterator_delete(bsi); return had_err; }
static int split_description(const char *filename, GtStr *splitdesc, unsigned long width, bool force, GtError *err) { unsigned long i; GtBioseq *bioseq; GtStr *descname; int had_err = 0; gt_error_check(err); gt_assert(filename && splitdesc && gt_str_length(splitdesc)); descname = gt_str_new(); if (!(bioseq = gt_bioseq_new(filename, err))) had_err = -1; for (i = 0; !had_err && i < gt_bioseq_number_of_sequences(bioseq); i++) { GtFile *outfp; char *seq; gt_str_reset(descname); gt_str_append_str(descname, splitdesc); gt_str_append_char(descname, '/'); gt_str_append_cstr(descname, gt_bioseq_get_description(bioseq, i)); gt_str_append_cstr(descname, gt_file_suffix(filename)); if (!(outfp = gt_output_file_xopen_forcecheck(gt_str_get(descname), "w", force, err))) { had_err = -1; break; } seq = gt_bioseq_get_sequence(bioseq, i); gt_fasta_show_entry(gt_bioseq_get_description(bioseq, i), seq, gt_bioseq_get_sequence_length(bioseq, i), width, outfp); gt_free(seq); gt_file_delete(outfp); } gt_bioseq_delete(bioseq); gt_str_delete(descname); return had_err; }
static char* generate_fragment(GtShredder *shredder, unsigned long *fragment_length, GtStr *desc) { gt_assert(shredder && fragment_length); if (shredder->seqnum < gt_bioseq_number_of_sequences(shredder->bioseq)) { unsigned long seqlen, fraglen; char *frag; seqlen = gt_bioseq_get_sequence_length(shredder->bioseq, shredder->seqnum); fraglen = (shredder->maxlength == shredder->minlength ? 0 : gt_rand_max(shredder->maxlength - shredder->minlength)) + shredder->minlength; gt_assert(fraglen >= shredder->minlength); if (shredder->pos + fraglen > seqlen) fraglen = seqlen - shredder->pos; *fragment_length = fraglen; gt_str_reset(desc); gt_str_append_cstr(desc, gt_bioseq_get_description(shredder->bioseq, shredder->seqnum)); gt_assert(shredder->pos + fraglen <= seqlen); frag = gt_bioseq_get_sequence_range(shredder->bioseq, shredder->seqnum, shredder->pos, shredder->pos + fraglen -1); if (shredder->pos + fraglen == seqlen) { /* last fragment */ shredder->seqnum++; shredder->pos = 0; } else { if (fraglen > shredder->overlap) shredder->pos += fraglen - shredder->overlap; else shredder->pos++; /* go at least one base further each step */ } return frag; } return NULL; }
static int gt_sequniq_runner(int argc, const char **argv, int parsed_args, void *tool_arguments, GtError *err) { GtSequniqArguments *arguments = tool_arguments; GtUint64 duplicates = 0, num_of_sequences = 0; int i, had_err = 0; GtMD5Set *md5set; gt_error_check(err); gt_assert(arguments); md5set = gt_md5set_new(arguments->nofseqs); if (!arguments->seqit) { GtUword j; GtBioseq *bs; for (i = parsed_args; !had_err && i < argc; i++) { if (!(bs = gt_bioseq_new(argv[i], err))) had_err = -1; if (!had_err) { GtMD5SetStatus retval; for (j = 0; j < gt_bioseq_number_of_sequences(bs) && !had_err; j++) { char *seq = gt_bioseq_get_sequence(bs, j); retval = gt_md5set_add_sequence(md5set, seq, gt_bioseq_get_sequence_length(bs, j), arguments->rev, err); if (retval == GT_MD5SET_NOT_FOUND) gt_fasta_show_entry(gt_bioseq_get_description(bs, j), seq, gt_bioseq_get_sequence_length(bs, j), arguments->width, arguments->outfp); else if (retval != GT_MD5SET_ERROR) duplicates++; else had_err = -1; num_of_sequences++; gt_free(seq); } gt_bioseq_delete(bs); } } } else { GtSeqIterator *seqit; GtStrArray *files; off_t totalsize; const GtUchar *sequence; char *desc; GtUword len; files = gt_str_array_new(); for (i = parsed_args; i < argc; i++) gt_str_array_add_cstr(files, argv[i]); totalsize = gt_files_estimate_total_size(files); seqit = gt_seq_iterator_sequence_buffer_new(files, err); if (!seqit) had_err = -1; if (!had_err) { if (arguments->verbose) { gt_progressbar_start(gt_seq_iterator_getcurrentcounter(seqit, (GtUint64) totalsize), (GtUint64) totalsize); } while (!had_err) { GtMD5SetStatus retval; if ((gt_seq_iterator_next(seqit, &sequence, &len, &desc, err)) != 1) break; retval = gt_md5set_add_sequence(md5set, (const char*) sequence, len, arguments->rev, err); if (retval == GT_MD5SET_NOT_FOUND) gt_fasta_show_entry(desc, (const char*) sequence, len, arguments->width, arguments->outfp); else if (retval != GT_MD5SET_ERROR) duplicates++; else had_err = -1; num_of_sequences++; } if (arguments->verbose) gt_progressbar_stop(); gt_seq_iterator_delete(seqit); } gt_str_array_delete(files); } /* show statistics */ if (!had_err) { fprintf(stderr, "# "GT_WU" out of "GT_WU" sequences have been removed (%.3f%%)\n", (GtUword)duplicates, (GtUword)num_of_sequences, ((double) duplicates / (double)num_of_sequences) * 100.0); } gt_md5set_delete(md5set); return had_err; }
static int gt_sketch_page_runner(GT_UNUSED int argc, const char **argv, int parsed_args, void *tool_arguments, GtError *err) { SketchPageArguments *arguments = tool_arguments; int had_err = 0; GtFeatureIndex *features = NULL; GtRange qry_range, sequence_region_range; GtStyle *sty = NULL; GtStr *prog, *gt_style_file; GtDiagram *d = NULL; GtLayout *l = NULL; GtBioseq *bioseq = NULL; GtCanvas *canvas = NULL; const char *seqid = NULL, *outfile; unsigned long start, height, num_pages = 0; double offsetpos, usable_height; cairo_surface_t *surf = NULL; cairo_t *cr = NULL; GtTextWidthCalculator *twc; gt_error_check(err); features = gt_feature_index_memory_new(); if (cairo_version() < CAIRO_VERSION_ENCODE(1, 8, 6)) gt_warning("Your cairo library (version %s) is older than version 1.8.6! " "These versions contain a bug which may result in " "corrupted PDF output!", cairo_version_string()); /* get style */ sty = gt_style_new(err); if (gt_str_length(arguments->stylefile) == 0) { prog = gt_str_new(); gt_str_append_cstr_nt(prog, argv[0], gt_cstr_length_up_to_char(argv[0], ' ')); gt_style_file = gt_get_gtdata_path(gt_str_get(prog), err); gt_str_delete(prog); gt_str_append_cstr(gt_style_file, "/sketch/default.style"); } else { gt_style_file = gt_str_ref(arguments->stylefile); } had_err = gt_style_load_file(sty, gt_str_get(gt_style_file), err); outfile = argv[parsed_args]; if (!had_err) { /* get features */ had_err = gt_feature_index_add_gff3file(features, argv[parsed_args+1], err); if (!had_err && gt_str_length(arguments->seqid) == 0) { seqid = gt_feature_index_get_first_seqid(features); if (seqid == NULL) { gt_error_set(err, "GFF input file must contain a sequence region!"); had_err = -1; } } else if (!had_err && !gt_feature_index_has_seqid(features, gt_str_get(arguments->seqid))) { gt_error_set(err, "sequence region '%s' does not exist in GFF input file", gt_str_get(arguments->seqid)); had_err = -1; } else if (!had_err) seqid = gt_str_get(arguments->seqid); } /* set text */ if (gt_str_length(arguments->text) == 0) { gt_str_delete(arguments->text); arguments->text = gt_str_new_cstr(argv[parsed_args+1]); } if (!had_err) { /* set display range */ gt_feature_index_get_range_for_seqid(features, &sequence_region_range, seqid); qry_range.start = (arguments->range.start == GT_UNDEF_ULONG ? sequence_region_range.start : arguments->range.start); qry_range.end = (arguments->range.end == GT_UNDEF_ULONG ? sequence_region_range.end : arguments->range.end); /* set output format */ if (strcmp(gt_str_get(arguments->format), "pdf") == 0) { surf = cairo_pdf_surface_create(outfile, mm_to_pt(arguments->pwidth), mm_to_pt(arguments->pheight)); } else if (strcmp(gt_str_get(arguments->format), "ps") == 0) { surf = cairo_ps_surface_create(outfile, mm_to_pt(arguments->pwidth), mm_to_pt(arguments->pheight)); } gt_log_log("created page with %.2f:%.2f dimensions\n", mm_to_pt(arguments->pwidth), mm_to_pt(arguments->pheight)); offsetpos = TEXT_SPACER + arguments->theight + TEXT_SPACER; usable_height = mm_to_pt(arguments->pheight) - arguments->theight - arguments->theight - 4*TEXT_SPACER; if (gt_str_length(arguments->seqfile) > 0) { bioseq = gt_bioseq_new(gt_str_get(arguments->seqfile), err); } cr = cairo_create(surf); cairo_set_font_size(cr, 8); twc = gt_text_width_calculator_cairo_new(cr, sty); for (start = qry_range.start; start <= qry_range.end; start += arguments->width) { GtRange single_range; GtCustomTrack *ct = NULL; const char *seq; single_range.start = start; single_range.end = start + arguments->width; if (had_err) break; d = gt_diagram_new(features, seqid, &single_range, sty, err); if (!d) { had_err = -1; break; } if (bioseq) { seq = gt_bioseq_get_sequence(bioseq, 0); ct = gt_custom_track_gc_content_new(seq, gt_bioseq_get_sequence_length(bioseq, 0), 800, 70, 0.4, true); gt_diagram_add_custom_track(d, ct); } l = gt_layout_new_with_twc(d, mm_to_pt(arguments->width), sty, twc, err); had_err = gt_layout_get_height(l, &height, err); if (!had_err) { if (gt_double_smaller_double(usable_height - 10 - 2*TEXT_SPACER - arguments->theight, offsetpos + height)) { draw_header(cr, gt_str_get(arguments->text), argv[parsed_args+1], seqid, num_pages, mm_to_pt(arguments->pwidth), mm_to_pt(arguments->pheight), arguments->theight); cairo_show_page(cr); offsetpos = TEXT_SPACER + arguments->theight + TEXT_SPACER; num_pages++; } canvas = gt_canvas_cairo_context_new(sty, cr, offsetpos, mm_to_pt(arguments->pwidth), height, NULL, err); if (!canvas) had_err = -1; offsetpos += height; if (!had_err) had_err = gt_layout_sketch(l, canvas, err); } gt_canvas_delete(canvas); gt_layout_delete(l); gt_diagram_delete(d); if (ct) gt_custom_track_delete(ct); } draw_header(cr, gt_str_get(arguments->text), argv[parsed_args+1], seqid, num_pages, mm_to_pt(arguments->pwidth), mm_to_pt(arguments->pheight), arguments->theight); cairo_show_page(cr); num_pages++; gt_log_log("finished, should be %lu pages\n", num_pages); gt_text_width_calculator_delete(twc); cairo_destroy(cr); cairo_surface_flush(surf); cairo_surface_finish(surf); cairo_surface_destroy(surf); cairo_debug_reset_static_data(); if (bioseq) gt_bioseq_delete(bioseq); gt_style_delete(sty); gt_str_delete(gt_style_file); gt_feature_index_delete(features); } return had_err; }
/* updates the BSSM parameterization file */ static void build_bssm(GtBioseq *bioseq, GthBSSMModel *bssm_model, unsigned int hypothesisnum) { GtUword mono_ct[STRINGSIZE-1][ALPHSIZE], /* Mononuc freq */ di_ct[STRINGSIZE-1][ALPHSIZE][ALPHSIZE]; /* Dinuc freq */ double mono_freq, /* Mononuc relative freq */ di_freq; /* Dinuc relative freq */ GtUword i, j, k, /* Iterator variables */ len, curlen = 0, num_entries = gt_bioseq_number_of_sequences(bioseq); GtUchar *encoded_seq = NULL; /* Inits of local variables */ for (i = 0; i < (STRINGSIZE-1); i++) { for (j = 0; j < ALPHSIZE; j++) { mono_ct[i][j] = INITVAL_INT; for (k = 0; k < ALPHSIZE; k++) di_ct[i][j][k] = INITVAL_INT; } } /* mononucleotides */ for (j = 0; j < num_entries; j++) { len = gt_bioseq_get_sequence_length(bioseq, j); gt_assert(len == STRINGSIZE); if (len > curlen) { encoded_seq = gt_realloc(encoded_seq, len); curlen = len; } gt_bioseq_get_encoded_sequence(bioseq, encoded_seq, j); for (i = 0; i < (STRINGSIZE-1); i++) { gt_assert(encoded_seq[i] < ALPHSIZE); mono_ct[i][encoded_seq[i]]++; } } /* dinucleotides */ for (j = 0; j < num_entries; j++) { len = gt_bioseq_get_sequence_length(bioseq, j); gt_assert(len == STRINGSIZE); if (len > curlen) { encoded_seq = gt_realloc(encoded_seq, len); curlen = len; } gt_bioseq_get_encoded_sequence(bioseq, encoded_seq, j); for (i = 0; i < (STRINGSIZE-1); i++) { di_ct[i][encoded_seq[i]] [encoded_seq[i + 1]]++; } } gt_free(encoded_seq); /* Record equilibrium frequencies (1st ``slot" in transition freqs) */ for (i = 0; i < ALPHSIZE; i++) { for (j = 0; j < ALPHSIZE; j++) { bssm_model->hypotables .hypo7table[hypothesisnum][0][i][j] = (GthFlt) mono_ct[0][i] / num_entries; } } /* Populate the remaining transition frequencies */ for (k = 1; k < STRINGSIZE; k++) { for (i = 0; i < ALPHSIZE; i++) { mono_freq = (double) mono_ct[k-1][i] / num_entries; for (j = 0; j < ALPHSIZE; j++) { di_freq = (double) di_ct[k-1][i][j] / num_entries; if (mono_freq == 0.0) { bssm_model->hypotables .hypo7table[hypothesisnum][k][i][j] = (GthFlt) NULLPROB; } else { bssm_model->hypotables .hypo7table[hypothesisnum][k][i][j] = (GthFlt) (di_freq / mono_freq); } } /* Remove non-zero transition probabilities: Briefly, 0.0 entries (dinucleotide absent in training corpus) are replaced arbitrarily by PSEUDOPROB, and non-0.0 entries p are replaced by p = p * (1 - 4 * PSEUDOPROB) + PSEUDOPROB */ for (j = 0; j < ALPHSIZE; ++j) { /* If any entry is NULLPROB, ALL elements in the row need fixed */ if (bssm_model->hypotables .hypo7table[hypothesisnum][k][i][j] == NULLPROB) { /* Fix all elements in the row, then break */ for (j = 0; j < ALPHSIZE; j++) { if (bssm_model->hypotables .hypo7table[hypothesisnum][k][i][j] == NULLPROB) { bssm_model->hypotables .hypo7table[hypothesisnum][k][i][j] = (GthFlt) PSEUDOPROB; } else { /* Adjust non-zero transition prob */ bssm_model->hypotables.hypo7table[hypothesisnum][k][i][j] = (GthFlt) (bssm_model->hypotables.hypo7table[hypothesisnum][k][i][j] * (1 - (4 * PSEUDOPROB)) + PSEUDOPROB); } } break; } } } } }
int gth_bssm_param_parameterize(GthBSSMParam *bssm_param, const char *path, Termtype termtype, bool gzip, GtError *err) { GtAlphabet *alphabet = NULL; GtBioseq *bioseq; GtStr *file2proc; GtUword i, j; int had_err = 0; gt_error_check(err); file2proc = gt_str_new(); /* set version number */ bssm_param->version_num = (unsigned char) MYVERSION; /* set model to true and set window sizes */ switch (termtype) { case GT_DONOR_TYPE: bssm_param->gt_donor_model_set = true; set_window_sizes_in_Bssmmodel(&bssm_param->gt_donor_model); break; case GC_DONOR_TYPE: bssm_param->gc_donor_model_set = true; set_window_sizes_in_Bssmmodel(&bssm_param->gc_donor_model); break; case AG_ACCEPTOR_TYPE: bssm_param->ag_acceptor_model_set = true; set_window_sizes_in_Bssmmodel(&bssm_param->ag_acceptor_model); break; default: gt_assert(0); } for (i = 0; !had_err && i < NUMOFFILES; i++) { /* process datafile */ gt_str_append_cstr(file2proc, path); switch (termtype) { case GT_DONOR_TYPE: gt_str_append_cstr(file2proc, "/GT_donor/"); gt_str_append_cstr(file2proc, filenames[i]); break; case GC_DONOR_TYPE: gt_str_append_cstr(file2proc, "/GC_donor/"); gt_str_append_cstr(file2proc, filenames[i]); break; case AG_ACCEPTOR_TYPE: gt_str_append_cstr(file2proc, "/AG_acceptor/"); gt_str_append_cstr(file2proc, filenames[i]); break; default: gt_assert(0); } if (gzip) gt_str_append_cstr(file2proc, ".gz"); if (!(bioseq = gt_bioseq_new(gt_str_get(file2proc), err))) had_err = -1; if (!had_err) alphabet = gt_bioseq_get_alphabet(bioseq); /* check here if all sequences have the length 102 and correct bases at positions 51 and 52 (i.e., GT, GC, or AG) */ for (j = 0; !had_err && j < gt_bioseq_number_of_sequences(bioseq); j++) { GtUchar encoded_seq[2]; /* check length */ if (gt_bioseq_get_sequence_length(bioseq, j) != STRINGSIZE) { gt_error_set(err, "sequence "GT_WU" in file \"%s\" does not have length %u", j, gt_str_get(file2proc), STRINGSIZE); had_err = -1; } encoded_seq[0] = gt_bioseq_get_encoded_char(bioseq, j, 50); encoded_seq[1] = gt_bioseq_get_encoded_char(bioseq, j, 51); if (!had_err) { /* check base correctness */ switch (termtype) { case GT_DONOR_TYPE: if (encoded_seq[0] != gt_alphabet_encode(alphabet, 'G') || encoded_seq[1] != gt_alphabet_encode(alphabet, 'T')) { gt_error_set(err, "sequence "GT_WU" in file \"%s\" is not a GT " "sequence", j, gt_str_get(file2proc)); had_err = -1; } break; case GC_DONOR_TYPE: if (encoded_seq[0] != gt_alphabet_encode(alphabet, 'G') || encoded_seq[1] != gt_alphabet_encode(alphabet, 'C')) { gt_error_set(err, "sequence "GT_WU" in file \"%s\" is not a GC " "sequence", j, gt_str_get(file2proc)); had_err = -1; } break; case AG_ACCEPTOR_TYPE: if (encoded_seq[0] != gt_alphabet_encode(alphabet, 'A') || encoded_seq[1] != gt_alphabet_encode(alphabet, 'G')) { gt_error_set(err, "sequence "GT_WU" in file \"%s\" is not a AG " "sequence", j, gt_str_get(file2proc)); had_err = -1; } break; default: gt_assert(0); } } } if (!had_err) { switch (termtype) { case GT_DONOR_TYPE: build_bssm(bioseq, &bssm_param->gt_donor_model, i); break; case GC_DONOR_TYPE: build_bssm(bioseq, &bssm_param->gc_donor_model, i); break; case AG_ACCEPTOR_TYPE: build_bssm(bioseq, &bssm_param->ag_acceptor_model, i); break; default: gt_assert(0); } } /* reset */ gt_str_reset(file2proc); /* free space */ gt_bioseq_delete(bioseq); } gt_str_delete(file2proc); return had_err; }
static int gt_seqfilter_runner(int argc, const char **argv, int parsed_args, void *tool_arguments, GtError *err) { SeqFilterArguments *arguments = tool_arguments; GtBioseqIterator *bsi; GtBioseq *bioseq; GtUint64 passed = 0, filtered = 0, num_of_sequences = 0, steps = 0; int had_err = 0; gt_error_check(err); gt_assert(tool_arguments); bsi = gt_bioseq_iterator_new(argc - parsed_args, argv + parsed_args); while (!(had_err = gt_bioseq_iterator_next(bsi, &bioseq, err)) && bioseq != NULL) { GtUword i; GtUint64 current_num = gt_bioseq_number_of_sequences(bioseq); for (i = 0; i < current_num && (arguments->maxseqnum == GT_UNDEF_UWORD || passed + 1 <= arguments->maxseqnum); i++) { char *seq; if ((arguments->step == 1 || steps + 1 == arguments->step) && (arguments->sample_prob == 1.0 || gt_rand_0_to_1() <= arguments->sample_prob) && (arguments->minlength == GT_UNDEF_UWORD || gt_bioseq_get_sequence_length(bioseq, i) >= arguments->minlength) && (arguments->maxlength == GT_UNDEF_UWORD || gt_bioseq_get_sequence_length(bioseq, i) <= arguments->maxlength)) { seq = gt_bioseq_get_sequence(bioseq, i); gt_fasta_show_entry(gt_bioseq_get_description(bioseq, i), seq, gt_bioseq_get_sequence_length(bioseq, i), arguments->width, arguments->outfp); gt_free(seq); passed++; } else { filtered++; } steps = (steps + 1 == arguments->step) ? 0 : steps + 1; } filtered += current_num - i; num_of_sequences += current_num; gt_bioseq_delete(bioseq); } /* show statistics */ if (!had_err) { gt_assert(passed + filtered == num_of_sequences); fprintf(stderr, "# " GT_LLU " out of " GT_LLU " sequences have been removed (%.3f%%)\n", filtered, num_of_sequences, ((double) filtered / num_of_sequences) * 100.0); } gt_bioseq_iterator_delete(bsi); return had_err; }
int main(int argc, char *argv[]) { const char *style_file, *png_file, *gff3_file; char *seqid; GtStyle *style; GtBioseq *bioseq; GtFeatureIndex *feature_index; GtRange range; GtDiagram *diagram; GtLayout *layout; GtCanvas *canvas; GtCustomTrack *custom; GtUword height, windowsize; GtError *err; if (argc != 9) { fprintf(stderr, "Usage: %s style_file PNG_file GFF3_file Seq_file seqid" " start end windowsize\n", argv[0]); return EXIT_FAILURE; } style_file = argv[1]; png_file = argv[2]; gff3_file = argv[3]; /* initialize */ gt_lib_init(); /* create error object */ err = gt_error_new(); /* create style */ if (!(style = gt_style_new(err))) handle_error(err); /* load style file */ if (gt_style_load_file(style, style_file, err)) handle_error(err); /* create feature index */ feature_index = gt_feature_index_memory_new(); /* add GFF3 file to index */ if (gt_feature_index_add_gff3file(feature_index, gff3_file, err)) handle_error(err); /* create diagram for first sequence ID in feature index */ seqid = argv[5]; if (gt_feature_index_get_range_for_seqid(feature_index, &range, seqid, err)) handle_error(err); sscanf(argv[6], "%lu", &range.start); sscanf(argv[7], "%lu", &range.end); sscanf(argv[8], "%lu", &windowsize); diagram = gt_diagram_new(feature_index, seqid, &range, style, err); if (gt_error_is_set(err)) handle_error(err); /* load sequence for GC plot */ bioseq = gt_bioseq_new(argv[4], err); if (gt_error_is_set(err)) handle_error(err); /* create custom track with GC plot for first sequence in file, window size 1000, 40px height and average line at 16.5% */ custom = gt_custom_track_gc_content_new(gt_bioseq_get_sequence(bioseq, 0), gt_bioseq_get_sequence_length(bioseq, 0), windowsize, 70, 0.165, true); gt_diagram_add_custom_track(diagram, custom); /* create layout with given width, determine resulting image height */ layout = gt_layout_new(diagram, 600, style, err); if (gt_error_is_set(err)) handle_error(err); if (gt_layout_get_height(layout, &height, err)) handle_error(err); /* create PNG canvas */ canvas = gt_canvas_cairo_file_new(style, GT_GRAPHICS_PNG, 600, height, NULL, err); if (!canvas) handle_error(err); /* sketch layout on canvas */ if (gt_layout_sketch(layout, canvas, err)) handle_error(err); /* write canvas to file */ if (gt_canvas_cairo_file_to_file((GtCanvasCairoFile*) canvas, png_file, err)) handle_error(err); /* free */ gt_custom_track_delete(custom); gt_bioseq_delete(bioseq); gt_canvas_delete(canvas); gt_layout_delete(layout); gt_diagram_delete(diagram); gt_feature_index_delete(feature_index); gt_style_delete(style); gt_error_delete(err); /* perform static data cleanup */ gt_lib_clean(); return EXIT_SUCCESS; }