void gt_gc_content_show(const char *seq, unsigned long len, GtAlphabet *alphabet, GtFile *outfp) { unsigned long i, gc = 0, /* number of G/C bases */ at = 0, /* number of A/T bases */ n = 0; /* number of N bases */ unsigned int a_code, c_code, g_code, t_code, n_code, cc; gt_assert(seq && alphabet); gt_assert(gt_alphabet_is_dna(alphabet)); a_code = gt_alphabet_encode(alphabet, 'A'); c_code = gt_alphabet_encode(alphabet, 'C'); g_code = gt_alphabet_encode(alphabet, 'G'); t_code = gt_alphabet_encode(alphabet, 'T'); n_code = gt_alphabet_encode(alphabet, 'N'); for (i = 0; i < len; i++) { cc = gt_alphabet_encode(alphabet, seq[i]); if (cc == g_code || cc == c_code) gc++; else if (cc == a_code || cc == t_code) at++; else if (cc == n_code) n++; else { gt_assert(0); } } gt_file_xprintf(outfp, "GC-content: %.2f%% (AT-content: %.2f%%, " "N-content: %.2f%%)\n", ((double) gc / len) * 100.0, ((double) at / len) * 100.0, ((double) n / len) * 100.0); }
static int decode_sequence_file(const char *seqfile, GtEncseqDecodeArguments *args, GtError *err) { GtEncseqLoader *encseq_loader; GtEncseq *encseq; int had_err = 0; gt_error_check(err); gt_assert(seqfile); encseq_loader = gt_encseq_loader_new(); if (!had_err && gt_encseq_options_lossless_value(args->eopts)) { gt_encseq_loader_require_lossless_support(encseq_loader); } if (!(encseq = gt_encseq_loader_load(encseq_loader, seqfile, err))) had_err = -1; if (!had_err && gt_encseq_options_mirrored_value(args->eopts)) { if (!gt_alphabet_is_dna(gt_encseq_alphabet(encseq))) { gt_error_set(err, "mirroring is only defined on DNA sequences"); had_err = -1; } if (!had_err) had_err = gt_encseq_mirror(encseq, err); } if (!had_err) had_err = output_sequence(encseq, args, seqfile, err); gt_encseq_delete(encseq); gt_encseq_loader_delete(encseq_loader); return had_err; }
void gt_bioseq_show_gc_content(GtBioseq *bs, GtFile *outfp) { gt_assert(bs); if (gt_alphabet_is_dna(gt_encseq_alphabet(bs->encseq))) { GtUword i, GT_UNUSED purecharlen; GtStr *str = gt_str_new(); purecharlen = gt_encseq_total_length(bs->encseq) - gt_encseq_num_of_sequences(bs->encseq) + 1; for (i=0; i < gt_encseq_num_of_sequences(bs->encseq); i++) { char *tmp; tmp = gt_bioseq_get_sequence(bs, i); gt_str_append_cstr(str, tmp); gt_free(tmp); } gt_assert(gt_str_length(str) == purecharlen); gt_file_xprintf(outfp, "showing GC-content for sequence file \"%s\"\n", gt_str_get(bs->sequence_file)); gt_gc_content_show(gt_str_get(str), gt_str_length(str), gt_encseq_alphabet(bs->encseq), outfp); gt_str_delete(str); } }
static int gt_seed_extend_runner(GT_UNUSED int argc, GT_UNUSED const char **argv, GT_UNUSED int parsed_args, void *tool_arguments, GtError *err) { GtSeedExtendArguments *arguments = tool_arguments; GtEncseqLoader *encseq_loader = NULL; GtEncseq *aencseq = NULL, *bencseq = NULL; GtGreedyextendmatchinfo *grextinfo = NULL; GtXdropmatchinfo *xdropinfo = NULL; GtQuerymatchoutoptions *querymatchoutopt = NULL; GtTimer *seedextendtimer = NULL; GtExtendCharAccess cam = GT_EXTEND_CHAR_ACCESS_ANY; GtUword errorpercentage = 0UL; int had_err = 0; gt_error_check(err); gt_assert(arguments != NULL); gt_assert(arguments->se_minidentity >= GT_EXTEND_MIN_IDENTITY_PERCENTAGE && arguments->se_minidentity <= 100UL); /* Calculate error percentage from minidentity */ errorpercentage = 100UL - arguments->se_minidentity; /* Measure whole running time */ if (arguments->benchmark || arguments->verbose) { gt_showtime_enable(); } if (gt_showtime_enabled()) { seedextendtimer = gt_timer_new(); gt_timer_start(seedextendtimer); } /* Load encseq A */ encseq_loader = gt_encseq_loader_new(); gt_encseq_loader_enable_autosupport(encseq_loader); aencseq = gt_encseq_loader_load(encseq_loader, gt_str_get(arguments->dbs_indexname), err); if (aencseq == NULL) had_err = -1; /* If there is a 2nd read set: Load encseq B */ if (!had_err) { if (strcmp(gt_str_get(arguments->dbs_queryname), "") != 0) { bencseq = gt_encseq_loader_load(encseq_loader, gt_str_get(arguments->dbs_queryname), err); } else { bencseq = gt_encseq_ref(aencseq); } if (bencseq == NULL) { had_err = -1; gt_encseq_delete(aencseq); } } gt_encseq_loader_delete(encseq_loader); /* set character access method */ if (!had_err && (gt_option_is_set(arguments->se_option_greedy) || gt_option_is_set(arguments->se_option_xdrop) || arguments->se_alignmentwidth > 0)) { cam = gt_greedy_extend_char_access(gt_str_get (arguments->se_char_access_mode), err); if ((int) cam == -1) { had_err = -1; gt_encseq_delete(aencseq); gt_encseq_delete(bencseq); } } /* Use bias dependent parameters, adapted from E. Myers' DALIGNER */ if (!had_err && arguments->bias_parameters) { const GtAlphabet *alpha = gt_encseq_alphabet(aencseq); const double bias_factor[10] = {.690, .690, .690, .690, .780, .850, .900, .933, .966, 1.000}; if (gt_alphabet_is_dna(alpha)) { GtUword at, cg; at = gt_encseq_charcount(aencseq, gt_alphabet_encode(alpha, 'a')); at += gt_encseq_charcount(aencseq, gt_alphabet_encode(alpha, 't')); cg = gt_encseq_charcount(aencseq, gt_alphabet_encode(alpha, 'c')); cg += gt_encseq_charcount(aencseq, gt_alphabet_encode(alpha, 'g')); if (at + cg > 0) { const double ratio = (double)MIN(at, cg) / (at + cg); int bias_index = (int)MAX(0.0, (ratio + 0.025) * 20.0 - 1.0); gt_assert(bias_index < 10); arguments->se_maxalilendiff = 30; arguments->se_perc_match_hist = (GtUword)(100.0 - errorpercentage * bias_factor[bias_index]); if (arguments->verbose) { printf("# Base ratio = %4.2lf -> percmathistory = "GT_WU"\n", ratio, arguments->se_perc_match_hist); } } else { had_err = -1; } } else { had_err = -1; } if (had_err) { gt_error_set(err, "option \"-bias-parameters\" can only be applied to " "the DNA alphabet"); gt_encseq_delete(aencseq); gt_encseq_delete(bencseq); } } /* Prepare options for greedy extension */ if (!had_err && gt_option_is_set(arguments->se_option_greedy)) { grextinfo = gt_greedy_extend_matchinfo_new(errorpercentage, arguments->se_maxalilendiff, arguments->se_historysize, arguments->se_perc_match_hist, arguments->se_alignlength, cam, arguments->se_extendgreedy); if (arguments->benchmark) { gt_greedy_extend_matchinfo_silent_set(grextinfo); } } /* Prepare options for xdrop extension */ if (!had_err && gt_option_is_set(arguments->se_option_xdrop)) { xdropinfo = gt_xdrop_matchinfo_new(arguments->se_alignlength, errorpercentage, arguments->se_xdropbelowscore, arguments->se_extendxdrop); if (arguments->benchmark) { gt_xdrop_matchinfo_silent_set(xdropinfo); } } /* Prepare output options */ if (!had_err && (arguments->se_alignmentwidth > 0 || gt_option_is_set(arguments->se_option_xdrop))) { querymatchoutopt = gt_querymatchoutoptions_new(arguments->se_alignmentwidth); if (gt_option_is_set(arguments->se_option_xdrop) || gt_option_is_set(arguments->se_option_greedy)) { const GtUword sensitivity = gt_option_is_set(arguments->se_option_greedy) ? arguments->se_extendgreedy : 100; gt_querymatchoutoptions_extend(querymatchoutopt, errorpercentage, arguments->se_maxalilendiff, arguments->se_historysize, arguments->se_perc_match_hist, cam, sensitivity); } } /* Start algorithm */ if (!had_err) { GtDiagbandseed dbsarguments; dbsarguments.errorpercentage = errorpercentage; dbsarguments.userdefinedleastlength = arguments->se_alignlength; dbsarguments.seedlength = arguments->dbs_seedlength; dbsarguments.logdiagbandwidth = arguments->dbs_logdiagbandwidth; dbsarguments.mincoverage = arguments->dbs_mincoverage; dbsarguments.maxfreq = arguments->dbs_maxfreq; dbsarguments.memlimit = arguments->dbs_memlimit; dbsarguments.mirror = arguments->mirror; dbsarguments.overlappingseeds = arguments->overlappingseeds; dbsarguments.verify = arguments->dbs_verify; dbsarguments.verbose = arguments->verbose; dbsarguments.debug_kmer = arguments->dbs_debug_kmer; dbsarguments.debug_seedpair = arguments->dbs_debug_seedpair; dbsarguments.seed_display = arguments->seed_display; dbsarguments.extendgreedyinfo = grextinfo; dbsarguments.extendxdropinfo = xdropinfo; dbsarguments.querymatchoutopt = querymatchoutopt; had_err = gt_diagbandseed_run(aencseq, bencseq, &dbsarguments, err); /* clean up */ gt_encseq_delete(aencseq); gt_encseq_delete(bencseq); if (gt_option_is_set(arguments->se_option_greedy)) { gt_greedy_extend_matchinfo_delete(grextinfo); } if (gt_option_is_set(arguments->se_option_xdrop)) { gt_xdrop_matchinfo_delete(xdropinfo); } if (arguments->se_alignmentwidth > 0 || gt_option_is_set(arguments->se_option_xdrop)) { gt_querymatchoutoptions_delete(querymatchoutopt); } } if (gt_showtime_enabled()) { if (!had_err) { char *keystring = gt_seed_extend_params_keystring(gt_option_is_set(arguments-> se_option_greedy), gt_option_is_set(arguments-> se_option_xdrop), arguments->dbs_seedlength, arguments->se_alignlength, arguments->se_minidentity, arguments->se_maxalilendiff, arguments->se_perc_match_hist, arguments->se_extendgreedy, arguments->se_extendxdrop, arguments->se_xdropbelowscore); printf("# TIME seedextend-%s", keystring); gt_free(keystring); gt_timer_show_formatted(seedextendtimer, " overall " GT_WD ".%06ld\n", stdout); } gt_timer_delete(seedextendtimer); } return had_err; }
static int gt_encseq_check_runner(GT_UNUSED int argc, const char **argv, int parsed_args, void *tool_arguments, GtError *err) { GtEncseqCheckArguments *arguments = tool_arguments; int had_err = 0; GtEncseqLoader *encseq_loader; GtEncseq *encseq; gt_error_check(err); gt_assert(arguments); encseq_loader = gt_encseq_loader_new(); if (!(encseq = gt_encseq_loader_load(encseq_loader, argv[parsed_args], err))) had_err = -1; if (!had_err) { int readmode; gt_encseq_check_startpositions(encseq); for (readmode = 0; readmode < 4; readmode++) { if (gt_alphabet_is_dna(gt_encseq_alphabet(encseq)) || ((GtReadmode) readmode) == GT_READMODE_FORWARD || ((GtReadmode) readmode) == GT_READMODE_REVERSE) { if (gt_encseq_check_consistency(encseq, gt_encseq_filenames(encseq), (GtReadmode) readmode, arguments->scantrials, arguments->multicharcmptrials, gt_encseq_has_multiseq_support(encseq), err) != 0) { had_err = -1; break; } } } if (!had_err) { gt_encseq_check_specialranges(encseq); } if (!had_err) { gt_encseq_check_markpos(encseq); } if (!had_err) { had_err = gt_encseq_check_minmax(encseq, err); } if (!had_err && arguments->prefixlength > 0) { if (gt_verifymappedstr(encseq, arguments->prefixlength, err) != 0) { had_err = -1; } } } gt_encseq_delete(encseq); gt_encseq_loader_delete(encseq_loader); return had_err; }
int gt_genomediff_pck_shu_simple(GtLogger *logger, const GtGenomediffArguments *arguments, GtError *err) { int had_err = 0; int retval; GtSeqIterator *queries = NULL; const GtUchar *symbolmap, *currentQuery; const GtAlphabet *alphabet; GtUchar c_sym = 0, g_sym = 0; uint64_t queryNo; char *description = NULL; unsigned long queryLength, subjectLength = 0, currentSuffix; double avgShuLength, currentShuLength = 0.0, /*gc_subject,*/ gc_query /*, gc*/; const FMindex *subjectindex = NULL; Genericindex *genericindexSubject; const GtEncseq *encseq = NULL; double *ln_n_fac; /* get the precalculation of ln(n!) for 0<n<max_ln_n_fac */ ln_n_fac = gt_get_ln_n_fac(arguments->max_ln_n_fac); gt_log_log("ln(max_ln_n_fac!) = %f\n", ln_n_fac[arguments->max_ln_n_fac]); genericindexSubject = genericindex_new(gt_str_get( arguments->indexname), arguments->with_esa, true, false, true, arguments->user_max_depth, logger, err); if (genericindexSubject == NULL) { had_err = 1; } else { encseq = genericindex_getencseq(genericindexSubject); } if (!had_err) { subjectLength = genericindex_get_totallength(genericindexSubject) - 1; /*subjectLength /= 2;*/ /*gt_log_log("subject length: %lu", subjectLength);*/ subjectindex = genericindex_get_packedindex(genericindexSubject); queries = gt_seqiterator_sequence_buffer_new( arguments->queryname, err); gt_assert(queries); alphabet = gt_encseq_alphabet(encseq); /* makes assumption that alphabet is dna, it has to calculate the gc! */ if (!gt_alphabet_is_dna(alphabet)) { fprintf(stderr, "error: Sequences need to be dna"); had_err = 1; } else { symbolmap = gt_alphabet_symbolmap(alphabet); gt_seqiterator_set_symbolmap(queries, symbolmap); c_sym = gt_alphabet_encode(alphabet, 'c'); g_sym = gt_alphabet_encode(alphabet, 'g'); } } for (queryNo = 0; !had_err; queryNo++) { retval = gt_seqiterator_next(queries, ¤tQuery, &queryLength, &description, err); if ( retval != 1) { if (retval < 0) { gt_free(description); } break; } gt_logger_log(logger, "found query of length: %lu", queryLength); avgShuLength = 0.0; gc_query = 0.0; for (currentSuffix = 0; currentSuffix < queryLength; currentSuffix++) { currentShuLength = (double) gt_pck_getShuStringLength( subjectindex, ¤tQuery[currentSuffix], queryLength - currentSuffix); avgShuLength += currentShuLength; if (currentQuery[currentSuffix] == c_sym || currentQuery[currentSuffix] == g_sym) { gc_query++; } } if (arguments->shulen_only) { printf("# Query %d sum of shulen:\n %.0f\n", (int) queryNo, avgShuLength); } else { avgShuLength /= (double) queryLength; gc_query /= (double) queryLength; gt_logger_log(logger, "Query %d has an average SHUstring length " "of\n# shulength: %f", (int) queryNo, avgShuLength); gt_logger_log(logger, "Query description: %s", description); gt_log_log("Query (i): %s", description); /* XXX Fehlerabfragen einbauen */ if ( !had_err ) { double div, kr; gt_logger_log(logger, "shulen:\n%f", avgShuLength); gt_log_log("shu: %f, gc: %f, len: %lu", avgShuLength, gc_query, subjectLength); div = gt_divergence(arguments->divergence_rel_err, arguments->divergence_abs_err, arguments->divergence_m, arguments->divergence_threshold, avgShuLength, subjectLength, gc_query, ln_n_fac, arguments->max_ln_n_fac); gt_logger_log(logger, "divergence:\n%f", div); kr = gt_calculateKr(div); printf("# Kr:\n%f\n", kr); } } } gt_free(ln_n_fac); gt_seqiterator_delete(queries); genericindex_delete(genericindexSubject); return had_err; }
double *gt_encseq_get_gc(const GtEncseq *encseq, bool with_special, bool calculate, GT_UNUSED GtError *err) { GtEncseqReader *reader; GtAlphabet *alphabet; double *gc_content; /* unit = file or sequence depending on per_file */ unsigned long char_idx, totallength, max_unit, seq_idx = 0, nextsep = 0, at_count = 0, gc_count = 0, default_count = 0; bool is_mirrored_encseq; GtUchar acgt[8], current_c; alphabet = gt_encseq_alphabet(encseq); gt_assert(gt_alphabet_is_dna(alphabet)); gt_alphabet_encode_seq(alphabet, acgt, "aAtTcCgG", 8UL); totallength = gt_encseq_total_length(encseq); reader = gt_encseq_create_reader_with_readmode(encseq, GT_READMODE_FORWARD, 0); is_mirrored_encseq = gt_encseq_is_mirrored(encseq); if (is_mirrored_encseq) { max_unit = GT_DIV2(gt_encseq_num_of_sequences(encseq)); gc_content = gt_calloc((size_t) GT_MULT2(max_unit), sizeof (double)); } else { max_unit = gt_encseq_num_of_sequences(encseq); gc_content = gt_calloc((size_t) max_unit, sizeof (double)); } nextsep = gt_encseq_seqstartpos(encseq, seq_idx) + gt_encseq_seqlength(encseq, seq_idx); for (char_idx = 0; char_idx < totallength; char_idx++) { if (nextsep == char_idx) { if (calculate) { calculate_gc(encseq, gc_content, with_special, seq_idx, gc_count, at_count); } else { gc_content[seq_idx] = (double) gc_count; } seq_idx++; nextsep = gt_encseq_seqstartpos(encseq, seq_idx) + gt_encseq_seqlength(encseq, seq_idx); gt_encseq_reader_reinit_with_readmode(reader, encseq, GT_READMODE_FORWARD, char_idx + 1UL); gc_count = at_count = default_count = 0UL; continue; } current_c = gt_encseq_reader_next_encoded_char(reader); if (current_c == acgt[0] || current_c == acgt[1] || current_c == acgt[2] || current_c == acgt[3]) { at_count++; } else { if (current_c == acgt[4] || current_c == acgt[5] || current_c == acgt[6] || current_c == acgt[7]) { gc_count++; } else { default_count++; } } } if (calculate) { calculate_gc(encseq, gc_content, with_special, seq_idx, gc_count, at_count); } else { gc_content[seq_idx] = (double) gc_count; } gt_encseq_reader_delete(reader); if (is_mirrored_encseq) { unsigned long double_max_unit = GT_MULT2(max_unit); for (seq_idx = 0; seq_idx < max_unit; seq_idx++) { gc_content[double_max_unit - seq_idx - 1] = gc_content[seq_idx]; } } return gc_content; }
static int gt_encseq_info_runner(GT_UNUSED int argc, const char **argv, int parsed_args, void *tool_arguments, GtError *err) { GtEncseqInfoArguments *arguments = tool_arguments; int had_err = 0; GtAlphabet *alpha; const GtUchar *chars; gt_error_check(err); gt_assert(arguments); if (arguments->nomap) { GtEncseqMetadata *emd = gt_encseq_metadata_new(argv[parsed_args], err); if (!emd) had_err = -1; if (!had_err) { if (!arguments->noindexname) { gt_file_xprintf(arguments->outfp, "index name: "); gt_file_xprintf(arguments->outfp, "%s\n", argv[parsed_args]); } gt_file_xprintf(arguments->outfp, "file format version: "); gt_file_xprintf(arguments->outfp, ""GT_WU"\n", gt_encseq_metadata_version(emd)); gt_file_xprintf(arguments->outfp, "64-bit file: "); gt_file_xprintf(arguments->outfp, "%s\n", gt_encseq_metadata_is64bit(emd) ? "yes" : "no"); gt_file_xprintf(arguments->outfp, "total length: "); gt_file_xprintf(arguments->outfp, ""GT_WU"\n", gt_encseq_metadata_total_length(emd)); gt_file_xprintf(arguments->outfp, "number of sequences: "); gt_file_xprintf(arguments->outfp, ""GT_WU"\n", gt_encseq_metadata_num_of_sequences(emd)); gt_file_xprintf(arguments->outfp, "number of files: "); gt_file_xprintf(arguments->outfp, ""GT_WU"\n", gt_encseq_metadata_num_of_files(emd)); gt_file_xprintf(arguments->outfp, "length of shortest/longest " "sequence: "); gt_file_xprintf(arguments->outfp, ""GT_WU"/"GT_WU"\n", gt_encseq_metadata_min_seq_length(emd), gt_encseq_metadata_max_seq_length(emd)); gt_file_xprintf(arguments->outfp, "accesstype: "); gt_file_xprintf(arguments->outfp, "%s\n", gt_encseq_access_type_str(gt_encseq_metadata_accesstype(emd))); alpha = gt_encseq_metadata_alphabet(emd); chars = gt_alphabet_characters(alpha); gt_file_xprintf(arguments->outfp, "alphabet size: "); gt_file_xprintf(arguments->outfp, "%u\n", gt_alphabet_num_of_chars(alpha)); gt_file_xprintf(arguments->outfp, "alphabet characters: "); gt_file_xprintf(arguments->outfp, "%.*s", gt_alphabet_num_of_chars(alpha), (char*) chars); if (gt_alphabet_is_dna(alpha)) gt_file_xprintf(arguments->outfp, " (DNA)"); if (gt_alphabet_is_protein(alpha)) gt_file_xprintf(arguments->outfp, " (Protein)"); gt_file_xprintf(arguments->outfp, "\n"); if (arguments->show_alphabet) { GtStr *out = gt_str_new(); gt_alphabet_to_str(alpha, out); gt_file_xprintf(arguments->outfp, "alphabet definition:\n"); gt_file_xprintf(arguments->outfp, "%s\n", gt_str_get(out)); gt_str_delete(out); } } gt_encseq_metadata_delete(emd); } else { GtEncseqLoader *encseq_loader; GtEncseq *encseq; encseq_loader = gt_encseq_loader_new(); if (arguments->mirror) gt_encseq_loader_mirror(encseq_loader); if (!(encseq = gt_encseq_loader_load(encseq_loader, argv[parsed_args], err))) had_err = -1; if (!had_err) { const GtStrArray *filenames; GtUword i; if (!arguments->noindexname) { gt_file_xprintf(arguments->outfp, "index name: "); gt_file_xprintf(arguments->outfp, "%s\n", argv[parsed_args]); } gt_file_xprintf(arguments->outfp, "file format version: "); gt_file_xprintf(arguments->outfp, ""GT_WU"\n", gt_encseq_version(encseq)); gt_file_xprintf(arguments->outfp, "64-bit file: "); gt_file_xprintf(arguments->outfp, "%s\n", gt_encseq_is_64_bit(encseq) ? "yes" : "no"); gt_file_xprintf(arguments->outfp, "total length: "); gt_file_xprintf(arguments->outfp, ""GT_WU"\n", gt_encseq_total_length(encseq)); gt_file_xprintf(arguments->outfp, "compressed size: "); gt_file_xprintf(arguments->outfp, ""GT_WU" bytes\n", gt_encseq_sizeofrep(encseq)); gt_file_xprintf(arguments->outfp, "number of sequences: "); gt_file_xprintf(arguments->outfp, ""GT_WU"\n", gt_encseq_num_of_sequences(encseq)); gt_file_xprintf(arguments->outfp, "number of files: "); gt_file_xprintf(arguments->outfp, ""GT_WU"\n", gt_encseq_num_of_files(encseq)); gt_file_xprintf(arguments->outfp, "length of shortest/longest " "sequence: "); gt_file_xprintf(arguments->outfp, ""GT_WU"/"GT_WU"\n", gt_encseq_min_seq_length(encseq), gt_encseq_max_seq_length(encseq)); filenames = gt_encseq_filenames(encseq); gt_file_xprintf(arguments->outfp, "original filenames:\n"); for (i = 0; i < gt_str_array_size(filenames); i++) { gt_file_xprintf(arguments->outfp, "\t%s ("GT_WU" characters)\n", gt_str_array_get(filenames, i), (GtUword) gt_encseq_effective_filelength(encseq, i)); } alpha = gt_encseq_alphabet(encseq); chars = gt_alphabet_characters(alpha); gt_file_xprintf(arguments->outfp, "alphabet size: "); gt_file_xprintf(arguments->outfp, "%u\n", gt_alphabet_num_of_chars(alpha)); gt_file_xprintf(arguments->outfp, "alphabet characters: "); gt_file_xprintf(arguments->outfp, "%.*s", gt_alphabet_num_of_chars(alpha), (char*) chars); if (gt_alphabet_is_dna(alpha)) gt_file_xprintf(arguments->outfp, " (DNA)"); if (gt_alphabet_is_protein(alpha)) gt_file_xprintf(arguments->outfp, " (Protein)"); gt_file_xprintf(arguments->outfp, "\n"); if (arguments->show_alphabet) { GtStr *out = gt_str_new(); gt_alphabet_to_str(alpha, out); gt_file_xprintf(arguments->outfp, "alphabet definition:\n"); gt_file_xprintf(arguments->outfp, "%s\n", gt_str_get(out)); gt_str_delete(out); } gt_file_xprintf(arguments->outfp, "character distribution:\n"); for (i = 0; i < gt_alphabet_num_of_chars(alpha); i++) { GtUword cc; cc = gt_encseq_charcount(encseq, gt_alphabet_encode(alpha, chars[i])); gt_file_xprintf(arguments->outfp, "\t%c: "GT_WU" (%.2f%%)\n", (char) chars[i], cc, (cc /(double) (gt_encseq_total_length(encseq) - gt_encseq_num_of_sequences(encseq)+1))*100); } gt_file_xprintf(arguments->outfp, "number of wildcards: "); gt_file_xprintf(arguments->outfp, ""GT_WU" ("GT_WU" range(s))\n", gt_encseq_wildcards(encseq), gt_encseq_realwildcardranges(encseq)); gt_file_xprintf(arguments->outfp, "number of special characters: "); gt_file_xprintf(arguments->outfp, ""GT_WU" ("GT_WU" range(s))\n", gt_encseq_specialcharacters(encseq), gt_encseq_realspecialranges(encseq)); gt_file_xprintf(arguments->outfp, "length of longest non-special " "character stretch: "); gt_file_xprintf(arguments->outfp, ""GT_WU"\n", gt_encseq_lengthoflongestnonspecial(encseq)); gt_file_xprintf(arguments->outfp, "accesstype: "); gt_file_xprintf(arguments->outfp, "%s\n", gt_encseq_access_type_str(gt_encseq_accesstype_get(encseq))); gt_file_xprintf(arguments->outfp, "bits used per character: "); gt_file_xprintf(arguments->outfp, "%f\n", (double) ((uint64_t) CHAR_BIT * (uint64_t) gt_encseq_sizeofrep(encseq)) / (double) gt_encseq_total_length(encseq)); gt_file_xprintf(arguments->outfp, "has special ranges: "); gt_file_xprintf(arguments->outfp, "%s\n", gt_encseq_has_specialranges(encseq) ? "yes" : "no"); gt_file_xprintf(arguments->outfp, "has description support: "); gt_file_xprintf(arguments->outfp, "%s\n", gt_encseq_has_description_support(encseq) ? "yes" : "no"); if (gt_encseq_has_description_support(encseq)) { gt_file_xprintf(arguments->outfp, "length of longest description: "); gt_file_xprintf(arguments->outfp, ""GT_WU"\n", gt_encseq_max_desc_length(encseq)); } gt_file_xprintf(arguments->outfp, "has multiple sequence support: "); gt_file_xprintf(arguments->outfp, "%s\n", gt_encseq_has_multiseq_support(encseq) ? "yes" : "no"); } gt_encseq_delete(encseq); gt_encseq_loader_delete(encseq_loader); } return had_err; }