static int gt_encseq_info_runner(GT_UNUSED int argc, const char **argv, int parsed_args, void *tool_arguments, GtError *err) { GtEncseqInfoArguments *arguments = tool_arguments; int had_err = 0; GtAlphabet *alpha; const GtUchar *chars; gt_error_check(err); gt_assert(arguments); if (arguments->nomap) { GtEncseqMetadata *emd = gt_encseq_metadata_new(argv[parsed_args], err); if (!emd) had_err = -1; if (!had_err) { if (!arguments->noindexname) { gt_file_xprintf(arguments->outfp, "index name: "); gt_file_xprintf(arguments->outfp, "%s\n", argv[parsed_args]); } gt_file_xprintf(arguments->outfp, "file format version: "); gt_file_xprintf(arguments->outfp, ""GT_WU"\n", gt_encseq_metadata_version(emd)); gt_file_xprintf(arguments->outfp, "64-bit file: "); gt_file_xprintf(arguments->outfp, "%s\n", gt_encseq_metadata_is64bit(emd) ? "yes" : "no"); gt_file_xprintf(arguments->outfp, "total length: "); gt_file_xprintf(arguments->outfp, ""GT_WU"\n", gt_encseq_metadata_total_length(emd)); gt_file_xprintf(arguments->outfp, "number of sequences: "); gt_file_xprintf(arguments->outfp, ""GT_WU"\n", gt_encseq_metadata_num_of_sequences(emd)); gt_file_xprintf(arguments->outfp, "number of files: "); gt_file_xprintf(arguments->outfp, ""GT_WU"\n", gt_encseq_metadata_num_of_files(emd)); gt_file_xprintf(arguments->outfp, "length of shortest/longest " "sequence: "); gt_file_xprintf(arguments->outfp, ""GT_WU"/"GT_WU"\n", gt_encseq_metadata_min_seq_length(emd), gt_encseq_metadata_max_seq_length(emd)); gt_file_xprintf(arguments->outfp, "accesstype: "); gt_file_xprintf(arguments->outfp, "%s\n", gt_encseq_access_type_str(gt_encseq_metadata_accesstype(emd))); alpha = gt_encseq_metadata_alphabet(emd); chars = gt_alphabet_characters(alpha); gt_file_xprintf(arguments->outfp, "alphabet size: "); gt_file_xprintf(arguments->outfp, "%u\n", gt_alphabet_num_of_chars(alpha)); gt_file_xprintf(arguments->outfp, "alphabet characters: "); gt_file_xprintf(arguments->outfp, "%.*s", gt_alphabet_num_of_chars(alpha), (char*) chars); if (gt_alphabet_is_dna(alpha)) gt_file_xprintf(arguments->outfp, " (DNA)"); if (gt_alphabet_is_protein(alpha)) gt_file_xprintf(arguments->outfp, " (Protein)"); gt_file_xprintf(arguments->outfp, "\n"); if (arguments->show_alphabet) { GtStr *out = gt_str_new(); gt_alphabet_to_str(alpha, out); gt_file_xprintf(arguments->outfp, "alphabet definition:\n"); gt_file_xprintf(arguments->outfp, "%s\n", gt_str_get(out)); gt_str_delete(out); } } gt_encseq_metadata_delete(emd); } else { GtEncseqLoader *encseq_loader; GtEncseq *encseq; encseq_loader = gt_encseq_loader_new(); if (arguments->mirror) gt_encseq_loader_mirror(encseq_loader); if (!(encseq = gt_encseq_loader_load(encseq_loader, argv[parsed_args], err))) had_err = -1; if (!had_err) { const GtStrArray *filenames; GtUword i; if (!arguments->noindexname) { gt_file_xprintf(arguments->outfp, "index name: "); gt_file_xprintf(arguments->outfp, "%s\n", argv[parsed_args]); } gt_file_xprintf(arguments->outfp, "file format version: "); gt_file_xprintf(arguments->outfp, ""GT_WU"\n", gt_encseq_version(encseq)); gt_file_xprintf(arguments->outfp, "64-bit file: "); gt_file_xprintf(arguments->outfp, "%s\n", gt_encseq_is_64_bit(encseq) ? "yes" : "no"); gt_file_xprintf(arguments->outfp, "total length: "); gt_file_xprintf(arguments->outfp, ""GT_WU"\n", gt_encseq_total_length(encseq)); gt_file_xprintf(arguments->outfp, "compressed size: "); gt_file_xprintf(arguments->outfp, ""GT_WU" bytes\n", gt_encseq_sizeofrep(encseq)); gt_file_xprintf(arguments->outfp, "number of sequences: "); gt_file_xprintf(arguments->outfp, ""GT_WU"\n", gt_encseq_num_of_sequences(encseq)); gt_file_xprintf(arguments->outfp, "number of files: "); gt_file_xprintf(arguments->outfp, ""GT_WU"\n", gt_encseq_num_of_files(encseq)); gt_file_xprintf(arguments->outfp, "length of shortest/longest " "sequence: "); gt_file_xprintf(arguments->outfp, ""GT_WU"/"GT_WU"\n", gt_encseq_min_seq_length(encseq), gt_encseq_max_seq_length(encseq)); filenames = gt_encseq_filenames(encseq); gt_file_xprintf(arguments->outfp, "original filenames:\n"); for (i = 0; i < gt_str_array_size(filenames); i++) { gt_file_xprintf(arguments->outfp, "\t%s ("GT_WU" characters)\n", gt_str_array_get(filenames, i), (GtUword) gt_encseq_effective_filelength(encseq, i)); } alpha = gt_encseq_alphabet(encseq); chars = gt_alphabet_characters(alpha); gt_file_xprintf(arguments->outfp, "alphabet size: "); gt_file_xprintf(arguments->outfp, "%u\n", gt_alphabet_num_of_chars(alpha)); gt_file_xprintf(arguments->outfp, "alphabet characters: "); gt_file_xprintf(arguments->outfp, "%.*s", gt_alphabet_num_of_chars(alpha), (char*) chars); if (gt_alphabet_is_dna(alpha)) gt_file_xprintf(arguments->outfp, " (DNA)"); if (gt_alphabet_is_protein(alpha)) gt_file_xprintf(arguments->outfp, " (Protein)"); gt_file_xprintf(arguments->outfp, "\n"); if (arguments->show_alphabet) { GtStr *out = gt_str_new(); gt_alphabet_to_str(alpha, out); gt_file_xprintf(arguments->outfp, "alphabet definition:\n"); gt_file_xprintf(arguments->outfp, "%s\n", gt_str_get(out)); gt_str_delete(out); } gt_file_xprintf(arguments->outfp, "character distribution:\n"); for (i = 0; i < gt_alphabet_num_of_chars(alpha); i++) { GtUword cc; cc = gt_encseq_charcount(encseq, gt_alphabet_encode(alpha, chars[i])); gt_file_xprintf(arguments->outfp, "\t%c: "GT_WU" (%.2f%%)\n", (char) chars[i], cc, (cc /(double) (gt_encseq_total_length(encseq) - gt_encseq_num_of_sequences(encseq)+1))*100); } gt_file_xprintf(arguments->outfp, "number of wildcards: "); gt_file_xprintf(arguments->outfp, ""GT_WU" ("GT_WU" range(s))\n", gt_encseq_wildcards(encseq), gt_encseq_realwildcardranges(encseq)); gt_file_xprintf(arguments->outfp, "number of special characters: "); gt_file_xprintf(arguments->outfp, ""GT_WU" ("GT_WU" range(s))\n", gt_encseq_specialcharacters(encseq), gt_encseq_realspecialranges(encseq)); gt_file_xprintf(arguments->outfp, "length of longest non-special " "character stretch: "); gt_file_xprintf(arguments->outfp, ""GT_WU"\n", gt_encseq_lengthoflongestnonspecial(encseq)); gt_file_xprintf(arguments->outfp, "accesstype: "); gt_file_xprintf(arguments->outfp, "%s\n", gt_encseq_access_type_str(gt_encseq_accesstype_get(encseq))); gt_file_xprintf(arguments->outfp, "bits used per character: "); gt_file_xprintf(arguments->outfp, "%f\n", (double) ((uint64_t) CHAR_BIT * (uint64_t) gt_encseq_sizeofrep(encseq)) / (double) gt_encseq_total_length(encseq)); gt_file_xprintf(arguments->outfp, "has special ranges: "); gt_file_xprintf(arguments->outfp, "%s\n", gt_encseq_has_specialranges(encseq) ? "yes" : "no"); gt_file_xprintf(arguments->outfp, "has description support: "); gt_file_xprintf(arguments->outfp, "%s\n", gt_encseq_has_description_support(encseq) ? "yes" : "no"); if (gt_encseq_has_description_support(encseq)) { gt_file_xprintf(arguments->outfp, "length of longest description: "); gt_file_xprintf(arguments->outfp, ""GT_WU"\n", gt_encseq_max_desc_length(encseq)); } gt_file_xprintf(arguments->outfp, "has multiple sequence support: "); gt_file_xprintf(arguments->outfp, "%s\n", gt_encseq_has_multiseq_support(encseq) ? "yes" : "no"); } gt_encseq_delete(encseq); gt_encseq_loader_delete(encseq_loader); } return had_err; }
static int gt_readjoiner_assembly_runner(GT_UNUSED int argc, GT_UNUSED const char **argv, GT_UNUSED int parsed_args, void *tool_arguments, GtError *err) { GtReadjoinerAssemblyArguments *arguments = tool_arguments; GtLogger *verbose_logger, *default_logger; GtEncseqLoader *el; GtEncseq *reads; GtTimer *timer = NULL; GtStrgraph *strgraph = NULL; GtBitsequence *contained = NULL; const char *readset = gt_str_get(arguments->readset); bool eqlen = true; GtUword nreads, tlen, rlen; int had_err = 0; gt_assert(arguments); gt_error_check(err); default_logger = gt_logger_new(!arguments->quiet, GT_LOGGER_DEFLT_PREFIX, stdout); gt_logger_log(default_logger, "gt readjoiner assembly (version "GT_READJOINER_VERSION")"); verbose_logger = gt_logger_new(arguments->verbose, GT_LOGGER_DEFLT_PREFIX, stdout); gt_logger_log(verbose_logger, "verbose output activated"); gt_logger_log(verbose_logger, "readset name = %s", readset); if (gt_showtime_enabled()) { timer = gt_timer_new_with_progress_description( GT_READJOINER_ASSEMBLY_MSG_COUNTSPM); gt_timer_start(timer); gt_timer_show_cpu_time_by_progress(timer); } if (!arguments->paths2seq) { el = gt_encseq_loader_new(); gt_encseq_loader_drop_description_support(el); gt_encseq_loader_disable_autosupport(el); reads = gt_encseq_loader_load(el, readset, err); if (reads == NULL) { had_err = -1; } if (had_err == 0) { eqlen = gt_encseq_accesstype_get(reads) == GT_ACCESS_TYPE_EQUALLENGTH; nreads = gt_encseq_num_of_sequences(reads); gt_logger_log(default_logger, "number of reads in filtered readset = " GT_WU, nreads); tlen = gt_encseq_total_length(reads) - nreads + 1; gt_logger_log(verbose_logger, "total length of filtered readset = " GT_WU, tlen); if (eqlen) { rlen = gt_encseq_seqlength(reads, 0); gt_logger_log(verbose_logger, "read length = " GT_WU, rlen); gt_encseq_delete(reads); reads = NULL; } else { had_err = gt_readjoiner_assembly_build_contained_reads_list( arguments, &contained, err); rlen = 0; gt_logger_log(verbose_logger, "read length = variable"); gt_assert(reads != NULL); } } if (had_err == 0) { if (!arguments->load) { had_err = gt_readjoiner_assembly_build_graph(arguments, &strgraph, reads, readset, eqlen, rlen, nreads, contained, default_logger, verbose_logger, timer, err); } else { gt_readjoiner_assembly_load_graph(&strgraph, reads, readset, rlen, default_logger, timer); } } if (!eqlen && reads != NULL && !arguments->errors) { gt_encseq_delete(reads); reads = NULL; if (had_err == 0) gt_strgraph_set_encseq(strgraph, NULL); } if (had_err == 0 && arguments->redtrans) { if (gt_showtime_enabled()) gt_timer_show_progress(timer, GT_READJOINER_ASSEMBLY_MSG_REDTRANS, stdout); gt_strgraph_sort_edges_by_len(strgraph, false); (void)gt_strgraph_redtrans(strgraph, false); (void)gt_strgraph_redself(strgraph, false); (void)gt_strgraph_redwithrc(strgraph, false); gt_strgraph_log_stats(strgraph, verbose_logger); } if (had_err == 0 && arguments->errors) { if (gt_showtime_enabled()) gt_timer_show_progress(timer, GT_READJOINER_ASSEMBLY_MSG_CLEANSG, stdout); gt_logger_log(default_logger, GT_READJOINER_ASSEMBLY_MSG_CLEANSG); had_err = gt_readjoiner_assembly_error_correction(strgraph, arguments->bubble, arguments->deadend, arguments->deadend_depth, verbose_logger); } if (had_err == 0 && arguments->save) { if (gt_showtime_enabled()) gt_timer_show_progress(timer, GT_READJOINER_ASSEMBLY_MSG_SAVESG, stdout); gt_logger_log(default_logger, GT_READJOINER_ASSEMBLY_MSG_SAVESG); gt_strgraph_show(strgraph, GT_STRGRAPH_BIN, gt_str_get(arguments->readset), GT_READJOINER_SUFFIX_SG, false); } if (!eqlen && reads != NULL) { gt_encseq_delete(reads); reads = NULL; if (had_err == 0) gt_strgraph_set_encseq(strgraph, NULL); } if (had_err == 0) { if (gt_showtime_enabled()) gt_timer_show_progress(timer, GT_READJOINER_ASSEMBLY_MSG_TRAVERSESG, stdout); gt_logger_log(default_logger, GT_READJOINER_ASSEMBLY_MSG_TRAVERSESG); gt_readjoiner_assembly_show_current_space("(before traversal)"); gt_strgraph_spell(strgraph, (GtUword)arguments->depthcutoff, (GtUword)arguments->lengthcutoff, arguments->vd, readset, GT_READJOINER_SUFFIX_CONTIG_PATHS, NULL, true, arguments->show_contigs_info, false, verbose_logger); } if (contained != NULL) gt_free(contained); gt_strgraph_delete(strgraph); strgraph = NULL; gt_assert(reads == NULL); gt_encseq_loader_delete(el); } if (had_err == 0) { gt_readjoiner_assembly_show_current_space("(before paths2seq)"); had_err = gt_readjoiner_assembly_paths2seq(readset, (GtUword)arguments->lengthcutoff, arguments->vd, arguments->astat, arguments->coverage, arguments->copynum, arguments->buffersize, default_logger, &timer, err); } if (gt_showtime_enabled()) { gt_timer_show_progress_final(timer, stdout); gt_timer_delete(timer); } gt_logger_delete(default_logger); gt_logger_delete(verbose_logger); return had_err; }
static int gt_readjoiner_cnttest_runner(GT_UNUSED int argc, GT_UNUSED const char **argv, GT_UNUSED int parsed_args, void *tool_arguments, GT_UNUSED GtError *err) { GtReadjoinerCnttestArguments *arguments = tool_arguments; GtEncseqLoader *el = NULL; GtEncseq *reads = NULL; GtBitsequence *bits = NULL; GtUword nofreads; int had_err = 0; gt_error_check(err); gt_assert(arguments); if (arguments->test == GT_READJOINER_CNTTEST_SHOWLIST) { GtStr *fn = NULL; fn = gt_str_clone(arguments->readset); gt_str_append_cstr(fn, GT_READJOINER_SUFFIX_CNTLIST); had_err = gt_cntlist_parse(gt_str_get(fn), true, &bits, &nofreads, err); gt_str_delete(fn); } else if (arguments->test == GT_READJOINER_CNTTEST_BRUTEFORCE || arguments->test == GT_READJOINER_CNTTEST_KMP) { el = gt_encseq_loader_new(); gt_encseq_loader_drop_description_support(el); gt_encseq_loader_disable_autosupport(el); if (!arguments->singlestrand) gt_encseq_loader_mirror(el); reads = gt_encseq_loader_load(el, gt_str_get(arguments->readset), err); if (reads == NULL) had_err = -1; else { gt_rdj_pairwise_exact(GT_OVLFIND_CNT, reads, !arguments->singlestrand, false, arguments->test == GT_READJOINER_CNTTEST_KMP, 1UL, true, NULL, NULL, false, NULL, &bits, &nofreads); } gt_encseq_delete(reads); gt_encseq_loader_delete(el); } else if (arguments->test == GT_READJOINER_CNTTEST_ESA) { Sequentialsuffixarrayreader *ssar = NULL; GtUword readlength = 0, firstrevcompl = 0; GtLogger *verbose_logger = gt_logger_new(arguments->verbose, GT_LOGGER_DEFLT_PREFIX, stderr); ssar = gt_newSequentialsuffixarrayreaderfromfile(gt_str_get( arguments->readset), SARR_LCPTAB | SARR_SUFTAB | SARR_SSPTAB, true, verbose_logger, err); if (gt_error_is_set(err)) had_err = -1; else { nofreads = gt_encseq_num_of_sequences(ssar->encseq); if (!arguments->singlestrand) { nofreads = GT_DIV2(nofreads); firstrevcompl = nofreads; } GT_INITBITTAB(bits, nofreads); if (!arguments->singlestrand) if (gt_encseq_accesstype_get(ssar->encseq) == GT_ACCESS_TYPE_EQUALLENGTH) readlength = gt_encseq_seqlength(ssar->encseq, 0); (void)gt_contfind_bottomup(ssar, false, bits, arguments->singlestrand ? 0 : firstrevcompl, readlength); } if (ssar != NULL) gt_freeSequentialsuffixarrayreader(&ssar); gt_logger_delete(verbose_logger); } else { gt_assert(false); } if (!had_err) had_err = gt_cntlist_show(bits, nofreads, NULL, false, err); gt_free(bits); return had_err; }