static GtEncseq *mapbwtencoding(const char *indexname, GtLogger *logger, GtError *err) { GtEncseqLoader *el; GtEncseq *ret; gt_error_check(err); el = gt_encseq_loader_new(); gt_encseq_loader_do_not_require_des_tab(el); gt_encseq_loader_do_not_require_ssp_tab(el); gt_encseq_loader_do_not_require_sds_tab(el); gt_encseq_loader_set_logger(el, logger); ret = gt_encseq_loader_load(el, indexname, err); gt_encseq_loader_delete(el); return ret; }
static int gt_readjoiner_assembly_paths2seq(const char *readset, GtUword lengthcutoff, bool showpaths, bool astat, double coverage, bool load_copynum, GtUword buffersize, GtLogger *default_logger, GtTimer **timer, GtError *err) { int had_err; GtEncseqLoader *el = gt_encseq_loader_new(); GtEncseq *reads; if (gt_showtime_enabled()) { gt_assert(timer != NULL); if (*timer == NULL) /* paths2seq */ { *timer = gt_timer_new_with_progress_description( GT_READJOINER_ASSEMBLY_MSG_PUMPENCSEQ); gt_timer_show_cpu_time_by_progress(*timer); gt_timer_start(*timer); } else gt_timer_show_progress(*timer, GT_READJOINER_ASSEMBLY_MSG_PUMPENCSEQ, stdout); } gt_logger_log(default_logger, GT_READJOINER_ASSEMBLY_MSG_PUMPENCSEQ); gt_encseq_loader_drop_description_support(el); gt_encseq_loader_disable_autosupport(el); gt_encseq_loader_mirror(el); reads = gt_encseq_loader_load(el, readset, err); gt_assert(reads != NULL); gt_readjoiner_assembly_pump_encseq_through_cache(reads); if (gt_showtime_enabled()) gt_timer_show_progress(*timer, GT_READJOINER_ASSEMBLY_MSG_OUTPUTCONTIGS, stdout); gt_logger_log(default_logger, GT_READJOINER_ASSEMBLY_MSG_OUTPUTCONTIGS); had_err = gt_contigpaths_to_fasta(readset, GT_READJOINER_SUFFIX_CONTIG_PATHS, GT_READJOINER_SUFFIX_CONTIGS, reads, lengthcutoff, showpaths, astat, coverage, load_copynum, (size_t)buffersize, default_logger, err); gt_encseq_delete(reads); gt_encseq_loader_delete(el); return had_err; }
static int gt_readjoiner_assembly_runner(GT_UNUSED int argc, GT_UNUSED const char **argv, GT_UNUSED int parsed_args, void *tool_arguments, GtError *err) { GtReadjoinerAssemblyArguments *arguments = tool_arguments; GtLogger *verbose_logger, *default_logger; GtEncseqLoader *el; GtEncseq *reads; GtTimer *timer = NULL; GtStrgraph *strgraph = NULL; GtBitsequence *contained = NULL; const char *readset = gt_str_get(arguments->readset); bool eqlen = true; GtUword nreads, tlen, rlen; int had_err = 0; gt_assert(arguments); gt_error_check(err); default_logger = gt_logger_new(!arguments->quiet, GT_LOGGER_DEFLT_PREFIX, stdout); gt_logger_log(default_logger, "gt readjoiner assembly (version "GT_READJOINER_VERSION")"); verbose_logger = gt_logger_new(arguments->verbose, GT_LOGGER_DEFLT_PREFIX, stdout); gt_logger_log(verbose_logger, "verbose output activated"); gt_logger_log(verbose_logger, "readset name = %s", readset); if (gt_showtime_enabled()) { timer = gt_timer_new_with_progress_description( GT_READJOINER_ASSEMBLY_MSG_COUNTSPM); gt_timer_start(timer); gt_timer_show_cpu_time_by_progress(timer); } if (!arguments->paths2seq) { el = gt_encseq_loader_new(); gt_encseq_loader_drop_description_support(el); gt_encseq_loader_disable_autosupport(el); reads = gt_encseq_loader_load(el, readset, err); if (reads == NULL) { had_err = -1; } if (had_err == 0) { eqlen = gt_encseq_accesstype_get(reads) == GT_ACCESS_TYPE_EQUALLENGTH; nreads = gt_encseq_num_of_sequences(reads); gt_logger_log(default_logger, "number of reads in filtered readset = " GT_WU, nreads); tlen = gt_encseq_total_length(reads) - nreads + 1; gt_logger_log(verbose_logger, "total length of filtered readset = " GT_WU, tlen); if (eqlen) { rlen = gt_encseq_seqlength(reads, 0); gt_logger_log(verbose_logger, "read length = " GT_WU, rlen); gt_encseq_delete(reads); reads = NULL; } else { had_err = gt_readjoiner_assembly_build_contained_reads_list( arguments, &contained, err); rlen = 0; gt_logger_log(verbose_logger, "read length = variable"); gt_assert(reads != NULL); } } if (had_err == 0) { if (!arguments->load) { had_err = gt_readjoiner_assembly_build_graph(arguments, &strgraph, reads, readset, eqlen, rlen, nreads, contained, default_logger, verbose_logger, timer, err); } else { gt_readjoiner_assembly_load_graph(&strgraph, reads, readset, rlen, default_logger, timer); } } if (!eqlen && reads != NULL && !arguments->errors) { gt_encseq_delete(reads); reads = NULL; if (had_err == 0) gt_strgraph_set_encseq(strgraph, NULL); } if (had_err == 0 && arguments->redtrans) { if (gt_showtime_enabled()) gt_timer_show_progress(timer, GT_READJOINER_ASSEMBLY_MSG_REDTRANS, stdout); gt_strgraph_sort_edges_by_len(strgraph, false); (void)gt_strgraph_redtrans(strgraph, false); (void)gt_strgraph_redself(strgraph, false); (void)gt_strgraph_redwithrc(strgraph, false); gt_strgraph_log_stats(strgraph, verbose_logger); } if (had_err == 0 && arguments->errors) { if (gt_showtime_enabled()) gt_timer_show_progress(timer, GT_READJOINER_ASSEMBLY_MSG_CLEANSG, stdout); gt_logger_log(default_logger, GT_READJOINER_ASSEMBLY_MSG_CLEANSG); had_err = gt_readjoiner_assembly_error_correction(strgraph, arguments->bubble, arguments->deadend, arguments->deadend_depth, verbose_logger); } if (had_err == 0 && arguments->save) { if (gt_showtime_enabled()) gt_timer_show_progress(timer, GT_READJOINER_ASSEMBLY_MSG_SAVESG, stdout); gt_logger_log(default_logger, GT_READJOINER_ASSEMBLY_MSG_SAVESG); gt_strgraph_show(strgraph, GT_STRGRAPH_BIN, gt_str_get(arguments->readset), GT_READJOINER_SUFFIX_SG, false); } if (!eqlen && reads != NULL) { gt_encseq_delete(reads); reads = NULL; if (had_err == 0) gt_strgraph_set_encseq(strgraph, NULL); } if (had_err == 0) { if (gt_showtime_enabled()) gt_timer_show_progress(timer, GT_READJOINER_ASSEMBLY_MSG_TRAVERSESG, stdout); gt_logger_log(default_logger, GT_READJOINER_ASSEMBLY_MSG_TRAVERSESG); gt_readjoiner_assembly_show_current_space("(before traversal)"); gt_strgraph_spell(strgraph, (GtUword)arguments->depthcutoff, (GtUword)arguments->lengthcutoff, arguments->vd, readset, GT_READJOINER_SUFFIX_CONTIG_PATHS, NULL, true, arguments->show_contigs_info, false, verbose_logger); } if (contained != NULL) gt_free(contained); gt_strgraph_delete(strgraph); strgraph = NULL; gt_assert(reads == NULL); gt_encseq_loader_delete(el); } if (had_err == 0) { gt_readjoiner_assembly_show_current_space("(before paths2seq)"); had_err = gt_readjoiner_assembly_paths2seq(readset, (GtUword)arguments->lengthcutoff, arguments->vd, arguments->astat, arguments->coverage, arguments->copynum, arguments->buffersize, default_logger, &timer, err); } if (gt_showtime_enabled()) { gt_timer_show_progress_final(timer, stdout); gt_timer_delete(timer); } gt_logger_delete(default_logger); gt_logger_delete(verbose_logger); return had_err; }
static int inputsuffixarray(bool map, Suffixarray *suffixarray, unsigned int demand, const char *indexname, GtLogger *logger, GtError *err) { bool haserr = false; GtEncseqLoader *el; GtUword totallength = 0; gt_error_check(err); initsuffixarray(suffixarray); el = gt_encseq_loader_new(); if (!(demand & SARR_DESTAB)) gt_encseq_loader_do_not_require_des_tab(el); else gt_encseq_loader_require_des_tab(el); if (!(demand & SARR_SDSTAB)) gt_encseq_loader_do_not_require_sds_tab(el); else gt_encseq_loader_require_sds_tab(el); if (!(demand & SARR_SSPTAB)) gt_encseq_loader_do_not_require_ssp_tab(el); else gt_encseq_loader_require_ssp_tab(el); gt_encseq_loader_set_logger(el, logger); suffixarray->encseq = gt_encseq_loader_load(el, indexname, err); gt_encseq_loader_delete(el); if (suffixarray->encseq == NULL) { haserr = true; } if (!haserr) { haserr = scanprjfileuintkeys(suffixarray,indexname,logger,err); } if (!haserr && suffixarray->mirroredencseq && !gt_encseq_is_mirrored(suffixarray->encseq)) { if (gt_encseq_mirror(suffixarray->encseq, err) != 0) haserr = true; } if (!haserr) { totallength = gt_encseq_total_length(suffixarray->encseq); } if (!haserr && (demand & SARR_SUFTAB)) { if (map) { if (suffixarray->numberofallsortedsuffixes > 0) { suffixarray->suftab = gt_fa_mmap_check_size_with_suffix(indexname, GT_SUFTABSUFFIX, suffixarray->numberofallsortedsuffixes, sizeof (*suffixarray->suftab), err); if (suffixarray->suftab == NULL) { haserr = true; } } } else { #if defined (_LP64) || defined (_WIN64) off_t filesize = gt_file_size_with_suffix(indexname,GT_SUFTABSUFFIX); if (filesize == (off_t) sizeof (uint32_t) * suffixarray->numberofallsortedsuffixes) { gt_logger_log(logger,"read suftab in units of 4 bytes"); INITBufferedfile(indexname,&suffixarray->suftabstream_uint32_t,uint32_t, GT_SUFTABSUFFIX); } else { gt_logger_log(logger,"read suftab in units of 8 bytes"); INITBufferedfile(indexname,&suffixarray->suftabstream_GtUword,GtUword, GT_SUFTABSUFFIX); } #else gt_logger_log(logger,"read suftab in units of 4 bytes"); INITBufferedfile(indexname,&suffixarray->suftabstream_GtUword,GtUword, GT_SUFTABSUFFIX); #endif } if (!haserr && !suffixarray->longest.defined) { gt_error_set(err,"longest not defined"); haserr = true; } } if (!haserr && (demand & SARR_LCPTAB)) { if (map) { if (suffixarray->numberofallsortedsuffixes > 0) { suffixarray->lcptab = gt_fa_mmap_check_size_with_suffix(indexname, GT_LCPTABSUFFIX, suffixarray->numberofallsortedsuffixes, sizeof (*suffixarray->lcptab), err); if (suffixarray->lcptab == NULL) { haserr = true; } } } else { INITBufferedfile(indexname,&suffixarray->lcptabstream,GtUchar, GT_LCPTABSUFFIX); if (!haserr && fseek(suffixarray->lcptabstream.fp, (GtWord) sizeof (GtUchar),SEEK_SET)) { gt_error_set(err,"fseek(esastream) failed: %s",strerror(errno)); haserr = true; } } if (!haserr && !suffixarray->numoflargelcpvalues.defined) { gt_error_set(err,"numoflargelcpvalues not defined"); haserr = true; } if (!haserr && suffixarray->numoflargelcpvalues.valueunsignedlong > 0) { if (map) { suffixarray->llvtab = gt_fa_mmap_check_size_with_suffix(indexname, GT_LARGELCPTABSUFFIX, (GtUword) suffixarray->numoflargelcpvalues. valueunsignedlong, sizeof (*suffixarray->llvtab), err); if (suffixarray->llvtab == NULL) { haserr = true; } } else { INITBufferedfile(indexname,&suffixarray->llvtabstream,Largelcpvalue, GT_LARGELCPTABSUFFIX); } } } if (!haserr && (demand & SARR_BWTTAB)) { if (map) { suffixarray->bwttab = gt_fa_mmap_check_size_with_suffix(indexname, GT_BWTTABSUFFIX, totallength+1, sizeof (*suffixarray->bwttab), err); if (suffixarray->bwttab == NULL) { haserr = true; } } else { INITBufferedfile(indexname,&suffixarray->bwttabstream,GtUchar, GT_BWTTABSUFFIX); } } if (!haserr && (demand & SARR_BCKTAB)) { suffixarray->bcktab = gt_bcktab_map(indexname, gt_encseq_alphabetnumofchars(suffixarray->encseq), suffixarray->prefixlength, totallength+1, true, err); if (suffixarray->bcktab == NULL) { haserr = true; } } if (haserr) { gt_freesuffixarray(suffixarray); } return haserr ? -1 : 0; }
static int gt_condenseq_compress_runner(GT_UNUSED int argc, const char **argv, int parsed_args, void *tool_arguments, GtError *err) { GtCondenseqCompressArguments *arguments = tool_arguments; GtLogger *logger, *kdb_logger; FILE *kmer_fp = NULL; int had_err = 0; gt_error_check(err); gt_assert(arguments); logger = gt_logger_new(arguments->verbose, GT_LOGGER_DEFLT_PREFIX, stderr); kdb_logger = gt_logger_new(arguments->kdb, GT_LOGGER_DEFLT_PREFIX, stderr); if (arguments->kdb) { kmer_fp = gt_fa_fopen("kmer_db.out", "w", err); gt_logger_set_target(kdb_logger, kmer_fp); } if (gt_str_length(arguments->indexname) == 0UL) { char *basenameptr; basenameptr = gt_basename(argv[parsed_args]); gt_str_set(arguments->indexname, basenameptr); gt_free(basenameptr); } if (!had_err) { GtEncseqLoader *es_l = gt_encseq_loader_new(); arguments->input_es = gt_encseq_loader_load(es_l, argv[parsed_args], err); if (arguments->input_es == NULL) had_err = -1; gt_encseq_loader_delete(es_l); } if (!had_err) { if (arguments->minalignlength == GT_UNDEF_UWORD) arguments->minalignlength = arguments->initsize != GT_UNDEF_UWORD ? arguments->initsize / (GtUword) 3UL : GT_UNDEF_UWORD; if (arguments->windowsize == GT_UNDEF_UINT) arguments->windowsize = arguments->minalignlength != GT_UNDEF_UWORD ? (unsigned int) (arguments->minalignlength / 5U) : GT_UNDEF_UINT; if (arguments->windowsize < 4U) arguments->windowsize = 4U; if (arguments->kmersize == GT_UNDEF_UINT) { unsigned int size = gt_alphabet_num_of_chars(gt_encseq_alphabet(arguments->input_es)); /* size^k ~= 100000 */ gt_safe_assign(arguments->kmersize, gt_round_to_long(gt_log_base(100000.0, (double) size))); gt_logger_log(logger, "|A|: %u, k: %u", size, arguments->kmersize); } if (arguments->windowsize == GT_UNDEF_UINT) { arguments->windowsize = 5U * arguments->kmersize; } if (arguments->minalignlength == GT_UNDEF_UWORD) { arguments->minalignlength = (GtUword) (3UL * arguments->windowsize); } if (arguments->initsize == GT_UNDEF_UWORD) { arguments->initsize = (GtUword) (3UL * arguments->minalignlength); } } if (!had_err && arguments->windowsize <= arguments->kmersize) { gt_error_set(err, "-windowsize (%u) must be larger -kmersize (%u)!", arguments->windowsize, arguments->kmersize); had_err = -1; } if (!had_err && arguments->minalignlength < (GtUword) arguments->windowsize) { gt_error_set(err, "-alignlength (" GT_WU ") must be at least " "-windowsize (%u)!", arguments->minalignlength, arguments->windowsize); had_err = -1; } if (!had_err && (arguments->initsize < arguments->minalignlength)) { gt_error_set(err, "-initsize (" GT_WU ") must be at least " "-alignlength (" GT_WU ")!", arguments->initsize, arguments->minalignlength); had_err = -1; } if (!had_err) { GtCondenseqCreator *ces_c; if (!had_err) { ces_c = gt_condenseq_creator_new(arguments->initsize, arguments->minalignlength, arguments->xdrop, &(arguments->scores), arguments->kmersize, arguments->windowsize, logger, err); if (ces_c == NULL) had_err = -1; } if (!had_err) { if (arguments->cutoff_value == GT_UNDEF_UWORD) gt_condenseq_creator_use_mean_cutoff(ces_c); else if (arguments->cutoff_value == 0) gt_condenseq_creator_disable_cutoff(ces_c); else gt_condenseq_creator_set_cutoff(ces_c, arguments->cutoff_value); gt_condenseq_creator_set_mean_fraction(ces_c, arguments->fraction); if (arguments->prune) gt_condenseq_creator_disable_prune(ces_c); if (arguments->brute) gt_condenseq_creator_enable_brute_force(ces_c); if (!arguments->diags) gt_condenseq_creator_disable_diagonals(ces_c); if (arguments->full_diags) gt_condenseq_creator_enable_full_diagonals(ces_c); if (arguments->clean_percent != GT_UNDEF_UINT) gt_condenseq_creator_set_diags_clean_limit(ces_c, arguments->clean_percent); had_err = gt_condenseq_creator_create(ces_c, arguments->indexname, arguments->input_es, logger, kdb_logger, err); gt_condenseq_creator_delete(ces_c); } } gt_logger_delete(logger); gt_logger_delete(kdb_logger); if (arguments->kdb) gt_fa_fclose(kmer_fp); return had_err; }
static int gt_encseq_info_runner(GT_UNUSED int argc, const char **argv, int parsed_args, void *tool_arguments, GtError *err) { GtEncseqInfoArguments *arguments = tool_arguments; int had_err = 0; GtAlphabet *alpha; const GtUchar *chars; gt_error_check(err); gt_assert(arguments); if (arguments->nomap) { GtEncseqMetadata *emd = gt_encseq_metadata_new(argv[parsed_args], err); if (!emd) had_err = -1; if (!had_err) { if (!arguments->noindexname) { gt_file_xprintf(arguments->outfp, "index name: "); gt_file_xprintf(arguments->outfp, "%s\n", argv[parsed_args]); } gt_file_xprintf(arguments->outfp, "file format version: "); gt_file_xprintf(arguments->outfp, ""GT_WU"\n", gt_encseq_metadata_version(emd)); gt_file_xprintf(arguments->outfp, "64-bit file: "); gt_file_xprintf(arguments->outfp, "%s\n", gt_encseq_metadata_is64bit(emd) ? "yes" : "no"); gt_file_xprintf(arguments->outfp, "total length: "); gt_file_xprintf(arguments->outfp, ""GT_WU"\n", gt_encseq_metadata_total_length(emd)); gt_file_xprintf(arguments->outfp, "number of sequences: "); gt_file_xprintf(arguments->outfp, ""GT_WU"\n", gt_encseq_metadata_num_of_sequences(emd)); gt_file_xprintf(arguments->outfp, "number of files: "); gt_file_xprintf(arguments->outfp, ""GT_WU"\n", gt_encseq_metadata_num_of_files(emd)); gt_file_xprintf(arguments->outfp, "length of shortest/longest " "sequence: "); gt_file_xprintf(arguments->outfp, ""GT_WU"/"GT_WU"\n", gt_encseq_metadata_min_seq_length(emd), gt_encseq_metadata_max_seq_length(emd)); gt_file_xprintf(arguments->outfp, "accesstype: "); gt_file_xprintf(arguments->outfp, "%s\n", gt_encseq_access_type_str(gt_encseq_metadata_accesstype(emd))); alpha = gt_encseq_metadata_alphabet(emd); chars = gt_alphabet_characters(alpha); gt_file_xprintf(arguments->outfp, "alphabet size: "); gt_file_xprintf(arguments->outfp, "%u\n", gt_alphabet_num_of_chars(alpha)); gt_file_xprintf(arguments->outfp, "alphabet characters: "); gt_file_xprintf(arguments->outfp, "%.*s", gt_alphabet_num_of_chars(alpha), (char*) chars); if (gt_alphabet_is_dna(alpha)) gt_file_xprintf(arguments->outfp, " (DNA)"); if (gt_alphabet_is_protein(alpha)) gt_file_xprintf(arguments->outfp, " (Protein)"); gt_file_xprintf(arguments->outfp, "\n"); if (arguments->show_alphabet) { GtStr *out = gt_str_new(); gt_alphabet_to_str(alpha, out); gt_file_xprintf(arguments->outfp, "alphabet definition:\n"); gt_file_xprintf(arguments->outfp, "%s\n", gt_str_get(out)); gt_str_delete(out); } } gt_encseq_metadata_delete(emd); } else { GtEncseqLoader *encseq_loader; GtEncseq *encseq; encseq_loader = gt_encseq_loader_new(); if (arguments->mirror) gt_encseq_loader_mirror(encseq_loader); if (!(encseq = gt_encseq_loader_load(encseq_loader, argv[parsed_args], err))) had_err = -1; if (!had_err) { const GtStrArray *filenames; GtUword i; if (!arguments->noindexname) { gt_file_xprintf(arguments->outfp, "index name: "); gt_file_xprintf(arguments->outfp, "%s\n", argv[parsed_args]); } gt_file_xprintf(arguments->outfp, "file format version: "); gt_file_xprintf(arguments->outfp, ""GT_WU"\n", gt_encseq_version(encseq)); gt_file_xprintf(arguments->outfp, "64-bit file: "); gt_file_xprintf(arguments->outfp, "%s\n", gt_encseq_is_64_bit(encseq) ? "yes" : "no"); gt_file_xprintf(arguments->outfp, "total length: "); gt_file_xprintf(arguments->outfp, ""GT_WU"\n", gt_encseq_total_length(encseq)); gt_file_xprintf(arguments->outfp, "compressed size: "); gt_file_xprintf(arguments->outfp, ""GT_WU" bytes\n", gt_encseq_sizeofrep(encseq)); gt_file_xprintf(arguments->outfp, "number of sequences: "); gt_file_xprintf(arguments->outfp, ""GT_WU"\n", gt_encseq_num_of_sequences(encseq)); gt_file_xprintf(arguments->outfp, "number of files: "); gt_file_xprintf(arguments->outfp, ""GT_WU"\n", gt_encseq_num_of_files(encseq)); gt_file_xprintf(arguments->outfp, "length of shortest/longest " "sequence: "); gt_file_xprintf(arguments->outfp, ""GT_WU"/"GT_WU"\n", gt_encseq_min_seq_length(encseq), gt_encseq_max_seq_length(encseq)); filenames = gt_encseq_filenames(encseq); gt_file_xprintf(arguments->outfp, "original filenames:\n"); for (i = 0; i < gt_str_array_size(filenames); i++) { gt_file_xprintf(arguments->outfp, "\t%s ("GT_WU" characters)\n", gt_str_array_get(filenames, i), (GtUword) gt_encseq_effective_filelength(encseq, i)); } alpha = gt_encseq_alphabet(encseq); chars = gt_alphabet_characters(alpha); gt_file_xprintf(arguments->outfp, "alphabet size: "); gt_file_xprintf(arguments->outfp, "%u\n", gt_alphabet_num_of_chars(alpha)); gt_file_xprintf(arguments->outfp, "alphabet characters: "); gt_file_xprintf(arguments->outfp, "%.*s", gt_alphabet_num_of_chars(alpha), (char*) chars); if (gt_alphabet_is_dna(alpha)) gt_file_xprintf(arguments->outfp, " (DNA)"); if (gt_alphabet_is_protein(alpha)) gt_file_xprintf(arguments->outfp, " (Protein)"); gt_file_xprintf(arguments->outfp, "\n"); if (arguments->show_alphabet) { GtStr *out = gt_str_new(); gt_alphabet_to_str(alpha, out); gt_file_xprintf(arguments->outfp, "alphabet definition:\n"); gt_file_xprintf(arguments->outfp, "%s\n", gt_str_get(out)); gt_str_delete(out); } gt_file_xprintf(arguments->outfp, "character distribution:\n"); for (i = 0; i < gt_alphabet_num_of_chars(alpha); i++) { GtUword cc; cc = gt_encseq_charcount(encseq, gt_alphabet_encode(alpha, chars[i])); gt_file_xprintf(arguments->outfp, "\t%c: "GT_WU" (%.2f%%)\n", (char) chars[i], cc, (cc /(double) (gt_encseq_total_length(encseq) - gt_encseq_num_of_sequences(encseq)+1))*100); } gt_file_xprintf(arguments->outfp, "number of wildcards: "); gt_file_xprintf(arguments->outfp, ""GT_WU" ("GT_WU" range(s))\n", gt_encseq_wildcards(encseq), gt_encseq_realwildcardranges(encseq)); gt_file_xprintf(arguments->outfp, "number of special characters: "); gt_file_xprintf(arguments->outfp, ""GT_WU" ("GT_WU" range(s))\n", gt_encseq_specialcharacters(encseq), gt_encseq_realspecialranges(encseq)); gt_file_xprintf(arguments->outfp, "length of longest non-special " "character stretch: "); gt_file_xprintf(arguments->outfp, ""GT_WU"\n", gt_encseq_lengthoflongestnonspecial(encseq)); gt_file_xprintf(arguments->outfp, "accesstype: "); gt_file_xprintf(arguments->outfp, "%s\n", gt_encseq_access_type_str(gt_encseq_accesstype_get(encseq))); gt_file_xprintf(arguments->outfp, "bits used per character: "); gt_file_xprintf(arguments->outfp, "%f\n", (double) ((uint64_t) CHAR_BIT * (uint64_t) gt_encseq_sizeofrep(encseq)) / (double) gt_encseq_total_length(encseq)); gt_file_xprintf(arguments->outfp, "has special ranges: "); gt_file_xprintf(arguments->outfp, "%s\n", gt_encseq_has_specialranges(encseq) ? "yes" : "no"); gt_file_xprintf(arguments->outfp, "has description support: "); gt_file_xprintf(arguments->outfp, "%s\n", gt_encseq_has_description_support(encseq) ? "yes" : "no"); if (gt_encseq_has_description_support(encseq)) { gt_file_xprintf(arguments->outfp, "length of longest description: "); gt_file_xprintf(arguments->outfp, ""GT_WU"\n", gt_encseq_max_desc_length(encseq)); } gt_file_xprintf(arguments->outfp, "has multiple sequence support: "); gt_file_xprintf(arguments->outfp, "%s\n", gt_encseq_has_multiseq_support(encseq) ? "yes" : "no"); } gt_encseq_delete(encseq); gt_encseq_loader_delete(encseq_loader); } return had_err; }
static int gt_seed_extend_runner(GT_UNUSED int argc, GT_UNUSED const char **argv, GT_UNUSED int parsed_args, void *tool_arguments, GtError *err) { GtSeedExtendArguments *arguments = tool_arguments; GtEncseqLoader *encseq_loader = NULL; GtEncseq *aencseq = NULL, *bencseq = NULL; GtGreedyextendmatchinfo *grextinfo = NULL; GtXdropmatchinfo *xdropinfo = NULL; GtQuerymatchoutoptions *querymatchoutopt = NULL; GtTimer *seedextendtimer = NULL; GtExtendCharAccess cam = GT_EXTEND_CHAR_ACCESS_ANY; GtUword errorpercentage = 0UL; int had_err = 0; gt_error_check(err); gt_assert(arguments != NULL); gt_assert(arguments->se_minidentity >= GT_EXTEND_MIN_IDENTITY_PERCENTAGE && arguments->se_minidentity <= 100UL); /* Calculate error percentage from minidentity */ errorpercentage = 100UL - arguments->se_minidentity; /* Measure whole running time */ if (arguments->benchmark || arguments->verbose) { gt_showtime_enable(); } if (gt_showtime_enabled()) { seedextendtimer = gt_timer_new(); gt_timer_start(seedextendtimer); } /* Load encseq A */ encseq_loader = gt_encseq_loader_new(); gt_encseq_loader_enable_autosupport(encseq_loader); aencseq = gt_encseq_loader_load(encseq_loader, gt_str_get(arguments->dbs_indexname), err); if (aencseq == NULL) had_err = -1; /* If there is a 2nd read set: Load encseq B */ if (!had_err) { if (strcmp(gt_str_get(arguments->dbs_queryname), "") != 0) { bencseq = gt_encseq_loader_load(encseq_loader, gt_str_get(arguments->dbs_queryname), err); } else { bencseq = gt_encseq_ref(aencseq); } if (bencseq == NULL) { had_err = -1; gt_encseq_delete(aencseq); } } gt_encseq_loader_delete(encseq_loader); /* set character access method */ if (!had_err && (gt_option_is_set(arguments->se_option_greedy) || gt_option_is_set(arguments->se_option_xdrop) || arguments->se_alignmentwidth > 0)) { cam = gt_greedy_extend_char_access(gt_str_get (arguments->se_char_access_mode), err); if ((int) cam == -1) { had_err = -1; gt_encseq_delete(aencseq); gt_encseq_delete(bencseq); } } /* Use bias dependent parameters, adapted from E. Myers' DALIGNER */ if (!had_err && arguments->bias_parameters) { const GtAlphabet *alpha = gt_encseq_alphabet(aencseq); const double bias_factor[10] = {.690, .690, .690, .690, .780, .850, .900, .933, .966, 1.000}; if (gt_alphabet_is_dna(alpha)) { GtUword at, cg; at = gt_encseq_charcount(aencseq, gt_alphabet_encode(alpha, 'a')); at += gt_encseq_charcount(aencseq, gt_alphabet_encode(alpha, 't')); cg = gt_encseq_charcount(aencseq, gt_alphabet_encode(alpha, 'c')); cg += gt_encseq_charcount(aencseq, gt_alphabet_encode(alpha, 'g')); if (at + cg > 0) { const double ratio = (double)MIN(at, cg) / (at + cg); int bias_index = (int)MAX(0.0, (ratio + 0.025) * 20.0 - 1.0); gt_assert(bias_index < 10); arguments->se_maxalilendiff = 30; arguments->se_perc_match_hist = (GtUword)(100.0 - errorpercentage * bias_factor[bias_index]); if (arguments->verbose) { printf("# Base ratio = %4.2lf -> percmathistory = "GT_WU"\n", ratio, arguments->se_perc_match_hist); } } else { had_err = -1; } } else { had_err = -1; } if (had_err) { gt_error_set(err, "option \"-bias-parameters\" can only be applied to " "the DNA alphabet"); gt_encseq_delete(aencseq); gt_encseq_delete(bencseq); } } /* Prepare options for greedy extension */ if (!had_err && gt_option_is_set(arguments->se_option_greedy)) { grextinfo = gt_greedy_extend_matchinfo_new(errorpercentage, arguments->se_maxalilendiff, arguments->se_historysize, arguments->se_perc_match_hist, arguments->se_alignlength, cam, arguments->se_extendgreedy); if (arguments->benchmark) { gt_greedy_extend_matchinfo_silent_set(grextinfo); } } /* Prepare options for xdrop extension */ if (!had_err && gt_option_is_set(arguments->se_option_xdrop)) { xdropinfo = gt_xdrop_matchinfo_new(arguments->se_alignlength, errorpercentage, arguments->se_xdropbelowscore, arguments->se_extendxdrop); if (arguments->benchmark) { gt_xdrop_matchinfo_silent_set(xdropinfo); } } /* Prepare output options */ if (!had_err && (arguments->se_alignmentwidth > 0 || gt_option_is_set(arguments->se_option_xdrop))) { querymatchoutopt = gt_querymatchoutoptions_new(arguments->se_alignmentwidth); if (gt_option_is_set(arguments->se_option_xdrop) || gt_option_is_set(arguments->se_option_greedy)) { const GtUword sensitivity = gt_option_is_set(arguments->se_option_greedy) ? arguments->se_extendgreedy : 100; gt_querymatchoutoptions_extend(querymatchoutopt, errorpercentage, arguments->se_maxalilendiff, arguments->se_historysize, arguments->se_perc_match_hist, cam, sensitivity); } } /* Start algorithm */ if (!had_err) { GtDiagbandseed dbsarguments; dbsarguments.errorpercentage = errorpercentage; dbsarguments.userdefinedleastlength = arguments->se_alignlength; dbsarguments.seedlength = arguments->dbs_seedlength; dbsarguments.logdiagbandwidth = arguments->dbs_logdiagbandwidth; dbsarguments.mincoverage = arguments->dbs_mincoverage; dbsarguments.maxfreq = arguments->dbs_maxfreq; dbsarguments.memlimit = arguments->dbs_memlimit; dbsarguments.mirror = arguments->mirror; dbsarguments.overlappingseeds = arguments->overlappingseeds; dbsarguments.verify = arguments->dbs_verify; dbsarguments.verbose = arguments->verbose; dbsarguments.debug_kmer = arguments->dbs_debug_kmer; dbsarguments.debug_seedpair = arguments->dbs_debug_seedpair; dbsarguments.seed_display = arguments->seed_display; dbsarguments.extendgreedyinfo = grextinfo; dbsarguments.extendxdropinfo = xdropinfo; dbsarguments.querymatchoutopt = querymatchoutopt; had_err = gt_diagbandseed_run(aencseq, bencseq, &dbsarguments, err); /* clean up */ gt_encseq_delete(aencseq); gt_encseq_delete(bencseq); if (gt_option_is_set(arguments->se_option_greedy)) { gt_greedy_extend_matchinfo_delete(grextinfo); } if (gt_option_is_set(arguments->se_option_xdrop)) { gt_xdrop_matchinfo_delete(xdropinfo); } if (arguments->se_alignmentwidth > 0 || gt_option_is_set(arguments->se_option_xdrop)) { gt_querymatchoutoptions_delete(querymatchoutopt); } } if (gt_showtime_enabled()) { if (!had_err) { char *keystring = gt_seed_extend_params_keystring(gt_option_is_set(arguments-> se_option_greedy), gt_option_is_set(arguments-> se_option_xdrop), arguments->dbs_seedlength, arguments->se_alignlength, arguments->se_minidentity, arguments->se_maxalilendiff, arguments->se_perc_match_hist, arguments->se_extendgreedy, arguments->se_extendxdrop, arguments->se_xdropbelowscore); printf("# TIME seedextend-%s", keystring); gt_free(keystring); gt_timer_show_formatted(seedextendtimer, " overall " GT_WD ".%06ld\n", stdout); } gt_timer_delete(seedextendtimer); } return had_err; }
static int gt_seqorder_runner(GT_UNUSED int argc, const char **argv, int parsed_args, void *tool_arguments, GtError *err) { GtSeqorderArguments *arguments = tool_arguments; int had_err = 0; GtEncseq *encseq; GtEncseqLoader *loader; unsigned long i, nofseqs; gt_error_check(err); gt_assert(arguments != NULL); /* load encseq */ loader = gt_encseq_loader_new(); encseq = gt_encseq_loader_load(loader, argv[parsed_args], err); if (encseq == NULL) had_err = -1; if (had_err == 0 && !gt_encseq_has_description_support(encseq)) gt_warning("%s has no description support", argv[parsed_args]); if (!had_err) { nofseqs = gt_encseq_num_of_sequences(encseq); if (arguments->invert) { for (i = nofseqs; i > 0; i--) gt_seqorder_output(i - 1, encseq); } else if (arguments->shuffle) { unsigned long *seqnums; seqnums = gt_malloc(sizeof (unsigned long) * nofseqs); gt_seqorder_get_shuffled_seqnums(nofseqs, seqnums); for (i = 0; i < nofseqs; i++) gt_seqorder_output(seqnums[i], encseq); gt_free(seqnums); } else { GtSuffixsortspace *suffixsortspace; gt_assert(arguments->sort || arguments->revsort); suffixsortspace = gt_suffixsortspace_new(nofseqs, /* Use iterator over sequence separators: saves a lot of binary searches */ gt_encseq_seqstartpos(encseq, nofseqs-1), false,NULL); gt_seqorder_sort(suffixsortspace, encseq); if (arguments->sort) for (i = 0; i < nofseqs; i++) gt_seqorder_output(gt_encseq_seqnum(encseq, gt_suffixsortspace_getdirect(suffixsortspace, i)), encseq); else for (i = nofseqs; i > 0; i--) gt_seqorder_output(gt_encseq_seqnum(encseq, gt_suffixsortspace_getdirect(suffixsortspace, i - 1)), encseq); gt_suffixsortspace_delete(suffixsortspace, false); } } gt_encseq_loader_delete(loader); gt_encseq_delete(encseq); return had_err; }
static int gt_encseq2spm_runner(GT_UNUSED int argc, GT_UNUSED const char **argv, GT_UNUSED int parsed_args, void *tool_arguments, GtError *err) { GtEncseq2spmArguments *arguments = tool_arguments; GtEncseqLoader *el = NULL; GtEncseq *encseq = NULL; bool haserr = false; gt_error_check(err); gt_assert(arguments); el = gt_encseq_loader_new(); gt_encseq_loader_drop_description_support(el); gt_encseq_loader_disable_autosupport(el); encseq = gt_encseq_loader_load(el, gt_str_get(arguments->encseqinput), err); if (encseq == NULL) { haserr = true; } if (!haserr) { if (arguments->singlestrand) { gt_error_set(err,"option -singlestand is not implemented"); haserr = true; } else { if (gt_encseq_mirror(encseq, err) != 0) { haserr = true; } } } if (!haserr && arguments->singlescan > 0) { GtTimer *timer = NULL; if (gt_showtime_enabled()) { char *outmsg; switch (arguments->singlescan) { case 1: outmsg = "to run fast scanning"; break; case 2: outmsg = "to run fast scanning with check"; break; case 3: outmsg = "to run fast scanning with output"; break; case 4: outmsg = "to run old scanning code"; break; default: gt_error_set(err,"argument %u to option -singlescan not allowed", arguments->singlescan); haserr = true; } if (!haserr) { timer = gt_timer_new_with_progress_description(outmsg); gt_timer_start(timer); } } if (!haserr) { unsigned int kmersize = 0; haserr = gt_encseq2spm_kmersize(arguments, &kmersize, err); if (!haserr) { if (arguments->singlescan == 4U) { gt_rungetencseqkmers(encseq,kmersize); } else { if (arguments->singlescan > 0) { gt_firstcode_runkmerscan(encseq,arguments->singlescan - 1,kmersize, arguments->minmatchlength); } } } } if (timer != NULL) { gt_timer_show_progress_final(timer, stdout); gt_timer_delete(timer); } } if (!haserr && arguments->singlescan == 0) { GtLogger *logger; const GtReadmode readmode = GT_READMODE_FORWARD; GtBUstate_spmsk **spmsk_states = NULL; unsigned int kmersize, threadcount; #ifdef GT_THREADS_ENABLED const unsigned int threads = gt_jobs; #else const unsigned int threads = 1U; #endif if (arguments->countspms || arguments->outputspms) { spmsk_states = gt_malloc(sizeof (*spmsk_states) * threads); for (threadcount = 0; threadcount < threads; threadcount++) { spmsk_states[threadcount] = gt_spmsk_inl_new(encseq, readmode, (unsigned long) arguments->minmatchlength, arguments->countspms, arguments->outputspms, gt_str_get(arguments->encseqinput)); } } logger = gt_logger_new(arguments->verbose,GT_LOGGER_DEFLT_PREFIX, stdout); haserr = gt_encseq2spm_kmersize(arguments, &kmersize, err); if (!haserr) { if (storefirstcodes_getencseqkmers_twobitencoding(encseq, kmersize, arguments->numofparts, arguments->maximumspace, arguments->minmatchlength, /* use false */ arguments->checksuftab, /* use false */ arguments->onlyaccum, /* use false */ arguments-> onlyallfirstcodes, /* use 5U */ arguments-> addbscache_depth, /* specify the extra space needed for the function processing the interval */ arguments->phase2extra, /* use true */ arguments->radixlarge ? false : true, /* use 2 without threads and use 1 with threads */ arguments->radixparts, spmsk_states != NULL ? gt_spmsk_inl_process : NULL, gt_spmsk_inl_process_end, spmsk_states, logger, err) != 0) { haserr = true; } } if (spmsk_states != NULL) { unsigned long countmatches = 0; for (threadcount = 0; threadcount < threads; threadcount++) { countmatches += gt_spmsk_inl_delete(spmsk_states[threadcount]); } if (arguments->countspms) { printf("number of suffix-prefix matches=%lu\n",countmatches); } gt_free(spmsk_states); } gt_logger_delete(logger); } gt_encseq_delete(encseq); gt_encseq_loader_delete(el); return haserr ? -1 : 0; }
int gt_extractkeysfromdesfile(const char *indexname, bool sortkeys, GtLogger *logger, GtError *err) { FILE *fpin, *fpout = NULL; GtStr *line = NULL; const char *keyptr; unsigned long keylen, constantkeylen = 0, linenum;/* incorrectorder = 0;*/ bool haserr = false, firstdesc = true; char *previouskey = NULL; Fixedsizekey *keytab = NULL, *keytabptr = NULL; GtEncseq *encseq = NULL; unsigned long numofentries = 0; const unsigned long linewidth = 60UL; fpin = gt_fa_fopen_with_suffix(indexname,GT_DESTABFILESUFFIX,"rb",err); if (fpin == NULL) { return -1; } if (!sortkeys) { fpout = gt_fa_fopen_with_suffix(indexname,GT_KEYSTABFILESUFFIX,"wb",err); if (fpout == NULL) { haserr = true; } } if (!haserr) { line = gt_str_new(); } for (linenum = 0; !haserr && gt_str_read_next_line(line, fpin) != EOF; linenum++) { keyptr = desc2key(&keylen,gt_str_get(line),err); if (keyptr == NULL) { haserr = true; break; } if (keylen == 0) { gt_error_set(err,"key of length 0 in \"%s\" not expected", gt_str_get(line)); haserr = true; break; } if (firstdesc) { if (keylen > (unsigned long) CHAR_MAX) { gt_error_set(err,"key \"%*.*s\" of length %lu not allowed; " "no key must be larger than %d", (int) keylen,(int) keylen,keyptr,keylen,CHAR_MAX); haserr = true; break; } constantkeylen = keylen; previouskey = gt_malloc(sizeof (char) * (constantkeylen+1)); firstdesc = false; if (!sortkeys) { gt_xfputc((char) constantkeylen,fpout); } else { GtEncseqLoader *el; if (constantkeylen > (unsigned long) MAXFIXEDKEYSIZE) { gt_error_set(err,"key \"%*.*s\" of length %lu not allowed; " "no key must be larger than %d", (int) keylen,(int) keylen,keyptr,keylen, MAXFIXEDKEYSIZE); haserr = true; break; } el = gt_encseq_loader_new(); gt_encseq_loader_set_logger(el, logger); encseq = gt_encseq_loader_load(el, indexname, err); gt_encseq_loader_delete(el); if (encseq == NULL) { haserr = true; break; } numofentries = gt_encseq_num_of_sequences(encseq); gt_assert(numofentries > 0); keytab = gt_malloc(sizeof (*keytab) * numofentries); keytabptr = keytab; } } else { if (constantkeylen != keylen) { gt_error_set(err,"key \"%*.*s\" of length %lu: all keys must be of " "the same length which for all previously seen " "headers is %lu", (int) keylen,(int) keylen,keyptr,keylen, constantkeylen); haserr = true; break; } gt_assert(previouskey != NULL); if (!sortkeys && strncmp(previouskey,keyptr,(size_t) constantkeylen) >= 0) { gt_error_set(err,"previous key \"%s\" is not lexicographically smaller " "than current key \"%*.*s\"", previouskey,(int) keylen,(int) keylen,keyptr); haserr = true; break; /* printf("previous key \"%s\" (no %lu) is lexicographically larger " "than current key \"%*.*s\"\n", previouskey,linenum,(int) keylen,(int) keylen,keyptr); incorrectorder++; */ } } if (!sortkeys) { gt_xfwrite(keyptr,sizeof *keyptr,(size_t) keylen,fpout); gt_xfputc('\0',fpout); } else { gt_assert(keytabptr != NULL); strncpy(keytabptr->key,keyptr,(size_t) constantkeylen); keytabptr->key[constantkeylen] = '\0'; keytabptr->seqnum = linenum; keytabptr++; } strncpy(previouskey,keyptr,(size_t) constantkeylen); previouskey[constantkeylen] = '\0'; gt_str_reset(line); } if (!haserr) { gt_logger_log(logger,"number of keys of length %lu = %lu", constantkeylen,linenum); /* gt_logger_log(logger,"number of incorrectly ordered keys = %lu", incorrectorder); */ } gt_str_delete(line); gt_fa_fclose(fpin); gt_fa_fclose(fpout); gt_free(previouskey); if (!haserr && sortkeys) { gt_assert(keytabptr != NULL); gt_assert(numofentries > 0); gt_assert(keytabptr == keytab + numofentries); qsort(keytab,(size_t) numofentries,sizeof (*keytab),compareFixedkeys); gt_assert(keytabptr != NULL); for (keytabptr = keytab; !haserr && keytabptr < keytab + numofentries; keytabptr++) { if (giextract_encodedseq2fasta(stdout, encseq, keytabptr->seqnum, NULL, linewidth, err) != 0) { haserr = true; break; } } } if (encseq != NULL) { gt_encseq_delete(encseq); encseq = NULL; } gt_free(keytab); return haserr ? -1 : 0; }
static int gt_kmer_database_runner(GT_UNUSED int argc, const char **argv, int parsed_args, void *tool_arguments, GtError *err) { GtKmerDatabaseArguments *arguments = tool_arguments; int had_err = 0; GtEncseq *es; GtUword es_length, nu_kmer_codes = 0; GtKmerDatabase *compare_db = NULL, *db = NULL; GtLogger *logger; FILE *fp = NULL; GtHashmap *kmer_hash = NULL; GtTimer *timer = NULL; gt_error_check(err); gt_assert(arguments); if (arguments->use_hash) kmer_hash = gt_hashmap_new(GT_HASH_DIRECT, NULL, (GtFree) gt_kmer_database_delete_hash_value); if (arguments->bench) timer = gt_timer_new_with_progress_description("loading encoded sequence"); logger = gt_logger_new(arguments->verbose, GT_LOGGER_DEFLT_PREFIX, stderr); if (arguments->verbose && gt_str_length(arguments->print_filename) > 0UL) { fp = gt_fa_fopen(gt_str_get(arguments->print_filename), "w", err); gt_logger_set_target(logger, fp); } if (!had_err) { GtEncseqLoader *es_l; if (arguments->bench) gt_timer_start(timer); es_l = gt_encseq_loader_new(); es = gt_encseq_loader_load(es_l, argv[parsed_args], err); if (arguments->bench) gt_timer_show_progress(timer, "saving kmers (+iterating over file)", stdout); if (es == NULL) { had_err = -1; } gt_encseq_loader_delete(es_l); } if (!had_err) { es_length = gt_encseq_total_length(es); if (es_length < (GtUword) arguments->kmersize) { gt_error_set(err, "Input is too short for used kmersize. File length: " GT_WU " kmersize: %u", es_length, arguments->kmersize); had_err = -1; } } if (!had_err) { GtAlphabet *alphabet; alphabet = gt_encseq_alphabet(es); if (arguments->bench) nu_kmer_codes = gt_power_for_small_exponents( gt_alphabet_num_of_chars(alphabet), arguments->kmersize); if (!arguments->merge_only && !arguments->use_hash && !arguments->bench) { compare_db = gt_kmer_database_new(gt_alphabet_num_of_chars(alphabet), arguments->kmersize, arguments->sb_size, es); } if (!arguments->use_hash) { db = gt_kmer_database_new(gt_alphabet_num_of_chars(alphabet), arguments->kmersize, arguments->sb_size, es); if (arguments->cutoff) { if (arguments->mean_cutoff) gt_kmer_database_use_mean_cutoff(db, (GtUword) 2, arguments->cutoff_value); else gt_kmer_database_set_cutoff(db, arguments->cutoff_value); if (!arguments->prune) gt_kmer_database_set_prune(db); } } } if (!had_err) { GtUword startpos = 0, endpos; GtKmercodeiterator *iter; const GtKmercode *kmercode = NULL; iter = gt_kmercodeiterator_encseq_new(es, GT_READMODE_FORWARD, arguments->kmersize, 0); while (!had_err && startpos < es_length - (arguments->kmersize - 1)) { GtUword startpos_add_kmer = startpos; if (arguments->merge_only) { endpos = startpos + (arguments->kmersize - 1) + (gt_rand_max((arguments->sb_size - 1) * 2)); if (endpos > es_length) endpos = es_length; } else { endpos = startpos + (arguments->kmersize - 1) + (gt_rand_max(arguments->sb_size - 1)); } gt_kmercodeiterator_reset(iter, GT_READMODE_FORWARD, startpos); while ((kmercode = gt_kmercodeiterator_encseq_next(iter)) != NULL && startpos_add_kmer <= endpos - (arguments->kmersize - 1)) { if (!arguments->merge_only && !arguments->use_hash && !kmercode->definedspecialposition && !arguments->bench) { gt_kmer_database_add_kmer(compare_db, kmercode->code, startpos_add_kmer); } if (arguments->use_hash && !kmercode->definedspecialposition) { gt_kmer_database_add_to_hash(kmer_hash, kmercode->code, startpos_add_kmer); } startpos_add_kmer++; } if (!arguments->use_hash) { gt_kmer_database_add_interval(db, startpos, endpos); gt_kmer_database_print_buffer(db, logger); if (!arguments->bench) had_err = gt_kmer_database_check_consistency(db, err); } startpos = endpos + 1; } if (!arguments->use_hash) { gt_kmer_database_flush(db); gt_kmer_database_print_buffer(db, logger); if (!had_err && !arguments->bench) had_err = gt_kmer_database_check_consistency(db, err); if (!arguments->merge_only && !had_err && !arguments->bench) had_err = gt_kmer_database_check_consistency(compare_db, err); if (!arguments->merge_only && !arguments->bench) gt_kmer_database_print(compare_db, logger, true); if (!arguments->merge_only && !had_err && !arguments->bench) had_err = gt_kmer_database_compare(compare_db, db, err); gt_kmer_database_print(db, logger, true); } gt_kmercodeiterator_delete(iter); } if (arguments->bench) { GtKmerStartpos pos; GtArrayGtUword *pos_hash; GtUword rand_access = (GtUword) 50000000, rand_code, i, sum = 0; gt_timer_show_progress(timer, "random access", stdout); for (i = 0; i < rand_access; i++) { rand_code = gt_rand_max(nu_kmer_codes - 1); if (arguments->use_hash) { pos_hash = gt_hashmap_get(kmer_hash, (const void *) rand_code); if (pos_hash != NULL) sum += pos_hash->spaceGtUword[pos_hash->nextfreeGtUword - 1]; } else { pos = gt_kmer_database_get_startpos(db, rand_code); if (pos.no_positions > 0) sum += pos.startpos[pos.no_positions - 1]; } } printf("sum: " GT_WU "\n", sum); gt_timer_show_progress(timer, "", stdout); gt_timer_stop(timer); gt_timer_delete(timer); } if (arguments->use_hash) gt_hashmap_delete(kmer_hash); gt_encseq_delete(es); if (!arguments->use_hash) gt_kmer_database_delete(db); if (!arguments->merge_only && !arguments->bench) gt_kmer_database_delete(compare_db); gt_logger_delete(logger); gt_fa_fclose(fp); return had_err; }
static int gt_ltrdigest_runner(GT_UNUSED int argc, const char **argv, int parsed_args, void *tool_arguments, GtError *err) { GtLTRdigestOptions *arguments = tool_arguments; GtNodeStream *gff3_in_stream = NULL, *gff3_out_stream = NULL, *pdom_stream = NULL, *ppt_stream = NULL, *pbs_stream = NULL, *tab_out_stream = NULL, *sa_stream = NULL, *last_stream = NULL; int had_err = 0, tests_to_run = 0, arg = parsed_args; GtRegionMapping *rmap = NULL; GtPdomModelSet *ms = NULL; gt_error_check(err); gt_assert(arguments); /* determine and open sequence source */ if (gt_seqid2file_option_used(arguments->s2fi)) { /* create region mapping */ rmap = gt_seqid2file_region_mapping_new(arguments->s2fi, err); if (!rmap) had_err = -1; } else { GtEncseqLoader *el; GtEncseq *encseq; /* no new-style sequence source option given, fall back to legacy syntax */ if (argc < 3) { gt_error_set(err, "missing mandatory argument(s)"); had_err = -1; } if (!had_err) { el = gt_encseq_loader_new(); gt_encseq_loader_disable_autosupport(el); gt_encseq_loader_require_md5_support(el); gt_encseq_loader_require_description_support(el); encseq = gt_encseq_loader_load(el, argv[argc-1], err); /* XXX: clip off terminal argument */ gt_free((char*) argv[argc-1]); argv[argc-1] = NULL; argc--; gt_encseq_loader_delete(el); if (!encseq) had_err = -1; else { rmap = gt_region_mapping_new_encseq_seqno(encseq); gt_encseq_delete(encseq); } } } gt_assert(had_err || rmap); /* Always search for PPT. */ tests_to_run |= GT_LTRDIGEST_RUN_PPT; /* Open tRNA library if given. */ if (!had_err && arguments->trna_lib && gt_str_length(arguments->trna_lib) > 0) { tests_to_run |= GT_LTRDIGEST_RUN_PBS; arguments->trna_lib_bs = gt_bioseq_new(gt_str_get(arguments->trna_lib), err); if (gt_error_is_set(err)) had_err = -1; } /* Set HMMER cutoffs. */ if (!had_err && gt_str_array_size(arguments->hmm_files) > 0) { tests_to_run |= GT_LTRDIGEST_RUN_PDOM; if (!strcmp(gt_str_get(arguments->cutoffs), "GA")) { arguments->cutoff = GT_PHMM_CUTOFF_GA; } else if (!strcmp(gt_str_get(arguments->cutoffs), "TC")) { arguments->cutoff = GT_PHMM_CUTOFF_TC; } else if (!strcmp(gt_str_get(arguments->cutoffs), "NONE")) { arguments->cutoff = GT_PHMM_CUTOFF_NONE; } else { gt_error_set(err, "invalid cutoff setting!"); had_err = -1; } } if (!had_err) { last_stream = gff3_in_stream = gt_gff3_in_stream_new_sorted(argv[arg]); } if (!had_err && gt_str_array_size(arguments->hmm_files) > 0) { GtNodeVisitor *pdom_v; ms = gt_pdom_model_set_new(arguments->hmm_files, err); if (ms != NULL) { pdom_v = gt_ltrdigest_pdom_visitor_new(ms, arguments->evalue_cutoff, arguments->chain_max_gap_length, arguments->cutoff, rmap, err); if (pdom_v == NULL) had_err = -1; if (!had_err) { gt_ltrdigest_pdom_visitor_set_source_tag((GtLTRdigestPdomVisitor*) pdom_v, GT_LTRDIGEST_TAG); if (arguments->output_all_chains) gt_ltrdigest_pdom_visitor_output_all_chains((GtLTRdigestPdomVisitor*) pdom_v); last_stream = pdom_stream = gt_visitor_stream_new(last_stream, pdom_v); } } else had_err = -1; } if (!had_err && arguments->trna_lib_bs) { GtNodeVisitor *pbs_v; pbs_v = gt_ltrdigest_pbs_visitor_new(rmap, arguments->pbs_radius, arguments->max_edist, arguments->alilen, arguments->offsetlen, arguments->trnaoffsetlen, arguments->ali_score_match, arguments->ali_score_mismatch, arguments->ali_score_insertion, arguments->ali_score_deletion, arguments->trna_lib_bs, err); if (pbs_v != NULL) last_stream = pbs_stream = gt_visitor_stream_new(last_stream, pbs_v); else had_err = -1; } if (!had_err) { GtNodeVisitor *ppt_v; ppt_v = gt_ltrdigest_ppt_visitor_new(rmap, arguments->ppt_len, arguments->ubox_len, arguments->ppt_pyrimidine_prob, arguments->ppt_purine_prob, arguments->bkg_a_prob, arguments->bkg_g_prob, arguments->bkg_t_prob, arguments->bkg_c_prob, arguments->ubox_u_prob, arguments->ppt_radius, arguments->max_ubox_dist, err); if (ppt_v != NULL) last_stream = ppt_stream = gt_visitor_stream_new(last_stream, ppt_v); else had_err = -1; } if (!had_err) { GtNodeVisitor *sa_v; sa_v = gt_ltrdigest_strand_assign_visitor_new(); gt_assert(sa_v); last_stream = sa_stream = gt_visitor_stream_new(last_stream, sa_v); } if (!had_err) { /* attach tabular output stream, if requested */ if (gt_str_length(arguments->prefix) > 0) { last_stream = tab_out_stream = gt_ltrdigest_file_out_stream_new( last_stream, tests_to_run, rmap, gt_str_get(arguments->prefix), arguments->seqnamelen, err); if (!tab_out_stream) had_err = -1; if (!had_err && arguments->print_metadata) { had_err = gt_ltrdigest_file_out_stream_write_metadata( (GtLTRdigestFileOutStream*) tab_out_stream, tests_to_run, gt_str_get(arguments->trna_lib), argv[arg], arguments->ppt_len, arguments->ubox_len, arguments->ppt_radius, arguments->alilen, arguments->max_edist, arguments->offsetlen, arguments->trnaoffsetlen, arguments->pbs_radius, arguments->hmm_files, arguments->chain_max_gap_length, arguments->evalue_cutoff, err); } if (!had_err) { if (arguments->write_alignments) gt_ltrdigest_file_out_stream_enable_pdom_alignment_output( tab_out_stream); if (arguments->write_aaseqs) gt_ltrdigest_file_out_stream_enable_aa_sequence_output( tab_out_stream); } } last_stream = gff3_out_stream = gt_gff3_out_stream_new(last_stream, arguments->outfp); /* pull the features through the stream and free them afterwards */ had_err = gt_node_stream_pull(last_stream, err); } gt_pdom_model_set_delete(ms); gt_node_stream_delete(gff3_out_stream); gt_node_stream_delete(ppt_stream); gt_node_stream_delete(pbs_stream); gt_node_stream_delete(sa_stream); gt_node_stream_delete(pdom_stream); gt_node_stream_delete(tab_out_stream); gt_node_stream_delete(gff3_in_stream); gt_bioseq_delete(arguments->trna_lib_bs); gt_region_mapping_delete(rmap); return had_err; }
static int gt_ltrdigest_runner(GT_UNUSED int argc, const char **argv, int parsed_args, void *tool_arguments, GtError *err) { GtLTRdigestOptions *arguments = tool_arguments; GtNodeStream *gff3_in_stream = NULL, *gff3_out_stream = NULL, *ltrdigest_stream = NULL, *tab_out_stream = NULL, *last_stream = NULL; int had_err = 0, tests_to_run = 0, arg = parsed_args; const char *indexname = argv[arg+1]; GtLogger *logger = gt_logger_new(arguments->verbose, GT_LOGGER_DEFLT_PREFIX, stdout); GtEncseqLoader *el; GtEncseq *encseq; gt_error_check(err); gt_assert(arguments); /* Set sequence encoder options. Defaults are ok. */ el = gt_encseq_loader_new(); gt_encseq_loader_set_logger(el, logger); /* Open sequence file */ encseq = gt_encseq_loader_load(el, indexname, err); if (!encseq) had_err = -1; /* Always search for PPT. */ tests_to_run |= GT_LTRDIGEST_RUN_PPT; /* Open tRNA library if given. */ if (!had_err && arguments->trna_lib && gt_str_length(arguments->trna_lib) > 0) { tests_to_run |= GT_LTRDIGEST_RUN_PBS; arguments->pbs_opts.trna_lib = gt_bioseq_new(gt_str_get(arguments->trna_lib), err); if (gt_error_is_set(err)) had_err = -1; } #ifdef HAVE_HMMER /* Open HMMER files if given. */ if (!had_err && gt_str_array_size(arguments->pdom_opts.hmm_files) > 0) { tests_to_run |= GT_LTRDIGEST_RUN_PDOM; if (!strcmp(gt_str_get(arguments->cutoffs), "GA")) { arguments->pdom_opts.cutoff = GT_PHMM_CUTOFF_GA; } else if (!strcmp(gt_str_get(arguments->cutoffs), "TC")) { arguments->pdom_opts.cutoff = GT_PHMM_CUTOFF_TC; } else if (!strcmp(gt_str_get(arguments->cutoffs), "NONE")) { arguments->pdom_opts.cutoff = GT_PHMM_CUTOFF_NONE; } else { gt_error_set(err, "invalid cutoff setting!"); had_err = -1; } } #endif if (!had_err) { /* set up stream flow * ------------------*/ last_stream = gff3_in_stream = gt_gff3_in_stream_new_sorted(argv[arg]); last_stream = ltrdigest_stream = gt_ltrdigest_stream_new(last_stream, tests_to_run, encseq, &arguments->pbs_opts, &arguments->ppt_opts, #ifdef HAVE_HMMER &arguments->pdom_opts, #endif err); if (!ltrdigest_stream) had_err = -1; } if (!had_err) { /* attach tabular output stream, if requested */ if (gt_str_length(arguments->prefix) > 0) { last_stream = tab_out_stream = gt_ltr_fileout_stream_new(last_stream, tests_to_run, encseq, gt_str_get(arguments->prefix), &arguments->ppt_opts, &arguments->pbs_opts, #ifdef HAVE_HMMER &arguments->pdom_opts, #endif gt_str_get(arguments->trna_lib), argv[arg+1], argv[arg], arguments->seqnamelen, err); #ifdef HAVE_HMMER if (&arguments->pdom_opts.write_alignments) gt_ltr_fileout_stream_enable_pdom_alignment_output(tab_out_stream); if (&arguments->pdom_opts.write_aaseqs) gt_ltr_fileout_stream_enable_aa_sequence_output(tab_out_stream); #endif } last_stream = gff3_out_stream = gt_gff3_out_stream_new(last_stream, arguments->outfp); /* pull the features through the stream and free them afterwards */ had_err = gt_node_stream_pull(last_stream, err); } gt_node_stream_delete(gff3_out_stream); gt_node_stream_delete(ltrdigest_stream); if (tab_out_stream != NULL) gt_node_stream_delete(tab_out_stream); gt_node_stream_delete(gff3_in_stream); gt_encseq_loader_delete(el); gt_encseq_delete(encseq); encseq = NULL; gt_bioseq_delete(arguments->pbs_opts.trna_lib); gt_logger_delete(logger); return had_err; }
static int gt_encseq_check_runner(GT_UNUSED int argc, const char **argv, int parsed_args, void *tool_arguments, GtError *err) { GtEncseqCheckArguments *arguments = tool_arguments; int had_err = 0; GtEncseqLoader *encseq_loader; GtEncseq *encseq; gt_error_check(err); gt_assert(arguments); encseq_loader = gt_encseq_loader_new(); if (!(encseq = gt_encseq_loader_load(encseq_loader, argv[parsed_args], err))) had_err = -1; if (!had_err) { int readmode; gt_encseq_check_startpositions(encseq); for (readmode = 0; readmode < 4; readmode++) { if (gt_alphabet_is_dna(gt_encseq_alphabet(encseq)) || ((GtReadmode) readmode) == GT_READMODE_FORWARD || ((GtReadmode) readmode) == GT_READMODE_REVERSE) { if (gt_encseq_check_consistency(encseq, gt_encseq_filenames(encseq), (GtReadmode) readmode, arguments->scantrials, arguments->multicharcmptrials, gt_encseq_has_multiseq_support(encseq), err) != 0) { had_err = -1; break; } } } if (!had_err) { gt_encseq_check_specialranges(encseq); } if (!had_err) { gt_encseq_check_markpos(encseq); } if (!had_err) { had_err = gt_encseq_check_minmax(encseq, err); } if (!had_err && arguments->prefixlength > 0) { if (gt_verifymappedstr(encseq, arguments->prefixlength, err) != 0) { had_err = -1; } } } gt_encseq_delete(encseq); gt_encseq_loader_delete(encseq_loader); return had_err; }
int gt_testmaxpairs(const char *indexname, GtUword samples, unsigned int minlength, GtUword substringlength, GtLogger *logger, GtError *err) { GtEncseq *encseq; GtUword totallength = 0, dblen, querylen; GtUchar *dbseq = NULL, *query = NULL; bool haserr = false; GtUword s; GtArray *tabmaxquerymatches; Maxmatchselfinfo maxmatchselfinfo; GtEncseqLoader *el; gt_logger_log(logger,"draw "GT_WU" samples",samples); el = gt_encseq_loader_new(); gt_encseq_loader_do_not_require_des_tab(el); gt_encseq_loader_do_not_require_ssp_tab(el); gt_encseq_loader_do_not_require_sds_tab(el); gt_encseq_loader_set_logger(el, logger); encseq = gt_encseq_loader_load(el, indexname, err); gt_encseq_loader_delete(el); if (encseq == NULL) { haserr = true; } else { totallength = gt_encseq_total_length(encseq); } if (!haserr) { if (substringlength > totallength/2) { substringlength = totallength/2; } dbseq = gt_malloc(sizeof *dbseq * substringlength); query = gt_malloc(sizeof *query * substringlength); } for (s=0; s<samples && !haserr; s++) { dblen = samplesubstring(dbseq,encseq,substringlength); querylen = samplesubstring(query,encseq,substringlength); gt_logger_log(logger,"run query match for dblen="GT_WU"" ",querylen= "GT_WU", minlength=%u", dblen, querylen, minlength); tabmaxquerymatches = gt_array_new(sizeof (Substringmatch)); if (gt_sarrquerysubstringmatch(dbseq, dblen, query, (GtUword) querylen, minlength, gt_encseq_alphabet(encseq), storemaxmatchquery, tabmaxquerymatches, logger, err) != 0) { haserr = true; break; } gt_logger_log(logger,"run self match for dblen="GT_WU"" ",querylen= "GT_WU", minlength=%u", dblen, querylen, minlength); maxmatchselfinfo.results = gt_array_new(sizeof (Substringmatch)); maxmatchselfinfo.dblen = dblen; maxmatchselfinfo.querylen = querylen; maxmatchselfinfo.querymarkpos = sequence2markpositions(&maxmatchselfinfo.numofquerysequences, query,querylen); if (sarrselfsubstringmatch(dbseq, dblen, query, (GtUword) querylen, minlength, gt_encseq_alphabet(encseq), storemaxmatchself, &maxmatchselfinfo, logger, err) != 0) { haserr = true; break; } gt_array_sort(tabmaxquerymatches,orderSubstringmatch); gt_array_sort(maxmatchselfinfo.results,orderSubstringmatch); if (!gt_array_equal(tabmaxquerymatches,maxmatchselfinfo.results, orderSubstringmatch)) { const GtUword width = 60UL; printf("failure for query of length "GT_WU"\n",(GtUword) querylen); printf("querymatches\n"); (void) gt_array_iterate(tabmaxquerymatches,showSubstringmatch,NULL, err); printf("dbmatches\n"); (void) gt_array_iterate(maxmatchselfinfo.results,showSubstringmatch, NULL,err); gt_symbolstring2fasta(stdout,"dbseq", gt_encseq_alphabet(encseq), dbseq, (GtUword) dblen, width); gt_symbolstring2fasta(stdout,"queryseq", gt_encseq_alphabet(encseq), query, (GtUword) querylen, width); exit(GT_EXIT_PROGRAMMING_ERROR); } gt_free(maxmatchselfinfo.querymarkpos); printf("# numberofmatches="GT_WU"\n",gt_array_size(tabmaxquerymatches)); gt_array_delete(tabmaxquerymatches); gt_array_delete(maxmatchselfinfo.results); } gt_free(dbseq); gt_free(query); gt_encseq_delete(encseq); encseq = NULL; return haserr ? -1 : 0; }
static int gt_encseq_bitextract_runner(GT_UNUSED int argc, const char **argv, GT_UNUSED int parsed_args, void *tool_arguments, GT_UNUSED GtError *err) { GtEncseqBitextractArguments *arguments = tool_arguments; GtEncseqLoader *el; GtEncseq *encseq; int had_err = 0; bool fwd, it1, GT_UNUSED it2; char buffer[BUFSIZ]; GtEndofTwobitencoding etbe; GtEncseqReader *esr; GtSpecialrangeiterator *sri; GtRange srng; GtReadmode rm; gt_error_check(err); gt_assert(arguments); el = gt_encseq_loader_new(); encseq = gt_encseq_loader_load(el, argv[parsed_args], err); if (!encseq) had_err = -1; if (!had_err && arguments->mirror) { had_err = gt_encseq_mirror(encseq, err); } if (!had_err) { rm = gt_readmode_parse(gt_str_get(arguments->readmode), NULL); fwd = GT_ISDIRREVERSE(rm) ? false : true; } if (!had_err && arguments->bitpos != GT_UNDEF_ULONG) { if (arguments->bitpos >= gt_encseq_total_length(encseq)) { gt_error_set(err, "position %lu exceeds encoded sequence length of %lu", arguments->bitpos, gt_encseq_total_length(encseq)); had_err = -1; } if (!had_err) { unsigned long ret; esr = gt_encseq_create_reader_with_readmode(encseq, rm, arguments->bitpos); ret = gt_encseq_extract2bitencwithtwobitencodingstoppos(&etbe, esr, encseq, rm, arguments->bitpos); gt_bitsequence_tostring(buffer, etbe.tbe); printf("Twobitencoding %s\n" "unitsnotspecial %u\n" "position %lu\n" "returnvalue %lu\n", buffer, etbe.unitsnotspecial, arguments->bitpos, ret); gt_encseq_reader_delete(esr); } } if (!had_err && arguments->stoppos != GT_UNDEF_ULONG) { if (arguments->stoppos >= gt_encseq_total_length(encseq)) { gt_error_set(err, "position %lu exceeds encoded sequence length of %lu", arguments->stoppos, gt_encseq_total_length(encseq)); had_err = -1; } if (!had_err) { esr = gt_encseq_create_reader_with_readmode(encseq, rm, 0); /* check stoppos stuff */ gt_encseq_reader_reinit_with_readmode(esr, encseq, rm, arguments->stoppos); printf("%lu: %lu\n", arguments->stoppos, gt_getnexttwobitencodingstoppos(fwd, esr)); gt_encseq_reader_delete(esr); } } if (!had_err && arguments->specialranges) { /* check specialrangeiterator stuff */ if (gt_encseq_has_specialranges(encseq)) { sri = gt_specialrangeiterator_new(encseq, fwd); while (true) { it1 = gt_specialrangeiterator_next(sri, &srng); if (it1) printf("%lu:%lu\n", srng.start, srng.end); else break; } gt_specialrangeiterator_delete(sri); } } gt_encseq_delete(encseq); gt_encseq_loader_delete(el); return had_err; }
static int gt_condenser_extract_runner(GT_UNUSED int argc, const char **argv, int parsed_args, void *tool_arguments, GtError *err) { int had_err = 0; GtCondenserExtractArguments *arguments = tool_arguments; GtNREncseq *nre = NULL; GtEncseq *orig_encseq = NULL; GtEncseqLoader *esl; gt_error_check(err); gt_assert(arguments); /*load original encseq*/ esl = gt_encseq_loader_new(); orig_encseq = gt_encseq_loader_load(esl, gt_str_get(arguments->original), err); if (!orig_encseq) { had_err = -1; } gt_encseq_loader_delete(esl); if (!had_err) { nre = gt_n_r_encseq_new_from_file(argv[parsed_args], orig_encseq, err); if (nre == NULL) { had_err = -1; } } /*TODO get sequences by sequence ids: not yet implemented in n_r_encseq*/ /*if (!had_err && arguments->range.start == GT_UNDEF_UWORD && uedb != NULL) { GtUword idx, start = arguments->seqrange.start == GT_UNDEF_UWORD ? 0 : arguments->seqrange.start, end = arguments->seqrange.end == GT_UNDEF_UWORD ? uedb->nseq - 1 : arguments->seqrange.end; for (idx = start; idx <= end && !had_err; idx++) { had_err = gt_unique_encseq_get_sequence_from_idx(idx, unique_encseq, uedb, stdout, err); } } else if (!had_err) {*/ if (!had_err) { GtNREncseqDecompressor *nred = gt_n_r_encseq_decompressor_new(nre); if (arguments->range.start == GT_UNDEF_ULONG && arguments->range.end == GT_UNDEF_ULONG) { had_err = gt_n_r_encseq_decompressor_extract_origin_complete(stdout, nred, true, err); } else { had_err = gt_n_r_encseq_decompressor_extract_originrange(stdout, nred, &arguments->range, false, err); } gt_xfwrite_one("\n",stdout); /*TODO should better be in n_r_encseq.c?*/ gt_n_r_encseq_decompressor_delete(nred); } gt_n_r_encseq_delete(nre); gt_encseq_delete(orig_encseq); return had_err; }
static int bioseq_fill(GtBioseq *bs, bool recreate, GtError *err) { GtStr *bioseq_index_file = NULL, *bioseq_ois_file = NULL, *bioseq_sds_file = NULL, *bioseq_md5_file = NULL, *bioseq_des_file = NULL; int had_err = 0; GtStr *bioseq_basename; gt_assert(!bs->encseq); if (bs->use_stdin) bioseq_basename = gt_str_new_cstr("stdin"); else bioseq_basename = bs->sequence_file; /* construct file names */ bioseq_index_file = gt_str_clone(bioseq_basename); gt_str_append_cstr(bioseq_index_file, GT_ENCSEQFILESUFFIX); bioseq_ois_file = gt_str_clone(bioseq_basename); gt_str_append_cstr(bioseq_ois_file, GT_OISTABFILESUFFIX); bioseq_sds_file = gt_str_clone(bioseq_basename); gt_str_append_cstr(bioseq_sds_file, GT_SDSTABFILESUFFIX); bioseq_md5_file = gt_str_clone(bioseq_basename); gt_str_append_cstr(bioseq_md5_file, GT_MD5TABFILESUFFIX); bioseq_des_file = gt_str_clone(bioseq_basename); gt_str_append_cstr(bioseq_des_file, GT_DESTABFILESUFFIX); /* construct the bioseq files if necessary */ if (recreate || bs->use_stdin || !gt_file_exists(gt_str_get(bioseq_index_file)) || !gt_file_exists(gt_str_get(bioseq_ois_file)) || !gt_file_exists(gt_str_get(bioseq_sds_file)) || !gt_file_exists(gt_str_get(bioseq_md5_file)) || !gt_file_exists(gt_str_get(bioseq_des_file)) || gt_file_is_newer(gt_str_get(bs->sequence_file), gt_str_get(bioseq_index_file))) { had_err = construct_bioseq_files(bs, bioseq_basename, err); } if (!had_err) { GtEncseqLoader *el = gt_encseq_loader_new(); gt_encseq_loader_disable_autosupport(el); gt_encseq_loader_require_lossless_support(el); gt_encseq_loader_require_description_support(el); gt_encseq_loader_require_md5_support(el); gt_encseq_loader_require_multiseq_support(el); bs->encseq = gt_encseq_loader_load(el, gt_str_get(bioseq_basename), err); if (bs->encseq == NULL) { had_err = -1; gt_assert(gt_error_is_set(err)); } gt_encseq_loader_delete(el); } if (!had_err) { gt_assert(bs->encseq); } /* free */ if (bs->use_stdin) gt_str_delete(bioseq_basename); gt_str_delete(bioseq_index_file); gt_str_delete(bioseq_ois_file); gt_str_delete(bioseq_md5_file); gt_str_delete(bioseq_sds_file); gt_str_delete(bioseq_des_file); return had_err; }
int gt_runidxlocali(const IdxlocaliOptions *idxlocalioptions,GtError *err) { Genericindex *genericindex = NULL; bool haserr = false; GtLogger *logger; const GtEncseq *encseq = NULL; logger = gt_logger_new(idxlocalioptions->verbose, GT_LOGGER_DEFLT_PREFIX, stdout); if (idxlocalioptions->doonline) { GtEncseqLoader *el; el = gt_encseq_loader_new(); gt_encseq_loader_require_multiseq_support(el); gt_encseq_loader_drop_description_support(el); gt_encseq_loader_set_logger(el, logger); encseq = gt_encseq_loader_load(el, gt_str_get(idxlocalioptions->indexname), err); gt_encseq_loader_delete(el); if (encseq == NULL) { haserr = true; } } else { genericindex = genericindex_new(gt_str_get(idxlocalioptions->indexname), idxlocalioptions->withesa, idxlocalioptions->withesa || idxlocalioptions->docompare, false, true, 0, logger, err); if (genericindex == NULL) { haserr = true; } else { encseq = genericindex_getencseq(genericindex); } } if (!haserr) { GtSeqIterator *seqit; const GtUchar *query; unsigned long querylen; char *desc = NULL; int retval; Limdfsresources *limdfsresources = NULL; const AbstractDfstransformer *dfst; SWdpresource *swdpresource = NULL; Showmatchinfo showmatchinfo; ProcessIdxMatch processmatch; GtAlphabet *a; void *processmatchinfoonline, *processmatchinfooffline; Storematchinfo storeonline, storeoffline; a = gt_encseq_alphabet(encseq); if (idxlocalioptions->docompare) { processmatch = storematch; gt_initstorematch(&storeonline,encseq); gt_initstorematch(&storeoffline,encseq); processmatchinfoonline = &storeonline; processmatchinfooffline = &storeoffline; } else { processmatch = showmatch; showmatchinfo.encseq = encseq; showmatchinfo.characters = gt_alphabet_characters(a); showmatchinfo.wildcardshow = gt_alphabet_wildcard_show(a); showmatchinfo.showalignment = idxlocalioptions->showalignment; processmatchinfoonline = processmatchinfooffline = &showmatchinfo; } if (idxlocalioptions->doonline || idxlocalioptions->docompare) { swdpresource = gt_newSWdpresource(idxlocalioptions->matchscore, idxlocalioptions->mismatchscore, idxlocalioptions->gapextend, idxlocalioptions->threshold, idxlocalioptions->showalignment, processmatch, processmatchinfoonline); } dfst = gt_locali_AbstractDfstransformer(); if (!idxlocalioptions->doonline || idxlocalioptions->docompare) { gt_assert(genericindex != NULL); limdfsresources = gt_newLimdfsresources(genericindex, true, 0, 0, /* maxpathlength */ true, /* keepexpandedonstack */ processmatch, processmatchinfooffline, NULL, /* processresult */ NULL, /* processresult info */ dfst); } seqit = gt_seq_iterator_sequence_buffer_new(idxlocalioptions->queryfiles, err); if (!seqit) haserr = true; if (!haserr) { gt_seq_iterator_set_symbolmap(seqit, gt_alphabet_symbolmap(a)); for (showmatchinfo.queryunit = 0; /* Nothing */; showmatchinfo.queryunit++) { retval = gt_seq_iterator_next(seqit, &query, &querylen, &desc, err); if (retval < 0) { haserr = true; break; } if (retval == 0) { break; } printf("process sequence " Formatuint64_t " of length %lu\n", PRINTuint64_tcast(showmatchinfo.queryunit),querylen); if (idxlocalioptions->doonline || idxlocalioptions->docompare) { gt_multiapplysmithwaterman(swdpresource,encseq,query,querylen); } if (!idxlocalioptions->doonline || idxlocalioptions->docompare) { gt_indexbasedlocali(limdfsresources, idxlocalioptions->matchscore, idxlocalioptions->mismatchscore, idxlocalioptions->gapstart, idxlocalioptions->gapextend, idxlocalioptions->threshold, query, querylen, dfst); } if (idxlocalioptions->docompare) { gt_checkandresetstorematch(showmatchinfo.queryunit, &storeonline,&storeoffline); } } if (limdfsresources != NULL) { gt_freeLimdfsresources(&limdfsresources,dfst); } if (swdpresource != NULL) { gt_freeSWdpresource(swdpresource); swdpresource = NULL; } gt_seq_iterator_delete(seqit); } if (idxlocalioptions->docompare) { gt_freestorematch(&storeonline); gt_freestorematch(&storeoffline); } } if (genericindex == NULL) { gt_encseq_delete((GtEncseq *) encseq); encseq = NULL; } else { genericindex_delete(genericindex); } gt_logger_delete(logger); logger = NULL; return haserr ? -1 : 0; }
/*read condenseq data structure from file*/ GtCondenseq *gt_condenseq_new_from_file(const char *indexname, GtLogger *logger, GtError *err) { int had_err = 0; FILE* fp; GtEncseqLoader *esl; GtEncseq *unique_es; GtCondenseq *condenseq = NULL; /*load unique_es*/ esl = gt_encseq_loader_new(); unique_es = gt_encseq_loader_load(esl, indexname, err); if (!unique_es) had_err = -1; if (!had_err) { gt_encseq_loader_delete(esl); condenseq = condenseq_new_empty(gt_encseq_alphabet(unique_es)); condenseq->filename = gt_cstr_dup(indexname); condenseq->unique_es = unique_es; fp = gt_fa_fopen_with_suffix(indexname, GT_CONDENSEQ_FILE_SUFFIX, "rb", err); if (fp == NULL) { had_err = -1; } else { had_err = condenseq_io(condenseq, fp, gt_io_error_fread, err); if (!had_err) { GtUword i; gt_assert(condenseq->uniques); gt_assert(condenseq->links); gt_fa_fclose(fp); /*create link array for each unique entry*/ for (i = 0; i < condenseq->udb_nelems; i++) { GT_INITARRAY(&(condenseq->uniques[i].links),uint32_t); } /* check for overflows */ if (condenseq->ldb_nelems > (GtUword) ((uint32_t) 0 - (uint32_t) 1)) { gt_error_set(err, "Overflow, to many link-elements. Can't be stored"); had_err = -1; } /* iterate through link entrys and store ids in corresponding unique entry array */ for (i = 0; !had_err && (GtUword) i < condenseq->ldb_nelems; i++) { GtUword uid = condenseq->links[i].unique_id; gt_assert(uid < condenseq->udb_nelems); GT_STOREINARRAY(&(condenseq->uniques[uid].links), uint32_t, 10, (uint32_t) i); } } } } if (!had_err) { gt_assert(condenseq != NULL); if (condenseq->id_len != GT_UNDEF_UWORD) gt_logger_log(logger, "IDs const len: " GT_WU, condenseq->id_len); else gt_logger_log(logger, "using sdstab to access IDs"); } if (had_err) { gt_condenseq_delete(condenseq); condenseq = NULL; } return (condenseq); }
static int gt_readjoiner_cnttest_runner(GT_UNUSED int argc, GT_UNUSED const char **argv, GT_UNUSED int parsed_args, void *tool_arguments, GT_UNUSED GtError *err) { GtReadjoinerCnttestArguments *arguments = tool_arguments; GtEncseqLoader *el = NULL; GtEncseq *reads = NULL; GtBitsequence *bits = NULL; GtUword nofreads; int had_err = 0; gt_error_check(err); gt_assert(arguments); if (arguments->test == GT_READJOINER_CNTTEST_SHOWLIST) { GtStr *fn = NULL; fn = gt_str_clone(arguments->readset); gt_str_append_cstr(fn, GT_READJOINER_SUFFIX_CNTLIST); had_err = gt_cntlist_parse(gt_str_get(fn), true, &bits, &nofreads, err); gt_str_delete(fn); } else if (arguments->test == GT_READJOINER_CNTTEST_BRUTEFORCE || arguments->test == GT_READJOINER_CNTTEST_KMP) { el = gt_encseq_loader_new(); gt_encseq_loader_drop_description_support(el); gt_encseq_loader_disable_autosupport(el); if (!arguments->singlestrand) gt_encseq_loader_mirror(el); reads = gt_encseq_loader_load(el, gt_str_get(arguments->readset), err); if (reads == NULL) had_err = -1; else { gt_rdj_pairwise_exact(GT_OVLFIND_CNT, reads, !arguments->singlestrand, false, arguments->test == GT_READJOINER_CNTTEST_KMP, 1UL, true, NULL, NULL, false, NULL, &bits, &nofreads); } gt_encseq_delete(reads); gt_encseq_loader_delete(el); } else if (arguments->test == GT_READJOINER_CNTTEST_ESA) { Sequentialsuffixarrayreader *ssar = NULL; GtUword readlength = 0, firstrevcompl = 0; GtLogger *verbose_logger = gt_logger_new(arguments->verbose, GT_LOGGER_DEFLT_PREFIX, stderr); ssar = gt_newSequentialsuffixarrayreaderfromfile(gt_str_get( arguments->readset), SARR_LCPTAB | SARR_SUFTAB | SARR_SSPTAB, true, verbose_logger, err); if (gt_error_is_set(err)) had_err = -1; else { nofreads = gt_encseq_num_of_sequences(ssar->encseq); if (!arguments->singlestrand) { nofreads = GT_DIV2(nofreads); firstrevcompl = nofreads; } GT_INITBITTAB(bits, nofreads); if (!arguments->singlestrand) if (gt_encseq_accesstype_get(ssar->encseq) == GT_ACCESS_TYPE_EQUALLENGTH) readlength = gt_encseq_seqlength(ssar->encseq, 0); (void)gt_contfind_bottomup(ssar, false, bits, arguments->singlestrand ? 0 : firstrevcompl, readlength); } if (ssar != NULL) gt_freeSequentialsuffixarrayreader(&ssar); gt_logger_delete(verbose_logger); } else { gt_assert(false); } if (!had_err) had_err = gt_cntlist_show(bits, nofreads, NULL, false, err); gt_free(bits); return had_err; }