char *gt_condenseq_basefilename(const GtCondenseq *condenseq) { char *basename = NULL, *suffix_ptr; if (condenseq->filename != NULL) { basename = gt_basename(condenseq->filename); if (strlen(basename) > (size_t) 1 && (suffix_ptr = strrchr(basename + 1, '.')) != NULL) { /* remove suffix */ *suffix_ptr = '\0'; } } return basename; }
static void default_track_selector(GtBlock *block, GtStr *result, GT_UNUSED void *data) { GtGenomeNode *top; char *basename; gt_assert(block && result); gt_str_reset(result); top = (GtGenomeNode*) gt_block_get_top_level_feature(block); /* we take the basename of the filename to have nicer output in the generated graphic. this might lead to ``collapsed'' tracks, if two files with different paths have the same basename. */ basename = gt_basename(gt_genome_node_get_filename(top)); gt_str_append_cstr(result, basename); gt_free(basename); gt_str_append_char(result, GT_FILENAME_TYPE_SEPARATOR); gt_str_append_cstr(result, gt_block_get_type(block)); }
static int gt_encseq_encode_runner(GT_UNUSED int argc, const char **argv, int parsed_args, GT_UNUSED void *tool_arguments, GtError *err) { int had_err = 0, i; GtEncseqEncodeArguments *arguments = (GtEncseqEncodeArguments*) tool_arguments; GtStrArray *infiles; gt_error_check(err); infiles = gt_str_array_new(); for (i = parsed_args; i < argc; i++) { gt_str_array_add_cstr(infiles, argv[i]); } if (gt_str_length(arguments->indexname) == 0UL) { if (gt_str_array_size(infiles) > 1UL) { gt_error_set(err,"if more than one input file is given, then " "option -indexname is mandatory"); had_err = -1; } else { char *basenameptr; basenameptr = gt_basename(gt_str_array_get(infiles, 0UL)); gt_str_set(arguments->indexname, basenameptr); gt_free(basenameptr); } } if (!had_err) { gt_assert(gt_str_length(arguments->indexname) > 0UL); had_err = encode_sequence_files(infiles, arguments->eopts, gt_str_get(arguments->indexname), arguments->verbose, arguments->no_esq_header, err); } if (!had_err && arguments->showstats) show_encoded_statistics(infiles, gt_str_get(arguments->indexname)); gt_str_array_delete(infiles); return had_err; }
static int gt_compreads_compress_arguments_check(GT_UNUSED int rest_argc, void *tool_arguments, GtError *err) { int had_err = 0; GtCsrHcrEncodeArguments *arguments = tool_arguments; GtSplitter *splitter = NULL; GtStr *buffer; gt_error_check(err); gt_assert(arguments); if (gt_str_array_size(arguments->files) == 0) { gt_error_set(err, "option \"-files\" is mandatory and requires" " at least one filename as argument!"); had_err = -1; } if (!had_err) { if (gt_str_length(arguments->name) == 0) { if (gt_str_array_size(arguments->files) > 1UL) { gt_error_set(err, "option \"-name\" needs to be specified" " if more than one file is given"); had_err = -1; } else { GtUword i; char *basename; splitter = gt_splitter_new(); basename = gt_basename(gt_str_array_get(arguments->files, 0)); buffer = gt_str_new_cstr(basename); gt_splitter_split(splitter, gt_str_get(buffer), gt_str_length(buffer), '.'); for (i = 0; i < gt_splitter_size(splitter) - 1; i++) { gt_str_append_cstr(arguments->name, gt_splitter_get_token(splitter, i)); if (i < gt_splitter_size(splitter) - 2) gt_str_append_char(arguments->name, '.'); } gt_free(basename); gt_splitter_delete(splitter); gt_str_delete(buffer); } } } if (!had_err) { char *sampling_type = gt_str_get(arguments->method); static const char *methods[] = { "page", "regular", "none" }; if (!strcmp(methods[0], sampling_type)) { arguments->pagewise = true; if (arguments->srate == GT_UNDEF_UWORD) arguments->srate = GT_SAMPLING_DEFAULT_PAGE_RATE; else if (arguments->srate == 0) { gt_error_set(err, "page sampling was chosen, but sampling" " rate was set to "GT_WU"! this seems wrong.", arguments->srate); had_err = -1; } } else if (!strcmp(methods[1], sampling_type)) { arguments->regular = true; if (arguments->srate == GT_UNDEF_UWORD) arguments->srate = GT_SAMPLING_DEFAULT_REGULAR_RATE; else if (arguments->srate == 0) { gt_error_set(err, "regular sampling was chosen, but sampling rate " " was set to "GT_WU"! this seems wrong.", arguments->srate); had_err = -1; } } else if (!strcmp(methods[2], sampling_type)) { if (arguments->srate == GT_UNDEF_UWORD) arguments->srate = 0; else if (arguments->srate != 0) { gt_error_set(err, "no sampling was chosen, but sampling rate was" " set to "GT_WU"! this seems wrong.", arguments->srate); had_err = -1; } } else { gt_error_set(err, "somethings wrong with the stype option"); had_err = -1; } } if (!had_err) { if (arguments->arg_range.start != GT_UNDEF_UWORD) { if (arguments->arg_range.start <= (GtUword) UINT_MAX) { gt_safe_assign(arguments->qrng.start, arguments->arg_range.start); if (arguments->arg_range.end <= (GtUword) UINT_MAX) gt_safe_assign(arguments->qrng.end, arguments->arg_range.end); else had_err = -1; } else had_err = -1; } if (had_err) gt_error_set(err, "Range for qualities: value to large! larger than %u", UINT_MAX); } return had_err; }
static int gt_compreads_decompress_runner(GT_UNUSED int argc, GT_UNUSED const char **argv, GT_UNUSED int parsed_args, void *tool_arguments, GtError *err) { GtCsrHcrDecodeArguments *arguments = tool_arguments; int had_err = 0; GtAlphabet *alpha = NULL; GtHcrDecoder *hcrd = NULL; GtTimer *timer = NULL; unsigned long start, end; gt_error_check(err); gt_assert(arguments); if (gt_showtime_enabled()) { timer = gt_timer_new_with_progress_description("start"); gt_timer_start(timer); gt_assert(timer); } if (gt_str_length(arguments->smap) > 0) { alpha = gt_alphabet_new_from_file_no_suffix(gt_str_get(arguments->smap), err); if (!alpha) had_err = -1; } else { alpha = gt_alphabet_new_dna(); if (!alpha) had_err = -1; } if (!had_err) { if (timer != NULL) gt_timer_show_progress(timer, "decoding", stdout); if (gt_str_length(arguments->name) == 0) { char *basenameptr; basenameptr = gt_basename(gt_str_get(arguments->file)); gt_str_set(arguments->name, basenameptr); gt_free(basenameptr); } hcrd = gt_hcr_decoder_new(gt_str_get(arguments->file), alpha, arguments->descs, timer, err); if (hcrd == NULL) had_err = -1; else { if (arguments->bench != 0) { had_err = gt_compreads_decompress_benchmark(hcrd, arguments->bench, timer, err); } else { if (arguments->rng.start != GT_UNDEF_ULONG && arguments->rng.end != GT_UNDEF_ULONG) { if (arguments->rng.start >= gt_hcr_decoder_num_of_reads(hcrd) || arguments->rng.end >= gt_hcr_decoder_num_of_reads(hcrd)) { gt_error_set(err, "range %lu-%lu includes a read number exceeding " "the total number of reads (%lu)", arguments->rng.start, arguments->rng.end, gt_hcr_decoder_num_of_reads(hcrd)); had_err = -1; } start = arguments->rng.start; end = arguments->rng.end; } else { start = 0; end = gt_hcr_decoder_num_of_reads(hcrd) - 1; } if (!had_err) { gt_log_log("filebasename: %s", gt_str_get(arguments->name)); if (gt_hcr_decoder_decode_range(hcrd, gt_str_get(arguments->name), start, end, timer, err) != 0) had_err = -1; } } } gt_hcr_decoder_delete(hcrd); } gt_alphabet_delete(alpha); if (timer != NULL) { gt_timer_show_progress_final(timer, stdout); gt_timer_delete(timer); } if (had_err) gt_assert(gt_error_is_set(err)); return had_err; }
static void make_sequence_region(GtHashmap *sequence_regions, GtStr *sequenceid, GthRegionFactory *srf, GthInput *input, GtUword filenum, GtUword seqnum) { GtUword offset_is_defined = false; GtRange range, descrange; GtGenomeNode *sr = NULL; gt_assert(sequence_regions && sequenceid && srf && input); if (gth_input_use_substring_spec(input)) { range.start = gth_input_genomic_substring_from(input); range.end = gth_input_genomic_substring_to(input); } else { range = gth_input_get_relative_genomic_range(input, filenum, seqnum); } if (srf->use_desc_ranges) { GtStr *description = gt_str_new(); gth_input_get_genomic_description(input, description, filenum, seqnum); if (!gt_parse_description_range(gt_str_get(description), &descrange)) offset_is_defined = true; gt_str_delete(description); } if (offset_is_defined) range = gt_range_offset(&range, descrange.start); else range = gt_range_offset(&range, 1); /* 1-based */ if (!gt_str_length(sequenceid) || (gt_cstr_table_get(srf->used_seqids, gt_str_get(sequenceid)) && !offset_is_defined)) { /* sequenceid is empty or exists already (and no offset has been parsed) -> make one up */ GtStr *seqid; char *base; base = gt_basename(gth_input_get_genomic_filename(input, filenum)); seqid = gt_str_new_cstr(base); gt_free(base); gt_str_append_char(seqid, '|'); gt_str_append_uword(seqid, seqnum + 1); /* 1-based */ seqid_store_add(srf->seqid_store, filenum, seqnum, seqid, GT_UNDEF_UWORD); gt_assert(!gt_cstr_table_get(srf->used_seqids, gt_str_get(seqid))); gt_cstr_table_add(srf->used_seqids, gt_str_get(seqid)); sr = gt_region_node_new(seqid_store_get(srf->seqid_store, filenum, seqnum), range.start, range.end); gt_hashmap_add(sequence_regions, (void*) gt_cstr_table_get(srf->used_seqids, gt_str_get(seqid)), sr); gt_str_delete(seqid); } else { /* sequenceid does not exists already (or an offset has been parsed) -> use this one */ if (!gt_cstr_table_get(srf->used_seqids, gt_str_get(sequenceid))) { /* no sequence region with this id exists -> create one */ gt_cstr_table_add(srf->used_seqids, gt_str_get(sequenceid)); seqid_store_add(srf->seqid_store, filenum, seqnum, sequenceid, offset_is_defined ? descrange.start : GT_UNDEF_UWORD); sr = gt_region_node_new(seqid_store_get(srf->seqid_store, filenum, seqnum), range.start, range.end); gt_hashmap_add(sequence_regions, (void*) gt_cstr_table_get(srf->used_seqids, gt_str_get(sequenceid)), sr); } else { GtRange prev_range, new_range; /* sequence region with this id exists already -> modify range */ sr = gt_hashmap_get(sequence_regions, gt_str_get(sequenceid)); gt_assert(sr); prev_range = gt_genome_node_get_range(sr); new_range = gt_range_join(&prev_range, &range); gt_genome_node_set_range(sr, &new_range); seqid_store_add(srf->seqid_store, filenum, seqnum, sequenceid, offset_is_defined ? descrange.start : GT_UNDEF_UWORD); } } gt_assert(sr); }
static int gt_condenseq_compress_runner(GT_UNUSED int argc, const char **argv, int parsed_args, void *tool_arguments, GtError *err) { GtCondenseqCompressArguments *arguments = tool_arguments; GtLogger *logger, *kdb_logger; FILE *kmer_fp = NULL; int had_err = 0; gt_error_check(err); gt_assert(arguments); logger = gt_logger_new(arguments->verbose, GT_LOGGER_DEFLT_PREFIX, stderr); kdb_logger = gt_logger_new(arguments->kdb, GT_LOGGER_DEFLT_PREFIX, stderr); if (arguments->kdb) { kmer_fp = gt_fa_fopen("kmer_db.out", "w", err); gt_logger_set_target(kdb_logger, kmer_fp); } if (gt_str_length(arguments->indexname) == 0UL) { char *basenameptr; basenameptr = gt_basename(argv[parsed_args]); gt_str_set(arguments->indexname, basenameptr); gt_free(basenameptr); } if (!had_err) { GtEncseqLoader *es_l = gt_encseq_loader_new(); arguments->input_es = gt_encseq_loader_load(es_l, argv[parsed_args], err); if (arguments->input_es == NULL) had_err = -1; gt_encseq_loader_delete(es_l); } if (!had_err) { if (arguments->minalignlength == GT_UNDEF_UWORD) arguments->minalignlength = arguments->initsize != GT_UNDEF_UWORD ? arguments->initsize / (GtUword) 3UL : GT_UNDEF_UWORD; if (arguments->windowsize == GT_UNDEF_UINT) arguments->windowsize = arguments->minalignlength != GT_UNDEF_UWORD ? (unsigned int) (arguments->minalignlength / 5U) : GT_UNDEF_UINT; if (arguments->windowsize < 4U) arguments->windowsize = 4U; if (arguments->kmersize == GT_UNDEF_UINT) { unsigned int size = gt_alphabet_num_of_chars(gt_encseq_alphabet(arguments->input_es)); /* size^k ~= 100000 */ gt_safe_assign(arguments->kmersize, gt_round_to_long(gt_log_base(100000.0, (double) size))); gt_logger_log(logger, "|A|: %u, k: %u", size, arguments->kmersize); } if (arguments->windowsize == GT_UNDEF_UINT) { arguments->windowsize = 5U * arguments->kmersize; } if (arguments->minalignlength == GT_UNDEF_UWORD) { arguments->minalignlength = (GtUword) (3UL * arguments->windowsize); } if (arguments->initsize == GT_UNDEF_UWORD) { arguments->initsize = (GtUword) (3UL * arguments->minalignlength); } } if (!had_err && arguments->windowsize <= arguments->kmersize) { gt_error_set(err, "-windowsize (%u) must be larger -kmersize (%u)!", arguments->windowsize, arguments->kmersize); had_err = -1; } if (!had_err && arguments->minalignlength < (GtUword) arguments->windowsize) { gt_error_set(err, "-alignlength (" GT_WU ") must be at least " "-windowsize (%u)!", arguments->minalignlength, arguments->windowsize); had_err = -1; } if (!had_err && (arguments->initsize < arguments->minalignlength)) { gt_error_set(err, "-initsize (" GT_WU ") must be at least " "-alignlength (" GT_WU ")!", arguments->initsize, arguments->minalignlength); had_err = -1; } if (!had_err) { GtCondenseqCreator *ces_c; if (!had_err) { ces_c = gt_condenseq_creator_new(arguments->initsize, arguments->minalignlength, arguments->xdrop, &(arguments->scores), arguments->kmersize, arguments->windowsize, logger, err); if (ces_c == NULL) had_err = -1; } if (!had_err) { if (arguments->cutoff_value == GT_UNDEF_UWORD) gt_condenseq_creator_use_mean_cutoff(ces_c); else if (arguments->cutoff_value == 0) gt_condenseq_creator_disable_cutoff(ces_c); else gt_condenseq_creator_set_cutoff(ces_c, arguments->cutoff_value); gt_condenseq_creator_set_mean_fraction(ces_c, arguments->fraction); if (arguments->prune) gt_condenseq_creator_disable_prune(ces_c); if (arguments->brute) gt_condenseq_creator_enable_brute_force(ces_c); if (!arguments->diags) gt_condenseq_creator_disable_diagonals(ces_c); if (arguments->full_diags) gt_condenseq_creator_enable_full_diagonals(ces_c); if (arguments->clean_percent != GT_UNDEF_UINT) gt_condenseq_creator_set_diags_clean_limit(ces_c, arguments->clean_percent); had_err = gt_condenseq_creator_create(ces_c, arguments->indexname, arguments->input_es, logger, kdb_logger, err); gt_condenseq_creator_delete(ces_c); } } gt_logger_delete(logger); gt_logger_delete(kdb_logger); if (arguments->kdb) gt_fa_fclose(kmer_fp); return had_err; }
int gt_gtdata_show_help(const char *progname, GT_UNUSED void *unused, GtError *err) { GtSplitter *splitter; GtStr *doc_file; lua_State *L = NULL; char *prog, *bn; int had_err = 0; gt_error_check(err); gt_assert(progname); prog = gt_cstr_dup(progname); /* create modifiable copy for splitter */ splitter = gt_splitter_new(); gt_splitter_split(splitter, prog, strlen(prog), ' '); doc_file = gt_get_gtdata_path(gt_splitter_get_token(splitter, 0), err); if (!doc_file) had_err = -1; if (!had_err) { gt_str_append_cstr(doc_file, "/doc/"); /* create Lua & push gtdata_doc_dir to Lua */ L = luaL_newstate(); if (!L) { gt_error_set(err, "out of memory (cannot create new Lua state)"); had_err = -1; } } if (!had_err) { luaL_openlibs(L); lua_pushstring(L, gt_str_get(doc_file)); lua_setglobal(L, "gtdata_doc_dir"); /* finish creating doc_file */ if (gt_splitter_size(splitter) == 1) { /* special case for `gt` */ bn = gt_basename(progname); gt_str_append_cstr(doc_file, bn); gt_free(bn); } else { /* general case for the tools */ gt_str_append_cstr(doc_file, gt_splitter_get_token(splitter, gt_splitter_size(splitter) - 1)); } gt_str_append_cstr(doc_file, ".lua"); /* execute doc_file */ if (luaL_loadfile(L, gt_str_get(doc_file)) || lua_pcall(L, 0, 0, 0)) { gt_error_set(err, "cannot run doc file: %s", lua_tostring(L, -1)); had_err = -1; } } /* free */ if (L) lua_close(L); gt_str_delete(doc_file); gt_splitter_delete(splitter); gt_free(prog); return had_err; }
static GtOPrval parsemkfmindex(Mkfmcallinfo *mkfmcallinfo, int argc, const char **argv, GtError *err) { GtOptionParser *op; GtOption *option, *optionfmout; GtOPrval oprval; int parsed_args; gt_error_check(err); mkfmcallinfo->indexnametab = gt_str_array_new(); mkfmcallinfo->outfmindex = gt_str_new(); mkfmcallinfo->leveldesc = gt_str_new(); op = gt_option_parser_new("[option ...] -ii indexfile [...]", "Compute FM-index."); gt_option_parser_set_mail_address(op, "<*****@*****.**>"); optionfmout = gt_option_new_string("fmout", "specify name of FM-index to be generated\n" "(mandatory if more than one input index " "is specified)", mkfmcallinfo->outfmindex, NULL); gt_option_parser_add_option(op, optionfmout); option = gt_option_new_filename_array("ii", "specify indices to be used", mkfmcallinfo->indexnametab); gt_option_is_mandatory(option); gt_option_parser_add_option(op, option); option = gt_option_new_string("size", "specify size (tiny, small, medium, big)", mkfmcallinfo->leveldesc, "medium"); gt_option_parser_add_option(op, option); option = gt_option_new_bool("noindexpos", "store no index positions (hence the positions of\n" "matches in the index cannot be retrieved)", &mkfmcallinfo->noindexpos,false); gt_option_parser_add_option(op, option); oprval = gt_option_parser_parse(op, &parsed_args, argc, argv, gt_versionfunc, err); if (oprval == GT_OPTION_PARSER_OK) { if (!gt_option_is_set(optionfmout)) { if (gt_str_array_size(mkfmcallinfo->indexnametab) > 1UL) { gt_error_set(err,"if more than one index is given, then " "option -fmout is mandatory"); oprval = GT_OPTION_PARSER_ERROR; } else { char *basenameptr; basenameptr = gt_basename(gt_str_array_get(mkfmcallinfo->indexnametab, 0)); gt_str_set(mkfmcallinfo->outfmindex,basenameptr); gt_free(basenameptr); } } } gt_option_parser_delete(op); if (oprval == GT_OPTION_PARSER_OK && parsed_args != argc) { gt_error_set(err,"superfluous program parameters"); oprval = GT_OPTION_PARSER_ERROR; } return oprval; }
static GtOPrval parse_options(int *parsed_args, bool doesa, Suffixeratoroptions *so, int argc, const char **argv, GtError *err) { GtOptionParser *op; GtOption *option, *optionshowprogress, *optiongenomediff, *optionii; GtOPrval oprval; gt_error_check(err); op = gt_option_parser_new("[option ...] (-db file [...] | -ii index)", doesa ? "Compute enhanced suffix array." : "Compute packed index."); gt_option_parser_set_mail_address(op, "<*****@*****.**>"); /* input info */ so->indexname = gt_str_new(); so->inputindex = gt_str_new(); so->db = gt_str_array_new(); /* register options for encoded sequence handling */ so->encopts = gt_encseq_options_register_encoding(op, so->indexname, so->db); so->loadopts = gt_encseq_options_register_loading(op, so->indexname); /* register options for index handling */ if (doesa) so->idxopts = gt_index_options_register_esa(op, so->encopts); else so->idxopts = gt_index_options_register_packedidx(op, so->indexname, so->encopts); /* verbosity */ option = gt_option_new_verbose(&so->beverbose); gt_option_parser_add_option(op, option); optionshowprogress = gt_option_new_bool("showprogress", "show a progress bar", &so->showprogress, false); gt_option_parser_add_option(op, optionshowprogress); optionii = gt_option_new_filename("ii", "specify existing encoded sequence", so->inputindex); gt_option_parser_add_option(op, optionii); gt_option_is_mandatory_either(gt_encseq_options_db_option(so->encopts), optionii); gt_option_exclude(gt_encseq_options_db_option(so->encopts), optionii); gt_option_exclude(optionii, gt_encseq_options_smap_option(so->encopts)); gt_option_exclude(optionii, gt_encseq_options_dna_option(so->encopts)); gt_option_exclude(optionii, gt_encseq_options_protein_option(so->encopts)); gt_option_exclude(optionii, gt_encseq_options_plain_option(so->encopts)); gt_option_exclude(optionii, gt_encseq_options_sat_option(so->encopts)); optiongenomediff = gt_option_new_bool("genomediff", "directly process the lcp intervals using " "the genomediff algorithm (suffix array and " "lcp-tables are not output)", &so->genomediff, false); gt_option_is_extended_option(optiongenomediff); if (gt_index_options_outsuftab_option(so->idxopts) != NULL) { gt_option_exclude(optiongenomediff, gt_index_options_outsuftab_option(so->idxopts)); } gt_option_parser_add_option(op, optiongenomediff); /* suffixerator and friends do not take arguments */ gt_option_parser_set_min_max_args(op, 0U, 0U); oprval = gt_option_parser_parse(op, parsed_args, argc, argv, gt_versionfunc, err); if (gt_str_length(so->indexname) == 0UL) { /* we do not have an indexname yet, so there was none given in the -indexname option and it could not be derived from the input filenames. So it must be in the -ii parameter. */ char *basenameptr; basenameptr = gt_basename(gt_str_get(so->inputindex)); gt_str_set(so->indexname, basenameptr); gt_free(basenameptr); } gt_option_parser_delete(op); return oprval; }
static int gt_condenser_search_runner(GT_UNUSED int argc, GT_UNUSED const char **argv, GT_UNUSED int parsed_args, void *tool_arguments, GtError *err) { GtCondenserSearchArguments *arguments = tool_arguments; int i, had_err = 0; char *querypath = gt_str_get(arguments->querypath); GtStr* coarse_fname = gt_str_new_cstr("coarse_"); char *db_basename = NULL; char *suffix_ptr = NULL; GtTimer *timer = NULL; GtLogger *logger = NULL; gt_error_check(err); gt_assert(arguments); logger = gt_logger_new(arguments->verbose, GT_LOGGER_DEFLT_PREFIX, stderr); db_basename = gt_basename(gt_str_get(arguments->dbpath)); /* if first char is '.' this might be a hidden file */ if (strlen(db_basename) > (size_t) 1 && (suffix_ptr = strrchr(db_basename + 1, '.')) != NULL) { /* remove suffix */ *suffix_ptr = '\0'; } gt_str_append_cstr(coarse_fname, db_basename); gt_str_append_cstr(coarse_fname, ".fas"); gt_free(db_basename); db_basename = NULL; suffix_ptr = NULL; if (arguments->blastn || arguments->blastp) { GtMatch *match; GtMatchIterator *mp = NULL; GtNREncseq *nrencseq = NULL; GtStr *fastaname = gt_str_clone(arguments->dbpath); HitPosition *hits; double eval, raw_eval = 0.0; GtUword coarse_db_len = 0; GtMatchIteratorStatus status; int curr_hits = 0, max_hits = 100; hits = gt_malloc(sizeof (*hits) * (size_t) max_hits); gt_str_append_cstr(fastaname, ".fas"); for (i=0; i < max_hits; i++) { hits[i].range = gt_malloc(sizeof (*hits[i].range) * (size_t) 1); } if (gt_showtime_enabled()) { timer = gt_timer_new_with_progress_description("initialization"); gt_timer_start(timer); } /*extract sequences from compressed database*/ if (!had_err) { nrencseq = gt_n_r_encseq_new_from_file(gt_str_get(arguments->dbpath), logger, err); if (nrencseq == NULL) had_err = -1; } if (!had_err) { if (arguments->ceval == GT_UNDEF_DOUBLE || arguments->feval == GT_UNDEF_DOUBLE) { /* from NCBI BLAST tutorial: E = Kmne^{-lambdaS} calculates E-value for score S with natural scale parameters K for search space size and lambda for the scoring system E = mn2^-S' m being the subject (total) length, n the length of ONE query calculates E-value for bit-score S' */ GtFastaReader *reader; GtCondenserSearchAvg avg = {0,0}; reader = gt_fasta_reader_rec_new(arguments->querypath); had_err = gt_fasta_reader_run(reader, NULL, NULL, gt_condenser_search_cum_moving_avg, &avg, err); if (!had_err) { GtUword S = arguments->bitscore; gt_log_log(GT_WU " queries, avg query size: " GT_WU, avg.count, avg.avg); raw_eval = 1/pow(2.0, (double) S) * avg.avg; gt_logger_log(logger, "Raw E-value set to %.4e", raw_eval); gt_assert(avg.avg != 0); } gt_fasta_reader_delete(reader); } } /*create BLAST database from compressed database fasta file*/ if (!had_err) { if (timer != NULL) gt_timer_show_progress(timer, "create coarse BLAST db", stderr); if (arguments->blastn) had_err = gt_condenser_search_create_nucl_blastdb(gt_str_get(fastaname), err); else had_err = gt_condenser_search_create_prot_blastdb(gt_str_get(fastaname), err); } if (!had_err) { GtBlastProcessCall *call; if (timer != NULL) gt_timer_show_progress(timer, "coarse BLAST run", stderr); if (arguments->blastp) call = gt_blast_process_call_new_prot(); else call = gt_blast_process_call_new_nucl(); gt_blast_process_call_set_db(call, gt_str_get(fastaname)); gt_blast_process_call_set_query(call, querypath); gt_blast_process_call_set_evalue(call, arguments->ceval); gt_blast_process_call_set_num_threads(call, arguments->blthreads); mp = gt_match_iterator_blast_process_new(call, err); if (!mp) had_err = -1; gt_blast_process_call_delete(call); while (!had_err && (status = gt_match_iterator_next(mp, &match, err)) != GT_MATCHER_STATUS_END) { if (status == GT_MATCHER_STATUS_OK) { GtUword hit_seq_id; char string[7]; const char *dbseqid = gt_match_get_seqid2(match); if (sscanf(dbseqid,"%6s" GT_WU, string, &hit_seq_id) == 2) { gt_match_get_range_seq2(match, hits[curr_hits].range); hits[curr_hits].idx = hit_seq_id; gt_match_delete(match); curr_hits++; if (curr_hits == max_hits) { HitPosition *hit_extention; max_hits += 100; hits = gt_realloc(hits, sizeof (*hit_extention) * max_hits); for (i=max_hits - 100; i < max_hits; i++) { hits[i].range = gt_malloc(sizeof (*hits[i].range)); } } } else { gt_error_set(err, "could not parse unique db header %s", dbseqid); had_err = -1; } } else if (status == GT_MATCHER_STATUS_ERROR) { had_err = -1; } } gt_match_iterator_delete(mp); } /*extract sequences*/ if (!had_err) { GtNREncseqDecompressor *decomp; GtFile *coarse_hits; if (timer != NULL) gt_timer_show_progress(timer, "extract coarse search hits", stderr); decomp = gt_n_r_encseq_decompressor_new(nrencseq); coarse_hits = gt_file_new(gt_str_get(coarse_fname),"w", err); /* TODO DW do NOT extract complete uniques! these could be complete chromosomes!! just extract something around it? maybe +- max query length*/ for (i = 0; i < curr_hits; i++) { gt_n_r_encseq_decompressor_add_unique_idx_to_extract(decomp, hits[i].idx); } had_err = gt_n_r_encseq_decompressor_start_unique_extraction(coarse_hits, decomp, &coarse_db_len, err); gt_assert(coarse_db_len != 0); gt_file_delete(coarse_hits); gt_n_r_encseq_decompressor_delete(decomp); } gt_n_r_encseq_delete(nrencseq); /* create BLAST database from decompressed database file */ if (!had_err) { if (timer != NULL) gt_timer_show_progress(timer, "create fine BLAST db", stderr); if (arguments->blastn) had_err = gt_condenser_search_create_nucl_blastdb(gt_str_get(coarse_fname), err); else had_err = gt_condenser_search_create_prot_blastdb(gt_str_get(coarse_fname), err); } /* perform fine BLAST search */ if (!had_err) { GtBlastProcessCall *call; if (timer != NULL) gt_timer_show_progress(timer, "fine BLAST run", stderr); if (arguments->feval == GT_UNDEF_DOUBLE) { eval = raw_eval * coarse_db_len; } else { eval = arguments->feval; } if (arguments->blastp) call = gt_blast_process_call_new_prot(); else call = gt_blast_process_call_new_nucl(); gt_blast_process_call_set_db(call, gt_str_get(coarse_fname)); gt_blast_process_call_set_query(call, querypath); gt_blast_process_call_set_evalue(call, eval); gt_blast_process_call_set_num_threads(call, arguments->blthreads); gt_logger_log(logger, "Fine E-value set to: %.4e (len)" GT_WU, eval, coarse_db_len); mp = gt_match_iterator_blast_process_new(call, err); if (!mp) had_err = -1; gt_blast_process_call_delete(call); if (!had_err) { GtUword numofhits = 0; while (!had_err && (status = gt_match_iterator_next(mp, &match, err)) != GT_MATCHER_STATUS_END) { if (status == GT_MATCHER_STATUS_OK) { GtMatchBlast *matchb = (GtMatchBlast*) match; char *dbseqid = gt_malloc(sizeof (*dbseqid) * 50); GtRange range_seq1; GtRange range_seq2; numofhits++; gt_match_get_range_seq1(match, &range_seq1); gt_match_get_range_seq2(match, &range_seq2); gt_file_xprintf( arguments->outfp, "%s\t%s\t%.2f\t" GT_WU "\t" GT_WU "\t" GT_WU "\t" GT_WU "\t" GT_WU "\t%g\t%.3f\n", gt_match_get_seqid1(match), gt_match_get_seqid2(match), gt_match_blast_get_similarity(matchb), gt_match_blast_get_align_length(matchb), range_seq1.start, range_seq1.end, range_seq2.start, range_seq2.end, gt_match_blast_get_evalue(matchb), (double) gt_match_blast_get_bitscore(matchb)); gt_match_delete(match); gt_free(dbseqid); } else if (status == GT_MATCHER_STATUS_ERROR) { had_err = -1; } } gt_log_log(GT_WU " hits found\n", numofhits); } gt_match_iterator_delete(mp); } if (!had_err) if (timer != NULL) gt_timer_show_progress_final(timer, stderr); gt_timer_delete(timer); /*cleanup*/ for (i=0; i < max_hits; i++) { gt_free(hits[i].range); } gt_free(hits); gt_str_delete(fastaname); } gt_str_delete(coarse_fname); gt_logger_delete(logger); return had_err; }