static int gt_readjoiner_assembly_build_graph( GtReadjoinerAssemblyArguments *arguments, GtStrgraph **strgraph, GtEncseq *reads, const char *readset, bool eqlen, GtUword rlen, GtUword nreads, GtBitsequence *contained, GtLogger *default_logger, GtLogger *verbose_logger, GtTimer *timer, GtError *err) { int had_err = 0; *strgraph = gt_strgraph_new(nreads); if (arguments->minmatchlength > 0) gt_logger_log(verbose_logger, "SPM length cutoff = %u", arguments->minmatchlength); had_err = gt_readjoiner_assembly_count_spm(readset, eqlen, arguments->minmatchlength, arguments->nspmfiles, *strgraph, contained, default_logger, err); gt_readjoiner_assembly_show_current_space("(edges counted)"); if (gt_showtime_enabled()) gt_timer_show_progress(timer, GT_READJOINER_ASSEMBLY_MSG_BUILDSG, stdout); gt_logger_log(default_logger, GT_READJOINER_ASSEMBLY_MSG_BUILDSG); if (had_err == 0) { gt_assert((eqlen && rlen > 0 && reads == NULL) || (!eqlen && rlen == 0 && reads != NULL)); gt_strgraph_allocate_graph(*strgraph, rlen, reads); gt_readjoiner_assembly_show_current_space("(graph allocated)"); had_err = gt_strgraph_load_spm_from_file(*strgraph, (GtUword)arguments->minmatchlength, arguments->redtrans, contained, readset, arguments->nspmfiles, GT_READJOINER_SUFFIX_SPMLIST, err); } return had_err; }
void gt_showmaximalprefixlength(GtLogger *logger, unsigned int maxprefixlen, unsigned int recommended) { gt_logger_log(logger, "for this input size and alphabet size, " "the maximal prefixlength"); gt_logger_log(logger,"(argument of option -pl) is %u,",maxprefixlen); gt_logger_log(logger,"the recommended prefixlength is %u",recommended); }
static void infer_cds_visitor_check_stop(AgnInferCDSVisitor *v) { if(gt_array_size(v->cds) == 0) return; const char *mrnaid = gt_feature_node_get_attribute(v->mrna, "ID"); unsigned int ln = gt_genome_node_get_line_number((GtGenomeNode *)v->mrna); GtStrand strand = gt_feature_node_get_strand(v->mrna); GtRange stoprange; GtUword threeprimeindex = gt_array_size(v->cds) - 1; GtGenomeNode **threeprimesegment = gt_array_get(v->cds, threeprimeindex); stoprange = gt_genome_node_get_range(*threeprimesegment); stoprange.start = stoprange.end - 2; if(strand == GT_STRAND_REVERSE) { threeprimesegment = gt_array_get(v->cds, 0); stoprange = gt_genome_node_get_range(*threeprimesegment); stoprange.end = stoprange.start + 2; } if(gt_array_size(v->stops) > 1) { gt_logger_log(v->logger, "mRNA '%s' (line %u) has %lu stop codons", mrnaid, ln, gt_array_size(v->starts)); } else if(gt_array_size(v->stops) == 1) { GtGenomeNode **codon = gt_array_get(v->stops, 0); GtRange testrange = gt_genome_node_get_range(*codon); if(gt_range_compare(&stoprange, &testrange) != 0) { gt_logger_log(v->logger, "stop codon inferred from CDS [%lu, %lu] does " "not match explicitly provided stop codon [%lu, %lu] for " "mRNA '%s'", stoprange.start, stoprange.end, testrange.start, testrange.end, mrnaid); } } else // agn_assert(gt_array_size(v->stops) == 0) { GtStr *seqid = gt_genome_node_get_seqid((GtGenomeNode *)v->mrna); GtGenomeNode *codonfeature = gt_feature_node_new(seqid, "stop_codon", stoprange.start, stoprange.end, strand); if(v->source) gt_feature_node_set_source((GtFeatureNode *)codonfeature, v->source); GtFeatureNode *cf = (GtFeatureNode *)codonfeature; gt_feature_node_add_child(v->mrna, cf); gt_array_add(v->stops, cf); } }
static int gt_readjoiner_assembly_count_spm(const char *readset, bool eqlen, unsigned int minmatchlength, unsigned int nspmfiles, GtStrgraph *strgraph, GtBitsequence *contained, GtLogger *default_logger, GtError *err) { GtSpmprocSkipData skipdata; int had_err = 0; unsigned int i; GtStr *filename = gt_str_new(); gt_logger_log(default_logger, GT_READJOINER_ASSEMBLY_MSG_COUNTSPM); if (!eqlen) { skipdata.out.e.proc = gt_spmproc_strgraph_count; skipdata.to_skip = contained; skipdata.out.e.data = strgraph; } for (i = 0; i < nspmfiles; i++) { gt_str_append_cstr(filename, readset); gt_str_append_char(filename, '.'); gt_str_append_uint(filename, i); gt_str_append_cstr(filename, GT_READJOINER_SUFFIX_SPMLIST); had_err = gt_spmlist_parse(gt_str_get(filename), (GtUword)minmatchlength, eqlen ? gt_spmproc_strgraph_count : gt_spmproc_skip, eqlen ? (void*)strgraph : (void*)&skipdata, err); gt_str_reset(filename); } gt_str_delete(filename); return had_err; }
static int gt_readjoiner_assembly_paths2seq(const char *readset, GtUword lengthcutoff, bool showpaths, bool astat, double coverage, bool load_copynum, GtUword buffersize, GtLogger *default_logger, GtTimer **timer, GtError *err) { int had_err; GtEncseqLoader *el = gt_encseq_loader_new(); GtEncseq *reads; if (gt_showtime_enabled()) { gt_assert(timer != NULL); if (*timer == NULL) /* paths2seq */ { *timer = gt_timer_new_with_progress_description( GT_READJOINER_ASSEMBLY_MSG_PUMPENCSEQ); gt_timer_show_cpu_time_by_progress(*timer); gt_timer_start(*timer); } else gt_timer_show_progress(*timer, GT_READJOINER_ASSEMBLY_MSG_PUMPENCSEQ, stdout); } gt_logger_log(default_logger, GT_READJOINER_ASSEMBLY_MSG_PUMPENCSEQ); gt_encseq_loader_drop_description_support(el); gt_encseq_loader_disable_autosupport(el); gt_encseq_loader_mirror(el); reads = gt_encseq_loader_load(el, readset, err); gt_assert(reads != NULL); gt_readjoiner_assembly_pump_encseq_through_cache(reads); if (gt_showtime_enabled()) gt_timer_show_progress(*timer, GT_READJOINER_ASSEMBLY_MSG_OUTPUTCONTIGS, stdout); gt_logger_log(default_logger, GT_READJOINER_ASSEMBLY_MSG_OUTPUTCONTIGS); had_err = gt_contigpaths_to_fasta(readset, GT_READJOINER_SUFFIX_CONTIG_PATHS, GT_READJOINER_SUFFIX_CONTIGS, reads, lengthcutoff, showpaths, astat, coverage, load_copynum, (size_t)buffersize, default_logger, err); gt_encseq_delete(reads); gt_encseq_loader_delete(el); return had_err; }
static void showfinalstatistics(const TyrDfsstate *state, const char *inputindex, GtLogger *logger) { uint64_t dnumofmers = addupdistribution(&state->occdistribution); if (state->performtest) { checknumofmers(state,dnumofmers); } gt_logger_log(logger, "the following output refers to the set of all sequences"); gt_logger_log(logger, "represented by the index \"%s\"",inputindex); gt_logger_log(logger, "number of "GT_WU"-mers in the sequences not containing a " "wildcard: " Formatuint64_t, (GtUword) state->mersize, PRINTuint64_tcast(dnumofmers)); gt_logger_log(logger, "show the distribution of the number of occurrences of " GT_WU "-mers", (GtUword) state->mersize); gt_logger_log(logger,"not containing a wildcard as rows of the form " "i d where"); gt_logger_log(logger, "d is the number of events that a "GT_WU "-mer occurs exactly i times", (GtUword) state->mersize); showmerdistribution(state); }
static int gt_readjoiner_assembly_error_correction(GtStrgraph *strgraph, unsigned int bubble, unsigned int deadend, unsigned int deadend_depth, GtLogger *verbose_logger) { unsigned int i; GtUword retval, retval_sum; gt_logger_log(verbose_logger, "remove p-bubbles"); retval_sum = 0; retval = 1UL; for (i = 0; i < bubble && retval > 0; i++) { retval = gt_strgraph_redpbubbles(strgraph, 0, 1UL, false); retval_sum += retval; gt_logger_log(verbose_logger, "removed p-bubble edges [round %u] = "GT_WU, i + 1, retval); } gt_logger_log(verbose_logger, "removed p-bubble edges [%u rounds] = "GT_WU, i, retval_sum); gt_logger_log(verbose_logger, "remove dead-end paths"); retval_sum = 0; retval = 1UL; for (i = 0; i < deadend && retval > 0; i++) { retval = gt_strgraph_reddepaths(strgraph, (GtUword)deadend_depth, false); retval_sum += retval; gt_logger_log(verbose_logger, "removed dead-end path edges [round %u] = " GT_WU, i + 1, retval); } gt_logger_log(verbose_logger, "removed dead-end path edges [%u rounds] = " GT_WU, i, retval_sum); return 0; }
static void gt_readjoiner_assembly_load_graph(GtStrgraph **strgraph, GtEncseq *reads, const char *readset, GtUword rlen, GtLogger *default_logger, GtTimer *timer) { *strgraph = gt_strgraph_new_from_file(reads, rlen, readset, GT_READJOINER_SUFFIX_SG); if (gt_showtime_enabled()) gt_timer_show_progress(timer, GT_READJOINER_ASSEMBLY_MSG_LOADSG, stdout); gt_logger_log(default_logger, GT_READJOINER_ASSEMBLY_MSG_LOADSG); gt_readjoiner_assembly_show_current_space("(graph loaded)"); }
static int hmmsearch_call_coarse_search(GtCondenseq* ces, char *hmmsearch_path, char *table_filename, char *hmm_filename, GtLogger *logger, GtError *err) { int had_err = 0; char **hmmargs = NULL, *hmmenv[] = { NULL }; GtStr *coarse_fas = gt_condenseq_unique_fasta_file(ces); GtSafePipe *pipe = NULL; gt_assert(coarse_fas != NULL); /* Array has to end with NULL */ hmmargs = gt_calloc((size_t) 8, sizeof (*hmmargs)); hmmargs[0] = hmmsearch_path; hmmargs[1] = gt_cstr_dup("--noali"); hmmargs[2] = gt_cstr_dup("--notextw"); hmmargs[3] = gt_cstr_dup("--domtblout"); hmmargs[4] = table_filename; hmmargs[5] = hmm_filename; hmmargs[6] = gt_str_get(coarse_fas); gt_logger_log(logger, "calling: %s", hmmsearch_path); pipe = gt_safe_popen(hmmsearch_path, hmmargs, hmmenv, err); if (pipe == NULL) had_err = -1; gt_free(hmmargs[1]); gt_free(hmmargs[2]); gt_free(hmmargs[3]); gt_free(hmmargs); gt_str_delete(coarse_fas); /* pipe test for splint */ if (!had_err && pipe != NULL) { if (gt_log_enabled()) { GtStr *line = gt_str_new(); while (gt_str_read_next_line(line, pipe->read_fd) == 0) { gt_log_log("%s", gt_str_get(line)); gt_str_reset(line); } gt_str_delete(line); } (void) gt_safe_pclose(pipe); } return had_err; }
static int hmmsearch_call_fine_search(GtStr *table_filename, char *fine_fasta_filename, char *hmmsearch_path, char *hmm_filename, GtLogger *logger, GtError *err) { int had_err = 0; GtSafePipe *pipe = NULL; char **hmmargs = NULL, *hmmenv[] = { NULL }; size_t hmmargc = (size_t) 4; unsigned int hmmidx = 0; if (table_filename != NULL) { hmmargc += (size_t) 2; } hmmargs = gt_calloc(hmmargc, sizeof (*hmmargs)); hmmargs[hmmidx++] = hmmsearch_path; if (table_filename != NULL) { hmmargs[hmmidx++] = gt_cstr_dup("--tblout"); hmmargs[hmmidx++] = gt_str_get(table_filename); } hmmargs[hmmidx++] = hmm_filename; hmmargs[hmmidx++] = fine_fasta_filename; gt_assert(hmmargs[hmmidx] == NULL); gt_logger_log(logger, "calling: %s", hmmsearch_path); pipe = gt_safe_popen(hmmsearch_path, hmmargs, hmmenv, err); if (table_filename != NULL) gt_free(hmmargs[1]); gt_free(hmmargs); if (pipe == NULL) had_err = -1; if (!had_err) { GtStr *line = gt_str_new(); gt_assert(pipe != NULL); /* shut up splint */ while (gt_str_read_next_line(line, pipe->read_fd) == 0) { printf("%s\n", gt_str_get(line)); gt_str_reset(line); } gt_str_delete(line); (void) gt_safe_pclose(pipe); } return had_err; }
GtSuffixsortspace *gt_suffixsortspace_new(unsigned long numofentries, unsigned long maxvalue, bool useuint, GT_UNUSED GtLogger *logger) { GtSuffixsortspace *suffixsortspace; unsigned long sufspacesize; gt_assert(numofentries > 0); suffixsortspace = gt_malloc(sizeof (*suffixsortspace)); suffixsortspace->maxindex = numofentries-1; suffixsortspace->maxvalue = maxvalue; suffixsortspace->longestidx.defined = false; suffixsortspace->longestidx.valueunsignedlong = 0; suffixsortspace->exportptr.ulongtabsectionptr = NULL; suffixsortspace->exportptr.uinttabsectionptr = NULL; suffixsortspace->currentexport = false; #ifdef _LP64 gt_logger_log(logger,"suftab uses %dbit values: " "maxvalue=%lu,numofentries=%lu", gt_decide_to_use_uint(useuint,maxvalue) ? 32 : 64, maxvalue,numofentries); #endif suffixsortspace->basesize = gt_decide_to_use_uint(useuint,maxvalue) ? sizeof (*suffixsortspace->uinttab) : sizeof (*suffixsortspace->ulongtab); sufspacesize = gt_safe_mult_ulong_check((unsigned long) suffixsortspace->basesize, numofentries, gt_suffixsortspace_overflow_abort, &numofentries); gt_log_log("sizeof (suftab)=%lu bytes",sufspacesize); if (gt_decide_to_use_uint(useuint,maxvalue)) { suffixsortspace->ulongtab = NULL; suffixsortspace->uinttab = gt_malloc((size_t) sufspacesize); } else { suffixsortspace->uinttab = NULL; suffixsortspace->ulongtab = gt_malloc((size_t) sufspacesize); } suffixsortspace->partoffset = 0; suffixsortspace->bucketleftidx = 0; suffixsortspace->unmapsortspace = false; return suffixsortspace; }
static int gt_genomediff_runner(int argc, const char **argv, int parsed_args, void *tool_arguments, GtError *err) { bool mirrored = false; int had_err = 0, i; GtEncseq *encseq = NULL; GtGenomediffArguments *arguments = tool_arguments; GtLogger *logger; GtShuUnitFileInfo *unit_info = NULL; GtTimer *timer = NULL; gt_error_check(err); gt_assert(arguments); logger = gt_logger_new(arguments->verbose, GT_LOGGER_DEFLT_PREFIX, stdout); gt_assert(logger); for (i = parsed_args; i < argc; i++) { gt_str_array_add_cstr(arguments->filenames, argv[i]); } if (gt_showtime_enabled()) { timer = gt_timer_new_with_progress_description("start"); gt_timer_start(timer); gt_assert(timer); } if (arguments->with_units) { gt_logger_log(logger, "unitfile option set, filename is %s\n", gt_str_get(arguments->unitfile)); } if (timer != NULL) gt_timer_show_progress(timer, "start shu search", stdout); if (gt_str_array_size(arguments->filenames) > 1UL) { GtEncseqEncoder *ee = gt_encseq_encoder_new(); gt_encseq_encoder_set_timer(ee, timer); gt_encseq_encoder_set_logger(ee, logger); /* kr only makes sense for dna, so we can check this already with ee */ gt_encseq_encoder_set_input_dna(ee); had_err = gt_encseq_encoder_encode(ee, arguments->filenames, gt_str_get(arguments->indexname), err); gt_encseq_encoder_delete(ee); } else { gt_str_append_str(arguments->indexname, gt_str_array_get_str(arguments->filenames, 0)); if (arguments->with_esa || arguments->with_pck) { GtStr *current_line = gt_str_new(); FILE *prj_fp; const char *buffer; char **elements = NULL; prj_fp = gt_fa_fopen_with_suffix(gt_str_get(arguments->indexname), GT_PROJECTFILESUFFIX,"rb",err); if (prj_fp == NULL) had_err = -1; while (!had_err && gt_str_read_next_line(current_line, prj_fp) != EOF) { buffer = gt_str_get(current_line); if (elements != NULL) { gt_free(elements[0]); gt_free(elements[1]); } gt_free(elements); elements = gt_cstr_split(buffer, '='); gt_log_log("%s", elements[0]); if (strcmp("mirrored", elements[0]) == 0) { gt_log_log("%s", elements[1]); if (strcmp("1", elements[1]) == 0) { mirrored = true; gt_log_log("sequences are treated as mirrored"); } } gt_str_reset(current_line); } gt_str_delete(current_line); if (elements != NULL) { gt_free(elements[0]); gt_free(elements[1]); } gt_free(elements); gt_fa_xfclose(prj_fp); } } if (!had_err) { GtEncseqLoader *el = gt_encseq_loader_new_from_options(arguments->loadopts, err); if (mirrored) gt_encseq_loader_mirror(el); encseq = gt_encseq_loader_load(el, gt_str_get(arguments->indexname), err); gt_encseq_loader_delete(el); } if (encseq == NULL) had_err = -1; if (!had_err) { unit_info = gt_shu_unit_info_new(encseq); if (arguments->with_units) had_err = gt_shu_unit_file_info_read(arguments->unitfile, unit_info, logger, err); } if (!had_err) { uint64_t **shusums = NULL; if (arguments->with_esa || arguments->with_pck) { shusums = gt_genomediff_shulen_sum(arguments, unit_info, logger, timer, err); if (shusums == NULL) had_err = -1; } else { const bool doesa = true; GenomediffInfo gd_info; Suffixeratoroptions sopts; sopts.beverbose = arguments->verbose; sopts.indexname = arguments->indexname; sopts.db = NULL; sopts.encopts = NULL; sopts.genomediff = true; sopts.inputindex = arguments->indexname; sopts.loadopts = arguments->loadopts; sopts.showprogress = false; sopts.idxopts = arguments->idxopts; gt_assert(unit_info != NULL); gt_array2dim_calloc(shusums, unit_info->num_of_genomes, unit_info->num_of_genomes); gd_info.shulensums = shusums; gd_info.unit_info = unit_info; had_err = gt_runsuffixerator(doesa, &sopts, &gd_info, logger, err); } if (!had_err && shusums != NULL) { had_err = gt_genomediff_kr_calc(shusums, arguments, unit_info, arguments->with_pck, logger, timer, err); gt_array2dim_delete(shusums); } } if (timer != NULL) { gt_timer_show_progress_final(timer, stdout); gt_timer_delete(timer); } gt_logger_delete(logger); gt_encseq_delete(encseq); gt_shu_unit_info_delete(unit_info); return had_err; }
void gt_copysort_derivesorting(const GtBucketspec2 *bucketspec2, GtSuffixsortspace *suffixsortspace, GtLogger *logger) { GtUword hardwork = 0, *targetoffset; unsigned int idx, idxsource, source, second; #ifdef WITHSUFFIXES { GtUword idx; for (idx = 0; idx < bucketspec2->partwidth; idx++) { gt_encseq_showatstartpos( stdout, GT_ISDIRREVERSE(readmode) ? false : true, GT_ISDIRCOMPLEMENT(readmode) ? true : false, encseq, gt_suffixsortspace_getdirect(suffixsortspace,idx)); } } #endif targetoffset = gt_malloc(sizeof (*targetoffset) * bucketspec2->numofchars); for (idxsource = 0; idxsource<bucketspec2->numofchars; idxsource++) { source = bucketspec2->order[idxsource]; for (second = 0; second < bucketspec2->numofchars; second++) { if (!bucketspec2->subbuckettab[source][second].sorted && source != second) { gt_assert(bucketspec2->subbuckettab[source][second].hardworktodo); gt_logger_log(logger,"hard work for %u %u",source,second); hardwork += getendidx(bucketspec2,source,second) - getstartidx(bucketspec2,source,second); bucketspec2->subbuckettab[source][second].sorted = true; } else { gt_assert(!bucketspec2->subbuckettab[source][second].hardworktodo); } } if (getstartidx(bucketspec2,source,0) < getstartidx(bucketspec2,source,source)) { for (idx = 0; idx < bucketspec2->numofchars; idx++) { targetoffset[idx] = getstartidx(bucketspec2,idx,source); } forwardderive(bucketspec2, suffixsortspace, targetoffset, source, getstartidx(bucketspec2,source,0)); } if (getendidx(bucketspec2,source,source) < getendidx(bucketspec2,source,bucketspec2->numofchars)) { for (idx = 0; idx < bucketspec2->numofchars; idx++) { /* do not need to assert that getendidx(idx,source) > 0, as later the value stored in targetoffset is incremented */ targetoffset[idx] = getendidx(bucketspec2,idx,source) - 1; } gt_assert(getendidx(bucketspec2,source,bucketspec2->numofchars) > 0); backwardderive(bucketspec2, suffixsortspace, targetoffset, source, getendidx(bucketspec2,source,bucketspec2->numofchars) - 1); } for (idx = 0; idx < bucketspec2->numofchars; idx++) { bucketspec2->subbuckettab[idx][source].sorted = true; } bucketspec2->superbuckettab[source].sorted = true; } gt_free(targetoffset); gt_logger_log(logger,"hardwork = "GT_WU" (%.2f)", hardwork, (double) hardwork/gt_encseq_total_length(bucketspec2->encseq)); }
static int hmmsearch_process_coarse_hits( char *table_filename, GtCondenseq *ces, GtCondenseqHmmsearchArguments *arguments, GtLogger *logger, GtError *err) { int had_err = 0; GtStr *line = gt_str_new(); FILE *table = NULL; GtSplitter *splitter = gt_splitter_new(); GtStr *query = gt_str_new(), *fine_fasta_filename = gt_str_new_cstr("condenseq"); GtRBTree *sequences = NULL; GtUword filecount = (GtUword) 1; unsigned int querycount = 0; const GtUword fine_fasta_name_length = gt_str_length(fine_fasta_filename); const GtUword table_name_length = gt_str_length(arguments->outtable_filename); table = gt_xfopen(table_filename, "r"); sequences = gt_rbtree_new(hmmsearch_cmp_seqnum, hmmsearch_tree_free_node, NULL); while (!had_err && gt_str_read_next_line(line, table) == 0) { char *c_line = gt_str_get(line); GtUword uid; const GtUword target_column = 0, query_column = (GtUword) 3; if (c_line[0] != '#') { gt_splitter_split_non_empty(splitter, c_line, gt_str_length(line), ' '); gt_assert(gt_splitter_size(splitter) == (GtUword) 23); if (sscanf(gt_splitter_get_token(splitter, target_column), GT_WU, &uid) != 1) { gt_error_set(err, "couldn't parse target number: %s", gt_splitter_get_token(splitter, target_column)); had_err = -1; } if (gt_str_length(query) == 0 || strcmp(gt_str_get(query), gt_splitter_get_token(splitter, query_column)) != 0) { gt_str_set(query, gt_splitter_get_token(splitter, query_column)); gt_logger_log(logger, "new query: %s", gt_str_get(query)); querycount++; } if (!had_err && querycount == arguments->max_queries) { hmmsearch_create_fine_fas(fine_fasta_filename, sequences, ces); if (table_name_length != 0) gt_str_append_uword(arguments->outtable_filename, filecount++); had_err = hmmsearch_call_fine_search(table_name_length != 0 ? arguments->outtable_filename : NULL, gt_str_get(fine_fasta_filename), gt_str_get(arguments->hmmsearch_path), gt_str_get(arguments->hmm), logger, err); gt_rbtree_clear(sequences); gt_str_set_length(fine_fasta_filename, fine_fasta_name_length); if (table_name_length != 0) gt_str_set_length(arguments->outtable_filename, table_name_length); querycount = 0; } if (!had_err) { if (gt_condenseq_each_redundant_seq(ces, uid, hmmsearch_process_seq, sequences, err) == 0) { had_err = -1; } } gt_splitter_reset(splitter); } gt_str_reset(line); } gt_splitter_delete(splitter); gt_str_delete(line); gt_str_delete(query); gt_xfclose(table); if (!had_err) { hmmsearch_create_fine_fas(fine_fasta_filename, sequences, ces); if (table_name_length != 0) gt_str_append_uword(arguments->outtable_filename, filecount++); had_err = hmmsearch_call_fine_search(table_name_length != 0 ? arguments->outtable_filename : NULL, gt_str_get(fine_fasta_filename), gt_str_get(arguments->hmmsearch_path), gt_str_get(arguments->hmm), logger, err); } gt_log_log("created " GT_WU " files", filecount); gt_rbtree_delete(sequences); gt_str_delete(fine_fasta_filename); return had_err; }
int gt_genomediff_pck_shu_simple(GtLogger *logger, const GtGenomediffArguments *arguments, GtError *err) { int had_err = 0; int retval; GtSeqIterator *queries = NULL; const GtUchar *symbolmap, *currentQuery; const GtAlphabet *alphabet; GtUchar c_sym = 0, g_sym = 0; uint64_t queryNo; char *description = NULL; unsigned long queryLength, subjectLength = 0, currentSuffix; double avgShuLength, currentShuLength = 0.0, /*gc_subject,*/ gc_query /*, gc*/; const FMindex *subjectindex = NULL; Genericindex *genericindexSubject; const GtEncseq *encseq = NULL; double *ln_n_fac; /* get the precalculation of ln(n!) for 0<n<max_ln_n_fac */ ln_n_fac = gt_get_ln_n_fac(arguments->max_ln_n_fac); gt_log_log("ln(max_ln_n_fac!) = %f\n", ln_n_fac[arguments->max_ln_n_fac]); genericindexSubject = genericindex_new(gt_str_get( arguments->indexname), arguments->with_esa, true, false, true, arguments->user_max_depth, logger, err); if (genericindexSubject == NULL) { had_err = 1; } else { encseq = genericindex_getencseq(genericindexSubject); } if (!had_err) { subjectLength = genericindex_get_totallength(genericindexSubject) - 1; /*subjectLength /= 2;*/ /*gt_log_log("subject length: %lu", subjectLength);*/ subjectindex = genericindex_get_packedindex(genericindexSubject); queries = gt_seqiterator_sequence_buffer_new( arguments->queryname, err); gt_assert(queries); alphabet = gt_encseq_alphabet(encseq); /* makes assumption that alphabet is dna, it has to calculate the gc! */ if (!gt_alphabet_is_dna(alphabet)) { fprintf(stderr, "error: Sequences need to be dna"); had_err = 1; } else { symbolmap = gt_alphabet_symbolmap(alphabet); gt_seqiterator_set_symbolmap(queries, symbolmap); c_sym = gt_alphabet_encode(alphabet, 'c'); g_sym = gt_alphabet_encode(alphabet, 'g'); } } for (queryNo = 0; !had_err; queryNo++) { retval = gt_seqiterator_next(queries, ¤tQuery, &queryLength, &description, err); if ( retval != 1) { if (retval < 0) { gt_free(description); } break; } gt_logger_log(logger, "found query of length: %lu", queryLength); avgShuLength = 0.0; gc_query = 0.0; for (currentSuffix = 0; currentSuffix < queryLength; currentSuffix++) { currentShuLength = (double) gt_pck_getShuStringLength( subjectindex, ¤tQuery[currentSuffix], queryLength - currentSuffix); avgShuLength += currentShuLength; if (currentQuery[currentSuffix] == c_sym || currentQuery[currentSuffix] == g_sym) { gc_query++; } } if (arguments->shulen_only) { printf("# Query %d sum of shulen:\n %.0f\n", (int) queryNo, avgShuLength); } else { avgShuLength /= (double) queryLength; gc_query /= (double) queryLength; gt_logger_log(logger, "Query %d has an average SHUstring length " "of\n# shulength: %f", (int) queryNo, avgShuLength); gt_logger_log(logger, "Query description: %s", description); gt_log_log("Query (i): %s", description); /* XXX Fehlerabfragen einbauen */ if ( !had_err ) { double div, kr; gt_logger_log(logger, "shulen:\n%f", avgShuLength); gt_log_log("shu: %f, gc: %f, len: %lu", avgShuLength, gc_query, subjectLength); div = gt_divergence(arguments->divergence_rel_err, arguments->divergence_abs_err, arguments->divergence_m, arguments->divergence_threshold, avgShuLength, subjectLength, gc_query, ln_n_fac, arguments->max_ln_n_fac); gt_logger_log(logger, "divergence:\n%f", div); kr = gt_calculateKr(div); printf("# Kr:\n%f\n", kr); } } } gt_free(ln_n_fac); gt_seqiterator_delete(queries); genericindex_delete(genericindexSubject); return had_err; }
static int gt_gdiffcalc_runner(int argc, const char **argv, int parsed_args, void *tool_arguments, GT_UNUSED GtError *err) { GtGenomediffArguments *arguments = tool_arguments; int had_err = 0, i; GtUword lcounter = 0, zcounter = 0; double **shusums = NULL; GtEncseq *encseq = NULL; GtLogger *logger; GtShuUnitFileInfo *unit_info = NULL; GtTimer *timer = NULL; gt_error_check(err); gt_assert(arguments); logger = gt_logger_new(arguments->verbose, GT_LOGGER_DEFLT_PREFIX, stdout); gt_assert(logger); for (i = parsed_args; i < argc; i++) { gt_str_array_add_cstr(arguments->filenames, argv[i]); } if (gt_showtime_enabled()) { timer = gt_timer_new_with_progress_description("load encseq"); gt_timer_start(timer); gt_assert(timer); } if (arguments->with_units) { gt_logger_log(logger, "unitfile option set, filename is %s\n", gt_str_get(arguments->unitfile)); } if (!had_err) { GtEncseqLoader *el = gt_encseq_loader_new_from_options(arguments->loadopts, err); encseq = gt_encseq_loader_load(el, gt_str_get(arguments->indexname), err); gt_encseq_loader_delete(el); } if (encseq == NULL) had_err = -1; if (timer != NULL) gt_timer_show_progress(timer, "load units", stdout); if (!had_err) { unit_info = gt_shu_unit_info_new(encseq); if (arguments->with_units) had_err = gt_shu_unit_file_info_read(arguments->unitfile, unit_info, logger, err); } if (timer != NULL) gt_timer_show_progress(timer, "read table", stdout); if (!had_err) { GtIO *table_file = NULL; GtTokenizer *tokenizer = NULL; GtStr *line = NULL; gt_assert(unit_info != NULL); gt_array2dim_calloc(shusums, unit_info->num_of_genomes, unit_info->num_of_genomes); table_file = gt_io_new(gt_str_array_get(arguments->filenames, 0), "r"); tokenizer = gt_tokenizer_new(table_file); line = gt_tokenizer_get_token(tokenizer); while (line != NULL && !had_err) { char *cline = gt_str_get(line); char *elem = strtok(cline, ";"); zcounter = 0; while (elem != NULL && !had_err) { if (*elem != '#') { if (1 != sscanf(elem, "%lf", &shusums[lcounter][zcounter])) { had_err = 1; gt_error_set(err, "couldn't scan"); break; } gt_logger_log(logger,"wert: %lf", shusums[lcounter][zcounter]); zcounter++; } else { gt_logger_log(logger, "name: %s", elem++); } elem = strtok(NULL, ";"); } gt_tokenizer_next_token(tokenizer); gt_str_delete(line); line = gt_tokenizer_get_token(tokenizer); lcounter++; gt_logger_log(logger, "line "GT_WD"", lcounter); } } if (!had_err) { GtUword num_of_seq, file_idx, seq_idx, startpos; GT_UNUSED GtUword oldpos = 0; gt_assert(unit_info != NULL); gt_assert(lcounter == zcounter); gt_assert(lcounter == unit_info->num_of_genomes); num_of_seq = gt_encseq_num_of_sequences(unit_info->encseq); for (seq_idx = 0; seq_idx < num_of_seq; seq_idx++) { startpos = gt_encseq_seqstartpos(unit_info->encseq, seq_idx); file_idx = gt_encseq_filenum(unit_info->encseq, startpos); gt_log_log("seq: "GT_WU" starts at: "GT_WU"\n" "belonges to file: "GT_WU" which is part of genome: %s", seq_idx, startpos, file_idx, gt_str_array_get(unit_info->genome_names, unit_info->map_files[file_idx])); gt_assert(oldpos <= startpos); oldpos = startpos; } } if (!had_err && shusums != NULL) { had_err = gt_genomediff_calculate_div_from_avg(shusums, arguments, unit_info, logger, timer, err); gt_array2dim_delete(shusums); } if (timer != NULL) { gt_timer_show_progress_final(timer, stdout); gt_timer_delete(timer); } gt_logger_delete(logger); gt_encseq_delete(encseq); gt_shu_unit_info_delete(unit_info); return had_err; }
static int gt_condenser_search_runner(GT_UNUSED int argc, GT_UNUSED const char **argv, GT_UNUSED int parsed_args, void *tool_arguments, GtError *err) { GtCondenserSearchArguments *arguments = tool_arguments; int i, had_err = 0; char *querypath = gt_str_get(arguments->querypath); GtStr* coarse_fname = gt_str_new_cstr("coarse_"); char *db_basename = NULL; char *suffix_ptr = NULL; GtTimer *timer = NULL; GtLogger *logger = NULL; gt_error_check(err); gt_assert(arguments); logger = gt_logger_new(arguments->verbose, GT_LOGGER_DEFLT_PREFIX, stderr); db_basename = gt_basename(gt_str_get(arguments->dbpath)); /* if first char is '.' this might be a hidden file */ if (strlen(db_basename) > (size_t) 1 && (suffix_ptr = strrchr(db_basename + 1, '.')) != NULL) { /* remove suffix */ *suffix_ptr = '\0'; } gt_str_append_cstr(coarse_fname, db_basename); gt_str_append_cstr(coarse_fname, ".fas"); gt_free(db_basename); db_basename = NULL; suffix_ptr = NULL; if (arguments->blastn || arguments->blastp) { GtMatch *match; GtMatchIterator *mp = NULL; GtNREncseq *nrencseq = NULL; GtStr *fastaname = gt_str_clone(arguments->dbpath); HitPosition *hits; double eval, raw_eval = 0.0; GtUword coarse_db_len = 0; GtMatchIteratorStatus status; int curr_hits = 0, max_hits = 100; hits = gt_malloc(sizeof (*hits) * (size_t) max_hits); gt_str_append_cstr(fastaname, ".fas"); for (i=0; i < max_hits; i++) { hits[i].range = gt_malloc(sizeof (*hits[i].range) * (size_t) 1); } if (gt_showtime_enabled()) { timer = gt_timer_new_with_progress_description("initialization"); gt_timer_start(timer); } /*extract sequences from compressed database*/ if (!had_err) { nrencseq = gt_n_r_encseq_new_from_file(gt_str_get(arguments->dbpath), logger, err); if (nrencseq == NULL) had_err = -1; } if (!had_err) { if (arguments->ceval == GT_UNDEF_DOUBLE || arguments->feval == GT_UNDEF_DOUBLE) { /* from NCBI BLAST tutorial: E = Kmne^{-lambdaS} calculates E-value for score S with natural scale parameters K for search space size and lambda for the scoring system E = mn2^-S' m being the subject (total) length, n the length of ONE query calculates E-value for bit-score S' */ GtFastaReader *reader; GtCondenserSearchAvg avg = {0,0}; reader = gt_fasta_reader_rec_new(arguments->querypath); had_err = gt_fasta_reader_run(reader, NULL, NULL, gt_condenser_search_cum_moving_avg, &avg, err); if (!had_err) { GtUword S = arguments->bitscore; gt_log_log(GT_WU " queries, avg query size: " GT_WU, avg.count, avg.avg); raw_eval = 1/pow(2.0, (double) S) * avg.avg; gt_logger_log(logger, "Raw E-value set to %.4e", raw_eval); gt_assert(avg.avg != 0); } gt_fasta_reader_delete(reader); } } /*create BLAST database from compressed database fasta file*/ if (!had_err) { if (timer != NULL) gt_timer_show_progress(timer, "create coarse BLAST db", stderr); if (arguments->blastn) had_err = gt_condenser_search_create_nucl_blastdb(gt_str_get(fastaname), err); else had_err = gt_condenser_search_create_prot_blastdb(gt_str_get(fastaname), err); } if (!had_err) { GtBlastProcessCall *call; if (timer != NULL) gt_timer_show_progress(timer, "coarse BLAST run", stderr); if (arguments->blastp) call = gt_blast_process_call_new_prot(); else call = gt_blast_process_call_new_nucl(); gt_blast_process_call_set_db(call, gt_str_get(fastaname)); gt_blast_process_call_set_query(call, querypath); gt_blast_process_call_set_evalue(call, arguments->ceval); gt_blast_process_call_set_num_threads(call, arguments->blthreads); mp = gt_match_iterator_blast_process_new(call, err); if (!mp) had_err = -1; gt_blast_process_call_delete(call); while (!had_err && (status = gt_match_iterator_next(mp, &match, err)) != GT_MATCHER_STATUS_END) { if (status == GT_MATCHER_STATUS_OK) { GtUword hit_seq_id; char string[7]; const char *dbseqid = gt_match_get_seqid2(match); if (sscanf(dbseqid,"%6s" GT_WU, string, &hit_seq_id) == 2) { gt_match_get_range_seq2(match, hits[curr_hits].range); hits[curr_hits].idx = hit_seq_id; gt_match_delete(match); curr_hits++; if (curr_hits == max_hits) { HitPosition *hit_extention; max_hits += 100; hits = gt_realloc(hits, sizeof (*hit_extention) * max_hits); for (i=max_hits - 100; i < max_hits; i++) { hits[i].range = gt_malloc(sizeof (*hits[i].range)); } } } else { gt_error_set(err, "could not parse unique db header %s", dbseqid); had_err = -1; } } else if (status == GT_MATCHER_STATUS_ERROR) { had_err = -1; } } gt_match_iterator_delete(mp); } /*extract sequences*/ if (!had_err) { GtNREncseqDecompressor *decomp; GtFile *coarse_hits; if (timer != NULL) gt_timer_show_progress(timer, "extract coarse search hits", stderr); decomp = gt_n_r_encseq_decompressor_new(nrencseq); coarse_hits = gt_file_new(gt_str_get(coarse_fname),"w", err); /* TODO DW do NOT extract complete uniques! these could be complete chromosomes!! just extract something around it? maybe +- max query length*/ for (i = 0; i < curr_hits; i++) { gt_n_r_encseq_decompressor_add_unique_idx_to_extract(decomp, hits[i].idx); } had_err = gt_n_r_encseq_decompressor_start_unique_extraction(coarse_hits, decomp, &coarse_db_len, err); gt_assert(coarse_db_len != 0); gt_file_delete(coarse_hits); gt_n_r_encseq_decompressor_delete(decomp); } gt_n_r_encseq_delete(nrencseq); /* create BLAST database from decompressed database file */ if (!had_err) { if (timer != NULL) gt_timer_show_progress(timer, "create fine BLAST db", stderr); if (arguments->blastn) had_err = gt_condenser_search_create_nucl_blastdb(gt_str_get(coarse_fname), err); else had_err = gt_condenser_search_create_prot_blastdb(gt_str_get(coarse_fname), err); } /* perform fine BLAST search */ if (!had_err) { GtBlastProcessCall *call; if (timer != NULL) gt_timer_show_progress(timer, "fine BLAST run", stderr); if (arguments->feval == GT_UNDEF_DOUBLE) { eval = raw_eval * coarse_db_len; } else { eval = arguments->feval; } if (arguments->blastp) call = gt_blast_process_call_new_prot(); else call = gt_blast_process_call_new_nucl(); gt_blast_process_call_set_db(call, gt_str_get(coarse_fname)); gt_blast_process_call_set_query(call, querypath); gt_blast_process_call_set_evalue(call, eval); gt_blast_process_call_set_num_threads(call, arguments->blthreads); gt_logger_log(logger, "Fine E-value set to: %.4e (len)" GT_WU, eval, coarse_db_len); mp = gt_match_iterator_blast_process_new(call, err); if (!mp) had_err = -1; gt_blast_process_call_delete(call); if (!had_err) { GtUword numofhits = 0; while (!had_err && (status = gt_match_iterator_next(mp, &match, err)) != GT_MATCHER_STATUS_END) { if (status == GT_MATCHER_STATUS_OK) { GtMatchBlast *matchb = (GtMatchBlast*) match; char *dbseqid = gt_malloc(sizeof (*dbseqid) * 50); GtRange range_seq1; GtRange range_seq2; numofhits++; gt_match_get_range_seq1(match, &range_seq1); gt_match_get_range_seq2(match, &range_seq2); gt_file_xprintf( arguments->outfp, "%s\t%s\t%.2f\t" GT_WU "\t" GT_WU "\t" GT_WU "\t" GT_WU "\t" GT_WU "\t%g\t%.3f\n", gt_match_get_seqid1(match), gt_match_get_seqid2(match), gt_match_blast_get_similarity(matchb), gt_match_blast_get_align_length(matchb), range_seq1.start, range_seq1.end, range_seq2.start, range_seq2.end, gt_match_blast_get_evalue(matchb), (double) gt_match_blast_get_bitscore(matchb)); gt_match_delete(match); gt_free(dbseqid); } else if (status == GT_MATCHER_STATUS_ERROR) { had_err = -1; } } gt_log_log(GT_WU " hits found\n", numofhits); } gt_match_iterator_delete(mp); } if (!had_err) if (timer != NULL) gt_timer_show_progress_final(timer, stderr); gt_timer_delete(timer); /*cleanup*/ for (i=0; i < max_hits; i++) { gt_free(hits[i].range); } gt_free(hits); gt_str_delete(fastaname); } gt_str_delete(coarse_fname); gt_logger_delete(logger); return had_err; }
static int enumeratelcpintervals(const char *inputindex, Sequentialsuffixarrayreader *ssar, const char *storeindex, bool storecounts, GtUword mersize, GtUword minocc, GtUword maxocc, bool performtest, GtLogger *logger, GtError *err) { TyrDfsstate *state; bool haserr = false; unsigned int alphasize; gt_error_check(err); state = gt_malloc(sizeof (*state)); GT_INITARRAY(&state->occdistribution,Countwithpositions); state->esrspace = gt_encseq_create_reader_with_readmode( gt_encseqSequentialsuffixarrayreader(ssar), gt_readmodeSequentialsuffixarrayreader(ssar), 0); state->mersize = (GtUword) mersize; state->encseq = gt_encseqSequentialsuffixarrayreader(ssar); alphasize = gt_alphabet_num_of_chars(gt_encseq_alphabet(state->encseq)); state->readmode = gt_readmodeSequentialsuffixarrayreader(ssar); state->storecounts = storecounts; state->minocc = minocc; state->maxocc = maxocc; state->totallength = gt_encseq_total_length(state->encseq); state->performtest = performtest; state->countoutputmers = 0; state->merindexfpout = NULL; state->countsfilefpout = NULL; GT_INITARRAY(&state->largecounts,Largecount); if (strlen(storeindex) == 0) { state->sizeofbuffer = 0; state->bytebuffer = NULL; } else { state->sizeofbuffer = MERBYTES(mersize); state->bytebuffer = gt_malloc(sizeof *state->bytebuffer * state->sizeofbuffer); } if (performtest) { state->currentmer = gt_malloc(sizeof *state->currentmer * state->mersize); state->suftab = gt_suftabSequentialsuffixarrayreader(ssar); } else { state->currentmer = NULL; state->suftab = NULL; } if (state->mersize > state->totallength) { gt_error_set(err,"mersize "GT_WU" > "GT_WU" = totallength not allowed", state->mersize, state->totallength); haserr = true; } else { if (strlen(storeindex) == 0) { state->processoccurrencecount = adddistpos2distribution; } else { state->merindexfpout = gt_fa_fopen_with_suffix(storeindex,MERSUFFIX, "wb",err); if (state->merindexfpout == NULL) { haserr = true; } else { if (state->storecounts) { state->countsfilefpout = gt_fa_fopen_with_suffix(storeindex,COUNTSSUFFIX,"wb",err); if (state->countsfilefpout == NULL) { haserr = true; } } } state->processoccurrencecount = outputsortedstring2index; } if (!haserr) { if (gt_depthfirstesa(ssar, tyr_allocateDfsinfo, tyr_freeDfsinfo, tyr_processleafedge, NULL, tyr_processcompletenode, tyr_assignleftmostleaf, tyr_assignrightmostleaf, (Dfsstate*) state, logger, err) != 0) { haserr = true; } if (strlen(storeindex) == 0) { showfinalstatistics(state,inputindex,logger); } } if (!haserr) { if (state->countsfilefpout != NULL) { gt_logger_log(logger,"write "GT_WU" mercounts > "GT_WU " to file \"%s%s\"", state->largecounts.nextfreeLargecount, (GtUword) MAXSMALLMERCOUNT, storeindex, COUNTSSUFFIX); gt_xfwrite(state->largecounts.spaceLargecount, sizeof (Largecount), (size_t) state->largecounts.nextfreeLargecount, state->countsfilefpout); } } if (!haserr) { gt_logger_log(logger,"number of "GT_WU"-mers in index: "GT_WU"", mersize, state->countoutputmers); gt_logger_log(logger,"index size: %.2f megabytes\n", GT_MEGABYTES(state->countoutputmers * state->sizeofbuffer + sizeof (GtUword) * EXTRAINTEGERS)); } } /* now out EXTRAINTEGERS integer values */ if (!haserr && state->merindexfpout != NULL) { outputbytewiseUlongvalue(state->merindexfpout, (GtUword) state->mersize); outputbytewiseUlongvalue(state->merindexfpout,(GtUword) alphasize); } gt_fa_xfclose(state->merindexfpout); gt_fa_xfclose(state->countsfilefpout); GT_FREEARRAY(&state->occdistribution,Countwithpositions); gt_free(state->currentmer); gt_free(state->bytebuffer); GT_FREEARRAY(&state->largecounts,Largecount); gt_encseq_reader_delete(state->esrspace); gt_free(state); return haserr ? -1 : 0; }
int gt_extractkeysfromdesfile(const char *indexname, bool sortkeys, GtLogger *logger, GtError *err) { FILE *fpin, *fpout = NULL; GtStr *line = NULL; const char *keyptr; unsigned long keylen, constantkeylen = 0, linenum;/* incorrectorder = 0;*/ bool haserr = false, firstdesc = true; char *previouskey = NULL; Fixedsizekey *keytab = NULL, *keytabptr = NULL; GtEncseq *encseq = NULL; unsigned long numofentries = 0; const unsigned long linewidth = 60UL; fpin = gt_fa_fopen_with_suffix(indexname,GT_DESTABFILESUFFIX,"rb",err); if (fpin == NULL) { return -1; } if (!sortkeys) { fpout = gt_fa_fopen_with_suffix(indexname,GT_KEYSTABFILESUFFIX,"wb",err); if (fpout == NULL) { haserr = true; } } if (!haserr) { line = gt_str_new(); } for (linenum = 0; !haserr && gt_str_read_next_line(line, fpin) != EOF; linenum++) { keyptr = desc2key(&keylen,gt_str_get(line),err); if (keyptr == NULL) { haserr = true; break; } if (keylen == 0) { gt_error_set(err,"key of length 0 in \"%s\" not expected", gt_str_get(line)); haserr = true; break; } if (firstdesc) { if (keylen > (unsigned long) CHAR_MAX) { gt_error_set(err,"key \"%*.*s\" of length %lu not allowed; " "no key must be larger than %d", (int) keylen,(int) keylen,keyptr,keylen,CHAR_MAX); haserr = true; break; } constantkeylen = keylen; previouskey = gt_malloc(sizeof (char) * (constantkeylen+1)); firstdesc = false; if (!sortkeys) { gt_xfputc((char) constantkeylen,fpout); } else { GtEncseqLoader *el; if (constantkeylen > (unsigned long) MAXFIXEDKEYSIZE) { gt_error_set(err,"key \"%*.*s\" of length %lu not allowed; " "no key must be larger than %d", (int) keylen,(int) keylen,keyptr,keylen, MAXFIXEDKEYSIZE); haserr = true; break; } el = gt_encseq_loader_new(); gt_encseq_loader_set_logger(el, logger); encseq = gt_encseq_loader_load(el, indexname, err); gt_encseq_loader_delete(el); if (encseq == NULL) { haserr = true; break; } numofentries = gt_encseq_num_of_sequences(encseq); gt_assert(numofentries > 0); keytab = gt_malloc(sizeof (*keytab) * numofentries); keytabptr = keytab; } } else { if (constantkeylen != keylen) { gt_error_set(err,"key \"%*.*s\" of length %lu: all keys must be of " "the same length which for all previously seen " "headers is %lu", (int) keylen,(int) keylen,keyptr,keylen, constantkeylen); haserr = true; break; } gt_assert(previouskey != NULL); if (!sortkeys && strncmp(previouskey,keyptr,(size_t) constantkeylen) >= 0) { gt_error_set(err,"previous key \"%s\" is not lexicographically smaller " "than current key \"%*.*s\"", previouskey,(int) keylen,(int) keylen,keyptr); haserr = true; break; /* printf("previous key \"%s\" (no %lu) is lexicographically larger " "than current key \"%*.*s\"\n", previouskey,linenum,(int) keylen,(int) keylen,keyptr); incorrectorder++; */ } } if (!sortkeys) { gt_xfwrite(keyptr,sizeof *keyptr,(size_t) keylen,fpout); gt_xfputc('\0',fpout); } else { gt_assert(keytabptr != NULL); strncpy(keytabptr->key,keyptr,(size_t) constantkeylen); keytabptr->key[constantkeylen] = '\0'; keytabptr->seqnum = linenum; keytabptr++; } strncpy(previouskey,keyptr,(size_t) constantkeylen); previouskey[constantkeylen] = '\0'; gt_str_reset(line); } if (!haserr) { gt_logger_log(logger,"number of keys of length %lu = %lu", constantkeylen,linenum); /* gt_logger_log(logger,"number of incorrectly ordered keys = %lu", incorrectorder); */ } gt_str_delete(line); gt_fa_fclose(fpin); gt_fa_fclose(fpout); gt_free(previouskey); if (!haserr && sortkeys) { gt_assert(keytabptr != NULL); gt_assert(numofentries > 0); gt_assert(keytabptr == keytab + numofentries); qsort(keytab,(size_t) numofentries,sizeof (*keytab),compareFixedkeys); gt_assert(keytabptr != NULL); for (keytabptr = keytab; !haserr && keytabptr < keytab + numofentries; keytabptr++) { if (giextract_encodedseq2fasta(stdout, encseq, keytabptr->seqnum, NULL, linewidth, err) != 0) { haserr = true; break; } } } if (encseq != NULL) { gt_encseq_delete(encseq); encseq = NULL; } gt_free(keytab); return haserr ? -1 : 0; }
static int inputsuffixarray(bool map, Suffixarray *suffixarray, unsigned int demand, const char *indexname, GtLogger *logger, GtError *err) { bool haserr = false; GtEncseqLoader *el; GtUword totallength = 0; gt_error_check(err); initsuffixarray(suffixarray); el = gt_encseq_loader_new(); if (!(demand & SARR_DESTAB)) gt_encseq_loader_do_not_require_des_tab(el); else gt_encseq_loader_require_des_tab(el); if (!(demand & SARR_SDSTAB)) gt_encseq_loader_do_not_require_sds_tab(el); else gt_encseq_loader_require_sds_tab(el); if (!(demand & SARR_SSPTAB)) gt_encseq_loader_do_not_require_ssp_tab(el); else gt_encseq_loader_require_ssp_tab(el); gt_encseq_loader_set_logger(el, logger); suffixarray->encseq = gt_encseq_loader_load(el, indexname, err); gt_encseq_loader_delete(el); if (suffixarray->encseq == NULL) { haserr = true; } if (!haserr) { haserr = scanprjfileuintkeys(suffixarray,indexname,logger,err); } if (!haserr && suffixarray->mirroredencseq && !gt_encseq_is_mirrored(suffixarray->encseq)) { if (gt_encseq_mirror(suffixarray->encseq, err) != 0) haserr = true; } if (!haserr) { totallength = gt_encseq_total_length(suffixarray->encseq); } if (!haserr && (demand & SARR_SUFTAB)) { if (map) { if (suffixarray->numberofallsortedsuffixes > 0) { suffixarray->suftab = gt_fa_mmap_check_size_with_suffix(indexname, GT_SUFTABSUFFIX, suffixarray->numberofallsortedsuffixes, sizeof (*suffixarray->suftab), err); if (suffixarray->suftab == NULL) { haserr = true; } } } else { #if defined (_LP64) || defined (_WIN64) off_t filesize = gt_file_size_with_suffix(indexname,GT_SUFTABSUFFIX); if (filesize == (off_t) sizeof (uint32_t) * suffixarray->numberofallsortedsuffixes) { gt_logger_log(logger,"read suftab in units of 4 bytes"); INITBufferedfile(indexname,&suffixarray->suftabstream_uint32_t,uint32_t, GT_SUFTABSUFFIX); } else { gt_logger_log(logger,"read suftab in units of 8 bytes"); INITBufferedfile(indexname,&suffixarray->suftabstream_GtUword,GtUword, GT_SUFTABSUFFIX); } #else gt_logger_log(logger,"read suftab in units of 4 bytes"); INITBufferedfile(indexname,&suffixarray->suftabstream_GtUword,GtUword, GT_SUFTABSUFFIX); #endif } if (!haserr && !suffixarray->longest.defined) { gt_error_set(err,"longest not defined"); haserr = true; } } if (!haserr && (demand & SARR_LCPTAB)) { if (map) { if (suffixarray->numberofallsortedsuffixes > 0) { suffixarray->lcptab = gt_fa_mmap_check_size_with_suffix(indexname, GT_LCPTABSUFFIX, suffixarray->numberofallsortedsuffixes, sizeof (*suffixarray->lcptab), err); if (suffixarray->lcptab == NULL) { haserr = true; } } } else { INITBufferedfile(indexname,&suffixarray->lcptabstream,GtUchar, GT_LCPTABSUFFIX); if (!haserr && fseek(suffixarray->lcptabstream.fp, (GtWord) sizeof (GtUchar),SEEK_SET)) { gt_error_set(err,"fseek(esastream) failed: %s",strerror(errno)); haserr = true; } } if (!haserr && !suffixarray->numoflargelcpvalues.defined) { gt_error_set(err,"numoflargelcpvalues not defined"); haserr = true; } if (!haserr && suffixarray->numoflargelcpvalues.valueunsignedlong > 0) { if (map) { suffixarray->llvtab = gt_fa_mmap_check_size_with_suffix(indexname, GT_LARGELCPTABSUFFIX, (GtUword) suffixarray->numoflargelcpvalues. valueunsignedlong, sizeof (*suffixarray->llvtab), err); if (suffixarray->llvtab == NULL) { haserr = true; } } else { INITBufferedfile(indexname,&suffixarray->llvtabstream,Largelcpvalue, GT_LARGELCPTABSUFFIX); } } } if (!haserr && (demand & SARR_BWTTAB)) { if (map) { suffixarray->bwttab = gt_fa_mmap_check_size_with_suffix(indexname, GT_BWTTABSUFFIX, totallength+1, sizeof (*suffixarray->bwttab), err); if (suffixarray->bwttab == NULL) { haserr = true; } } else { INITBufferedfile(indexname,&suffixarray->bwttabstream,GtUchar, GT_BWTTABSUFFIX); } } if (!haserr && (demand & SARR_BCKTAB)) { suffixarray->bcktab = gt_bcktab_map(indexname, gt_encseq_alphabetnumofchars(suffixarray->encseq), suffixarray->prefixlength, totallength+1, true, err); if (suffixarray->bcktab == NULL) { haserr = true; } } if (haserr) { gt_freesuffixarray(suffixarray); } return haserr ? -1 : 0; }
int gt_testmaxpairs(const char *indexname, GtUword samples, unsigned int minlength, GtUword substringlength, GtLogger *logger, GtError *err) { GtEncseq *encseq; GtUword totallength = 0, dblen, querylen; GtUchar *dbseq = NULL, *query = NULL; bool haserr = false; GtUword s; GtArray *tabmaxquerymatches; Maxmatchselfinfo maxmatchselfinfo; GtEncseqLoader *el; gt_logger_log(logger,"draw "GT_WU" samples",samples); el = gt_encseq_loader_new(); gt_encseq_loader_do_not_require_des_tab(el); gt_encseq_loader_do_not_require_ssp_tab(el); gt_encseq_loader_do_not_require_sds_tab(el); gt_encseq_loader_set_logger(el, logger); encseq = gt_encseq_loader_load(el, indexname, err); gt_encseq_loader_delete(el); if (encseq == NULL) { haserr = true; } else { totallength = gt_encseq_total_length(encseq); } if (!haserr) { if (substringlength > totallength/2) { substringlength = totallength/2; } dbseq = gt_malloc(sizeof *dbseq * substringlength); query = gt_malloc(sizeof *query * substringlength); } for (s=0; s<samples && !haserr; s++) { dblen = samplesubstring(dbseq,encseq,substringlength); querylen = samplesubstring(query,encseq,substringlength); gt_logger_log(logger,"run query match for dblen="GT_WU"" ",querylen= "GT_WU", minlength=%u", dblen, querylen, minlength); tabmaxquerymatches = gt_array_new(sizeof (Substringmatch)); if (gt_sarrquerysubstringmatch(dbseq, dblen, query, (GtUword) querylen, minlength, gt_encseq_alphabet(encseq), storemaxmatchquery, tabmaxquerymatches, logger, err) != 0) { haserr = true; break; } gt_logger_log(logger,"run self match for dblen="GT_WU"" ",querylen= "GT_WU", minlength=%u", dblen, querylen, minlength); maxmatchselfinfo.results = gt_array_new(sizeof (Substringmatch)); maxmatchselfinfo.dblen = dblen; maxmatchselfinfo.querylen = querylen; maxmatchselfinfo.querymarkpos = sequence2markpositions(&maxmatchselfinfo.numofquerysequences, query,querylen); if (sarrselfsubstringmatch(dbseq, dblen, query, (GtUword) querylen, minlength, gt_encseq_alphabet(encseq), storemaxmatchself, &maxmatchselfinfo, logger, err) != 0) { haserr = true; break; } gt_array_sort(tabmaxquerymatches,orderSubstringmatch); gt_array_sort(maxmatchselfinfo.results,orderSubstringmatch); if (!gt_array_equal(tabmaxquerymatches,maxmatchselfinfo.results, orderSubstringmatch)) { const GtUword width = 60UL; printf("failure for query of length "GT_WU"\n",(GtUword) querylen); printf("querymatches\n"); (void) gt_array_iterate(tabmaxquerymatches,showSubstringmatch,NULL, err); printf("dbmatches\n"); (void) gt_array_iterate(maxmatchselfinfo.results,showSubstringmatch, NULL,err); gt_symbolstring2fasta(stdout,"dbseq", gt_encseq_alphabet(encseq), dbseq, (GtUword) dblen, width); gt_symbolstring2fasta(stdout,"queryseq", gt_encseq_alphabet(encseq), query, (GtUword) querylen, width); exit(GT_EXIT_PROGRAMMING_ERROR); } gt_free(maxmatchselfinfo.querymarkpos); printf("# numberofmatches="GT_WU"\n",gt_array_size(tabmaxquerymatches)); gt_array_delete(tabmaxquerymatches); gt_array_delete(maxmatchselfinfo.results); } gt_free(dbseq); gt_free(query); gt_encseq_delete(encseq); encseq = NULL; return haserr ? -1 : 0; }
static void condenseq_process_descriptions(GtCondenseq *condenseq, const GtEncseq *orig_es, GtLogger *logger) { GtUword *dist; const char *desc; char *cur_id_startptr; GtUword desclen, dist_idx, distsize = (GtUword) 128, idlen, idx, maxendidx = 0, maxlen = 0, minlen = GT_UWORD_MAX, wastedmem = 0, sdssize, cur_total_id_len = 0; bool use_const_len; condenseq->ids_total_len = 0; dist = gt_calloc((size_t) distsize, sizeof (*dist)); for (idx = 0; idx < condenseq->orig_num_seq; ++idx) { desc = gt_encseq_description(orig_es, &desclen, idx); idlen = condenseq_idlen(desc, desclen); if (distsize <= idlen) { dist = gt_realloc(dist, (size_t) (idlen + 1) * sizeof (*dist)); for (dist_idx = distsize; dist_idx <= idlen; dist_idx++) dist[dist_idx] = 0; distsize = idlen + 1; } dist[idlen]++; if (idlen > maxlen) maxlen = idlen; if (idlen < minlen) minlen = idlen; maxendidx += idlen; } /* calculate memory we would waste if we assume equal length, and size if we store actual descriptions */ for (dist_idx = minlen; dist_idx < maxlen; dist_idx++) { wastedmem += dist[dist_idx] * (maxlen - dist_idx); condenseq->ids_total_len += dist[dist_idx] * dist_idx; } condenseq->ids_total_len += dist_idx * dist[dist_idx]; sdssize = (GtUword) gt_intset_best_memory_size(maxendidx, condenseq->orig_num_seq); use_const_len = wastedmem < sdssize; if (use_const_len) { gt_logger_log(logger, "Condenseq descriptions will use const len, " GT_WU ", \"wasting\" " GT_WU " bytes. SDS would use " GT_WU " bytes", maxlen, wastedmem, sdssize); condenseq->id_len = maxlen; condenseq->ids_total_len = maxlen * condenseq->orig_num_seq; } else { gt_logger_log(logger, "Condenseq descriptions will use sdstab with size " GT_WU ". Const length would have wasted " GT_WU " bytes.", sdssize, wastedmem); condenseq->sdstab = gt_intset_best_new(maxendidx, condenseq->orig_num_seq); } condenseq->orig_ids = gt_calloc((size_t) condenseq->ids_total_len, sizeof (*condenseq->orig_ids)); cur_id_startptr = condenseq->orig_ids; for (idx = 0; idx < condenseq->orig_num_seq; ++idx) { desc = gt_encseq_description(orig_es, &desclen, idx); idlen = condenseq_idlen(desc, desclen); gt_assert(idlen <= maxlen); (void) memcpy(cur_id_startptr, desc, (size_t) idlen); if (use_const_len) { cur_id_startptr += maxlen; cur_total_id_len += maxlen; } else { cur_id_startptr += idlen; cur_total_id_len += idlen; gt_intset_add(condenseq->sdstab, cur_total_id_len); } } gt_assert(cur_total_id_len == condenseq->ids_total_len); gt_free(dist); }
static int gt_readjoiner_assembly_runner(GT_UNUSED int argc, GT_UNUSED const char **argv, GT_UNUSED int parsed_args, void *tool_arguments, GtError *err) { GtReadjoinerAssemblyArguments *arguments = tool_arguments; GtLogger *verbose_logger, *default_logger; GtEncseqLoader *el; GtEncseq *reads; GtTimer *timer = NULL; GtStrgraph *strgraph = NULL; GtBitsequence *contained = NULL; const char *readset = gt_str_get(arguments->readset); bool eqlen = true; GtUword nreads, tlen, rlen; int had_err = 0; gt_assert(arguments); gt_error_check(err); default_logger = gt_logger_new(!arguments->quiet, GT_LOGGER_DEFLT_PREFIX, stdout); gt_logger_log(default_logger, "gt readjoiner assembly (version "GT_READJOINER_VERSION")"); verbose_logger = gt_logger_new(arguments->verbose, GT_LOGGER_DEFLT_PREFIX, stdout); gt_logger_log(verbose_logger, "verbose output activated"); gt_logger_log(verbose_logger, "readset name = %s", readset); if (gt_showtime_enabled()) { timer = gt_timer_new_with_progress_description( GT_READJOINER_ASSEMBLY_MSG_COUNTSPM); gt_timer_start(timer); gt_timer_show_cpu_time_by_progress(timer); } if (!arguments->paths2seq) { el = gt_encseq_loader_new(); gt_encseq_loader_drop_description_support(el); gt_encseq_loader_disable_autosupport(el); reads = gt_encseq_loader_load(el, readset, err); if (reads == NULL) { had_err = -1; } if (had_err == 0) { eqlen = gt_encseq_accesstype_get(reads) == GT_ACCESS_TYPE_EQUALLENGTH; nreads = gt_encseq_num_of_sequences(reads); gt_logger_log(default_logger, "number of reads in filtered readset = " GT_WU, nreads); tlen = gt_encseq_total_length(reads) - nreads + 1; gt_logger_log(verbose_logger, "total length of filtered readset = " GT_WU, tlen); if (eqlen) { rlen = gt_encseq_seqlength(reads, 0); gt_logger_log(verbose_logger, "read length = " GT_WU, rlen); gt_encseq_delete(reads); reads = NULL; } else { had_err = gt_readjoiner_assembly_build_contained_reads_list( arguments, &contained, err); rlen = 0; gt_logger_log(verbose_logger, "read length = variable"); gt_assert(reads != NULL); } } if (had_err == 0) { if (!arguments->load) { had_err = gt_readjoiner_assembly_build_graph(arguments, &strgraph, reads, readset, eqlen, rlen, nreads, contained, default_logger, verbose_logger, timer, err); } else { gt_readjoiner_assembly_load_graph(&strgraph, reads, readset, rlen, default_logger, timer); } } if (!eqlen && reads != NULL && !arguments->errors) { gt_encseq_delete(reads); reads = NULL; if (had_err == 0) gt_strgraph_set_encseq(strgraph, NULL); } if (had_err == 0 && arguments->redtrans) { if (gt_showtime_enabled()) gt_timer_show_progress(timer, GT_READJOINER_ASSEMBLY_MSG_REDTRANS, stdout); gt_strgraph_sort_edges_by_len(strgraph, false); (void)gt_strgraph_redtrans(strgraph, false); (void)gt_strgraph_redself(strgraph, false); (void)gt_strgraph_redwithrc(strgraph, false); gt_strgraph_log_stats(strgraph, verbose_logger); } if (had_err == 0 && arguments->errors) { if (gt_showtime_enabled()) gt_timer_show_progress(timer, GT_READJOINER_ASSEMBLY_MSG_CLEANSG, stdout); gt_logger_log(default_logger, GT_READJOINER_ASSEMBLY_MSG_CLEANSG); had_err = gt_readjoiner_assembly_error_correction(strgraph, arguments->bubble, arguments->deadend, arguments->deadend_depth, verbose_logger); } if (had_err == 0 && arguments->save) { if (gt_showtime_enabled()) gt_timer_show_progress(timer, GT_READJOINER_ASSEMBLY_MSG_SAVESG, stdout); gt_logger_log(default_logger, GT_READJOINER_ASSEMBLY_MSG_SAVESG); gt_strgraph_show(strgraph, GT_STRGRAPH_BIN, gt_str_get(arguments->readset), GT_READJOINER_SUFFIX_SG, false); } if (!eqlen && reads != NULL) { gt_encseq_delete(reads); reads = NULL; if (had_err == 0) gt_strgraph_set_encseq(strgraph, NULL); } if (had_err == 0) { if (gt_showtime_enabled()) gt_timer_show_progress(timer, GT_READJOINER_ASSEMBLY_MSG_TRAVERSESG, stdout); gt_logger_log(default_logger, GT_READJOINER_ASSEMBLY_MSG_TRAVERSESG); gt_readjoiner_assembly_show_current_space("(before traversal)"); gt_strgraph_spell(strgraph, (GtUword)arguments->depthcutoff, (GtUword)arguments->lengthcutoff, arguments->vd, readset, GT_READJOINER_SUFFIX_CONTIG_PATHS, NULL, true, arguments->show_contigs_info, false, verbose_logger); } if (contained != NULL) gt_free(contained); gt_strgraph_delete(strgraph); strgraph = NULL; gt_assert(reads == NULL); gt_encseq_loader_delete(el); } if (had_err == 0) { gt_readjoiner_assembly_show_current_space("(before paths2seq)"); had_err = gt_readjoiner_assembly_paths2seq(readset, (GtUword)arguments->lengthcutoff, arguments->vd, arguments->astat, arguments->coverage, arguments->copynum, arguments->buffersize, default_logger, &timer, err); } if (gt_showtime_enabled()) { gt_timer_show_progress_final(timer, stdout); gt_timer_delete(timer); } gt_logger_delete(default_logger); gt_logger_delete(verbose_logger); return had_err; }
/*read condenseq data structure from file*/ GtCondenseq *gt_condenseq_new_from_file(const char *indexname, GtLogger *logger, GtError *err) { int had_err = 0; FILE* fp; GtEncseqLoader *esl; GtEncseq *unique_es; GtCondenseq *condenseq = NULL; /*load unique_es*/ esl = gt_encseq_loader_new(); unique_es = gt_encseq_loader_load(esl, indexname, err); if (!unique_es) had_err = -1; if (!had_err) { gt_encseq_loader_delete(esl); condenseq = condenseq_new_empty(gt_encseq_alphabet(unique_es)); condenseq->filename = gt_cstr_dup(indexname); condenseq->unique_es = unique_es; fp = gt_fa_fopen_with_suffix(indexname, GT_CONDENSEQ_FILE_SUFFIX, "rb", err); if (fp == NULL) { had_err = -1; } else { had_err = condenseq_io(condenseq, fp, gt_io_error_fread, err); if (!had_err) { GtUword i; gt_assert(condenseq->uniques); gt_assert(condenseq->links); gt_fa_fclose(fp); /*create link array for each unique entry*/ for (i = 0; i < condenseq->udb_nelems; i++) { GT_INITARRAY(&(condenseq->uniques[i].links),uint32_t); } /* check for overflows */ if (condenseq->ldb_nelems > (GtUword) ((uint32_t) 0 - (uint32_t) 1)) { gt_error_set(err, "Overflow, to many link-elements. Can't be stored"); had_err = -1; } /* iterate through link entrys and store ids in corresponding unique entry array */ for (i = 0; !had_err && (GtUword) i < condenseq->ldb_nelems; i++) { GtUword uid = condenseq->links[i].unique_id; gt_assert(uid < condenseq->udb_nelems); GT_STOREINARRAY(&(condenseq->uniques[uid].links), uint32_t, 10, (uint32_t) i); } } } } if (!had_err) { gt_assert(condenseq != NULL); if (condenseq->id_len != GT_UNDEF_UWORD) gt_logger_log(logger, "IDs const len: " GT_WU, condenseq->id_len); else gt_logger_log(logger, "using sdstab to access IDs"); } if (had_err) { gt_condenseq_delete(condenseq); condenseq = NULL; } return (condenseq); }
static int gt_condenseq_compress_runner(GT_UNUSED int argc, const char **argv, int parsed_args, void *tool_arguments, GtError *err) { GtCondenseqCompressArguments *arguments = tool_arguments; GtLogger *logger, *kdb_logger; FILE *kmer_fp = NULL; int had_err = 0; gt_error_check(err); gt_assert(arguments); logger = gt_logger_new(arguments->verbose, GT_LOGGER_DEFLT_PREFIX, stderr); kdb_logger = gt_logger_new(arguments->kdb, GT_LOGGER_DEFLT_PREFIX, stderr); if (arguments->kdb) { kmer_fp = gt_fa_fopen("kmer_db.out", "w", err); gt_logger_set_target(kdb_logger, kmer_fp); } if (gt_str_length(arguments->indexname) == 0UL) { char *basenameptr; basenameptr = gt_basename(argv[parsed_args]); gt_str_set(arguments->indexname, basenameptr); gt_free(basenameptr); } if (!had_err) { GtEncseqLoader *es_l = gt_encseq_loader_new(); arguments->input_es = gt_encseq_loader_load(es_l, argv[parsed_args], err); if (arguments->input_es == NULL) had_err = -1; gt_encseq_loader_delete(es_l); } if (!had_err) { if (arguments->minalignlength == GT_UNDEF_UWORD) arguments->minalignlength = arguments->initsize != GT_UNDEF_UWORD ? arguments->initsize / (GtUword) 3UL : GT_UNDEF_UWORD; if (arguments->windowsize == GT_UNDEF_UINT) arguments->windowsize = arguments->minalignlength != GT_UNDEF_UWORD ? (unsigned int) (arguments->minalignlength / 5U) : GT_UNDEF_UINT; if (arguments->windowsize < 4U) arguments->windowsize = 4U; if (arguments->kmersize == GT_UNDEF_UINT) { unsigned int size = gt_alphabet_num_of_chars(gt_encseq_alphabet(arguments->input_es)); /* size^k ~= 100000 */ gt_safe_assign(arguments->kmersize, gt_round_to_long(gt_log_base(100000.0, (double) size))); gt_logger_log(logger, "|A|: %u, k: %u", size, arguments->kmersize); } if (arguments->windowsize == GT_UNDEF_UINT) { arguments->windowsize = 5U * arguments->kmersize; } if (arguments->minalignlength == GT_UNDEF_UWORD) { arguments->minalignlength = (GtUword) (3UL * arguments->windowsize); } if (arguments->initsize == GT_UNDEF_UWORD) { arguments->initsize = (GtUword) (3UL * arguments->minalignlength); } } if (!had_err && arguments->windowsize <= arguments->kmersize) { gt_error_set(err, "-windowsize (%u) must be larger -kmersize (%u)!", arguments->windowsize, arguments->kmersize); had_err = -1; } if (!had_err && arguments->minalignlength < (GtUword) arguments->windowsize) { gt_error_set(err, "-alignlength (" GT_WU ") must be at least " "-windowsize (%u)!", arguments->minalignlength, arguments->windowsize); had_err = -1; } if (!had_err && (arguments->initsize < arguments->minalignlength)) { gt_error_set(err, "-initsize (" GT_WU ") must be at least " "-alignlength (" GT_WU ")!", arguments->initsize, arguments->minalignlength); had_err = -1; } if (!had_err) { GtCondenseqCreator *ces_c; if (!had_err) { ces_c = gt_condenseq_creator_new(arguments->initsize, arguments->minalignlength, arguments->xdrop, &(arguments->scores), arguments->kmersize, arguments->windowsize, logger, err); if (ces_c == NULL) had_err = -1; } if (!had_err) { if (arguments->cutoff_value == GT_UNDEF_UWORD) gt_condenseq_creator_use_mean_cutoff(ces_c); else if (arguments->cutoff_value == 0) gt_condenseq_creator_disable_cutoff(ces_c); else gt_condenseq_creator_set_cutoff(ces_c, arguments->cutoff_value); gt_condenseq_creator_set_mean_fraction(ces_c, arguments->fraction); if (arguments->prune) gt_condenseq_creator_disable_prune(ces_c); if (arguments->brute) gt_condenseq_creator_enable_brute_force(ces_c); if (!arguments->diags) gt_condenseq_creator_disable_diagonals(ces_c); if (arguments->full_diags) gt_condenseq_creator_enable_full_diagonals(ces_c); if (arguments->clean_percent != GT_UNDEF_UINT) gt_condenseq_creator_set_diags_clean_limit(ces_c, arguments->clean_percent); had_err = gt_condenseq_creator_create(ces_c, arguments->indexname, arguments->input_es, logger, kdb_logger, err); gt_condenseq_creator_delete(ces_c); } } gt_logger_delete(logger); gt_logger_delete(kdb_logger); if (arguments->kdb) gt_fa_fclose(kmer_fp); return had_err; }