void gt_Outlcpinfo_delete(GtOutlcpinfo *outlcpinfo) { if (outlcpinfo == NULL) { return; } gt_turningwheel_delete(outlcpinfo->turnwheel); if (outlcpinfo->lcpsubtab.lcp2file != NULL) { if (!outlcpinfo->swallow_tail_lcpvalues && outlcpinfo->lcpsubtab.lcp2file->countoutputlcpvalues < outlcpinfo->numsuffixes2output) { outlcpinfo->lcpsubtab.lcp2file->countoutputlcpvalues += outmany0lcpvalues(outlcpinfo->numsuffixes2output - outlcpinfo->lcpsubtab.lcp2file ->countoutputlcpvalues, outlcpinfo->lcpsubtab.lcp2file->outfplcptab); } gt_assert(outlcpinfo->swallow_tail_lcpvalues || outlcpinfo->lcpsubtab.lcp2file->countoutputlcpvalues == outlcpinfo->numsuffixes2output); GT_FREEARRAY(&outlcpinfo->lcpsubtab.lcp2file->largelcpvalues, Largelcpvalue); gt_fa_fclose(outlcpinfo->lcpsubtab.lcp2file->outfplcptab); gt_fa_fclose(outlcpinfo->lcpsubtab.lcp2file->outfpllvtab); gt_free(outlcpinfo->lcpsubtab.lcp2file->reservoir); outlcpinfo->lcpsubtab.lcp2file->smalllcpvalues = NULL; outlcpinfo->lcpsubtab.lcp2file->reservoir = NULL; outlcpinfo->lcpsubtab.lcp2file->sizereservoir = 0; gt_free(outlcpinfo->lcpsubtab.lcp2file); } else { gt_free(outlcpinfo->lcpsubtab.tableoflcpvalues.bucketoflcpvalues); #ifndef NDEBUG gt_free(outlcpinfo->lcpsubtab.tableoflcpvalues.isset); #endif } gt_free(outlcpinfo->lcpsubtab.lcpprocess); outlcpinfo->lcpsubtab.tableoflcpvalues.bucketoflcpvalues = NULL; #ifndef NDEBUG outlcpinfo->lcpsubtab.tableoflcpvalues.isset = NULL; #endif outlcpinfo->lcpsubtab.tableoflcpvalues.numofentries = 0; if (outlcpinfo->lcpsubtab.distlcpvalues != NULL) { gt_disc_distri_show(outlcpinfo->lcpsubtab.distlcpvalues,NULL); gt_disc_distri_delete(outlcpinfo->lcpsubtab.distlcpvalues); } gt_free(outlcpinfo); }
static GtHcrSeqDecoder *hcr_seq_decoder_new(GtAlphabet *alpha, const char *name, GtError *err) { GtHcrSeqDecoder *seq_dec = gt_malloc(sizeof (GtHcrSeqDecoder)); GtBaseQualDistr *bqd = NULL; GtWord end_enc_start_sampling = 0; FILE *fp = NULL; GT_UNUSED size_t read, one = (size_t) 1; seq_dec->alpha = alpha; seq_dec->alphabet_size = gt_alphabet_size(alpha); seq_dec->cur_read = 0; seq_dec->data_iter = NULL; seq_dec->file_info_rbt = NULL; seq_dec->fileinfos = NULL; seq_dec->filename = gt_str_new_cstr(name); seq_dec->huff_dec = NULL; seq_dec->huffman = NULL; seq_dec->sampling = NULL; seq_dec->symbols = NULL; gt_str_append_cstr(seq_dec->filename, HCRFILESUFFIX); fp = gt_fa_fopen_with_suffix(name, HCRFILESUFFIX, "rb", err); if (gt_error_is_set(err)) { hcr_seq_decoder_delete(seq_dec); seq_dec = NULL; } else { hcr_read_file_info(seq_dec, fp); bqd = hcr_base_qual_distr_new_from_file(fp, seq_dec->alpha); seq_dec->qual_offset = bqd->qual_offset; read = gt_xfread_one(&end_enc_start_sampling, fp); gt_assert(read == one); seq_dec->start_of_encoding = decoder_calc_start_of_encoded_data(fp); seq_decoder_init_huffman(seq_dec, end_enc_start_sampling, bqd, err); if (gt_error_is_set(err)) { hcr_seq_decoder_delete(seq_dec); seq_dec = NULL; } } if (seq_dec != NULL) { gt_xfseek(fp, end_enc_start_sampling, SEEK_SET); seq_dec->sampling = gt_sampling_read(fp); seq_dec->file_info_rbt = seq_decoder_init_file_info(seq_dec->fileinfos, seq_dec->num_of_files); } hcr_base_qual_distr_delete(bqd); gt_fa_fclose(fp); return seq_dec; }
void gtr_delete(GtR *gtr) { if (!gtr) return; gt_fa_fclose(gtr->logfp); gt_str_delete(gtr->testspacepeak); gt_str_delete(gtr->debugfp); gt_str_delete(gtr->test_only); gt_str_delete(gtr->manoutdir); gt_toolbox_delete(gtr->tools); gt_hashmap_delete(gtr->unit_tests); if (gtr->L) lua_close(gtr->L); #ifndef WITHOUT_CAIRO gt_style_delete_without_state(gtr->style); #endif gt_free(gtr); }
int gt_cntlist_parse(const char *filename, bool alloc_cntlist, GtBitsequence **cntlist, GtUword *nofreads, GtError *err) { int c, retval = 0; FILE *infp; gt_log_log("parse contained reads list file: %s", filename); infp = gt_fa_fopen(filename, "rb", err); if (infp == NULL) return -1; c = gt_xfgetc(infp); switch (c) { case EOF: gt_error_set(err, "%s: unexpected end of file", filename); retval = 1; break; case GT_CNTLIST_BIN_HEADER: gt_log_log("contained reads list format: BIN"); retval = gt_cntlist_parse_bin(infp, alloc_cntlist, cntlist, nofreads, err); break; case GT_CNTLIST_BIT_HEADER: gt_log_log("contained reads list format: BIT"); retval = gt_cntlist_parse_bit(infp, alloc_cntlist, cntlist, nofreads, err); break; case GT_CNTLIST_ASCII_HEADER: gt_xungetc(c, infp); gt_log_log("contained reads list format: ASCII"); retval = gt_cntlist_parse_ascii(infp, alloc_cntlist, cntlist, nofreads, err); break; default: gt_error_set(err, "%s: unrecognized format", filename); retval = 1; break; } gt_fa_fclose(infp); return retval; }
static void gt_Sfxmappedrange_storetmp(GtSfxmappedrange *sfxmappedrange, GtSfxStoretype usedptrptr, GtSfxmappedrangetype type, bool writable) { FILE *outfp; gt_assert(sfxmappedrange != NULL); sfxmappedrange->ptr = NULL; sfxmappedrange->filename = gt_str_new(); sfxmappedrange->writable = writable; outfp = gt_xtmpfp(sfxmappedrange->filename); gt_assert(outfp != NULL); gt_log_log("write %s to file %s ("GT_WU" units of "GT_WU" bytes)", gt_str_get(sfxmappedrange->tablename), gt_str_get(sfxmappedrange->filename), (GtUword) sfxmappedrange->numofunits, (GtUword) sfxmappedrange->sizeofunit); switch (type) { case GtSfxGtBitsequence: gt_xfwrite(*(usedptrptr.bs),sfxmappedrange->sizeofunit, sfxmappedrange->numofunits,outfp); sfxmappedrange->usedptrptr = (void**) usedptrptr.bs; gt_free(*(usedptrptr.bs)); *(usedptrptr.bs) = NULL; break; case GtSfxunsignedlong: gt_xfwrite(*(usedptrptr.ulong),sfxmappedrange->sizeofunit, sfxmappedrange->numofunits,outfp); sfxmappedrange->usedptrptr = (void**) usedptrptr.ulong; gt_free(*(usedptrptr.ulong)); *(usedptrptr.ulong) = NULL; break; case GtSfxuint32_t: gt_xfwrite(*(usedptrptr.uint32),sfxmappedrange->sizeofunit, sfxmappedrange->numofunits,outfp); sfxmappedrange->usedptrptr = (void**) usedptrptr.uint32; gt_free(*(usedptrptr.uint32)); *(usedptrptr.uint32) = NULL; break; } gt_fa_fclose(outfp); }
int pckbucket2file(const GtStr *indexname,const Pckbuckettable *pckbuckettable, GtError *err) { FILE *fp; Seqpos seqposmaxdepth; gt_error_check(err); fp = opensfxfile(indexname,PCKBUCKETTABLE,"wb",err); if (fp == NULL) { return -1; } seqposmaxdepth = (Seqpos) pckbuckettable->maxdepth; gt_xfwrite(&seqposmaxdepth,sizeof (Seqpos),(size_t) 1,fp); gt_xfwrite(pckbuckettable->mbtab[0],sizeof (Mbtab), (size_t) pckbuckettable->maxnumofvalues,fp); gt_fa_fclose(fp); return 0; }
int gt_cntlist_show(GtBitsequence *cntlist, GtUword nofreads, const char *path, bool binary, GtError *err) { FILE *file; gt_assert(cntlist != NULL); if (path == NULL) file = stdout; else { file = gt_fa_fopen(path, binary ? "wb" : "w", err); if (file == NULL) return -1; } gt_assert(file != NULL); (binary ? gt_cntlist_show_bit : gt_cntlist_show_ascii) (cntlist, nofreads, file); if (path != NULL) gt_fa_fclose(file); return 0; }
int gt_pckbuckettable_2file(const char *indexname, const Pckbuckettable *pckbuckettable, GtError *err) { FILE *fp; unsigned long seqposmaxdepth; gt_error_check(err); fp = gt_fa_fopen_with_suffix(indexname,PCKBUCKETTABLE,"wb",err); if (fp == NULL) { return -1; } seqposmaxdepth = (unsigned long) pckbuckettable->maxdepth; gt_xfwrite(&seqposmaxdepth,sizeof (unsigned long),(size_t) 1,fp); gt_xfwrite(pckbuckettable->mbtab[0],sizeof (Mbtab), (size_t) pckbuckettable->maxnumofvalues,fp); gt_fa_fclose(fp); return 0; }
void gt_file_delete(GtFile *file) { if (!file) return; if (file->reference_count) { file->reference_count--; return; } switch (file->mode) { case GT_FILE_MODE_UNCOMPRESSED: if (!file->is_stdin) gt_fa_fclose(file->fileptr.file); break; case GT_FILE_MODE_GZIP: gt_fa_gzclose(file->fileptr.gzfile); break; case GT_FILE_MODE_BZIP2: gt_fa_bzclose(file->fileptr.bzfile); break; default: gt_assert(0); } gt_file_delete_without_handle(file); }
GtStr *gt_leftborderbuffer_delete(GtLeftborderOutbuffer *lbbuf, GtFirstcodesspacelog *fcsl, GT_UNUSED unsigned long expectedwritten) { GtStr *outfilename; gt_assert(lbbuf != NULL); gt_leftborderbuffer_flush(lbbuf); gt_fa_fclose(lbbuf->fp); lbbuf->fp = NULL; gt_log_log("write %s to file %s (%lu units of size %u)", gt_str_get(lbbuf->name), gt_str_get(lbbuf->outfilename), lbbuf->totalwrite,(unsigned int) sizeof (*lbbuf->spaceuint32_t)); gt_assert(lbbuf->spaceuint32_t != NULL); gt_free(lbbuf->spaceuint32_t); GT_FCI_SUBTRACTWORKSPACE(fcsl,gt_str_get(lbbuf->name)); gt_assert(lbbuf->totalwrite == expectedwritten); outfilename = lbbuf->outfilename; gt_str_delete(lbbuf->name); gt_free(lbbuf); return outfilename; }
/*call function with linear gap costs for all given sequences */ static int gt_all_against_all_alignment_check(bool affine, GtAlignment *align, const GtLinspaceArguments *arguments, GtLinspaceManagement *spacemanager, const GtScoreHandler *scorehandler, const GtUchar *characters, GtUchar wildcardshow, const GtSequenceTable *sequence_table1, const GtSequenceTable *sequence_table2, GtWord left_dist, GtWord right_dist, GtTimer *linspacetimer, GtError *err) { int had_err = 0; const GtUchar *useq, *vseq; GtUword i, j, ulen, vlen; gt_error_check(err); if (linspacetimer != NULL) { gt_timer_start(linspacetimer); } for (i = 0; !had_err && i < sequence_table1->size; i++) { ulen = gt_str_length(sequence_table1->seqarray[i]); useq = (const GtUchar*) gt_str_get(sequence_table1->seqarray[i]); for (j = 0; j< sequence_table2->size; j++) { vlen = gt_str_length(sequence_table2->seqarray[j]); vseq = (const GtUchar*) gt_str_get(sequence_table2->seqarray[j]); gt_alignment_reset(align); if (arguments->global) { if (arguments->diagonal) { if (gt_str_array_size(arguments->diagonalbonds) == 0) { left_dist = LEFT_DIAGONAL_SHIFT(arguments->similarity, ulen, vlen); right_dist = RIGHT_DIAGONAL_SHIFT(arguments->similarity, ulen, vlen); } if ((left_dist > MIN(0, (GtWord)vlen-(GtWord)ulen))|| (right_dist < MAX(0, (GtWord)vlen-(GtWord)ulen))) { gt_error_set(err, "ERROR: invalid diagonalband for global " "alignment (ulen: "GT_WU", vlen: "GT_WU")\n" "left_dist <= MIN(0, vlen-ulen) and " "right_dist >= MAX(0, vlen-ulen)", ulen, vlen); had_err = 1; } if (!had_err) { (affine ? gt_diagonalbandalign_affinegapcost_compute_generic : gt_diagonalbandalign_compute_generic) (spacemanager, scorehandler, align, useq, 0, ulen, vseq, 0, vlen, left_dist, right_dist); } } else { (affine ? gt_linearalign_affinegapcost_compute_generic : gt_linearalign_compute_generic) (spacemanager, scorehandler, align, useq, 0, ulen, vseq, 0, vlen); } } else if (arguments->local) { (affine ? gt_linearalign_affinegapcost_compute_local_generic : gt_linearalign_compute_local_generic) (spacemanager, scorehandler, align, useq, 0, ulen, vseq, 0, vlen); } /* show alignment*/ if (!had_err) { gt_assert(align != NULL); if (!strcmp(gt_str_get(arguments->outputfile),"stdout")) { alignment_show_with_sequences(useq, ulen, vseq, vlen, align, characters, wildcardshow, arguments->showscore, !arguments->scoreonly, arguments->showsequences, arguments->global, scorehandler, stdout); } else { FILE *fp = gt_fa_fopen_func(gt_str_get(arguments->outputfile), "a", __FILE__,__LINE__,err); if (fp == NULL) { had_err = -1; } else { alignment_show_with_sequences(useq, ulen, vseq, vlen, align, characters, wildcardshow, arguments->showscore, !arguments->scoreonly, arguments->showsequences, arguments->global, scorehandler,fp); gt_fa_fclose(fp); } } } } } if (linspacetimer != NULL) { gt_timer_stop(linspacetimer); } if (!had_err && arguments->wildcardshow) { printf("# wildcards are represented by %c\n", wildcardshow); } return had_err; }
static int itersearchoverallkeys(const GtEncseq *encseq, const char *keytab, unsigned long numofkeys, unsigned long keysize, const GtStr *fileofkeystoextract, unsigned long linewidth, GtError *err) { FILE *fp; GtStr *currentline; uint64_t linenum; unsigned long seqnum, countmissing = 0; bool haserr = false; Fastakeyquery fastakeyquery; if (linewidth == 0) { gt_error_set(err,"use option width to specify line width for formatting"); return -1; } fp = gt_fa_fopen(gt_str_get(fileofkeystoextract),"r",err); if (fp == NULL) { return -1; } currentline = gt_str_new(); fastakeyquery.fastakey = gt_malloc(sizeof (char) * (keysize+1)); for (linenum = 0; gt_str_read_next_line(currentline, fp) != EOF; linenum++) { if (extractkeyfromcurrentline(&fastakeyquery, keysize, currentline, linenum, fileofkeystoextract, err) != 0) { haserr = true; break; } seqnum = searchfastaqueryindes(fastakeyquery.fastakey,keytab,numofkeys, keysize); if (seqnum < numofkeys) { if (giextract_encodedseq2fasta(stdout, encseq, seqnum, &fastakeyquery, linewidth, err) != 0) { haserr = true; break; } } else { countmissing++; } gt_str_reset(currentline); } if (!haserr && countmissing > 0) { printf("# number of unsatified fastakey-queries: %lu\n",countmissing); } gt_str_delete(currentline); gt_fa_fclose(fp); gt_free(fastakeyquery.fastakey); return haserr ? - 1 : 0; }
int gt_extractkeysfromdesfile(const char *indexname, bool sortkeys, GtLogger *logger, GtError *err) { FILE *fpin, *fpout = NULL; GtStr *line = NULL; const char *keyptr; unsigned long keylen, constantkeylen = 0, linenum;/* incorrectorder = 0;*/ bool haserr = false, firstdesc = true; char *previouskey = NULL; Fixedsizekey *keytab = NULL, *keytabptr = NULL; GtEncseq *encseq = NULL; unsigned long numofentries = 0; const unsigned long linewidth = 60UL; fpin = gt_fa_fopen_with_suffix(indexname,GT_DESTABFILESUFFIX,"rb",err); if (fpin == NULL) { return -1; } if (!sortkeys) { fpout = gt_fa_fopen_with_suffix(indexname,GT_KEYSTABFILESUFFIX,"wb",err); if (fpout == NULL) { haserr = true; } } if (!haserr) { line = gt_str_new(); } for (linenum = 0; !haserr && gt_str_read_next_line(line, fpin) != EOF; linenum++) { keyptr = desc2key(&keylen,gt_str_get(line),err); if (keyptr == NULL) { haserr = true; break; } if (keylen == 0) { gt_error_set(err,"key of length 0 in \"%s\" not expected", gt_str_get(line)); haserr = true; break; } if (firstdesc) { if (keylen > (unsigned long) CHAR_MAX) { gt_error_set(err,"key \"%*.*s\" of length %lu not allowed; " "no key must be larger than %d", (int) keylen,(int) keylen,keyptr,keylen,CHAR_MAX); haserr = true; break; } constantkeylen = keylen; previouskey = gt_malloc(sizeof (char) * (constantkeylen+1)); firstdesc = false; if (!sortkeys) { gt_xfputc((char) constantkeylen,fpout); } else { GtEncseqLoader *el; if (constantkeylen > (unsigned long) MAXFIXEDKEYSIZE) { gt_error_set(err,"key \"%*.*s\" of length %lu not allowed; " "no key must be larger than %d", (int) keylen,(int) keylen,keyptr,keylen, MAXFIXEDKEYSIZE); haserr = true; break; } el = gt_encseq_loader_new(); gt_encseq_loader_set_logger(el, logger); encseq = gt_encseq_loader_load(el, indexname, err); gt_encseq_loader_delete(el); if (encseq == NULL) { haserr = true; break; } numofentries = gt_encseq_num_of_sequences(encseq); gt_assert(numofentries > 0); keytab = gt_malloc(sizeof (*keytab) * numofentries); keytabptr = keytab; } } else { if (constantkeylen != keylen) { gt_error_set(err,"key \"%*.*s\" of length %lu: all keys must be of " "the same length which for all previously seen " "headers is %lu", (int) keylen,(int) keylen,keyptr,keylen, constantkeylen); haserr = true; break; } gt_assert(previouskey != NULL); if (!sortkeys && strncmp(previouskey,keyptr,(size_t) constantkeylen) >= 0) { gt_error_set(err,"previous key \"%s\" is not lexicographically smaller " "than current key \"%*.*s\"", previouskey,(int) keylen,(int) keylen,keyptr); haserr = true; break; /* printf("previous key \"%s\" (no %lu) is lexicographically larger " "than current key \"%*.*s\"\n", previouskey,linenum,(int) keylen,(int) keylen,keyptr); incorrectorder++; */ } } if (!sortkeys) { gt_xfwrite(keyptr,sizeof *keyptr,(size_t) keylen,fpout); gt_xfputc('\0',fpout); } else { gt_assert(keytabptr != NULL); strncpy(keytabptr->key,keyptr,(size_t) constantkeylen); keytabptr->key[constantkeylen] = '\0'; keytabptr->seqnum = linenum; keytabptr++; } strncpy(previouskey,keyptr,(size_t) constantkeylen); previouskey[constantkeylen] = '\0'; gt_str_reset(line); } if (!haserr) { gt_logger_log(logger,"number of keys of length %lu = %lu", constantkeylen,linenum); /* gt_logger_log(logger,"number of incorrectly ordered keys = %lu", incorrectorder); */ } gt_str_delete(line); gt_fa_fclose(fpin); gt_fa_fclose(fpout); gt_free(previouskey); if (!haserr && sortkeys) { gt_assert(keytabptr != NULL); gt_assert(numofentries > 0); gt_assert(keytabptr == keytab + numofentries); qsort(keytab,(size_t) numofentries,sizeof (*keytab),compareFixedkeys); gt_assert(keytabptr != NULL); for (keytabptr = keytab; !haserr && keytabptr < keytab + numofentries; keytabptr++) { if (giextract_encodedseq2fasta(stdout, encseq, keytabptr->seqnum, NULL, linewidth, err) != 0) { haserr = true; break; } } } if (encseq != NULL) { gt_encseq_delete(encseq); encseq = NULL; } gt_free(keytab); return haserr ? -1 : 0; }
static Fastakeyquery *readfileofkeystoextract(bool verbose, unsigned long *numofqueries, const GtStr *fileofkeystoextract, GtError *err) { FILE *fp; GtStr *currentline; bool haserr = false; uint64_t linenum; Fastakeyquery *fastakeyqueries; #undef SKDEBUG #ifdef SKDEBUG unsigned long i; #endif gt_error_check(err); *numofqueries = gt_file_number_of_lines(gt_str_get(fileofkeystoextract)); if (*numofqueries == 0) { gt_error_set(err,"empty file \"%s\" not allowed", gt_str_get(fileofkeystoextract)); return NULL; } fp = gt_fa_fopen(gt_str_get(fileofkeystoextract),"r",err); if (fp == NULL) { return NULL; } if (verbose) { printf("# opened keyfile \"%s\"\n",gt_str_get(fileofkeystoextract)); } fastakeyqueries = gt_malloc(sizeof (*fastakeyqueries) * (*numofqueries)); currentline = gt_str_new(); for (linenum = 0; gt_str_read_next_line(currentline, fp) != EOF; linenum++) { if (extractkeyfromcurrentline(fastakeyqueries + linenum, 0, currentline, linenum, fileofkeystoextract, err) != 0) { haserr = true; break; } gt_str_reset(currentline); } gt_str_delete(currentline); gt_fa_fclose(fp); if (haserr) { fastakeyqueries_delete(fastakeyqueries,*numofqueries); return NULL; } qsort(fastakeyqueries,(size_t) *numofqueries,sizeof (*fastakeyqueries), comparefastakeys); if (verbose) { printf("# %lu fastakey-queries successfully parsed and sorted\n", *numofqueries); } *numofqueries = remdupsfastakeyqueries(fastakeyqueries,*numofqueries,verbose); #ifdef SKDEBUG for (i=0; i<*numofqueries; i++) { printf("%lu %s\n",i,fastakeyqueries[i].fastakey); } #endif return fastakeyqueries; }
static int gt_kmer_database_runner(GT_UNUSED int argc, const char **argv, int parsed_args, void *tool_arguments, GtError *err) { GtKmerDatabaseArguments *arguments = tool_arguments; int had_err = 0; GtEncseq *es; GtUword es_length, nu_kmer_codes = 0; GtKmerDatabase *compare_db = NULL, *db = NULL; GtLogger *logger; FILE *fp = NULL; GtHashmap *kmer_hash = NULL; GtTimer *timer = NULL; gt_error_check(err); gt_assert(arguments); if (arguments->use_hash) kmer_hash = gt_hashmap_new(GT_HASH_DIRECT, NULL, (GtFree) gt_kmer_database_delete_hash_value); if (arguments->bench) timer = gt_timer_new_with_progress_description("loading encoded sequence"); logger = gt_logger_new(arguments->verbose, GT_LOGGER_DEFLT_PREFIX, stderr); if (arguments->verbose && gt_str_length(arguments->print_filename) > 0UL) { fp = gt_fa_fopen(gt_str_get(arguments->print_filename), "w", err); gt_logger_set_target(logger, fp); } if (!had_err) { GtEncseqLoader *es_l; if (arguments->bench) gt_timer_start(timer); es_l = gt_encseq_loader_new(); es = gt_encseq_loader_load(es_l, argv[parsed_args], err); if (arguments->bench) gt_timer_show_progress(timer, "saving kmers (+iterating over file)", stdout); if (es == NULL) { had_err = -1; } gt_encseq_loader_delete(es_l); } if (!had_err) { es_length = gt_encseq_total_length(es); if (es_length < (GtUword) arguments->kmersize) { gt_error_set(err, "Input is too short for used kmersize. File length: " GT_WU " kmersize: %u", es_length, arguments->kmersize); had_err = -1; } } if (!had_err) { GtAlphabet *alphabet; alphabet = gt_encseq_alphabet(es); if (arguments->bench) nu_kmer_codes = gt_power_for_small_exponents( gt_alphabet_num_of_chars(alphabet), arguments->kmersize); if (!arguments->merge_only && !arguments->use_hash && !arguments->bench) { compare_db = gt_kmer_database_new(gt_alphabet_num_of_chars(alphabet), arguments->kmersize, arguments->sb_size, es); } if (!arguments->use_hash) { db = gt_kmer_database_new(gt_alphabet_num_of_chars(alphabet), arguments->kmersize, arguments->sb_size, es); if (arguments->cutoff) { if (arguments->mean_cutoff) gt_kmer_database_use_mean_cutoff(db, (GtUword) 2, arguments->cutoff_value); else gt_kmer_database_set_cutoff(db, arguments->cutoff_value); if (!arguments->prune) gt_kmer_database_set_prune(db); } } } if (!had_err) { GtUword startpos = 0, endpos; GtKmercodeiterator *iter; const GtKmercode *kmercode = NULL; iter = gt_kmercodeiterator_encseq_new(es, GT_READMODE_FORWARD, arguments->kmersize, 0); while (!had_err && startpos < es_length - (arguments->kmersize - 1)) { GtUword startpos_add_kmer = startpos; if (arguments->merge_only) { endpos = startpos + (arguments->kmersize - 1) + (gt_rand_max((arguments->sb_size - 1) * 2)); if (endpos > es_length) endpos = es_length; } else { endpos = startpos + (arguments->kmersize - 1) + (gt_rand_max(arguments->sb_size - 1)); } gt_kmercodeiterator_reset(iter, GT_READMODE_FORWARD, startpos); while ((kmercode = gt_kmercodeiterator_encseq_next(iter)) != NULL && startpos_add_kmer <= endpos - (arguments->kmersize - 1)) { if (!arguments->merge_only && !arguments->use_hash && !kmercode->definedspecialposition && !arguments->bench) { gt_kmer_database_add_kmer(compare_db, kmercode->code, startpos_add_kmer); } if (arguments->use_hash && !kmercode->definedspecialposition) { gt_kmer_database_add_to_hash(kmer_hash, kmercode->code, startpos_add_kmer); } startpos_add_kmer++; } if (!arguments->use_hash) { gt_kmer_database_add_interval(db, startpos, endpos); gt_kmer_database_print_buffer(db, logger); if (!arguments->bench) had_err = gt_kmer_database_check_consistency(db, err); } startpos = endpos + 1; } if (!arguments->use_hash) { gt_kmer_database_flush(db); gt_kmer_database_print_buffer(db, logger); if (!had_err && !arguments->bench) had_err = gt_kmer_database_check_consistency(db, err); if (!arguments->merge_only && !had_err && !arguments->bench) had_err = gt_kmer_database_check_consistency(compare_db, err); if (!arguments->merge_only && !arguments->bench) gt_kmer_database_print(compare_db, logger, true); if (!arguments->merge_only && !had_err && !arguments->bench) had_err = gt_kmer_database_compare(compare_db, db, err); gt_kmer_database_print(db, logger, true); } gt_kmercodeiterator_delete(iter); } if (arguments->bench) { GtKmerStartpos pos; GtArrayGtUword *pos_hash; GtUword rand_access = (GtUword) 50000000, rand_code, i, sum = 0; gt_timer_show_progress(timer, "random access", stdout); for (i = 0; i < rand_access; i++) { rand_code = gt_rand_max(nu_kmer_codes - 1); if (arguments->use_hash) { pos_hash = gt_hashmap_get(kmer_hash, (const void *) rand_code); if (pos_hash != NULL) sum += pos_hash->spaceGtUword[pos_hash->nextfreeGtUword - 1]; } else { pos = gt_kmer_database_get_startpos(db, rand_code); if (pos.no_positions > 0) sum += pos.startpos[pos.no_positions - 1]; } } printf("sum: " GT_WU "\n", sum); gt_timer_show_progress(timer, "", stdout); gt_timer_stop(timer); gt_timer_delete(timer); } if (arguments->use_hash) gt_hashmap_delete(kmer_hash); gt_encseq_delete(es); if (!arguments->use_hash) gt_kmer_database_delete(db); if (!arguments->merge_only && !arguments->bench) gt_kmer_database_delete(compare_db); gt_logger_delete(logger); gt_fa_fclose(fp); return had_err; }
/*read condenseq data structure from file*/ GtCondenseq *gt_condenseq_new_from_file(const char *indexname, GtLogger *logger, GtError *err) { int had_err = 0; FILE* fp; GtEncseqLoader *esl; GtEncseq *unique_es; GtCondenseq *condenseq = NULL; /*load unique_es*/ esl = gt_encseq_loader_new(); unique_es = gt_encseq_loader_load(esl, indexname, err); if (!unique_es) had_err = -1; if (!had_err) { gt_encseq_loader_delete(esl); condenseq = condenseq_new_empty(gt_encseq_alphabet(unique_es)); condenseq->filename = gt_cstr_dup(indexname); condenseq->unique_es = unique_es; fp = gt_fa_fopen_with_suffix(indexname, GT_CONDENSEQ_FILE_SUFFIX, "rb", err); if (fp == NULL) { had_err = -1; } else { had_err = condenseq_io(condenseq, fp, gt_io_error_fread, err); if (!had_err) { GtUword i; gt_assert(condenseq->uniques); gt_assert(condenseq->links); gt_fa_fclose(fp); /*create link array for each unique entry*/ for (i = 0; i < condenseq->udb_nelems; i++) { GT_INITARRAY(&(condenseq->uniques[i].links),uint32_t); } /* check for overflows */ if (condenseq->ldb_nelems > (GtUword) ((uint32_t) 0 - (uint32_t) 1)) { gt_error_set(err, "Overflow, to many link-elements. Can't be stored"); had_err = -1; } /* iterate through link entrys and store ids in corresponding unique entry array */ for (i = 0; !had_err && (GtUword) i < condenseq->ldb_nelems; i++) { GtUword uid = condenseq->links[i].unique_id; gt_assert(uid < condenseq->udb_nelems); GT_STOREINARRAY(&(condenseq->uniques[uid].links), uint32_t, 10, (uint32_t) i); } } } } if (!had_err) { gt_assert(condenseq != NULL); if (condenseq->id_len != GT_UNDEF_UWORD) gt_logger_log(logger, "IDs const len: " GT_WU, condenseq->id_len); else gt_logger_log(logger, "using sdstab to access IDs"); } if (had_err) { gt_condenseq_delete(condenseq); condenseq = NULL; } return (condenseq); }
static int gt_condenseq_compress_runner(GT_UNUSED int argc, const char **argv, int parsed_args, void *tool_arguments, GtError *err) { GtCondenseqCompressArguments *arguments = tool_arguments; GtLogger *logger, *kdb_logger; FILE *kmer_fp = NULL; int had_err = 0; gt_error_check(err); gt_assert(arguments); logger = gt_logger_new(arguments->verbose, GT_LOGGER_DEFLT_PREFIX, stderr); kdb_logger = gt_logger_new(arguments->kdb, GT_LOGGER_DEFLT_PREFIX, stderr); if (arguments->kdb) { kmer_fp = gt_fa_fopen("kmer_db.out", "w", err); gt_logger_set_target(kdb_logger, kmer_fp); } if (gt_str_length(arguments->indexname) == 0UL) { char *basenameptr; basenameptr = gt_basename(argv[parsed_args]); gt_str_set(arguments->indexname, basenameptr); gt_free(basenameptr); } if (!had_err) { GtEncseqLoader *es_l = gt_encseq_loader_new(); arguments->input_es = gt_encseq_loader_load(es_l, argv[parsed_args], err); if (arguments->input_es == NULL) had_err = -1; gt_encseq_loader_delete(es_l); } if (!had_err) { if (arguments->minalignlength == GT_UNDEF_UWORD) arguments->minalignlength = arguments->initsize != GT_UNDEF_UWORD ? arguments->initsize / (GtUword) 3UL : GT_UNDEF_UWORD; if (arguments->windowsize == GT_UNDEF_UINT) arguments->windowsize = arguments->minalignlength != GT_UNDEF_UWORD ? (unsigned int) (arguments->minalignlength / 5U) : GT_UNDEF_UINT; if (arguments->windowsize < 4U) arguments->windowsize = 4U; if (arguments->kmersize == GT_UNDEF_UINT) { unsigned int size = gt_alphabet_num_of_chars(gt_encseq_alphabet(arguments->input_es)); /* size^k ~= 100000 */ gt_safe_assign(arguments->kmersize, gt_round_to_long(gt_log_base(100000.0, (double) size))); gt_logger_log(logger, "|A|: %u, k: %u", size, arguments->kmersize); } if (arguments->windowsize == GT_UNDEF_UINT) { arguments->windowsize = 5U * arguments->kmersize; } if (arguments->minalignlength == GT_UNDEF_UWORD) { arguments->minalignlength = (GtUword) (3UL * arguments->windowsize); } if (arguments->initsize == GT_UNDEF_UWORD) { arguments->initsize = (GtUword) (3UL * arguments->minalignlength); } } if (!had_err && arguments->windowsize <= arguments->kmersize) { gt_error_set(err, "-windowsize (%u) must be larger -kmersize (%u)!", arguments->windowsize, arguments->kmersize); had_err = -1; } if (!had_err && arguments->minalignlength < (GtUword) arguments->windowsize) { gt_error_set(err, "-alignlength (" GT_WU ") must be at least " "-windowsize (%u)!", arguments->minalignlength, arguments->windowsize); had_err = -1; } if (!had_err && (arguments->initsize < arguments->minalignlength)) { gt_error_set(err, "-initsize (" GT_WU ") must be at least " "-alignlength (" GT_WU ")!", arguments->initsize, arguments->minalignlength); had_err = -1; } if (!had_err) { GtCondenseqCreator *ces_c; if (!had_err) { ces_c = gt_condenseq_creator_new(arguments->initsize, arguments->minalignlength, arguments->xdrop, &(arguments->scores), arguments->kmersize, arguments->windowsize, logger, err); if (ces_c == NULL) had_err = -1; } if (!had_err) { if (arguments->cutoff_value == GT_UNDEF_UWORD) gt_condenseq_creator_use_mean_cutoff(ces_c); else if (arguments->cutoff_value == 0) gt_condenseq_creator_disable_cutoff(ces_c); else gt_condenseq_creator_set_cutoff(ces_c, arguments->cutoff_value); gt_condenseq_creator_set_mean_fraction(ces_c, arguments->fraction); if (arguments->prune) gt_condenseq_creator_disable_prune(ces_c); if (arguments->brute) gt_condenseq_creator_enable_brute_force(ces_c); if (!arguments->diags) gt_condenseq_creator_disable_diagonals(ces_c); if (arguments->full_diags) gt_condenseq_creator_enable_full_diagonals(ces_c); if (arguments->clean_percent != GT_UNDEF_UINT) gt_condenseq_creator_set_diags_clean_limit(ces_c, arguments->clean_percent); had_err = gt_condenseq_creator_create(ces_c, arguments->indexname, arguments->input_es, logger, kdb_logger, err); gt_condenseq_creator_delete(ces_c); } } gt_logger_delete(logger); gt_logger_delete(kdb_logger); if (arguments->kdb) gt_fa_fclose(kmer_fp); return had_err; }