static int readkeysize(const char *indexname,GtError *err) { FILE *fp; bool haserr = false; char cc; gt_error_check(err); fp = gt_fa_fopen_with_suffix(indexname,GT_KEYSTABFILESUFFIX,"rb",err); if (fp == NULL) { haserr = true; } if (!haserr) { GT_UNUSED size_t ret; ret = fread(&cc,sizeof cc, (size_t) 1, fp); if (ferror(fp)) { gt_error_set(err,"error when trying to read first byte of file %s%s: %s", indexname,GT_KEYSTABFILESUFFIX,strerror(errno)); haserr = true; } } gt_assert(cc >= 0); gt_fa_xfclose(fp); return haserr ? -1 : (int) cc; }
int gt_outprjfile(const char *indexname, GtReadmode readmode, const GtEncseq *encseq, GtUword numberofallsortedsuffixes, unsigned int prefixlength, GtUword numoflargelcpvalues, double averagelcp, GtUword maxbranchdepth, const Definedunsignedlong *longest, GtError *err) { FILE *prjfp; bool haserr = false; gt_error_check(err); prjfp = gt_fa_fopen_with_suffix(indexname,GT_PROJECTFILESUFFIX,"wb",err); if (prjfp == NULL) { haserr = true; } if (!haserr) { showprjinfo(prjfp, readmode, encseq, numberofallsortedsuffixes, prefixlength, numoflargelcpvalues, averagelcp, maxbranchdepth, longest); gt_fa_xfclose(prjfp); } return haserr ? -1 : 0; }
int gt_hcr_decoder_decode_range(GtHcrDecoder *hcr_dec, const char *name, GtUword start, GtUword end, GtTimer *timer, GtError *err) { char qual[BUFSIZ] = {0}, seq[BUFSIZ] = {0}; GtStr *desc = gt_str_new(); int had_err = 0; GtUword cur_width, cur_read; size_t i; FILE *output; GT_UNUSED GtHcrSeqDecoder *seq_dec; gt_error_check(err); gt_assert(hcr_dec && name); seq_dec = hcr_dec->seq_dec; gt_assert(start <= end); gt_assert(start < seq_dec->num_of_reads && end < seq_dec->num_of_reads); if (timer != NULL) gt_timer_show_progress(timer, "decode hcr", stdout); output = gt_fa_fopen_with_suffix(name, HCRFILEDECODEDSUFFIX, "w", err); if (output == NULL) had_err = -1; for (cur_read = start; had_err == 0 && cur_read <= end; cur_read++) { if (gt_hcr_decoder_decode(hcr_dec, cur_read, seq, qual, desc, err) != 0) had_err = -1; else { gt_xfputc(HCR_DESCSEPSEQ, output); if (hcr_dec->encdesc != NULL) gt_xfputs(gt_str_get(desc), output); else fprintf(output, ""GT_WU"", cur_read); gt_xfputc('\n', output); for (i = 0, cur_width = 0; i < strlen(seq); i++, cur_width++) { if (cur_width == HCR_LINEWIDTH) { cur_width = 0; gt_xfputc('\n', output); } gt_xfputc(seq[i], output); } gt_xfputc('\n', output); gt_xfputc(HCR_DESCSEPQUAL, output); gt_xfputc('\n', output); for (i = 0, cur_width = 0; i < strlen(qual); i++, cur_width++) { if (cur_width == HCR_LINEWIDTH) { cur_width = 0; gt_xfputc('\n', output); } gt_xfputc(qual[i], output); } gt_xfputc('\n', output); } } gt_fa_xfclose(output); gt_str_delete(desc); return had_err; }
static GtHcrSeqDecoder *hcr_seq_decoder_new(GtAlphabet *alpha, const char *name, GtError *err) { GtHcrSeqDecoder *seq_dec = gt_malloc(sizeof (GtHcrSeqDecoder)); GtBaseQualDistr *bqd = NULL; GtWord end_enc_start_sampling = 0; FILE *fp = NULL; GT_UNUSED size_t read, one = (size_t) 1; seq_dec->alpha = alpha; seq_dec->alphabet_size = gt_alphabet_size(alpha); seq_dec->cur_read = 0; seq_dec->data_iter = NULL; seq_dec->file_info_rbt = NULL; seq_dec->fileinfos = NULL; seq_dec->filename = gt_str_new_cstr(name); seq_dec->huff_dec = NULL; seq_dec->huffman = NULL; seq_dec->sampling = NULL; seq_dec->symbols = NULL; gt_str_append_cstr(seq_dec->filename, HCRFILESUFFIX); fp = gt_fa_fopen_with_suffix(name, HCRFILESUFFIX, "rb", err); if (gt_error_is_set(err)) { hcr_seq_decoder_delete(seq_dec); seq_dec = NULL; } else { hcr_read_file_info(seq_dec, fp); bqd = hcr_base_qual_distr_new_from_file(fp, seq_dec->alpha); seq_dec->qual_offset = bqd->qual_offset; read = gt_xfread_one(&end_enc_start_sampling, fp); gt_assert(read == one); seq_dec->start_of_encoding = decoder_calc_start_of_encoded_data(fp); seq_decoder_init_huffman(seq_dec, end_enc_start_sampling, bqd, err); if (gt_error_is_set(err)) { hcr_seq_decoder_delete(seq_dec); seq_dec = NULL; } } if (seq_dec != NULL) { gt_xfseek(fp, end_enc_start_sampling, SEEK_SET); seq_dec->sampling = gt_sampling_read(fp); seq_dec->file_info_rbt = seq_decoder_init_file_info(seq_dec->fileinfos, seq_dec->num_of_files); } hcr_base_qual_distr_delete(bqd); gt_fa_fclose(fp); return seq_dec; }
static int hcr_write_seq_qual_data(const char *name, GtHcrEncoder *hcr_enc, GtTimer *timer, GtError *err) { int had_err = 0; FILE *fp; GtUword dummy = 0; GtWord pos; gt_error_check(err); fp = gt_fa_fopen_with_suffix(name, HCRFILESUFFIX, "wb", err); if (fp == NULL) had_err = -1; if (!had_err) { if (timer != NULL) gt_timer_show_progress(timer, "write sequences and qualities encoding", stdout); hcr_write_file_info(fp, hcr_enc); had_err = hcr_write_seqdistrtab(fp, hcr_enc); if (!had_err) { bool is_not_at_pageborder; pos = ftell(fp); gt_xfwrite_one(&dummy, fp); is_not_at_pageborder = (ftell(fp) % hcr_enc->pagesize) != 0; if (is_not_at_pageborder) hcr_enc->seq_encoder->start_of_encoding = (ftell(fp) / hcr_enc->pagesize + 1) * hcr_enc->pagesize; else hcr_enc->seq_encoder->start_of_encoding = ftell(fp); if (hcr_enc->page_sampling) hcr_enc->seq_encoder->sampling = gt_sampling_new_page(hcr_enc->sampling_rate, (off_t) hcr_enc->seq_encoder->start_of_encoding); else if (hcr_enc->regular_sampling) hcr_enc->seq_encoder->sampling = gt_sampling_new_regular(hcr_enc->sampling_rate, (off_t) hcr_enc->seq_encoder->start_of_encoding); had_err = hcr_write_seqs(fp, hcr_enc, err); } if (!had_err) { gt_assert(fp); gt_xfseek(fp, pos, SEEK_SET); gt_xfwrite_one(&hcr_enc->seq_encoder->startofsamplingtab, fp); } gt_fa_xfclose(fp); } return 0; }
int gt_pckbuckettable_2file(const char *indexname, const Pckbuckettable *pckbuckettable, GtError *err) { FILE *fp; unsigned long seqposmaxdepth; gt_error_check(err); fp = gt_fa_fopen_with_suffix(indexname,PCKBUCKETTABLE,"wb",err); if (fp == NULL) { return -1; } seqposmaxdepth = (unsigned long) pckbuckettable->maxdepth; gt_xfwrite(&seqposmaxdepth,sizeof (unsigned long),(size_t) 1,fp); gt_xfwrite(pckbuckettable->mbtab[0],sizeof (Mbtab), (size_t) pckbuckettable->maxnumofvalues,fp); gt_fa_fclose(fp); return 0; }
int gt_alphabet_to_file(const GtAlphabet *alphabet, const char *indexname, GtError *err) { FILE *al1fp; bool haserr = false; gt_error_check(err); al1fp = gt_fa_fopen_with_suffix(indexname,GT_ALPHABETFILESUFFIX,"wb",err); if (al1fp == NULL) { haserr = true; } if (!haserr) { GtStr *buf = gt_str_new(); gt_alphabet_to_str(alphabet, buf); gt_xfwrite(gt_str_get(buf), sizeof (char), (size_t) gt_str_length(buf), al1fp); gt_fa_xfclose(al1fp); gt_str_delete(buf); } return haserr ? -1 : 0; }
static bool scanprjfileuintkeys(Suffixarray *suffixarray, const char *indexname, GtLogger *logger, GtError *err) { bool haserr = false; FILE *fp; gt_error_check(err); fp = gt_fa_fopen_with_suffix(indexname,GT_PROJECTFILESUFFIX,"rb",err); if (fp == NULL) { haserr = true; } if (!haserr && scanprjfileuintkeysviafileptr(suffixarray, indexname,logger, fp,err) != 0) { haserr = true; } gt_fa_xfclose(fp); return haserr; }
/*read condenseq data structure from file*/ GtCondenseq *gt_condenseq_new_from_file(const char *indexname, GtLogger *logger, GtError *err) { int had_err = 0; FILE* fp; GtEncseqLoader *esl; GtEncseq *unique_es; GtCondenseq *condenseq = NULL; /*load unique_es*/ esl = gt_encseq_loader_new(); unique_es = gt_encseq_loader_load(esl, indexname, err); if (!unique_es) had_err = -1; if (!had_err) { gt_encseq_loader_delete(esl); condenseq = condenseq_new_empty(gt_encseq_alphabet(unique_es)); condenseq->filename = gt_cstr_dup(indexname); condenseq->unique_es = unique_es; fp = gt_fa_fopen_with_suffix(indexname, GT_CONDENSEQ_FILE_SUFFIX, "rb", err); if (fp == NULL) { had_err = -1; } else { had_err = condenseq_io(condenseq, fp, gt_io_error_fread, err); if (!had_err) { GtUword i; gt_assert(condenseq->uniques); gt_assert(condenseq->links); gt_fa_fclose(fp); /*create link array for each unique entry*/ for (i = 0; i < condenseq->udb_nelems; i++) { GT_INITARRAY(&(condenseq->uniques[i].links),uint32_t); } /* check for overflows */ if (condenseq->ldb_nelems > (GtUword) ((uint32_t) 0 - (uint32_t) 1)) { gt_error_set(err, "Overflow, to many link-elements. Can't be stored"); had_err = -1; } /* iterate through link entrys and store ids in corresponding unique entry array */ for (i = 0; !had_err && (GtUword) i < condenseq->ldb_nelems; i++) { GtUword uid = condenseq->links[i].unique_id; gt_assert(uid < condenseq->udb_nelems); GT_STOREINARRAY(&(condenseq->uniques[uid].links), uint32_t, 10, (uint32_t) i); } } } } if (!had_err) { gt_assert(condenseq != NULL); if (condenseq->id_len != GT_UNDEF_UWORD) gt_logger_log(logger, "IDs const len: " GT_WU, condenseq->id_len); else gt_logger_log(logger, "using sdstab to access IDs"); } if (had_err) { gt_condenseq_delete(condenseq); condenseq = NULL; } return (condenseq); }
GtOutlcpinfo *gt_Outlcpinfo_new(const char *indexname, unsigned int numofchars, unsigned int prefixlength, bool withdistribution, bool swallow_tail_lcpvalues, GtFinalProcessBucket final_process_bucket, void *final_process_bucket_info, GtError *err) { bool haserr = false; GtOutlcpinfo *outlcpinfo; outlcpinfo = gt_malloc(sizeof (*outlcpinfo)); outlcpinfo->sizeofinfo = sizeof (*outlcpinfo); outlcpinfo->lcpsubtab.lcptabsum = 0.0; outlcpinfo->swallow_tail_lcpvalues = swallow_tail_lcpvalues; if (withdistribution) { outlcpinfo->lcpsubtab.distlcpvalues = gt_disc_distri_new(); } else { outlcpinfo->lcpsubtab.distlcpvalues = NULL; } if (indexname == NULL) { outlcpinfo->lcpsubtab.lcp2file = NULL; if (final_process_bucket != NULL) { outlcpinfo->lcpsubtab.lcpprocess = gt_malloc(sizeof (*outlcpinfo->lcpsubtab.lcpprocess)); outlcpinfo->lcpsubtab.lcpprocess->final_process_bucket = final_process_bucket; outlcpinfo->lcpsubtab.lcpprocess->final_process_bucket_info = final_process_bucket_info; } else { outlcpinfo->lcpsubtab.lcpprocess = NULL; } } else { outlcpinfo->lcpsubtab.lcpprocess = NULL; outlcpinfo->lcpsubtab.lcp2file = gt_malloc(sizeof (*outlcpinfo->lcpsubtab.lcp2file)); outlcpinfo->sizeofinfo += sizeof (*outlcpinfo->lcpsubtab.lcp2file); outlcpinfo->lcpsubtab.lcp2file->countoutputlcpvalues = 0; outlcpinfo->lcpsubtab.lcp2file->maxbranchdepth = 0; outlcpinfo->lcpsubtab.lcp2file->totalnumoflargelcpvalues = 0; outlcpinfo->lcpsubtab.lcp2file->reservoir = NULL; outlcpinfo->lcpsubtab.lcp2file->sizereservoir = 0; outlcpinfo->lcpsubtab.lcp2file->smalllcpvalues = NULL; GT_INITARRAY(&outlcpinfo->lcpsubtab.lcp2file->largelcpvalues, Largelcpvalue); outlcpinfo->lcpsubtab.lcp2file->outfplcptab = gt_fa_fopen_with_suffix(indexname,GT_LCPTABSUFFIX,"wb",err); if (outlcpinfo->lcpsubtab.lcp2file->outfplcptab == NULL) { haserr = true; } if (!haserr) { outlcpinfo->lcpsubtab.lcp2file->outfpllvtab = gt_fa_fopen_with_suffix(indexname,GT_LARGELCPTABSUFFIX,"wb",err); if (outlcpinfo->lcpsubtab.lcp2file->outfpllvtab == NULL) { haserr = true; } } } outlcpinfo->numsuffixes2output = 0; outlcpinfo->minchanged = 0; if (!haserr && prefixlength > 0) { outlcpinfo->turnwheel = gt_turningwheel_new(prefixlength,numofchars); outlcpinfo->sizeofinfo += gt_turningwheel_size(); } else { outlcpinfo->turnwheel = NULL; } #ifdef SKDEBUG outlcpinfo->previoussuffix.startpos = 0; #endif outlcpinfo->previoussuffix.code = 0; outlcpinfo->previoussuffix.prefixindex = 0; outlcpinfo->previoussuffix.defined = false; outlcpinfo->previousbucketwasempty = false; outlcpinfo->lcpsubtab.tableoflcpvalues.bucketoflcpvalues = NULL; outlcpinfo->lcpsubtab.tableoflcpvalues.numofentries = 0; #ifndef NDEBUG outlcpinfo->lcpsubtab.tableoflcpvalues.isset = NULL; #endif if (haserr) { gt_free(outlcpinfo); return NULL; } return outlcpinfo; }
static int gt_genomediff_runner(int argc, const char **argv, int parsed_args, void *tool_arguments, GtError *err) { bool mirrored = false; int had_err = 0, i; GtEncseq *encseq = NULL; GtGenomediffArguments *arguments = tool_arguments; GtLogger *logger; GtShuUnitFileInfo *unit_info = NULL; GtTimer *timer = NULL; gt_error_check(err); gt_assert(arguments); logger = gt_logger_new(arguments->verbose, GT_LOGGER_DEFLT_PREFIX, stdout); gt_assert(logger); for (i = parsed_args; i < argc; i++) { gt_str_array_add_cstr(arguments->filenames, argv[i]); } if (gt_showtime_enabled()) { timer = gt_timer_new_with_progress_description("start"); gt_timer_start(timer); gt_assert(timer); } if (arguments->with_units) { gt_logger_log(logger, "unitfile option set, filename is %s\n", gt_str_get(arguments->unitfile)); } if (timer != NULL) gt_timer_show_progress(timer, "start shu search", stdout); if (gt_str_array_size(arguments->filenames) > 1UL) { GtEncseqEncoder *ee = gt_encseq_encoder_new(); gt_encseq_encoder_set_timer(ee, timer); gt_encseq_encoder_set_logger(ee, logger); /* kr only makes sense for dna, so we can check this already with ee */ gt_encseq_encoder_set_input_dna(ee); had_err = gt_encseq_encoder_encode(ee, arguments->filenames, gt_str_get(arguments->indexname), err); gt_encseq_encoder_delete(ee); } else { gt_str_append_str(arguments->indexname, gt_str_array_get_str(arguments->filenames, 0)); if (arguments->with_esa || arguments->with_pck) { GtStr *current_line = gt_str_new(); FILE *prj_fp; const char *buffer; char **elements = NULL; prj_fp = gt_fa_fopen_with_suffix(gt_str_get(arguments->indexname), GT_PROJECTFILESUFFIX,"rb",err); if (prj_fp == NULL) had_err = -1; while (!had_err && gt_str_read_next_line(current_line, prj_fp) != EOF) { buffer = gt_str_get(current_line); if (elements != NULL) { gt_free(elements[0]); gt_free(elements[1]); } gt_free(elements); elements = gt_cstr_split(buffer, '='); gt_log_log("%s", elements[0]); if (strcmp("mirrored", elements[0]) == 0) { gt_log_log("%s", elements[1]); if (strcmp("1", elements[1]) == 0) { mirrored = true; gt_log_log("sequences are treated as mirrored"); } } gt_str_reset(current_line); } gt_str_delete(current_line); if (elements != NULL) { gt_free(elements[0]); gt_free(elements[1]); } gt_free(elements); gt_fa_xfclose(prj_fp); } } if (!had_err) { GtEncseqLoader *el = gt_encseq_loader_new_from_options(arguments->loadopts, err); if (mirrored) gt_encseq_loader_mirror(el); encseq = gt_encseq_loader_load(el, gt_str_get(arguments->indexname), err); gt_encseq_loader_delete(el); } if (encseq == NULL) had_err = -1; if (!had_err) { unit_info = gt_shu_unit_info_new(encseq); if (arguments->with_units) had_err = gt_shu_unit_file_info_read(arguments->unitfile, unit_info, logger, err); } if (!had_err) { uint64_t **shusums = NULL; if (arguments->with_esa || arguments->with_pck) { shusums = gt_genomediff_shulen_sum(arguments, unit_info, logger, timer, err); if (shusums == NULL) had_err = -1; } else { const bool doesa = true; GenomediffInfo gd_info; Suffixeratoroptions sopts; sopts.beverbose = arguments->verbose; sopts.indexname = arguments->indexname; sopts.db = NULL; sopts.encopts = NULL; sopts.genomediff = true; sopts.inputindex = arguments->indexname; sopts.loadopts = arguments->loadopts; sopts.showprogress = false; sopts.idxopts = arguments->idxopts; gt_assert(unit_info != NULL); gt_array2dim_calloc(shusums, unit_info->num_of_genomes, unit_info->num_of_genomes); gd_info.shulensums = shusums; gd_info.unit_info = unit_info; had_err = gt_runsuffixerator(doesa, &sopts, &gd_info, logger, err); } if (!had_err && shusums != NULL) { had_err = gt_genomediff_kr_calc(shusums, arguments, unit_info, arguments->with_pck, logger, timer, err); gt_array2dim_delete(shusums); } } if (timer != NULL) { gt_timer_show_progress_final(timer, stdout); gt_timer_delete(timer); } gt_logger_delete(logger); gt_encseq_delete(encseq); gt_shu_unit_info_delete(unit_info); return had_err; }
static int readfirstvaluesfromfile(GtEncseqMetadata *emd, const char *indexname, GtError *err) { FILE *fp; bool had_err = false; unsigned long cc, byteoffset = 0, alphatype; char *alphadef; gt_error_check(err); fp = gt_fa_fopen_with_suffix(indexname, GT_ENCSEQFILESUFFIX, "rb", err); if (fp == NULL) { had_err = true; } NEXTFREAD(emd->is64bit); if (!had_err) { if ((int) emd->is64bit > 1) { gt_error_set(err, "illegal platform code %u in \"%s%s\"", emd->is64bit, indexname, GT_ENCSEQFILESUFFIX); had_err = true; } if (!had_err && ((emd->is64bit && sizeof (unsigned long) != (size_t) 8) || (!emd->is64bit && sizeof (unsigned long) == (size_t) 8))) { gt_error_set(err, "trying to load 64-bit index \"%s%s\" on a 32-bit " "system or vice versa -- please use correct index " "for this platform", indexname, GT_ENCSEQFILESUFFIX); had_err = true; } } NEXTFREAD(emd->version); if (!had_err) { if (emd->version < GT_ENCSEQ_VERSION) { gt_error_set(err, "index \"%s%s\" is format version %lu, current is " "%lu -- please re-encode", indexname, GT_ENCSEQFILESUFFIX, emd->version, GT_ENCSEQ_VERSION); had_err = true; } } NEXTFREAD(cc); if (!had_err) { if (cc >= (unsigned long) GT_ACCESS_TYPE_UNDEFINED) { gt_error_set(err, "illegal type %lu in \"%s%s\"", cc, indexname, GT_ENCSEQFILESUFFIX); had_err = true; } } if (!had_err) { emd->sat = (GtEncseqAccessType) cc; NEXTFREAD(emd->totallength); NEXTFREAD(emd->numofdbsequences); NEXTFREAD(emd->numofdbfiles); NEXTFREAD(emd->lengthofdbfilenames); NEXTFREAD(emd->specialcharinfo); NEXTFREAD(emd->minseqlen); NEXTFREAD(emd->maxseqlen); } NEXTFREAD(alphatype); if (!had_err) { if (alphatype > 2UL) { gt_error_set(err, "illegal alphabet type %lu in \"%s%s\"", alphatype, indexname, GT_ENCSEQFILESUFFIX); had_err = true; } } if (!had_err) { NEXTFREAD(emd->lengthofalphadef); switch (alphatype) { case 0: emd->alpha = gt_alphabet_new_dna(); break; case 1: emd->alpha = gt_alphabet_new_protein(); break; case 2: gt_assert(emd->lengthofalphadef > 0); emd->customalphabet = true; alphadef = gt_malloc(sizeof (char) * emd->lengthofalphadef); NEXTFREADWSIZE(*(alphadef), emd->lengthofalphadef); emd->alpha = gt_alphabet_new_from_string(alphadef, emd->lengthofalphadef, err); if (!emd->alpha) { had_err = true; } gt_free(alphadef); break; } gt_assert(emd->alpha != NULL); } gt_fa_xfclose(fp); return had_err ? -1 : 0; }
int gt_mapfmindex (Fmindex *fmindex,const char *indexname, GtLogger *logger,GtError *err) { FILE *fpin = NULL; bool haserr = false, storeindexpos = true; GtSpecialcharinfo specialcharinfo; gt_error_check(err); fmindex->mappedptr = NULL; fmindex->bwtformatching = NULL; fmindex->alphabet = NULL; fpin = gt_fa_fopen_with_suffix(indexname,FMASCIIFILESUFFIX,"rb",err); if (fpin == NULL) { haserr = true; } if (!haserr) { if (scanfmafileviafileptr(fmindex, &specialcharinfo, &storeindexpos, indexname, fpin, logger, err) != 0) { haserr = true; } } gt_fa_xfclose(fpin); if (!haserr) { fmindex->bwtformatching = mapbwtencoding(indexname,logger,err); if (fmindex->bwtformatching == NULL) { haserr = true; } } if (!haserr) { fmindex->specpos.nextfreeGtPairBwtidx = (unsigned long) gt_determinenumberofspecialstostore(&specialcharinfo); fmindex->specpos.spaceGtPairBwtidx = NULL; fmindex->specpos.allocatedGtPairBwtidx = 0; fmindex->alphabet = gt_alphabet_ref( gt_encseq_alphabet(fmindex->bwtformatching)); if (fmindex->alphabet == NULL) { haserr = true; } } if (!haserr) { GtStr *tmpfilename; gt_computefmkeyvalues (fmindex, &specialcharinfo, fmindex->bwtlength, fmindex->log2bsize, fmindex->log2markdist, gt_alphabet_num_of_chars(fmindex->alphabet), fmindex->suffixlength, storeindexpos); tmpfilename = gt_str_new_cstr(indexname); gt_str_append_cstr(tmpfilename,FMDATAFILESUFFIX); if (gt_fillfmmapspecstartptr(fmindex,storeindexpos,tmpfilename,err) != 0) { haserr = true; } gt_str_delete(tmpfilename); } if (haserr) { gt_freefmindex(fmindex); } return haserr ? -1 : 0; }
int gt_extractkeysfromdesfile(const char *indexname, bool sortkeys, GtLogger *logger, GtError *err) { FILE *fpin, *fpout = NULL; GtStr *line = NULL; const char *keyptr; unsigned long keylen, constantkeylen = 0, linenum;/* incorrectorder = 0;*/ bool haserr = false, firstdesc = true; char *previouskey = NULL; Fixedsizekey *keytab = NULL, *keytabptr = NULL; GtEncseq *encseq = NULL; unsigned long numofentries = 0; const unsigned long linewidth = 60UL; fpin = gt_fa_fopen_with_suffix(indexname,GT_DESTABFILESUFFIX,"rb",err); if (fpin == NULL) { return -1; } if (!sortkeys) { fpout = gt_fa_fopen_with_suffix(indexname,GT_KEYSTABFILESUFFIX,"wb",err); if (fpout == NULL) { haserr = true; } } if (!haserr) { line = gt_str_new(); } for (linenum = 0; !haserr && gt_str_read_next_line(line, fpin) != EOF; linenum++) { keyptr = desc2key(&keylen,gt_str_get(line),err); if (keyptr == NULL) { haserr = true; break; } if (keylen == 0) { gt_error_set(err,"key of length 0 in \"%s\" not expected", gt_str_get(line)); haserr = true; break; } if (firstdesc) { if (keylen > (unsigned long) CHAR_MAX) { gt_error_set(err,"key \"%*.*s\" of length %lu not allowed; " "no key must be larger than %d", (int) keylen,(int) keylen,keyptr,keylen,CHAR_MAX); haserr = true; break; } constantkeylen = keylen; previouskey = gt_malloc(sizeof (char) * (constantkeylen+1)); firstdesc = false; if (!sortkeys) { gt_xfputc((char) constantkeylen,fpout); } else { GtEncseqLoader *el; if (constantkeylen > (unsigned long) MAXFIXEDKEYSIZE) { gt_error_set(err,"key \"%*.*s\" of length %lu not allowed; " "no key must be larger than %d", (int) keylen,(int) keylen,keyptr,keylen, MAXFIXEDKEYSIZE); haserr = true; break; } el = gt_encseq_loader_new(); gt_encseq_loader_set_logger(el, logger); encseq = gt_encseq_loader_load(el, indexname, err); gt_encseq_loader_delete(el); if (encseq == NULL) { haserr = true; break; } numofentries = gt_encseq_num_of_sequences(encseq); gt_assert(numofentries > 0); keytab = gt_malloc(sizeof (*keytab) * numofentries); keytabptr = keytab; } } else { if (constantkeylen != keylen) { gt_error_set(err,"key \"%*.*s\" of length %lu: all keys must be of " "the same length which for all previously seen " "headers is %lu", (int) keylen,(int) keylen,keyptr,keylen, constantkeylen); haserr = true; break; } gt_assert(previouskey != NULL); if (!sortkeys && strncmp(previouskey,keyptr,(size_t) constantkeylen) >= 0) { gt_error_set(err,"previous key \"%s\" is not lexicographically smaller " "than current key \"%*.*s\"", previouskey,(int) keylen,(int) keylen,keyptr); haserr = true; break; /* printf("previous key \"%s\" (no %lu) is lexicographically larger " "than current key \"%*.*s\"\n", previouskey,linenum,(int) keylen,(int) keylen,keyptr); incorrectorder++; */ } } if (!sortkeys) { gt_xfwrite(keyptr,sizeof *keyptr,(size_t) keylen,fpout); gt_xfputc('\0',fpout); } else { gt_assert(keytabptr != NULL); strncpy(keytabptr->key,keyptr,(size_t) constantkeylen); keytabptr->key[constantkeylen] = '\0'; keytabptr->seqnum = linenum; keytabptr++; } strncpy(previouskey,keyptr,(size_t) constantkeylen); previouskey[constantkeylen] = '\0'; gt_str_reset(line); } if (!haserr) { gt_logger_log(logger,"number of keys of length %lu = %lu", constantkeylen,linenum); /* gt_logger_log(logger,"number of incorrectly ordered keys = %lu", incorrectorder); */ } gt_str_delete(line); gt_fa_fclose(fpin); gt_fa_fclose(fpout); gt_free(previouskey); if (!haserr && sortkeys) { gt_assert(keytabptr != NULL); gt_assert(numofentries > 0); gt_assert(keytabptr == keytab + numofentries); qsort(keytab,(size_t) numofentries,sizeof (*keytab),compareFixedkeys); gt_assert(keytabptr != NULL); for (keytabptr = keytab; !haserr && keytabptr < keytab + numofentries; keytabptr++) { if (giextract_encodedseq2fasta(stdout, encseq, keytabptr->seqnum, NULL, linewidth, err) != 0) { haserr = true; break; } } } if (encseq != NULL) { gt_encseq_delete(encseq); encseq = NULL; } gt_free(keytab); return haserr ? -1 : 0; }
static int enumeratelcpintervals(const char *inputindex, Sequentialsuffixarrayreader *ssar, const char *storeindex, bool storecounts, GtUword mersize, GtUword minocc, GtUword maxocc, bool performtest, GtLogger *logger, GtError *err) { TyrDfsstate *state; bool haserr = false; unsigned int alphasize; gt_error_check(err); state = gt_malloc(sizeof (*state)); GT_INITARRAY(&state->occdistribution,Countwithpositions); state->esrspace = gt_encseq_create_reader_with_readmode( gt_encseqSequentialsuffixarrayreader(ssar), gt_readmodeSequentialsuffixarrayreader(ssar), 0); state->mersize = (GtUword) mersize; state->encseq = gt_encseqSequentialsuffixarrayreader(ssar); alphasize = gt_alphabet_num_of_chars(gt_encseq_alphabet(state->encseq)); state->readmode = gt_readmodeSequentialsuffixarrayreader(ssar); state->storecounts = storecounts; state->minocc = minocc; state->maxocc = maxocc; state->totallength = gt_encseq_total_length(state->encseq); state->performtest = performtest; state->countoutputmers = 0; state->merindexfpout = NULL; state->countsfilefpout = NULL; GT_INITARRAY(&state->largecounts,Largecount); if (strlen(storeindex) == 0) { state->sizeofbuffer = 0; state->bytebuffer = NULL; } else { state->sizeofbuffer = MERBYTES(mersize); state->bytebuffer = gt_malloc(sizeof *state->bytebuffer * state->sizeofbuffer); } if (performtest) { state->currentmer = gt_malloc(sizeof *state->currentmer * state->mersize); state->suftab = gt_suftabSequentialsuffixarrayreader(ssar); } else { state->currentmer = NULL; state->suftab = NULL; } if (state->mersize > state->totallength) { gt_error_set(err,"mersize "GT_WU" > "GT_WU" = totallength not allowed", state->mersize, state->totallength); haserr = true; } else { if (strlen(storeindex) == 0) { state->processoccurrencecount = adddistpos2distribution; } else { state->merindexfpout = gt_fa_fopen_with_suffix(storeindex,MERSUFFIX, "wb",err); if (state->merindexfpout == NULL) { haserr = true; } else { if (state->storecounts) { state->countsfilefpout = gt_fa_fopen_with_suffix(storeindex,COUNTSSUFFIX,"wb",err); if (state->countsfilefpout == NULL) { haserr = true; } } } state->processoccurrencecount = outputsortedstring2index; } if (!haserr) { if (gt_depthfirstesa(ssar, tyr_allocateDfsinfo, tyr_freeDfsinfo, tyr_processleafedge, NULL, tyr_processcompletenode, tyr_assignleftmostleaf, tyr_assignrightmostleaf, (Dfsstate*) state, logger, err) != 0) { haserr = true; } if (strlen(storeindex) == 0) { showfinalstatistics(state,inputindex,logger); } } if (!haserr) { if (state->countsfilefpout != NULL) { gt_logger_log(logger,"write "GT_WU" mercounts > "GT_WU " to file \"%s%s\"", state->largecounts.nextfreeLargecount, (GtUword) MAXSMALLMERCOUNT, storeindex, COUNTSSUFFIX); gt_xfwrite(state->largecounts.spaceLargecount, sizeof (Largecount), (size_t) state->largecounts.nextfreeLargecount, state->countsfilefpout); } } if (!haserr) { gt_logger_log(logger,"number of "GT_WU"-mers in index: "GT_WU"", mersize, state->countoutputmers); gt_logger_log(logger,"index size: %.2f megabytes\n", GT_MEGABYTES(state->countoutputmers * state->sizeofbuffer + sizeof (GtUword) * EXTRAINTEGERS)); } } /* now out EXTRAINTEGERS integer values */ if (!haserr && state->merindexfpout != NULL) { outputbytewiseUlongvalue(state->merindexfpout, (GtUword) state->mersize); outputbytewiseUlongvalue(state->merindexfpout,(GtUword) alphasize); } gt_fa_xfclose(state->merindexfpout); gt_fa_xfclose(state->countsfilefpout); GT_FREEARRAY(&state->occdistribution,Countwithpositions); gt_free(state->currentmer); gt_free(state->bytebuffer); GT_FREEARRAY(&state->largecounts,Largecount); gt_encseq_reader_delete(state->esrspace); gt_free(state); return haserr ? -1 : 0; }