Example #1
0
static int readkeysize(const char *indexname,GtError *err)
{
  FILE *fp;
  bool haserr = false;
  char cc;

  gt_error_check(err);
  fp = gt_fa_fopen_with_suffix(indexname,GT_KEYSTABFILESUFFIX,"rb",err);
  if (fp == NULL)
  {
    haserr = true;
  }
  if (!haserr)
  {
    GT_UNUSED size_t ret;

    ret = fread(&cc,sizeof cc, (size_t) 1, fp);
    if (ferror(fp))
    {
      gt_error_set(err,"error when trying to read first byte of file %s%s: %s",
                   indexname,GT_KEYSTABFILESUFFIX,strerror(errno));
      haserr = true;
    }
  }
  gt_assert(cc >= 0);
  gt_fa_xfclose(fp);
  return haserr ? -1 : (int) cc;
}
Example #2
0
int gt_outprjfile(const char *indexname,
                  GtReadmode readmode,
                  const GtEncseq *encseq,
                  GtUword numberofallsortedsuffixes,
                  unsigned int prefixlength,
                  GtUword numoflargelcpvalues,
                  double averagelcp,
                  GtUword maxbranchdepth,
                  const Definedunsignedlong *longest,
                  GtError *err)
{
  FILE *prjfp;
  bool haserr = false;

  gt_error_check(err);
  prjfp = gt_fa_fopen_with_suffix(indexname,GT_PROJECTFILESUFFIX,"wb",err);
  if (prjfp == NULL)
  {
    haserr = true;
  }
  if (!haserr)
  {
    showprjinfo(prjfp,
                readmode,
                encseq,
                numberofallsortedsuffixes,
                prefixlength,
                numoflargelcpvalues,
                averagelcp,
                maxbranchdepth,
                longest);
    gt_fa_xfclose(prjfp);
  }
  return haserr ? -1 : 0;
}
Example #3
0
int gt_hcr_decoder_decode_range(GtHcrDecoder *hcr_dec, const char *name,
                                GtUword start, GtUword end,
                                GtTimer *timer, GtError *err)
{
    char qual[BUFSIZ] = {0},
                        seq[BUFSIZ] = {0};
    GtStr *desc = gt_str_new();
    int had_err = 0;
    GtUword cur_width,
            cur_read;
    size_t i;
    FILE *output;
    GT_UNUSED GtHcrSeqDecoder *seq_dec;

    gt_error_check(err);
    gt_assert(hcr_dec && name);
    seq_dec = hcr_dec->seq_dec;
    gt_assert(start <= end);
    gt_assert(start < seq_dec->num_of_reads && end < seq_dec->num_of_reads);
    if (timer != NULL)
        gt_timer_show_progress(timer, "decode hcr", stdout);
    output = gt_fa_fopen_with_suffix(name, HCRFILEDECODEDSUFFIX, "w", err);
    if (output == NULL)
        had_err = -1;

    for (cur_read = start; had_err == 0 && cur_read <= end; cur_read++) {
        if (gt_hcr_decoder_decode(hcr_dec, cur_read, seq, qual, desc, err) != 0)
            had_err = -1;
        else {
            gt_xfputc(HCR_DESCSEPSEQ, output);
            if (hcr_dec->encdesc != NULL)
                gt_xfputs(gt_str_get(desc), output);
            else
                fprintf(output, ""GT_WU"", cur_read);
            gt_xfputc('\n', output);
            for (i = 0, cur_width = 0; i < strlen(seq); i++, cur_width++) {
                if (cur_width == HCR_LINEWIDTH) {
                    cur_width = 0;
                    gt_xfputc('\n', output);
                }
                gt_xfputc(seq[i], output);
            }
            gt_xfputc('\n', output);
            gt_xfputc(HCR_DESCSEPQUAL, output);
            gt_xfputc('\n', output);
            for (i = 0, cur_width = 0; i < strlen(qual); i++, cur_width++) {
                if (cur_width == HCR_LINEWIDTH) {
                    cur_width = 0;
                    gt_xfputc('\n', output);
                }
                gt_xfputc(qual[i], output);
            }
            gt_xfputc('\n', output);
        }
    }
    gt_fa_xfclose(output);
    gt_str_delete(desc);
    return had_err;
}
Example #4
0
static GtHcrSeqDecoder *hcr_seq_decoder_new(GtAlphabet *alpha, const char *name,
        GtError *err)
{
    GtHcrSeqDecoder *seq_dec = gt_malloc(sizeof (GtHcrSeqDecoder));
    GtBaseQualDistr *bqd = NULL;
    GtWord end_enc_start_sampling = 0;
    FILE *fp = NULL;
    GT_UNUSED size_t read,
              one = (size_t) 1;

    seq_dec->alpha = alpha;
    seq_dec->alphabet_size = gt_alphabet_size(alpha);
    seq_dec->cur_read = 0;
    seq_dec->data_iter = NULL;
    seq_dec->file_info_rbt = NULL;
    seq_dec->fileinfos = NULL;
    seq_dec->filename = gt_str_new_cstr(name);
    seq_dec->huff_dec = NULL;
    seq_dec->huffman = NULL;
    seq_dec->sampling = NULL;
    seq_dec->symbols = NULL;
    gt_str_append_cstr(seq_dec->filename, HCRFILESUFFIX);

    fp = gt_fa_fopen_with_suffix(name, HCRFILESUFFIX, "rb", err);
    if (gt_error_is_set(err)) {
        hcr_seq_decoder_delete(seq_dec);
        seq_dec = NULL;
    }
    else {
        hcr_read_file_info(seq_dec, fp);

        bqd = hcr_base_qual_distr_new_from_file(fp, seq_dec->alpha);
        seq_dec->qual_offset = bqd->qual_offset;

        read = gt_xfread_one(&end_enc_start_sampling, fp);
        gt_assert(read == one);

        seq_dec->start_of_encoding = decoder_calc_start_of_encoded_data(fp);

        seq_decoder_init_huffman(seq_dec, end_enc_start_sampling, bqd, err);
        if (gt_error_is_set(err)) {
            hcr_seq_decoder_delete(seq_dec);
            seq_dec = NULL;
        }
    }

    if (seq_dec != NULL) {
        gt_xfseek(fp, end_enc_start_sampling, SEEK_SET);
        seq_dec->sampling = gt_sampling_read(fp);

        seq_dec->file_info_rbt = seq_decoder_init_file_info(seq_dec->fileinfos,
                                 seq_dec->num_of_files);
    }

    hcr_base_qual_distr_delete(bqd);
    gt_fa_fclose(fp);
    return seq_dec;
}
Example #5
0
static int hcr_write_seq_qual_data(const char *name, GtHcrEncoder *hcr_enc,
                                   GtTimer *timer, GtError *err)
{
    int had_err = 0;
    FILE *fp;
    GtUword dummy = 0;
    GtWord pos;

    gt_error_check(err);

    fp = gt_fa_fopen_with_suffix(name, HCRFILESUFFIX, "wb", err);
    if (fp == NULL)
        had_err = -1;

    if (!had_err) {
        if (timer != NULL)
            gt_timer_show_progress(timer, "write sequences and qualities encoding",
                                   stdout);
        hcr_write_file_info(fp, hcr_enc);

        had_err = hcr_write_seqdistrtab(fp, hcr_enc);

        if (!had_err) {
            bool is_not_at_pageborder;

            pos = ftell(fp);
            gt_xfwrite_one(&dummy, fp);

            is_not_at_pageborder = (ftell(fp) % hcr_enc->pagesize) != 0;

            if (is_not_at_pageborder)
                hcr_enc->seq_encoder->start_of_encoding =
                    (ftell(fp) / hcr_enc->pagesize + 1) * hcr_enc->pagesize;
            else
                hcr_enc->seq_encoder->start_of_encoding = ftell(fp);

            if (hcr_enc->page_sampling)
                hcr_enc->seq_encoder->sampling =
                    gt_sampling_new_page(hcr_enc->sampling_rate,
                                         (off_t) hcr_enc->seq_encoder->start_of_encoding);
            else if (hcr_enc->regular_sampling)
                hcr_enc->seq_encoder->sampling =
                    gt_sampling_new_regular(hcr_enc->sampling_rate,
                                            (off_t) hcr_enc->seq_encoder->start_of_encoding);

            had_err = hcr_write_seqs(fp, hcr_enc, err);
        }
        if (!had_err) {
            gt_assert(fp);
            gt_xfseek(fp, pos, SEEK_SET);
            gt_xfwrite_one(&hcr_enc->seq_encoder->startofsamplingtab, fp);
        }
        gt_fa_xfclose(fp);
    }
    return 0;
}
Example #6
0
int gt_pckbuckettable_2file(const char *indexname,
                            const Pckbuckettable *pckbuckettable,
                            GtError *err)
{
  FILE *fp;
  unsigned long seqposmaxdepth;

  gt_error_check(err);
  fp = gt_fa_fopen_with_suffix(indexname,PCKBUCKETTABLE,"wb",err);
  if (fp == NULL)
  {
    return -1;
  }
  seqposmaxdepth = (unsigned long) pckbuckettable->maxdepth;
  gt_xfwrite(&seqposmaxdepth,sizeof (unsigned long),(size_t) 1,fp);
  gt_xfwrite(pckbuckettable->mbtab[0],sizeof (Mbtab),
             (size_t) pckbuckettable->maxnumofvalues,fp);
  gt_fa_fclose(fp);
  return 0;
}
Example #7
0
int gt_alphabet_to_file(const GtAlphabet *alphabet, const char *indexname,
                        GtError *err)
{
  FILE *al1fp;
  bool haserr = false;

  gt_error_check(err);
  al1fp = gt_fa_fopen_with_suffix(indexname,GT_ALPHABETFILESUFFIX,"wb",err);
  if (al1fp == NULL)
  {
    haserr = true;
  }
  if (!haserr)
  {
    GtStr *buf = gt_str_new();
    gt_alphabet_to_str(alphabet, buf);
    gt_xfwrite(gt_str_get(buf), sizeof (char), (size_t) gt_str_length(buf),
               al1fp);
    gt_fa_xfclose(al1fp);
    gt_str_delete(buf);
  }
  return haserr ? -1 : 0;
}
Example #8
0
static bool scanprjfileuintkeys(Suffixarray *suffixarray,
                                const char *indexname,
                                GtLogger *logger,
                                GtError *err)
{
  bool haserr = false;
  FILE *fp;

  gt_error_check(err);
  fp = gt_fa_fopen_with_suffix(indexname,GT_PROJECTFILESUFFIX,"rb",err);
  if (fp == NULL)
  {
    haserr = true;
  }
  if (!haserr && scanprjfileuintkeysviafileptr(suffixarray,
                                               indexname,logger,
                                               fp,err) != 0)
  {
    haserr = true;
  }
  gt_fa_xfclose(fp);
  return haserr;
}
/*read condenseq data structure from file*/
GtCondenseq *gt_condenseq_new_from_file(const char *indexname,
                                        GtLogger *logger, GtError *err)
{
  int had_err = 0;
  FILE* fp;
  GtEncseqLoader *esl;
  GtEncseq *unique_es;
  GtCondenseq *condenseq = NULL;
  /*load unique_es*/
  esl = gt_encseq_loader_new();
  unique_es = gt_encseq_loader_load(esl, indexname, err);
  if (!unique_es)
    had_err = -1;
  if (!had_err) {
    gt_encseq_loader_delete(esl);
    condenseq = condenseq_new_empty(gt_encseq_alphabet(unique_es));
    condenseq->filename = gt_cstr_dup(indexname);
    condenseq->unique_es = unique_es;
    fp = gt_fa_fopen_with_suffix(indexname, GT_CONDENSEQ_FILE_SUFFIX,
                                 "rb", err);
    if (fp == NULL) {
      had_err = -1;
    }
    else {
      had_err = condenseq_io(condenseq, fp, gt_io_error_fread, err);
      if (!had_err) {
        GtUword i;
        gt_assert(condenseq->uniques);
        gt_assert(condenseq->links);
        gt_fa_fclose(fp);
        /*create link array for each unique entry*/
        for (i = 0; i < condenseq->udb_nelems; i++) {
          GT_INITARRAY(&(condenseq->uniques[i].links),uint32_t);
        }
        /* check for overflows */
        if (condenseq->ldb_nelems > (GtUword) ((uint32_t) 0 - (uint32_t) 1)) {
          gt_error_set(err, "Overflow, to many link-elements. Can't be stored");
          had_err = -1;
        }
        /* iterate through link entrys and store ids in corresponding unique
          entry array */
        for (i = 0; !had_err && (GtUword) i < condenseq->ldb_nelems; i++) {
          GtUword uid = condenseq->links[i].unique_id;
          gt_assert(uid < condenseq->udb_nelems);
          GT_STOREINARRAY(&(condenseq->uniques[uid].links),
                          uint32_t,
                          10,
                          (uint32_t) i);
        }
      }
    }
  }
  if (!had_err) {
    gt_assert(condenseq != NULL);
    if (condenseq->id_len != GT_UNDEF_UWORD)
      gt_logger_log(logger, "IDs const len: " GT_WU, condenseq->id_len);
    else
      gt_logger_log(logger, "using sdstab to access IDs");
  }
  if (had_err) {
    gt_condenseq_delete(condenseq);
    condenseq = NULL;
  }
  return (condenseq);
}
Example #10
0
GtOutlcpinfo *gt_Outlcpinfo_new(const char *indexname,
                                unsigned int numofchars,
                                unsigned int prefixlength,
                                bool withdistribution,
                                bool swallow_tail_lcpvalues,
                                GtFinalProcessBucket final_process_bucket,
                                void *final_process_bucket_info,
                                GtError *err)
{
  bool haserr = false;
  GtOutlcpinfo *outlcpinfo;

  outlcpinfo = gt_malloc(sizeof (*outlcpinfo));
  outlcpinfo->sizeofinfo = sizeof (*outlcpinfo);
  outlcpinfo->lcpsubtab.lcptabsum = 0.0;
  outlcpinfo->swallow_tail_lcpvalues = swallow_tail_lcpvalues;
  if (withdistribution)
  {
    outlcpinfo->lcpsubtab.distlcpvalues = gt_disc_distri_new();
  } else
  {
    outlcpinfo->lcpsubtab.distlcpvalues = NULL;
  }
  if (indexname == NULL)
  {
    outlcpinfo->lcpsubtab.lcp2file = NULL;
    if (final_process_bucket != NULL)
    {
      outlcpinfo->lcpsubtab.lcpprocess
        = gt_malloc(sizeof (*outlcpinfo->lcpsubtab.lcpprocess));
      outlcpinfo->lcpsubtab.lcpprocess->final_process_bucket
        = final_process_bucket;
      outlcpinfo->lcpsubtab.lcpprocess->final_process_bucket_info
        = final_process_bucket_info;
    } else
    {
      outlcpinfo->lcpsubtab.lcpprocess = NULL;
    }
  } else
  {
    outlcpinfo->lcpsubtab.lcpprocess = NULL;
    outlcpinfo->lcpsubtab.lcp2file
      = gt_malloc(sizeof (*outlcpinfo->lcpsubtab.lcp2file));
    outlcpinfo->sizeofinfo += sizeof (*outlcpinfo->lcpsubtab.lcp2file);
    outlcpinfo->lcpsubtab.lcp2file->countoutputlcpvalues = 0;
    outlcpinfo->lcpsubtab.lcp2file->maxbranchdepth = 0;
    outlcpinfo->lcpsubtab.lcp2file->totalnumoflargelcpvalues = 0;
    outlcpinfo->lcpsubtab.lcp2file->reservoir = NULL;
    outlcpinfo->lcpsubtab.lcp2file->sizereservoir = 0;
    outlcpinfo->lcpsubtab.lcp2file->smalllcpvalues = NULL;
    GT_INITARRAY(&outlcpinfo->lcpsubtab.lcp2file->largelcpvalues,
                 Largelcpvalue);
    outlcpinfo->lcpsubtab.lcp2file->outfplcptab
      = gt_fa_fopen_with_suffix(indexname,GT_LCPTABSUFFIX,"wb",err);
    if (outlcpinfo->lcpsubtab.lcp2file->outfplcptab == NULL)
    {
      haserr = true;
    }
    if (!haserr)
    {
      outlcpinfo->lcpsubtab.lcp2file->outfpllvtab
        = gt_fa_fopen_with_suffix(indexname,GT_LARGELCPTABSUFFIX,"wb",err);
      if (outlcpinfo->lcpsubtab.lcp2file->outfpllvtab == NULL)
      {
        haserr = true;
      }
    }
  }
  outlcpinfo->numsuffixes2output = 0;
  outlcpinfo->minchanged = 0;
  if (!haserr && prefixlength > 0)
  {
    outlcpinfo->turnwheel = gt_turningwheel_new(prefixlength,numofchars);
    outlcpinfo->sizeofinfo += gt_turningwheel_size();
  } else
  {
    outlcpinfo->turnwheel = NULL;
  }
#ifdef SKDEBUG
  outlcpinfo->previoussuffix.startpos = 0;
#endif
  outlcpinfo->previoussuffix.code = 0;
  outlcpinfo->previoussuffix.prefixindex = 0;
  outlcpinfo->previoussuffix.defined = false;
  outlcpinfo->previousbucketwasempty = false;
  outlcpinfo->lcpsubtab.tableoflcpvalues.bucketoflcpvalues = NULL;
  outlcpinfo->lcpsubtab.tableoflcpvalues.numofentries = 0;
#ifndef NDEBUG
  outlcpinfo->lcpsubtab.tableoflcpvalues.isset = NULL;
#endif
  if (haserr)
  {
    gt_free(outlcpinfo);
    return NULL;
  }
  return outlcpinfo;
}
Example #11
0
static int gt_genomediff_runner(int argc, const char **argv,
                                int parsed_args, void *tool_arguments,
                                GtError *err)
{
    bool mirrored = false;
    int had_err = 0,
        i;
    GtEncseq              *encseq = NULL;
    GtGenomediffArguments *arguments = tool_arguments;
    GtLogger              *logger;
    GtShuUnitFileInfo     *unit_info = NULL;
    GtTimer               *timer = NULL;

    gt_error_check(err);
    gt_assert(arguments);

    logger = gt_logger_new(arguments->verbose,
                           GT_LOGGER_DEFLT_PREFIX,
                           stdout);
    gt_assert(logger);

    for (i = parsed_args; i < argc; i++) {
        gt_str_array_add_cstr(arguments->filenames, argv[i]);
    }

    if (gt_showtime_enabled()) {
        timer = gt_timer_new_with_progress_description("start");
        gt_timer_start(timer);
        gt_assert(timer);
    }

    if (arguments->with_units) {
        gt_logger_log(logger, "unitfile option set, filename is %s\n",
                      gt_str_get(arguments->unitfile));
    }

    if (timer != NULL)
        gt_timer_show_progress(timer, "start shu search", stdout);

    if (gt_str_array_size(arguments->filenames) > 1UL) {
        GtEncseqEncoder *ee = gt_encseq_encoder_new();
        gt_encseq_encoder_set_timer(ee, timer);
        gt_encseq_encoder_set_logger(ee, logger);
        /* kr only makes sense for dna, so we can check this already with ee */
        gt_encseq_encoder_set_input_dna(ee);
        had_err = gt_encseq_encoder_encode(ee, arguments->filenames,
                                           gt_str_get(arguments->indexname), err);
        gt_encseq_encoder_delete(ee);
    }
    else {
        gt_str_append_str(arguments->indexname,
                          gt_str_array_get_str(arguments->filenames, 0));
        if (arguments->with_esa || arguments->with_pck) {
            GtStr *current_line = gt_str_new();
            FILE *prj_fp;
            const char *buffer;
            char **elements = NULL;

            prj_fp = gt_fa_fopen_with_suffix(gt_str_get(arguments->indexname),
                                             GT_PROJECTFILESUFFIX,"rb",err);
            if (prj_fp == NULL)
                had_err = -1;
            while (!had_err && gt_str_read_next_line(current_line, prj_fp) != EOF) {
                buffer = gt_str_get(current_line);
                if (elements != NULL) {
                    gt_free(elements[0]);
                    gt_free(elements[1]);
                }
                gt_free(elements);
                elements = gt_cstr_split(buffer, '=');
                gt_log_log("%s", elements[0]);
                if (strcmp("mirrored", elements[0]) == 0) {
                    gt_log_log("%s", elements[1]);
                    if (strcmp("1", elements[1]) == 0) {
                        mirrored = true;
                        gt_log_log("sequences are treated as mirrored");
                    }
                }
                gt_str_reset(current_line);
            }
            gt_str_delete(current_line);
            if (elements != NULL) {
                gt_free(elements[0]);
                gt_free(elements[1]);
            }
            gt_free(elements);
            gt_fa_xfclose(prj_fp);
        }
    }

    if (!had_err) {
        GtEncseqLoader *el = gt_encseq_loader_new_from_options(arguments->loadopts,
                             err);
        if (mirrored)
            gt_encseq_loader_mirror(el);
        encseq =
            gt_encseq_loader_load(el, gt_str_get(arguments->indexname), err);
        gt_encseq_loader_delete(el);
    }
    if (encseq == NULL)
        had_err = -1;
    if (!had_err) {
        unit_info = gt_shu_unit_info_new(encseq);
        if (arguments->with_units)
            had_err = gt_shu_unit_file_info_read(arguments->unitfile, unit_info,
                                                 logger, err);
    }

    if (!had_err) {
        uint64_t **shusums = NULL;
        if (arguments->with_esa || arguments->with_pck) {
            shusums = gt_genomediff_shulen_sum(arguments, unit_info,
                                               logger, timer, err);
            if (shusums == NULL)
                had_err = -1;
        }
        else {
            const bool doesa = true;
            GenomediffInfo gd_info;
            Suffixeratoroptions sopts;
            sopts.beverbose = arguments->verbose;
            sopts.indexname = arguments->indexname;
            sopts.db = NULL;
            sopts.encopts = NULL;
            sopts.genomediff = true;
            sopts.inputindex = arguments->indexname;
            sopts.loadopts = arguments->loadopts;
            sopts.showprogress = false;
            sopts.idxopts = arguments->idxopts;

            gt_assert(unit_info != NULL);
            gt_array2dim_calloc(shusums, unit_info->num_of_genomes,
                                unit_info->num_of_genomes);
            gd_info.shulensums = shusums;
            gd_info.unit_info = unit_info;
            had_err = gt_runsuffixerator(doesa, &sopts, &gd_info, logger, err);
        }
        if (!had_err && shusums != NULL) {
            had_err = gt_genomediff_kr_calc(shusums, arguments, unit_info,
                                            arguments->with_pck, logger, timer, err);
            gt_array2dim_delete(shusums);
        }
    }

    if (timer != NULL) {
        gt_timer_show_progress_final(timer, stdout);
        gt_timer_delete(timer);
    }
    gt_logger_delete(logger);
    gt_encseq_delete(encseq);
    gt_shu_unit_info_delete(unit_info);

    return had_err;
}
Example #12
0
static int readfirstvaluesfromfile(GtEncseqMetadata *emd,
                                   const char *indexname, GtError *err)
{
  FILE *fp;
  bool had_err = false;
  unsigned long cc, byteoffset = 0, alphatype;
  char *alphadef;

  gt_error_check(err);
  fp = gt_fa_fopen_with_suffix(indexname, GT_ENCSEQFILESUFFIX, "rb", err);
  if (fp == NULL)
  {
    had_err = true;
  }
  NEXTFREAD(emd->is64bit);
  if (!had_err)
  {
    if ((int) emd->is64bit > 1)
    {
      gt_error_set(err, "illegal platform code %u in \"%s%s\"", emd->is64bit,
                   indexname, GT_ENCSEQFILESUFFIX);
      had_err = true;
    }
    if (!had_err && ((emd->is64bit && sizeof (unsigned long) != (size_t) 8)
          || (!emd->is64bit && sizeof (unsigned long) == (size_t) 8)))
    {
      gt_error_set(err, "trying to load 64-bit index \"%s%s\" on a 32-bit "
                        "system or vice versa -- please use correct index "
                        "for this platform", indexname, GT_ENCSEQFILESUFFIX);
      had_err = true;
    }
  }
  NEXTFREAD(emd->version);
  if (!had_err)
  {
    if (emd->version < GT_ENCSEQ_VERSION)    {
      gt_error_set(err, "index \"%s%s\" is format version %lu, current is "
                        "%lu -- please re-encode",
                        indexname, GT_ENCSEQFILESUFFIX,
                        emd->version, GT_ENCSEQ_VERSION);
      had_err = true;
    }
  }
  NEXTFREAD(cc);
  if (!had_err)
  {
    if (cc >= (unsigned long) GT_ACCESS_TYPE_UNDEFINED)
    {
      gt_error_set(err, "illegal type %lu in \"%s%s\"", cc,
                   indexname, GT_ENCSEQFILESUFFIX);
      had_err = true;
    }
  }
  if (!had_err) {
    emd->sat = (GtEncseqAccessType) cc;
    NEXTFREAD(emd->totallength);
    NEXTFREAD(emd->numofdbsequences);
    NEXTFREAD(emd->numofdbfiles);
    NEXTFREAD(emd->lengthofdbfilenames);
    NEXTFREAD(emd->specialcharinfo);
    NEXTFREAD(emd->minseqlen);
    NEXTFREAD(emd->maxseqlen);
  }
  NEXTFREAD(alphatype);
  if (!had_err) {
    if (alphatype > 2UL) {
      gt_error_set(err, "illegal alphabet type %lu in \"%s%s\"", alphatype,
                   indexname, GT_ENCSEQFILESUFFIX);
      had_err = true;
    }
  }
  if (!had_err) {
    NEXTFREAD(emd->lengthofalphadef);
    switch (alphatype) {
      case 0:
        emd->alpha = gt_alphabet_new_dna();
        break;
      case 1:
        emd->alpha = gt_alphabet_new_protein();
        break;
      case 2:
        gt_assert(emd->lengthofalphadef > 0);
        emd->customalphabet = true;
        alphadef = gt_malloc(sizeof (char) * emd->lengthofalphadef);
        NEXTFREADWSIZE(*(alphadef), emd->lengthofalphadef);
        emd->alpha = gt_alphabet_new_from_string(alphadef,
                                                 emd->lengthofalphadef,
                                                 err);
        if (!emd->alpha) {
          had_err = true;
        }
        gt_free(alphadef);
        break;
    }
    gt_assert(emd->alpha != NULL);
  }
  gt_fa_xfclose(fp);
  return had_err ? -1 : 0;
}
Example #13
0
int gt_mapfmindex (Fmindex *fmindex,const char *indexname,
                GtLogger *logger,GtError *err)
{
  FILE *fpin = NULL;
  bool haserr = false, storeindexpos = true;
  GtSpecialcharinfo specialcharinfo;

  gt_error_check(err);
  fmindex->mappedptr = NULL;
  fmindex->bwtformatching = NULL;
  fmindex->alphabet = NULL;
  fpin = gt_fa_fopen_with_suffix(indexname,FMASCIIFILESUFFIX,"rb",err);
  if (fpin == NULL)
  {
    haserr = true;
  }
  if (!haserr)
  {
    if (scanfmafileviafileptr(fmindex,
                              &specialcharinfo,
                              &storeindexpos,
                              indexname,
                              fpin,
                              logger,
                              err) != 0)
    {
      haserr = true;
    }
  }
  gt_fa_xfclose(fpin);
  if (!haserr)
  {
    fmindex->bwtformatching = mapbwtencoding(indexname,logger,err);
    if (fmindex->bwtformatching == NULL)
    {
      haserr = true;
    }
  }
  if (!haserr)
  {
    fmindex->specpos.nextfreeGtPairBwtidx
      = (unsigned long) gt_determinenumberofspecialstostore(&specialcharinfo);
    fmindex->specpos.spaceGtPairBwtidx = NULL;
    fmindex->specpos.allocatedGtPairBwtidx = 0;
    fmindex->alphabet = gt_alphabet_ref(
                                  gt_encseq_alphabet(fmindex->bwtformatching));
    if (fmindex->alphabet == NULL)
    {
      haserr = true;
    }
  }
  if (!haserr)
  {
    GtStr *tmpfilename;

    gt_computefmkeyvalues (fmindex,
                           &specialcharinfo,
                           fmindex->bwtlength,
                           fmindex->log2bsize,
                           fmindex->log2markdist,
                           gt_alphabet_num_of_chars(fmindex->alphabet),
                           fmindex->suffixlength,
                           storeindexpos);
    tmpfilename = gt_str_new_cstr(indexname);
    gt_str_append_cstr(tmpfilename,FMDATAFILESUFFIX);
    if (gt_fillfmmapspecstartptr(fmindex,storeindexpos,tmpfilename,err) != 0)
    {
      haserr = true;
    }
    gt_str_delete(tmpfilename);
  }
  if (haserr)
  {
    gt_freefmindex(fmindex);
  }
  return haserr ? -1 : 0;
}
Example #14
0
int gt_extractkeysfromdesfile(const char *indexname,
                              bool sortkeys,
                              GtLogger *logger,
                              GtError *err)
{
  FILE *fpin, *fpout = NULL;
  GtStr *line = NULL;
  const char *keyptr;
  unsigned long keylen, constantkeylen = 0, linenum;/* incorrectorder = 0;*/
  bool haserr = false, firstdesc = true;
  char *previouskey = NULL;
  Fixedsizekey *keytab = NULL, *keytabptr = NULL;
  GtEncseq *encseq = NULL;
  unsigned long numofentries = 0;
  const unsigned long linewidth = 60UL;

  fpin = gt_fa_fopen_with_suffix(indexname,GT_DESTABFILESUFFIX,"rb",err);
  if (fpin == NULL)
  {
    return -1;
  }
  if (!sortkeys)
  {
    fpout = gt_fa_fopen_with_suffix(indexname,GT_KEYSTABFILESUFFIX,"wb",err);
    if (fpout == NULL)
    {
      haserr = true;
    }
  }
  if (!haserr)
  {
    line = gt_str_new();
  }
  for (linenum = 0; !haserr && gt_str_read_next_line(line, fpin) != EOF;
       linenum++)
  {
    keyptr = desc2key(&keylen,gt_str_get(line),err);
    if (keyptr == NULL)
    {
      haserr = true;
      break;
    }
    if (keylen == 0)
    {
      gt_error_set(err,"key of length 0 in \"%s\" not expected",
                   gt_str_get(line));
      haserr = true;
      break;
    }
    if (firstdesc)
    {
      if (keylen > (unsigned long) CHAR_MAX)
      {
        gt_error_set(err,"key \"%*.*s\" of length %lu not allowed; "
                         "no key must be larger than %d",
                          (int) keylen,(int) keylen,keyptr,keylen,CHAR_MAX);
        haserr = true;
        break;
      }
      constantkeylen = keylen;
      previouskey = gt_malloc(sizeof (char) * (constantkeylen+1));
      firstdesc = false;
      if (!sortkeys)
      {
        gt_xfputc((char) constantkeylen,fpout);
      } else
      {
        GtEncseqLoader *el;
        if (constantkeylen > (unsigned long) MAXFIXEDKEYSIZE)
        {
          gt_error_set(err,"key \"%*.*s\" of length %lu not allowed; "
                           "no key must be larger than %d",
                            (int) keylen,(int) keylen,keyptr,keylen,
                            MAXFIXEDKEYSIZE);
          haserr = true;
          break;
        }
        el = gt_encseq_loader_new();
        gt_encseq_loader_set_logger(el, logger);
        encseq = gt_encseq_loader_load(el, indexname, err);
        gt_encseq_loader_delete(el);
        if (encseq == NULL)
        {
          haserr = true;
          break;
        }
        numofentries = gt_encseq_num_of_sequences(encseq);
        gt_assert(numofentries > 0);
        keytab = gt_malloc(sizeof (*keytab) * numofentries);
        keytabptr = keytab;
      }
    } else
    {
      if (constantkeylen != keylen)
      {
        gt_error_set(err,"key \"%*.*s\" of length %lu: all keys must be of "
                         "the same length which for all previously seen "
                         "headers is %lu",
                         (int) keylen,(int) keylen,keyptr,keylen,
                         constantkeylen);
        haserr = true;
        break;
      }
      gt_assert(previouskey != NULL);
      if (!sortkeys && strncmp(previouskey,keyptr,(size_t) constantkeylen) >= 0)
      {
        gt_error_set(err,"previous key \"%s\" is not lexicographically smaller "
                         "than current key \"%*.*s\"",
                         previouskey,(int) keylen,(int) keylen,keyptr);
        haserr = true;
        break;
        /*
        printf("previous key \"%s\" (no %lu) is lexicographically larger "
               "than current key \"%*.*s\"\n",
               previouskey,linenum,(int) keylen,(int) keylen,keyptr);
        incorrectorder++;
        */
      }
    }
    if (!sortkeys)
    {
      gt_xfwrite(keyptr,sizeof *keyptr,(size_t) keylen,fpout);
      gt_xfputc('\0',fpout);
    } else
    {
      gt_assert(keytabptr != NULL);
      strncpy(keytabptr->key,keyptr,(size_t) constantkeylen);
      keytabptr->key[constantkeylen] = '\0';
      keytabptr->seqnum = linenum;
      keytabptr++;
    }
    strncpy(previouskey,keyptr,(size_t) constantkeylen);
    previouskey[constantkeylen] = '\0';
    gt_str_reset(line);
  }
  if (!haserr)
  {
    gt_logger_log(logger,"number of keys of length %lu = %lu",
                constantkeylen,linenum);
    /*
    gt_logger_log(logger,"number of incorrectly ordered keys = %lu",
                incorrectorder);
    */
  }
  gt_str_delete(line);
  gt_fa_fclose(fpin);
  gt_fa_fclose(fpout);
  gt_free(previouskey);
  if (!haserr && sortkeys)
  {
    gt_assert(keytabptr != NULL);
    gt_assert(numofentries > 0);
    gt_assert(keytabptr == keytab + numofentries);
    qsort(keytab,(size_t) numofentries,sizeof (*keytab),compareFixedkeys);
    gt_assert(keytabptr != NULL);
    for (keytabptr = keytab; !haserr && keytabptr < keytab + numofentries;
         keytabptr++)
    {
      if (giextract_encodedseq2fasta(stdout,
                                     encseq,
                                     keytabptr->seqnum,
                                     NULL,
                                     linewidth,
                                     err) != 0)
      {
        haserr = true;
        break;
      }
    }
  }
  if (encseq != NULL)
  {
    gt_encseq_delete(encseq);
    encseq = NULL;
  }
  gt_free(keytab);
  return haserr ? -1 : 0;
}
Example #15
0
static int enumeratelcpintervals(const char *inputindex,
                                 Sequentialsuffixarrayreader *ssar,
                                 const char *storeindex,
                                 bool storecounts,
                                 GtUword mersize,
                                 GtUword minocc,
                                 GtUword maxocc,
                                 bool performtest,
                                 GtLogger *logger,
                                 GtError *err)
{
  TyrDfsstate *state;
  bool haserr = false;
  unsigned int alphasize;

  gt_error_check(err);
  state = gt_malloc(sizeof (*state));
  GT_INITARRAY(&state->occdistribution,Countwithpositions);
  state->esrspace = gt_encseq_create_reader_with_readmode(
                                   gt_encseqSequentialsuffixarrayreader(ssar),
                                   gt_readmodeSequentialsuffixarrayreader(ssar),
                                   0);
  state->mersize = (GtUword) mersize;
  state->encseq = gt_encseqSequentialsuffixarrayreader(ssar);
  alphasize = gt_alphabet_num_of_chars(gt_encseq_alphabet(state->encseq));
  state->readmode = gt_readmodeSequentialsuffixarrayreader(ssar);
  state->storecounts = storecounts;
  state->minocc = minocc;
  state->maxocc = maxocc;
  state->totallength = gt_encseq_total_length(state->encseq);
  state->performtest = performtest;
  state->countoutputmers = 0;
  state->merindexfpout = NULL;
  state->countsfilefpout = NULL;
  GT_INITARRAY(&state->largecounts,Largecount);
  if (strlen(storeindex) == 0)
  {
    state->sizeofbuffer = 0;
    state->bytebuffer = NULL;
  } else
  {
    state->sizeofbuffer = MERBYTES(mersize);
    state->bytebuffer = gt_malloc(sizeof *state->bytebuffer
                                  * state->sizeofbuffer);
  }
  if (performtest)
  {
    state->currentmer = gt_malloc(sizeof *state->currentmer
                                  * state->mersize);
    state->suftab = gt_suftabSequentialsuffixarrayreader(ssar);
  } else
  {
    state->currentmer = NULL;
    state->suftab = NULL;
  }
  if (state->mersize > state->totallength)
  {
    gt_error_set(err,"mersize "GT_WU" > "GT_WU" = totallength not allowed",
                 state->mersize,
                 state->totallength);
    haserr = true;
  } else
  {
    if (strlen(storeindex) == 0)
    {
      state->processoccurrencecount = adddistpos2distribution;
    } else
    {
      state->merindexfpout = gt_fa_fopen_with_suffix(storeindex,MERSUFFIX,
                                                    "wb",err);
      if (state->merindexfpout == NULL)
      {
        haserr = true;
      } else
      {
        if (state->storecounts)
        {
          state->countsfilefpout
            = gt_fa_fopen_with_suffix(storeindex,COUNTSSUFFIX,"wb",err);
          if (state->countsfilefpout == NULL)
          {
            haserr = true;
          }
        }
      }
      state->processoccurrencecount = outputsortedstring2index;
    }
    if (!haserr)
    {
      if (gt_depthfirstesa(ssar,
                          tyr_allocateDfsinfo,
                          tyr_freeDfsinfo,
                          tyr_processleafedge,
                          NULL,
                          tyr_processcompletenode,
                          tyr_assignleftmostleaf,
                          tyr_assignrightmostleaf,
                          (Dfsstate*) state,
                          logger,
                          err) != 0)
      {
        haserr = true;
      }
      if (strlen(storeindex) == 0)
      {
        showfinalstatistics(state,inputindex,logger);
      }
    }
    if (!haserr)
    {
      if (state->countsfilefpout != NULL)
      {
        gt_logger_log(logger,"write "GT_WU" mercounts > "GT_WU
                      " to file \"%s%s\"",
                      state->largecounts.nextfreeLargecount,
                      (GtUword) MAXSMALLMERCOUNT,
                      storeindex,
                      COUNTSSUFFIX);
        gt_xfwrite(state->largecounts.spaceLargecount, sizeof (Largecount),
                  (size_t) state->largecounts.nextfreeLargecount,
                  state->countsfilefpout);
      }
    }
    if (!haserr)
    {
      gt_logger_log(logger,"number of "GT_WU"-mers in index: "GT_WU"",
                  mersize,
                  state->countoutputmers);
      gt_logger_log(logger,"index size: %.2f megabytes\n",
                  GT_MEGABYTES(state->countoutputmers * state->sizeofbuffer +
                               sizeof (GtUword) * EXTRAINTEGERS));
    }
  }
  /* now out EXTRAINTEGERS integer values */
  if (!haserr && state->merindexfpout != NULL)
  {
    outputbytewiseUlongvalue(state->merindexfpout,
                             (GtUword) state->mersize);
    outputbytewiseUlongvalue(state->merindexfpout,(GtUword) alphasize);
  }
  gt_fa_xfclose(state->merindexfpout);
  gt_fa_xfclose(state->countsfilefpout);
  GT_FREEARRAY(&state->occdistribution,Countwithpositions);
  gt_free(state->currentmer);
  gt_free(state->bytebuffer);
  GT_FREEARRAY(&state->largecounts,Largecount);
  gt_encseq_reader_delete(state->esrspace);
  gt_free(state);
  return haserr ? -1 : 0;
}