예제 #1
0
파일: fmi-mkindex.c 프로젝트: 9beckert/TIR
static int runmkfmindex(Mkfmcallinfo *mkfmcallinfo,GtLogger *logger,
                        GtError *err)
{
  Fmindex fm;
  unsigned int log2bsize,
               log2markdist;
  bool haserr = false;
  GtSpecialcharinfo specialcharinfo;

  gt_error_check(err);
  GT_INITARRAY(&fm.specpos, GtPairBwtidx);
  fm.bfreq = NULL;
  fm.superbfreq = NULL;
  fm.tfreq = NULL;
  fm.markpostable = NULL;
  fm.boundarray = NULL;
  fm.suffixlength = 0;

  if (levedescl2levelnum(gt_str_get(mkfmcallinfo->leveldesc),
                        &log2bsize,
                        &log2markdist) != 0)
  {
    gt_error_set(err,"undefined level \"%s\"",
                      gt_str_get(mkfmcallinfo->leveldesc));
    haserr = true;
  }
  if (!haserr && gt_sufbwt2fmindex(&fm,
                                   &specialcharinfo,
                                   log2bsize,
                                   log2markdist,
                                   gt_str_get(mkfmcallinfo->outfmindex),
                                   mkfmcallinfo->indexnametab,
                                   mkfmcallinfo->noindexpos ? false : true,
                                   logger,
                                   err) != 0)
  {
    haserr = true;
  }
  if (!haserr && gt_saveFmindex(gt_str_get(mkfmcallinfo->outfmindex),
                                &fm,
                                &specialcharinfo,
                                mkfmcallinfo->noindexpos ? false : true,
                                err) < 0)
  {
    haserr = true;
  }
  freeconstructedfmindex(&fm);
  return haserr ? -1 : 0;
}
예제 #2
0
GtBareEncseq *gt_bare_encseq_new(GtUchar *sequence,GtUword len,
                                 GtUword numofchars)
{
  GtBareEncseq *bare_encseq = gt_malloc(sizeof *bare_encseq);
  const GtUchar *readptr;
  GtBareSpecialrange *srptr = NULL;
  GtUword lastspecialrange_length = 0;

  bare_encseq->specialcharacters = 0;
  bare_encseq->numofchars = numofchars;
  bare_encseq->charcount = gt_calloc((size_t) bare_encseq->numofchars,
                                     sizeof *bare_encseq->charcount);
  GT_INITARRAY(&bare_encseq->specialranges,GtBareSpecialrange);
  for (readptr = sequence; readptr < sequence + len; readptr++)
  {
    GtUchar cc = *readptr;
    if (ISSPECIAL(cc))
    {
      if (lastspecialrange_length == 0)
      {
        GT_GETNEXTFREEINARRAY(srptr,&bare_encseq->specialranges,
                              GtBareSpecialrange,128UL);
        srptr->start = (GtUword) (readptr - sequence);
      }
      lastspecialrange_length++;
      bare_encseq->specialcharacters++;
    } else
    {
      gt_assert((GtUword) cc < bare_encseq->numofchars);
      bare_encseq->charcount[(int) cc]++;
      if (lastspecialrange_length > 0)
      {
        gt_assert(srptr != NULL);
        srptr->length = lastspecialrange_length;
      }
      lastspecialrange_length = 0;
    }
  }
  if (lastspecialrange_length > 0)
  {
    gt_assert(srptr != NULL);
    srptr->length = lastspecialrange_length;
  }
  bare_encseq->sequence = sequence;
  bare_encseq->totallength = len;
  return bare_encseq;
}
GtSeqIterator* gt_seq_iterator_sequence_buffer_new_with_buffer(
                                                       GtSequenceBuffer *buffer)
{
  GtSeqIterator *si;
  GtSeqIteratorSequenceBuffer *seqit;
  si = gt_seq_iterator_create(gt_seq_iterator_sequence_buffer_class());
  seqit = gt_seq_iterator_sequence_buffer_cast(si);
  GT_INITARRAY(&seqit->sequencebuffer, GtUchar);
  seqit->descptr = gt_desc_buffer_new();
  seqit->fb = gt_sequence_buffer_ref(buffer);
  gt_sequence_buffer_set_desc_buffer(seqit->fb, seqit->descptr);
  seqit->exhausted = false;
  seqit->unitnum = 0;
  seqit->withsequence = true;
  seqit->currentread = 0;
  seqit->maxread = 0;
  return si;
}
static void gt_kmer_database_add_to_hash(GtHashmap *hash, GtCodetype kmercode,
                                         GtUword position)
{
  GtArrayGtUword *arr =
    (GtArrayGtUword *) gt_hashmap_get(hash, (void *) kmercode);

  if (arr == NULL) {
    arr = gt_malloc(sizeof (*arr));
    GT_INITARRAY(arr, GtUword);
    gt_hashmap_add(hash, (void *) kmercode, (void *) arr);
  }
  if (arr->allocatedGtUword == 0)
    GT_STOREINARRAY(arr, GtUword,
                    (GtUword) 20,
                    position);
  else
    GT_STOREINARRAY(arr, GtUword,
                    arr->allocatedGtUword * 0.1,
                    position);
}
예제 #5
0
int gt_verifymappedstr(const GtEncseq *encseq,
                       unsigned int prefixlength,
                       GtError *err)
{
  unsigned int numofchars;
  GtArrayGtCodetype codeliststream;
  bool haserr = false;

  gt_error_check(err);
  numofchars = gt_alphabet_num_of_chars(gt_encseq_alphabet(encseq));
  GT_INITARRAY(&codeliststream,GtCodetype);
  if (getfastastreamkmers(gt_encseq_filenames(encseq),
                          numofchars,
                          prefixlength,
                          gt_alphabet_symbolmap(
                                gt_encseq_alphabet(encseq)),
                          false,
                          &codeliststream,
                          err) != 0)
  {
    haserr = true;
  }
  if (!haserr)
  {
    if (verifycodelists(encseq,
                        prefixlength,
                        numofchars,
                        &codeliststream,
                        err) != 0)
    {
      haserr = true;
    }
  }
  GT_FREEARRAY(&codeliststream,GtCodetype);
  return haserr ? -1 : 0;
}
예제 #6
0
/*read condenseq data structure from file*/
GtCondenseq *gt_condenseq_new_from_file(const char *indexname,
                                        GtLogger *logger, GtError *err)
{
  int had_err = 0;
  FILE* fp;
  GtEncseqLoader *esl;
  GtEncseq *unique_es;
  GtCondenseq *condenseq = NULL;
  /*load unique_es*/
  esl = gt_encseq_loader_new();
  unique_es = gt_encseq_loader_load(esl, indexname, err);
  if (!unique_es)
    had_err = -1;
  if (!had_err) {
    gt_encseq_loader_delete(esl);
    condenseq = condenseq_new_empty(gt_encseq_alphabet(unique_es));
    condenseq->filename = gt_cstr_dup(indexname);
    condenseq->unique_es = unique_es;
    fp = gt_fa_fopen_with_suffix(indexname, GT_CONDENSEQ_FILE_SUFFIX,
                                 "rb", err);
    if (fp == NULL) {
      had_err = -1;
    }
    else {
      had_err = condenseq_io(condenseq, fp, gt_io_error_fread, err);
      if (!had_err) {
        GtUword i;
        gt_assert(condenseq->uniques);
        gt_assert(condenseq->links);
        gt_fa_fclose(fp);
        /*create link array for each unique entry*/
        for (i = 0; i < condenseq->udb_nelems; i++) {
          GT_INITARRAY(&(condenseq->uniques[i].links),uint32_t);
        }
        /* check for overflows */
        if (condenseq->ldb_nelems > (GtUword) ((uint32_t) 0 - (uint32_t) 1)) {
          gt_error_set(err, "Overflow, to many link-elements. Can't be stored");
          had_err = -1;
        }
        /* iterate through link entrys and store ids in corresponding unique
          entry array */
        for (i = 0; !had_err && (GtUword) i < condenseq->ldb_nelems; i++) {
          GtUword uid = condenseq->links[i].unique_id;
          gt_assert(uid < condenseq->udb_nelems);
          GT_STOREINARRAY(&(condenseq->uniques[uid].links),
                          uint32_t,
                          10,
                          (uint32_t) i);
        }
      }
    }
  }
  if (!had_err) {
    gt_assert(condenseq != NULL);
    if (condenseq->id_len != GT_UNDEF_UWORD)
      gt_logger_log(logger, "IDs const len: " GT_WU, condenseq->id_len);
    else
      gt_logger_log(logger, "using sdstab to access IDs");
  }
  if (had_err) {
    gt_condenseq_delete(condenseq);
    condenseq = NULL;
  }
  return (condenseq);
}
예제 #7
0
GtOutlcpinfo *gt_Outlcpinfo_new(const char *indexname,
                                unsigned int numofchars,
                                unsigned int prefixlength,
                                bool withdistribution,
                                bool swallow_tail_lcpvalues,
                                GtFinalProcessBucket final_process_bucket,
                                void *final_process_bucket_info,
                                GtError *err)
{
  bool haserr = false;
  GtOutlcpinfo *outlcpinfo;

  outlcpinfo = gt_malloc(sizeof (*outlcpinfo));
  outlcpinfo->sizeofinfo = sizeof (*outlcpinfo);
  outlcpinfo->lcpsubtab.lcptabsum = 0.0;
  outlcpinfo->swallow_tail_lcpvalues = swallow_tail_lcpvalues;
  if (withdistribution)
  {
    outlcpinfo->lcpsubtab.distlcpvalues = gt_disc_distri_new();
  } else
  {
    outlcpinfo->lcpsubtab.distlcpvalues = NULL;
  }
  if (indexname == NULL)
  {
    outlcpinfo->lcpsubtab.lcp2file = NULL;
    if (final_process_bucket != NULL)
    {
      outlcpinfo->lcpsubtab.lcpprocess
        = gt_malloc(sizeof (*outlcpinfo->lcpsubtab.lcpprocess));
      outlcpinfo->lcpsubtab.lcpprocess->final_process_bucket
        = final_process_bucket;
      outlcpinfo->lcpsubtab.lcpprocess->final_process_bucket_info
        = final_process_bucket_info;
    } else
    {
      outlcpinfo->lcpsubtab.lcpprocess = NULL;
    }
  } else
  {
    outlcpinfo->lcpsubtab.lcpprocess = NULL;
    outlcpinfo->lcpsubtab.lcp2file
      = gt_malloc(sizeof (*outlcpinfo->lcpsubtab.lcp2file));
    outlcpinfo->sizeofinfo += sizeof (*outlcpinfo->lcpsubtab.lcp2file);
    outlcpinfo->lcpsubtab.lcp2file->countoutputlcpvalues = 0;
    outlcpinfo->lcpsubtab.lcp2file->maxbranchdepth = 0;
    outlcpinfo->lcpsubtab.lcp2file->totalnumoflargelcpvalues = 0;
    outlcpinfo->lcpsubtab.lcp2file->reservoir = NULL;
    outlcpinfo->lcpsubtab.lcp2file->sizereservoir = 0;
    outlcpinfo->lcpsubtab.lcp2file->smalllcpvalues = NULL;
    GT_INITARRAY(&outlcpinfo->lcpsubtab.lcp2file->largelcpvalues,
                 Largelcpvalue);
    outlcpinfo->lcpsubtab.lcp2file->outfplcptab
      = gt_fa_fopen_with_suffix(indexname,GT_LCPTABSUFFIX,"wb",err);
    if (outlcpinfo->lcpsubtab.lcp2file->outfplcptab == NULL)
    {
      haserr = true;
    }
    if (!haserr)
    {
      outlcpinfo->lcpsubtab.lcp2file->outfpllvtab
        = gt_fa_fopen_with_suffix(indexname,GT_LARGELCPTABSUFFIX,"wb",err);
      if (outlcpinfo->lcpsubtab.lcp2file->outfpllvtab == NULL)
      {
        haserr = true;
      }
    }
  }
  outlcpinfo->numsuffixes2output = 0;
  outlcpinfo->minchanged = 0;
  if (!haserr && prefixlength > 0)
  {
    outlcpinfo->turnwheel = gt_turningwheel_new(prefixlength,numofchars);
    outlcpinfo->sizeofinfo += gt_turningwheel_size();
  } else
  {
    outlcpinfo->turnwheel = NULL;
  }
#ifdef SKDEBUG
  outlcpinfo->previoussuffix.startpos = 0;
#endif
  outlcpinfo->previoussuffix.code = 0;
  outlcpinfo->previoussuffix.prefixindex = 0;
  outlcpinfo->previoussuffix.defined = false;
  outlcpinfo->previousbucketwasempty = false;
  outlcpinfo->lcpsubtab.tableoflcpvalues.bucketoflcpvalues = NULL;
  outlcpinfo->lcpsubtab.tableoflcpvalues.numofentries = 0;
#ifndef NDEBUG
  outlcpinfo->lcpsubtab.tableoflcpvalues.isset = NULL;
#endif
  if (haserr)
  {
    gt_free(outlcpinfo);
    return NULL;
  }
  return outlcpinfo;
}
예제 #8
0
Pckbuckettable *pckbuckettable_new(const void *voidbwtseq,
                                   unsigned int numofchars,
                                   Seqpos totallength,
                                   unsigned int maxdepth)
{
  GtArrayBoundsatdepth stack;
  Boundsatdepth parent, child;
  unsigned long rangesize, idx;
  Seqpos *rangeOccs;
  Pckbuckettable *pckbt;
  Mbtab *tmpmbtab;

  GT_INITARRAY(&stack,Boundsatdepth);
  child.lowerbound = 0;
  child.upperbound = totallength+1;
  child.depth = 0;
  child.code = (Codetype) 0;
  GT_STOREINARRAY(&stack,Boundsatdepth,128,child);
  rangeOccs = gt_malloc(sizeof(*rangeOccs) * GT_MULT2(numofchars));
  tmpmbtab = gt_malloc(sizeof(*tmpmbtab) * numofchars);
  pckbt = allocandinitpckbuckettable(numofchars,maxdepth,true);
  while (stack.nextfreeBoundsatdepth > 0)
  {
    parent = stack.spaceBoundsatdepth[--stack.nextfreeBoundsatdepth];
    gt_assert(parent.lowerbound < parent.upperbound);
    rangesize = bwtrangesplitallwithoutspecial(tmpmbtab,
                                               rangeOccs,
                                               voidbwtseq,
                                               parent.lowerbound,
                                               parent.upperbound);
    gt_assert(rangesize <= (unsigned long) numofchars);
    for (idx = 0; idx < rangesize; idx++)
    {
      child.lowerbound = tmpmbtab[idx].lowerbound;
      child.upperbound = tmpmbtab[idx].upperbound;
      child.depth = parent.depth + 1;
      gt_assert(child.depth <= maxdepth);
      child.code = parent.code * numofchars + idx;
      /*
      printf("depth=%lu code=%lu: %lu %lu\n",
             child.depth,child.code,(unsigned long) child.lowerbound,
                                    (unsigned long) child.upperbound);
      */
      storeBoundsatdepth(pckbt,&child);
      if (child.depth < maxdepth)
      {
        if (child.lowerbound + 1 < child.upperbound)
        {
          GT_STOREINARRAY(&stack,Boundsatdepth,128,child);
        } else
        {
          followleafedge(pckbt,voidbwtseq,&child);
        }
      }
    }
  }
  GT_FREEARRAY(&stack,Boundsatdepth);
  gt_free(rangeOccs);
  gt_free(tmpmbtab);
  printf("filled: %lu (%.2f)\n",pckbt->numofvalues,
                        (double) pckbt->numofvalues/pckbt->maxnumofvalues);
  return pckbt;
}
예제 #9
0
int gt_mapspec_write(GtMapspecSetupFunc setup, FILE *fp,
                     void *data, GtUword expectedsize, GtError *err)
{
  GtMapspecification *mapspecptr;
  GtUword byteoffset = 0;
  int had_err = 0;
  GtUword totalpadunits = 0;
  GtUword byteswritten;
  GtMapspec *ms = gt_malloc(sizeof (GtMapspec));

  gt_error_check(err);
  GT_INITARRAY(&ms->mapspectable,GtMapspecification);
  setup(ms, data, true);
  gt_assert(ms->mapspectable.spaceGtMapspecification != NULL);
  for (mapspecptr = ms->mapspectable.spaceGtMapspecification;
       mapspecptr < ms->mapspectable.spaceGtMapspecification +
                    ms->mapspectable.nextfreeGtMapspecification;
       mapspecptr++)
  {
#ifdef SKDEBUG
    printf("# %s",__func__);
    showmapspec(mapspecptr);
    printf(" at byteoffset "GT_WU"\n",byteoffset);
#endif
    if (mapspecptr->numofunits > 0)
    {
      switch (mapspecptr->typespec)
      {
        case GtCharType:
          WRITEACTIONWITHTYPE(char);
          break;
        case GtFilelengthvaluesType:
          WRITEACTIONWITHTYPE(GtFilelengthvalues);
          break;
        case GtUcharType:
          WRITEACTIONWITHTYPE(GtUchar);
          break;
        case Uint16Type:
          WRITEACTIONWITHTYPE(uint16_t);
          break;
        case Uint32Type:
          WRITEACTIONWITHTYPE(uint32_t);
          break;
        case GtUlongType:
          WRITEACTIONWITHTYPE(GtUlong);
          break;
        case Uint64Type:
          WRITEACTIONWITHTYPE(uint64_t);
          break;
        case GtBitsequenceType:
          WRITEACTIONWITHTYPE(GtBitsequence);
          break;
        case GtUlongBoundType:
          WRITEACTIONWITHTYPE(GtUlongBound);
          break;
        case GtPairBwtidxType:
          WRITEACTIONWITHTYPE(GtPairBwtidx);
          break;
        case GtTwobitencodingType:
          WRITEACTIONWITHTYPE(GtTwobitencoding);
          break;
        case GtSpecialcharinfoType:
          WRITEACTIONWITHTYPE(GtSpecialcharinfo);
          break;
        case GtBitElemType:
          WRITEACTIONWITHTYPE(BitElem);
          break;
        case GtUintType:
          WRITEACTIONWITHTYPE(unsigned int);
          break;
        default:
           gt_error_set(err, "no map specification for size " GT_WU,
                        (GtUword) mapspecptr->sizeofunit);
           had_err = -1;
      }
    }
    if (had_err)
    {
      break;
    }
    byteoffset = CALLCASTFUNC(uint64_t,unsigned_long,
                              (uint64_t) (byteoffset +
                                          mapspecptr->sizeofunit *
                                          mapspecptr->numofunits));
    if (gt_mapspec_pad(fp,&byteswritten,byteoffset,err) != 0)
    {
      had_err = -1;
    }
    byteoffset += byteswritten;
    totalpadunits += byteswritten;
  }
  if (!had_err)
  {
    if (expectedsize + totalpadunits != byteoffset)
    {
      gt_error_set(err, "expected file size is " GT_WU " bytes, "
                   "but file has " GT_WU " bytes",
                   expectedsize, byteoffset);
      had_err = -1;
    }
  }
  GT_FREEARRAY(&ms->mapspectable,GtMapspecification);
  gt_free(ms);
  return had_err;
}
예제 #10
0
int  gt_mapspec_read(GtMapspecSetupFunc setup, void *data,
                     const char *filename, GtUword expectedsize,
                     void **mapped, GtError *err)
{
  void *mapptr;
  uint64_t expectedaccordingtomapspec;
  GtUword byteoffset = 0;
  size_t numofbytes;
  GtMapspec *ms = gt_malloc(sizeof (GtMapspec));
  GtMapspecification *mapspecptr;
  int had_err = 0;
  GtUword totalpadunits = 0;

  gt_error_check(err);
  GT_INITARRAY(&ms->mapspectable, GtMapspecification);
  setup(ms, data, false);

  mapptr = gt_fa_mmap_read(filename, &numofbytes, err);
  if (mapptr == NULL)
  {
    had_err = -1;
  }
  *mapped = mapptr;
  if (!had_err)
  {
    if (assigncorrecttype(ms->mapspectable.spaceGtMapspecification,
                          mapptr,0,err) != 0)
    {
      had_err = -1;
    }
  }
  if (!had_err)
  {
    expectedaccordingtomapspec =
                               detexpectedaccordingtomapspec(&ms->mapspectable);
    if (expectedaccordingtomapspec != (uint64_t) numofbytes)
    {
      gt_error_set(err, GT_WU " bytes read from %s, but " Formatuint64_t
                   " expected",
                   (GtUword) numofbytes,
                   filename,
                   PRINTuint64_tcast(expectedaccordingtomapspec));
      had_err = -1;
    }
  }
  if (!had_err)
  {
    mapspecptr = ms->mapspectable.spaceGtMapspecification;
    gt_assert(mapspecptr != NULL);
    byteoffset = CALLCASTFUNC(uint64_t,unsigned_long,
                              (uint64_t) (mapspecptr->sizeofunit *
                                          mapspecptr->numofunits));
    if (byteoffset % (GtUword) GT_WORDSIZE_INBYTES > 0)
    {
      size_t padunits
        = GT_WORDSIZE_INBYTES - (byteoffset % GT_WORDSIZE_INBYTES);
      byteoffset += (GtUword) padunits;
      totalpadunits += (GtUword) padunits;
    }
    for (mapspecptr++;
         mapspecptr < ms->mapspectable.spaceGtMapspecification +
                      ms->mapspectable.nextfreeGtMapspecification; mapspecptr++)
    {
      if (assigncorrecttype(mapspecptr,mapptr,byteoffset,err) != 0)
      {
        had_err = -1;
        break;
      }
      byteoffset = CALLCASTFUNC(uint64_t,unsigned_long,
                                (uint64_t) (byteoffset +
                                            mapspecptr->sizeofunit *
                                            mapspecptr->numofunits));
      if (byteoffset % (GtUword) GT_WORDSIZE_INBYTES > 0)
      {
        size_t padunits
          = GT_WORDSIZE_INBYTES - (byteoffset % GT_WORDSIZE_INBYTES);
        byteoffset += (GtUword) padunits;
        totalpadunits += (GtUword) padunits;
      }
    }
  }
  if (!had_err)
  {
    if (expectedsize + totalpadunits != byteoffset)
    {
      gt_error_set(err,"mapping: expected file size is "GT_WU" bytes, "
                       "but file has "GT_WU" bytes",
                       expectedsize,byteoffset);
      had_err = -1;
    }
  }
  GT_FREEARRAY(&ms->mapspectable,GtMapspecification);
  gt_free(ms);
  return had_err;
}
예제 #11
0
static int enumeratelcpintervals(const char *inputindex,
                                 Sequentialsuffixarrayreader *ssar,
                                 const char *storeindex,
                                 bool storecounts,
                                 GtUword mersize,
                                 GtUword minocc,
                                 GtUword maxocc,
                                 bool performtest,
                                 GtLogger *logger,
                                 GtError *err)
{
  TyrDfsstate *state;
  bool haserr = false;
  unsigned int alphasize;

  gt_error_check(err);
  state = gt_malloc(sizeof (*state));
  GT_INITARRAY(&state->occdistribution,Countwithpositions);
  state->esrspace = gt_encseq_create_reader_with_readmode(
                                   gt_encseqSequentialsuffixarrayreader(ssar),
                                   gt_readmodeSequentialsuffixarrayreader(ssar),
                                   0);
  state->mersize = (GtUword) mersize;
  state->encseq = gt_encseqSequentialsuffixarrayreader(ssar);
  alphasize = gt_alphabet_num_of_chars(gt_encseq_alphabet(state->encseq));
  state->readmode = gt_readmodeSequentialsuffixarrayreader(ssar);
  state->storecounts = storecounts;
  state->minocc = minocc;
  state->maxocc = maxocc;
  state->totallength = gt_encseq_total_length(state->encseq);
  state->performtest = performtest;
  state->countoutputmers = 0;
  state->merindexfpout = NULL;
  state->countsfilefpout = NULL;
  GT_INITARRAY(&state->largecounts,Largecount);
  if (strlen(storeindex) == 0)
  {
    state->sizeofbuffer = 0;
    state->bytebuffer = NULL;
  } else
  {
    state->sizeofbuffer = MERBYTES(mersize);
    state->bytebuffer = gt_malloc(sizeof *state->bytebuffer
                                  * state->sizeofbuffer);
  }
  if (performtest)
  {
    state->currentmer = gt_malloc(sizeof *state->currentmer
                                  * state->mersize);
    state->suftab = gt_suftabSequentialsuffixarrayreader(ssar);
  } else
  {
    state->currentmer = NULL;
    state->suftab = NULL;
  }
  if (state->mersize > state->totallength)
  {
    gt_error_set(err,"mersize "GT_WU" > "GT_WU" = totallength not allowed",
                 state->mersize,
                 state->totallength);
    haserr = true;
  } else
  {
    if (strlen(storeindex) == 0)
    {
      state->processoccurrencecount = adddistpos2distribution;
    } else
    {
      state->merindexfpout = gt_fa_fopen_with_suffix(storeindex,MERSUFFIX,
                                                    "wb",err);
      if (state->merindexfpout == NULL)
      {
        haserr = true;
      } else
      {
        if (state->storecounts)
        {
          state->countsfilefpout
            = gt_fa_fopen_with_suffix(storeindex,COUNTSSUFFIX,"wb",err);
          if (state->countsfilefpout == NULL)
          {
            haserr = true;
          }
        }
      }
      state->processoccurrencecount = outputsortedstring2index;
    }
    if (!haserr)
    {
      if (gt_depthfirstesa(ssar,
                          tyr_allocateDfsinfo,
                          tyr_freeDfsinfo,
                          tyr_processleafedge,
                          NULL,
                          tyr_processcompletenode,
                          tyr_assignleftmostleaf,
                          tyr_assignrightmostleaf,
                          (Dfsstate*) state,
                          logger,
                          err) != 0)
      {
        haserr = true;
      }
      if (strlen(storeindex) == 0)
      {
        showfinalstatistics(state,inputindex,logger);
      }
    }
    if (!haserr)
    {
      if (state->countsfilefpout != NULL)
      {
        gt_logger_log(logger,"write "GT_WU" mercounts > "GT_WU
                      " to file \"%s%s\"",
                      state->largecounts.nextfreeLargecount,
                      (GtUword) MAXSMALLMERCOUNT,
                      storeindex,
                      COUNTSSUFFIX);
        gt_xfwrite(state->largecounts.spaceLargecount, sizeof (Largecount),
                  (size_t) state->largecounts.nextfreeLargecount,
                  state->countsfilefpout);
      }
    }
    if (!haserr)
    {
      gt_logger_log(logger,"number of "GT_WU"-mers in index: "GT_WU"",
                  mersize,
                  state->countoutputmers);
      gt_logger_log(logger,"index size: %.2f megabytes\n",
                  GT_MEGABYTES(state->countoutputmers * state->sizeofbuffer +
                               sizeof (GtUword) * EXTRAINTEGERS));
    }
  }
  /* now out EXTRAINTEGERS integer values */
  if (!haserr && state->merindexfpout != NULL)
  {
    outputbytewiseUlongvalue(state->merindexfpout,
                             (GtUword) state->mersize);
    outputbytewiseUlongvalue(state->merindexfpout,(GtUword) alphasize);
  }
  gt_fa_xfclose(state->merindexfpout);
  gt_fa_xfclose(state->countsfilefpout);
  GT_FREEARRAY(&state->occdistribution,Countwithpositions);
  gt_free(state->currentmer);
  gt_free(state->bytebuffer);
  GT_FREEARRAY(&state->largecounts,Largecount);
  gt_encseq_reader_delete(state->esrspace);
  gt_free(state);
  return haserr ? -1 : 0;
}
예제 #12
0
GtBareEncseq *gt_bare_encseq_parse_new(GtUchar *filecontents,size_t numofbytes,
                                       const GtAlphabet *alphabet,
                                       GtError *err)
{
  GtUchar *writeptr = filecontents, *readptr = filecontents;
  const GtUchar *endptr = filecontents + numofbytes;
  bool firstline = true, haserr = false;
  GtUword lastspecialrange_length = 0;
  GtBareSpecialrange *srptr = NULL;
  GtBareEncseq *bare_encseq = gt_malloc(sizeof *bare_encseq);
  const GtUchar *smap = gt_alphabet_symbolmap(alphabet);

  bare_encseq->specialcharacters = 0;
  bare_encseq->numofchars = (GtUword) gt_alphabet_num_of_chars(alphabet);
  bare_encseq->charcount = gt_calloc((size_t) bare_encseq->numofchars,
                                     sizeof *bare_encseq->charcount);
  GT_INITARRAY(&bare_encseq->specialranges,GtBareSpecialrange);
  readptr = filecontents;
  while (!haserr && readptr < endptr)
  {
    if (*readptr == '>')
    {
      if (!firstline)
      {
        if (lastspecialrange_length == 0)
        {
          GT_GETNEXTFREEINARRAY(srptr,&bare_encseq->specialranges,
                                GtBareSpecialrange,128UL);
          srptr->start = (GtUword) (writeptr - filecontents);
        }
        lastspecialrange_length++;
        *writeptr++ = SEPARATOR;
        bare_encseq->specialcharacters++;
      } else
      {
        firstline = false;
      }
      while (readptr < endptr && *readptr != '\n')
      {
        readptr++;
      }
      readptr++;
    } else
    {
      while (readptr < endptr && *readptr != '\n')
      {
        if (!isspace(*readptr))
        {
          GtUchar cc = smap[*readptr];
          if (cc == UNDEFCHAR)
          {
            gt_error_set(err,"illegal input characters %c\n",*readptr);
            haserr = true;
            break;
          }
          if (ISSPECIAL(cc))
          {
            if (lastspecialrange_length == 0)
            {
              GT_GETNEXTFREEINARRAY(srptr,&bare_encseq->specialranges,
                                    GtBareSpecialrange,128UL);
              srptr->start = (GtUword) (writeptr - filecontents);
            }
            lastspecialrange_length++;
            bare_encseq->specialcharacters++;
          } else
          {
            gt_assert((GtUword) cc < bare_encseq->numofchars);
            bare_encseq->charcount[(int) cc]++;
            if (lastspecialrange_length > 0)
            {
              gt_assert(srptr != NULL);
              srptr->length = lastspecialrange_length;
            }
            lastspecialrange_length = 0;
          }
          *writeptr++ = cc;
        }
        readptr++;
      }
      readptr++;
    }
  }
  if (lastspecialrange_length > 0)
  {
    gt_assert(srptr != NULL);
    srptr->length = lastspecialrange_length;
  }
  bare_encseq->sequence = filecontents;
  bare_encseq->totallength = (GtUword) (writeptr - filecontents);
  if (haserr)
  {
    gt_bare_encseq_delete(bare_encseq);
    return NULL;
  }
  return bare_encseq;
}