Exemplo n.º 1
0
GtQuerysubstringmatchiterator *gt_querysubstringmatchiterator_new(
                                     const GtEncseq *dbencseq,
                                     GtUword totallength,
                                     const ESASuffixptr *suftabpart,
                                     GtReadmode db_readmode,
                                     GtUword numberofsuffixes,
                                     const GtStrArray *query_files,
                                     const GtEncseq *query_encseq,
                                     GtReadmode query_readmode,
                                     unsigned int userdefinedleastlength,
                                     GtError *err)
{
  GtQuerysubstringmatchiterator *qsmi = gt_malloc(sizeof *qsmi);

  qsmi->dbencseq = dbencseq;
  qsmi->suftabpart = suftabpart;
  qsmi->db_readmode = db_readmode;
  qsmi->numberofsuffixes = numberofsuffixes;
  qsmi->totallength = totallength;
  qsmi->userdefinedleastlength = (GtUword) userdefinedleastlength;
  qsmi->queryunitnum = 0;
  qsmi->desc = NULL;
  qsmi->query_for_seqit = NULL;
  qsmi->query_seqlen = 0;
  qsmi->queryrep.sequence = NULL;
  qsmi->queryrep.encseq = query_encseq;
  qsmi->queryrep.readmode = query_readmode;
  qsmi->queryrep.startpos = 0;
  qsmi->dbstart = 0;
  qsmi->matchlength = 0;
  qsmi->querysubstring.queryrep = &qsmi->queryrep;
  qsmi->mmsi = gt_mmsearchiterator_new_empty();
  qsmi->mmsi_defined = false;
  if (query_files == NULL || gt_str_array_size(query_files) == 0)
  {
    gt_assert(query_encseq != NULL);
    qsmi->seqit = NULL;
    qsmi->query_encseq_numofsequences
      = (uint64_t) gt_encseq_num_of_sequences(query_encseq);
  } else
  {
    gt_assert(query_encseq == NULL);
    qsmi->seqit = gt_seq_iterator_sequence_buffer_new(query_files, err);
    if (qsmi->seqit == NULL)
    {
      gt_querysubstringmatchiterator_delete(qsmi);
      return NULL;
    }
    gt_seq_iterator_set_symbolmap(qsmi->seqit,
                        gt_alphabet_symbolmap(gt_encseq_alphabet(dbencseq)));
  }
  return qsmi;
}
Exemplo n.º 2
0
int gt_esa2shulengthqueryfiles(unsigned long *totalgmatchlength,
                               const Suffixarray *suffixarray,
                               const GtStrArray *queryfilenames,
                               GtError *err)
{
  bool haserr = false;
  GtSeqIterator *seqit;
  const GtUchar *query;
  unsigned long querylen;
  char *desc = NULL;
  int retval;
  GtAlphabet *alphabet;

  gt_error_check(err);
  alphabet = gt_encseq_alphabet(suffixarray->encseq);
  gt_assert(gt_str_array_size(queryfilenames) == 1UL);
  seqit = gt_seq_iterator_sequence_buffer_new(queryfilenames, err);
  if (!seqit)
  {
    haserr = true;
  }
  if (!haserr)
  {
    gt_seq_iterator_set_symbolmap(seqit, gt_alphabet_symbolmap(alphabet));
    for (; /* Nothing */; )
    {
      retval = gt_seq_iterator_next(seqit,
                                   &query,
                                   &querylen,
                                   &desc,
                                   err);
      if (retval < 0)
      {
        haserr = true;
        break;
      }
      if (retval == 0)
      {
        break;
      }
      *totalgmatchlength += gt_esa2shulengthquery(suffixarray,query,querylen);
    }
    gt_seq_iterator_delete(seqit);
  }
  return haserr ? -1 : 0;
}
Exemplo n.º 3
0
int gt_verifymappedstr(const GtEncseq *encseq,
                       unsigned int prefixlength,
                       GtError *err)
{
  unsigned int numofchars;
  GtArrayGtCodetype codeliststream;
  bool haserr = false;

  gt_error_check(err);
  numofchars = gt_alphabet_num_of_chars(gt_encseq_alphabet(encseq));
  GT_INITARRAY(&codeliststream,GtCodetype);
  if (getfastastreamkmers(gt_encseq_filenames(encseq),
                          numofchars,
                          prefixlength,
                          gt_alphabet_symbolmap(
                                gt_encseq_alphabet(encseq)),
                          false,
                          &codeliststream,
                          err) != 0)
  {
    haserr = true;
  }
  if (!haserr)
  {
    if (verifycodelists(encseq,
                        prefixlength,
                        numofchars,
                        &codeliststream,
                        err) != 0)
    {
      haserr = true;
    }
  }
  GT_FREEARRAY(&codeliststream,GtCodetype);
  return haserr ? -1 : 0;
}
Exemplo n.º 4
0
static int hcr_write_seqs(FILE *fp, GtHcrEncoder *hcr_enc, GtError *err)
{
    int had_err = 0, seqit_err;
    GtUword bits_to_write = 0,
            len,
            read_counter = 0,
            page_counter = 0,
            bits_left_in_page,
            cur_read = 0;
    GtWord filepos;
    GtSeqIterator *seqit;
    const GtUchar *seq,
          *qual;
    char *desc;
    GtBitOutStream *bitstream;

    gt_error_check(err);
    gt_assert(hcr_enc->seq_encoder->sampling);

    gt_safe_assign(bits_left_in_page, (hcr_enc->pagesize * 8));

    gt_xfseek(fp, hcr_enc->seq_encoder->start_of_encoding, SEEK_SET);
    bitstream = gt_bitoutstream_new(fp);

    seqit = gt_seq_iterator_fastq_new(hcr_enc->files, err);
    if (!seqit) {
        gt_assert(gt_error_is_set(err));
        had_err = -1;
    }

    if (!had_err) {
        gt_seq_iterator_set_quality_buffer(seqit, &qual);
        gt_seq_iterator_set_symbolmap(seqit,
                                      gt_alphabet_symbolmap(hcr_enc->seq_encoder->alpha));
        hcr_enc->seq_encoder->total_num_of_symbols = 0;
        while (!had_err &&
                (seqit_err = gt_seq_iterator_next(seqit,
                             &seq,
                             &len,
                             &desc, err)) == 1) {

            /* count the bits */
            bits_to_write = hcr_write_seq(hcr_enc->seq_encoder, seq, qual, len,
                                          bitstream, true);

            /* check if a new sample has to be added */
            if (gt_sampling_is_next_element_sample(hcr_enc->seq_encoder->sampling,
                                                   page_counter,
                                                   read_counter,
                                                   bits_to_write,
                                                   bits_left_in_page)) {
                gt_bitoutstream_flush_advance(bitstream);

                filepos = gt_bitoutstream_pos(bitstream);
                if (filepos < 0) {
                    had_err = -1;
                    gt_error_set(err, "error by ftell: %s", strerror(errno));
                }
                else {
                    gt_sampling_add_sample(hcr_enc->seq_encoder->sampling,
                                           (size_t) filepos,
                                           cur_read);

                    read_counter = 0;
                    page_counter = 0;
                    gt_safe_assign(bits_left_in_page, (hcr_enc->pagesize * 8));
                }
            }

            if (!had_err) {
                /* do the writing */
                bits_to_write = hcr_write_seq(hcr_enc->seq_encoder,
                                              seq, qual, len, bitstream, false);

                /* update counter for sampling */
                while (bits_left_in_page < bits_to_write) {
                    page_counter++;
                    bits_to_write -= bits_left_in_page;
                    gt_safe_assign(bits_left_in_page, (hcr_enc->pagesize * 8));
                }
                bits_left_in_page -= bits_to_write;
                /* always set first page as written */
                if (page_counter == 0)
                    page_counter++;
                read_counter++;
                hcr_enc->seq_encoder->total_num_of_symbols += len;
                cur_read++;
            }
        }
        gt_assert(hcr_enc->num_of_reads == cur_read);
        if (!had_err && seqit_err) {
            had_err = seqit_err;
            gt_assert(gt_error_is_set(err));
        }
    }

    if (!had_err) {
        gt_bitoutstream_flush(bitstream);
        filepos = gt_bitoutstream_pos(bitstream);
        if (filepos < 0) {
            had_err = -1;
            gt_error_set(err, "error by ftell: %s", strerror(errno));
        }
        else {
            hcr_enc->seq_encoder->startofsamplingtab = filepos;
            gt_log_log("start of samplingtab: "GT_WU"",
                       hcr_enc->seq_encoder->startofsamplingtab);
            if (hcr_enc->seq_encoder->sampling != NULL)
                gt_sampling_write(hcr_enc->seq_encoder->sampling, fp);
        }
    }
    gt_bitoutstream_delete(bitstream);
    gt_seq_iterator_delete(seqit);
    return had_err;
}
Exemplo n.º 5
0
GtHcrEncoder *gt_hcr_encoder_new(GtStrArray *files, GtAlphabet *alpha,
                                 bool descs, GtQualRange qrange, GtTimer *timer,
                                 GtError *err)
{
    GtBaseQualDistr *bqd;
    GtHcrEncoder *hcr_enc;
    GtSeqIterator *seqit;
    GtStrArray *file;
    int had_err = 0,
        status;
    GtUword len1,
            len2,
            i,
            num_of_reads = 0;
    const GtUchar *seq,
          *qual;
    char *desc;

    gt_error_check(err);
    gt_assert(alpha && files);

    if (timer != NULL)
        gt_timer_show_progress(timer, "get <base,qual> distr", stdout);

    if (qrange.start != GT_UNDEF_UINT)
        if (qrange.start == qrange.end) {
            gt_error_set(err, "qrange.start must unequal qrange.end");
            return NULL;
        }

    hcr_enc = gt_malloc(sizeof (GtHcrEncoder));
    hcr_enc->files = files;
    hcr_enc->num_of_files = gt_str_array_size(files);
    hcr_enc->num_of_reads = 0;
    hcr_enc->page_sampling = false;
    hcr_enc->regular_sampling = false;
    hcr_enc->sampling_rate = 0;
    hcr_enc->pagesize = gt_pagesize();
    if (descs) {
        hcr_enc->encdesc_encoder = gt_encdesc_encoder_new();
        if (timer != NULL)
            gt_encdesc_encoder_set_timer(hcr_enc->encdesc_encoder, timer);
    }
    else
        hcr_enc->encdesc_encoder = NULL;

    hcr_enc->seq_encoder = gt_malloc(sizeof (GtHcrSeqEncoder));
    hcr_enc->seq_encoder->alpha = alpha;
    hcr_enc->seq_encoder->sampling = NULL;
    hcr_enc->seq_encoder->fileinfos = gt_calloc((size_t) hcr_enc->num_of_files,
                                      sizeof (*(hcr_enc->seq_encoder->fileinfos)));
    hcr_enc->seq_encoder->qrange = qrange;
    bqd = hcr_base_qual_distr_new(alpha, qrange);

    /* check if reads in the same file are of same length and get
       <base, quality> pair distribution */
    for (i = 0; i < hcr_enc->num_of_files; i++) {
        file = gt_str_array_new();
        gt_str_array_add(file, gt_str_array_get_str(files, i));
        seqit = gt_seq_iterator_fastq_new(file, err);
        if (!seqit) {
            gt_error_set(err, "cannot initialize GtSeqIteratorFastQ object");
            had_err = -1;
        }
        if (!had_err) {
            gt_seq_iterator_set_symbolmap(seqit, gt_alphabet_symbolmap(alpha));
            gt_seq_iterator_set_quality_buffer(seqit, &qual);
            status = gt_seq_iterator_next(seqit, &seq, &len1, &desc, err);

            if (status == 1) {
                num_of_reads = 1UL;
                while (!had_err) {
                    status = gt_seq_iterator_next(seqit, &seq, &len2, &desc, err);
                    if (status == -1)
                        had_err = -1;
                    if (status != 1)
                        break;
                    if (len2 != len1) {
                        gt_error_set(err, "reads have to be of equal length");
                        had_err = -1;
                        break;
                    }
                    if (hcr_base_qual_distr_add(bqd, qual, seq, len1) != 0)
                        had_err = -1;
                    len1 = len2;
                    num_of_reads++;
                }
            }
            else if (status == -1)
                had_err = -1;

            if (!had_err) {
                if (i == 0)
                    hcr_enc->seq_encoder->fileinfos[i].readnum = num_of_reads;
                else
                    hcr_enc->seq_encoder->fileinfos[i].readnum =
                        hcr_enc->seq_encoder->fileinfos[i - 1].readnum + num_of_reads;
                hcr_enc->seq_encoder->fileinfos[i].readlength = len1;
            }
        }
        hcr_enc->num_of_reads += num_of_reads;
        gt_str_array_delete(file);
        gt_seq_iterator_delete(seqit);
    }
    if (!had_err)
        hcr_base_qual_distr_trim(bqd);

    if (!had_err) {
        if (timer != NULL)
            gt_timer_show_progress(timer, "build huffman tree for sequences and"
                                   " qualities", stdout);
        hcr_enc->seq_encoder->huffman =
            gt_huffman_new(bqd,
                           hcr_base_qual_distr_func,
                           (GtUword) bqd->ncols * bqd->nrows);
    }
    if (!had_err) {
        hcr_enc->seq_encoder->qual_offset = bqd->qual_offset;
        hcr_base_qual_distr_delete(bqd);
        return hcr_enc;
    }
    return NULL;
}
Exemplo n.º 6
0
int gt_genomediff_pck_shu_simple(GtLogger *logger,
                                 const GtGenomediffArguments *arguments,
                                 GtError *err)
{
  int had_err = 0;
  int retval;
  GtSeqIterator *queries = NULL;
  const GtUchar *symbolmap, *currentQuery;
  const GtAlphabet *alphabet;
  GtUchar c_sym = 0,
          g_sym = 0;
  uint64_t queryNo;
  char *description = NULL;
  unsigned long queryLength,
                subjectLength = 0,
                currentSuffix;
  double avgShuLength,
         currentShuLength = 0.0,
         /*gc_subject,*/
         gc_query /*, gc*/;
  const FMindex *subjectindex = NULL;
  Genericindex *genericindexSubject;
  const GtEncseq *encseq = NULL;
  double *ln_n_fac;

  /* get the precalculation of ln(n!) for 0<n<max_ln_n_fac */
  ln_n_fac = gt_get_ln_n_fac(arguments->max_ln_n_fac);
  gt_log_log("ln(max_ln_n_fac!) = %f\n",
             ln_n_fac[arguments->max_ln_n_fac]);

  genericindexSubject = genericindex_new(gt_str_get(
                                           arguments->indexname),
                                         arguments->with_esa,
                                         true,
                                         false,
                                         true,
                                         arguments->user_max_depth,
                                         logger,
                                         err);
  if (genericindexSubject == NULL)
  {
    had_err = 1;
  }
  else
  {
    encseq = genericindex_getencseq(genericindexSubject);
  }

  if (!had_err)
  {
    subjectLength = genericindex_get_totallength(genericindexSubject) - 1;
    /*subjectLength /= 2;*/
    /*gt_log_log("subject length: %lu", subjectLength);*/
    subjectindex = genericindex_get_packedindex(genericindexSubject);

    queries = gt_seqiterator_sequence_buffer_new(
                                          arguments->queryname,
                                          err);
    gt_assert(queries);
    alphabet = gt_encseq_alphabet(encseq);
    /* makes assumption that alphabet is dna, it has to calculate the gc! */
    if (!gt_alphabet_is_dna(alphabet))
    {
      fprintf(stderr, "error: Sequences need to be dna");
      had_err = 1;
    }
    else
    {
      symbolmap = gt_alphabet_symbolmap(alphabet);
      gt_seqiterator_set_symbolmap(queries, symbolmap);
      c_sym = gt_alphabet_encode(alphabet, 'c');
      g_sym = gt_alphabet_encode(alphabet, 'g');
    }
  }

  for (queryNo = 0; !had_err; queryNo++)
  {
    retval = gt_seqiterator_next(queries,
                                 &currentQuery,
                                 &queryLength,
                                 &description,
                                 err);
    if ( retval != 1)
    {
      if (retval < 0)
      {
        gt_free(description);
      }
      break;
    }
    gt_logger_log(logger,
                  "found query of length: %lu",
                  queryLength);
    avgShuLength = 0.0;
    gc_query = 0.0;
    for (currentSuffix = 0; currentSuffix < queryLength; currentSuffix++)
    {
      currentShuLength = (double) gt_pck_getShuStringLength(
                    subjectindex,
                    &currentQuery[currentSuffix],
                    queryLength - currentSuffix);
      avgShuLength += currentShuLength;
      if (currentQuery[currentSuffix] == c_sym ||
          currentQuery[currentSuffix] == g_sym)
      {
        gc_query++;
      }
    }
    if (arguments->shulen_only)
    {
      printf("# Query %d sum of shulen:\n %.0f\n",
             (int) queryNo, avgShuLength);
    }
    else
    {
      avgShuLength /= (double) queryLength;
      gc_query /= (double) queryLength;

      gt_logger_log(logger, "Query %d has an average SHUstring length "
                            "of\n# shulength: %f",
                            (int) queryNo, avgShuLength);
      gt_logger_log(logger, "Query description: %s", description);
      gt_log_log("Query (i): %s", description);

  /* XXX Fehlerabfragen einbauen */

      if ( !had_err )
      {
        double div, kr;

        gt_logger_log(logger, "shulen:\n%f", avgShuLength);
        gt_log_log("shu: %f, gc: %f, len: %lu",
            avgShuLength, gc_query, subjectLength);
        div =  gt_divergence(arguments->divergence_rel_err,
                             arguments->divergence_abs_err,
                             arguments->divergence_m,
                             arguments->divergence_threshold,
                             avgShuLength,
                             subjectLength,
                             gc_query,
                             ln_n_fac,
                             arguments->max_ln_n_fac);
        gt_logger_log(logger, "divergence:\n%f", div);

        kr = gt_calculateKr(div);

        printf("# Kr:\n%f\n", kr);
      }
    }
  }
  gt_free(ln_n_fac);
  gt_seqiterator_delete(queries);
  genericindex_delete(genericindexSubject);
  return had_err;
}
Exemplo n.º 7
0
int gt_runidxlocali(const IdxlocaliOptions *idxlocalioptions,GtError *err)
{
  Genericindex *genericindex = NULL;
  bool haserr = false;
  GtLogger *logger;
  const GtEncseq *encseq = NULL;

  logger = gt_logger_new(idxlocalioptions->verbose,
                         GT_LOGGER_DEFLT_PREFIX, stdout);

  if (idxlocalioptions->doonline)
  {
    GtEncseqLoader *el;
    el = gt_encseq_loader_new();
    gt_encseq_loader_require_multiseq_support(el);
    gt_encseq_loader_drop_description_support(el);
    gt_encseq_loader_set_logger(el, logger);
    encseq = gt_encseq_loader_load(el, gt_str_get(idxlocalioptions->indexname),
                                   err);
    gt_encseq_loader_delete(el);
    if (encseq == NULL)
    {
      haserr = true;
    }
  } else
  {
    genericindex = genericindex_new(gt_str_get(idxlocalioptions->indexname),
                                    idxlocalioptions->withesa,
                                    idxlocalioptions->withesa ||
                                    idxlocalioptions->docompare,
                                    false,
                                    true,
                                    0,
                                    logger,
                                    err);
    if (genericindex == NULL)
    {
      haserr = true;
    } else
    {
      encseq = genericindex_getencseq(genericindex);
    }
  }
  if (!haserr)
  {
    GtSeqIterator *seqit;
    const GtUchar *query;
    unsigned long querylen;
    char *desc = NULL;
    int retval;
    Limdfsresources *limdfsresources = NULL;
    const AbstractDfstransformer *dfst;
    SWdpresource *swdpresource = NULL;
    Showmatchinfo showmatchinfo;
    ProcessIdxMatch processmatch;
    GtAlphabet *a;
    void *processmatchinfoonline, *processmatchinfooffline;
    Storematchinfo storeonline, storeoffline;

    a = gt_encseq_alphabet(encseq);
    if (idxlocalioptions->docompare)
    {
      processmatch = storematch;
      gt_initstorematch(&storeonline,encseq);
      gt_initstorematch(&storeoffline,encseq);
      processmatchinfoonline = &storeonline;
      processmatchinfooffline = &storeoffline;
    } else
    {
      processmatch = showmatch;
      showmatchinfo.encseq = encseq;
      showmatchinfo.characters = gt_alphabet_characters(a);
      showmatchinfo.wildcardshow = gt_alphabet_wildcard_show(a);
      showmatchinfo.showalignment = idxlocalioptions->showalignment;
      processmatchinfoonline = processmatchinfooffline = &showmatchinfo;
    }
    if (idxlocalioptions->doonline || idxlocalioptions->docompare)
    {
      swdpresource = gt_newSWdpresource(idxlocalioptions->matchscore,
                                     idxlocalioptions->mismatchscore,
                                     idxlocalioptions->gapextend,
                                     idxlocalioptions->threshold,
                                     idxlocalioptions->showalignment,
                                     processmatch,
                                     processmatchinfoonline);
    }
    dfst = gt_locali_AbstractDfstransformer();
    if (!idxlocalioptions->doonline || idxlocalioptions->docompare)
    {
      gt_assert(genericindex != NULL);
      limdfsresources = gt_newLimdfsresources(genericindex,
                                           true,
                                           0,
                                           0,    /* maxpathlength */
                                           true, /* keepexpandedonstack */
                                           processmatch,
                                           processmatchinfooffline,
                                           NULL, /* processresult */
                                           NULL, /* processresult info */
                                           dfst);
    }
    seqit = gt_seq_iterator_sequence_buffer_new(idxlocalioptions->queryfiles,
                                               err);
    if (!seqit)
      haserr = true;
    if (!haserr)
    {
      gt_seq_iterator_set_symbolmap(seqit, gt_alphabet_symbolmap(a));
      for (showmatchinfo.queryunit = 0; /* Nothing */;
           showmatchinfo.queryunit++)
      {
        retval = gt_seq_iterator_next(seqit,
                                     &query,
                                     &querylen,
                                     &desc,
                                     err);
        if (retval < 0)
        {
          haserr = true;
          break;
        }
        if (retval == 0)
        {
          break;
        }
        printf("process sequence " Formatuint64_t " of length %lu\n",
                PRINTuint64_tcast(showmatchinfo.queryunit),querylen);
        if (idxlocalioptions->doonline || idxlocalioptions->docompare)
        {
          gt_multiapplysmithwaterman(swdpresource,encseq,query,querylen);
        }
        if (!idxlocalioptions->doonline || idxlocalioptions->docompare)
        {
          gt_indexbasedlocali(limdfsresources,
                           idxlocalioptions->matchscore,
                           idxlocalioptions->mismatchscore,
                           idxlocalioptions->gapstart,
                           idxlocalioptions->gapextend,
                           idxlocalioptions->threshold,
                           query,
                           querylen,
                           dfst);
        }
        if (idxlocalioptions->docompare)
        {
          gt_checkandresetstorematch(showmatchinfo.queryunit,
                                  &storeonline,&storeoffline);
        }
      }
      if (limdfsresources != NULL)
      {
        gt_freeLimdfsresources(&limdfsresources,dfst);
      }
      if (swdpresource != NULL)
      {
        gt_freeSWdpresource(swdpresource);
        swdpresource = NULL;
      }
      gt_seq_iterator_delete(seqit);
    }
    if (idxlocalioptions->docompare)
    {
      gt_freestorematch(&storeonline);
      gt_freestorematch(&storeoffline);
    }
  }
  if (genericindex == NULL)
  {
    gt_encseq_delete((GtEncseq *) encseq);
    encseq = NULL;
  } else
  {
    genericindex_delete(genericindex);
  }
  gt_logger_delete(logger);
  logger = NULL;
  return haserr ? -1 : 0;
}
Exemplo n.º 8
0
int gt_tyrsearch(const char *tyrindexname,
                 const GtStrArray *queryfilenames,
                 unsigned int showmode,
                 unsigned int searchstrand,
                 bool verbose,
                 bool performtest,
                 GtError *err)
{
  Tyrindex *tyrindex;
  Tyrcountinfo *tyrcountinfo = NULL;
  Tyrbckinfo *tyrbckinfo = NULL;
  bool haserr = false;

  gt_error_check(err);
  tyrindex = gt_tyrindex_new(tyrindexname,err);
  if (tyrindex == NULL)
  {
    haserr = true;
  } else
  {
    if (verbose)
    {
      gt_tyrindex_show(tyrindex);
    }
    if (performtest)
    {
      gt_tyrindex_check(tyrindex);
    }
  }
  if (!haserr)
  {
    gt_assert(tyrindex != NULL);
    if ((showmode & SHOWCOUNTS) && !gt_tyrindex_isempty(tyrindex))
    {
      tyrcountinfo = gt_tyrcountinfo_new(tyrindex,tyrindexname,err);
      if (tyrcountinfo == NULL)
      {
        haserr = true;
      }
    }
  }
  if (!haserr)
  {
    gt_assert(tyrindex != NULL);
    if (!gt_tyrindex_isempty(tyrindex))
    {
      tyrbckinfo = gt_tyrbckinfo_new(tyrindexname,
                                     gt_tyrindex_alphasize(tyrindex),
                                     err);
      if (tyrbckinfo == NULL)
      {
        haserr = true;
      }
    }
  }
  if (!haserr)
  {
    const GtUchar *query;
    unsigned long querylen;
    char *desc = NULL;
    uint64_t unitnum;
    int retval;
    Tyrsearchinfo tyrsearchinfo;
    GtSeqIterator *seqit;

    gt_assert(tyrindex != NULL);
    gt_tyrsearchinfo_init(&tyrsearchinfo,tyrindex,showmode,searchstrand);
    seqit = gt_seqiterator_sequence_buffer_new(queryfilenames, err);
    if (!seqit)
      haserr = true;
    if (!haserr)
    {
      gt_seqiterator_set_symbolmap(seqit,
                                 gt_alphabet_symbolmap(tyrsearchinfo.dnaalpha));
      for (unitnum = 0; /* Nothing */; unitnum++)
      {
        retval = gt_seqiterator_next(seqit,
                                     &query,
                                     &querylen,
                                     &desc,
                                     err);
        if (retval < 0)
        {
          haserr = true;
          break;
        }
        if (retval == 0)
        {
          break;
        }
        singleseqtyrsearch(tyrindex,
                           tyrcountinfo,
                           &tyrsearchinfo,
                           tyrbckinfo,
                           unitnum,
                           query,
                           querylen,
                           desc);
      }
      gt_seqiterator_delete(seqit);
    }
    gt_tyrsearchinfo_delete(&tyrsearchinfo);
  }
  if (tyrbckinfo != NULL)
  {
    gt_tyrbckinfo_delete(&tyrbckinfo);
  }
  if (tyrcountinfo != NULL)
  {
    gt_tyrcountinfo_delete(&tyrcountinfo);
  }
  if (tyrindex != NULL)
  {
    gt_tyrindex_delete(&tyrindex);
  }
  return haserr ? -1 : 0;
}
Exemplo n.º 9
0
static int gt_callenumquerymatches_withindex(
                            GtQuerysubstringmatchfunc findquerymatches,
                            const Suffixarray *suffixarray,
                            const GtStrArray *queryfiles,
                            bool forwardstrand,
                            bool reversestrand,
                            unsigned int userdefinedleastlength,
                            GtProcessquerybeforematching
                               processquerybeforematching,
                            GtProcessquerymatch processquerymatch,
                            void *processquerymatchinfo,
                            GtError *err)
{
  GtSeqIterator *seqit;
  bool haserr = false;

  seqit = gt_seq_iterator_sequence_buffer_new(queryfiles, err);
  if (seqit == NULL)
  {
    haserr = true;
  } else
  {
    GtQuerymatch *querymatchspaceptr = gt_querymatch_new();
    const GtUchar *query;
    unsigned long querylen;
    int retval;
    uint64_t queryunitnum;
    GtUchar *queryreverse = NULL;
    unsigned long queryreverse_length = 0;
    char *desc = NULL;
    int mode;

    gt_seq_iterator_set_symbolmap(seqit,
                    gt_alphabet_symbolmap(gt_encseq_alphabet(
                                                        suffixarray->encseq)));
    for (queryunitnum = 0; /* Nothing */; queryunitnum++)
    {
      retval = gt_seq_iterator_next(seqit, &query, &querylen, &desc, err);
      if (retval < 0)
      {
        haserr = true;
        break;
      }
      if (retval == 0)
      {
        break;
      }
      if (querylen >= (unsigned long) userdefinedleastlength)
      {
        GtQueryrep queryrep;

        queryrep.encseq = NULL;
        queryrep.readmode = GT_READMODE_FORWARD;
        queryrep.startpos = 0;
        queryrep.length = querylen;
        for (mode = 0; mode <= 1; mode++)
        {
          if (mode == 0 && forwardstrand)
          {
            queryrep.sequence = query;
            queryrep.reversecopy = false;
            if (processquerybeforematching != NULL)
            {
              processquerybeforematching(processquerymatchinfo,desc,query,
                                         querylen,true);
            }
          } else
          {
            if (mode == 1 && reversestrand)
            {
              if (querylen > queryreverse_length)
              {
                queryreverse = gt_realloc(queryreverse,
                                          sizeof (*queryreverse) * querylen);
                queryreverse_length = querylen;
              }
              gt_copy_reversecomplement(queryreverse,query,querylen);
              queryrep.sequence = queryreverse;
              queryrep.reversecopy = true;
              if (processquerybeforematching != NULL)
              {
                processquerybeforematching(processquerymatchinfo,desc,
                                           queryreverse,querylen,false);
              }
            } else
            {
              queryrep.sequence = NULL;
              queryrep.reversecopy = false;
            }
          }
          if (queryrep.sequence != NULL)
          {
            int ret = findquerymatches(false,
                                       suffixarray,
                                       queryunitnum,
                                       &queryrep,
                                       (unsigned long) userdefinedleastlength,
                                       processquerymatch,
                                       processquerymatchinfo,
                                       querymatchspaceptr,
                                       err);
            if (ret != 0)
            {
              haserr = true;
              break;
            }
          }
        }
      }
    }
    gt_seq_iterator_delete(seqit);
    gt_free(queryreverse);
    gt_querymatch_delete(querymatchspaceptr);
  }
  return haserr ? -1 : 0;
}
Exemplo n.º 10
0
int gt_findsubquerygmatchforward(const GtEncseq *encseq,
                                 const void *genericindex,
                                 unsigned long totallength,
                                 Greedygmatchforwardfunction gmatchforward,
                                 const GtAlphabet *alphabet,
                                 const GtStrArray *queryfilenames,
                                 Definedunsignedlong minlength,
                                 Definedunsignedlong maxlength,
                                 bool showsequence,
                                 bool showquerypos,
                                 bool showsubjectpos,
                                 GtError *err)
{
    Substringinfo substringinfo;
    Rangespecinfo rangespecinfo;
    bool haserr = false;
    GtSeqIterator *seqit;
    const GtUchar *query;
    unsigned long querylen;
    char *desc = NULL;
    int retval;
    uint64_t unitnum;

    gt_error_check(err);
    substringinfo.genericindex = genericindex;
    substringinfo.totallength = totallength;
    rangespecinfo.minlength = minlength;
    rangespecinfo.maxlength = maxlength;
    rangespecinfo.showsequence = showsequence;
    rangespecinfo.showquerypos = showquerypos;
    rangespecinfo.showsubjectpos = showsubjectpos;
    substringinfo.preprocessgmatchlength = showunitnum;
    substringinfo.processgmatchlength = showifinlengthrange;
    substringinfo.postprocessgmatchlength = NULL;
    substringinfo.alphabet = alphabet;
    substringinfo.processinfo = &rangespecinfo;
    substringinfo.gmatchforward = gmatchforward;
    substringinfo.encseq = encseq;
    seqit = gt_seqiterator_sequence_buffer_new(queryfilenames, err);
    if (!seqit)
        haserr = true;
    if (!haserr)
    {
        gt_seqiterator_set_symbolmap(seqit, gt_alphabet_symbolmap(alphabet));
        for (unitnum = 0; /* Nothing */; unitnum++)
        {
            retval = gt_seqiterator_next(seqit,
                                         &query,
                                         &querylen,
                                         &desc,
                                         err);
            if (retval < 0)
            {
                haserr = true;
                break;
            }
            if (retval == 0)
            {
                break;
            }
            gmatchposinsinglesequence(&substringinfo,
                                      unitnum,
                                      query,
                                      querylen,
                                      desc);
        }
        gt_seqiterator_delete(seqit);
    }
    return haserr ? -1 : 0;
}
Exemplo n.º 11
0
GtBareEncseq *gt_bare_encseq_parse_new(GtUchar *filecontents,size_t numofbytes,
                                       const GtAlphabet *alphabet,
                                       GtError *err)
{
  GtUchar *writeptr = filecontents, *readptr = filecontents;
  const GtUchar *endptr = filecontents + numofbytes;
  bool firstline = true, haserr = false;
  GtUword lastspecialrange_length = 0;
  GtBareSpecialrange *srptr = NULL;
  GtBareEncseq *bare_encseq = gt_malloc(sizeof *bare_encseq);
  const GtUchar *smap = gt_alphabet_symbolmap(alphabet);

  bare_encseq->specialcharacters = 0;
  bare_encseq->numofchars = (GtUword) gt_alphabet_num_of_chars(alphabet);
  bare_encseq->charcount = gt_calloc((size_t) bare_encseq->numofchars,
                                     sizeof *bare_encseq->charcount);
  GT_INITARRAY(&bare_encseq->specialranges,GtBareSpecialrange);
  readptr = filecontents;
  while (!haserr && readptr < endptr)
  {
    if (*readptr == '>')
    {
      if (!firstline)
      {
        if (lastspecialrange_length == 0)
        {
          GT_GETNEXTFREEINARRAY(srptr,&bare_encseq->specialranges,
                                GtBareSpecialrange,128UL);
          srptr->start = (GtUword) (writeptr - filecontents);
        }
        lastspecialrange_length++;
        *writeptr++ = SEPARATOR;
        bare_encseq->specialcharacters++;
      } else
      {
        firstline = false;
      }
      while (readptr < endptr && *readptr != '\n')
      {
        readptr++;
      }
      readptr++;
    } else
    {
      while (readptr < endptr && *readptr != '\n')
      {
        if (!isspace(*readptr))
        {
          GtUchar cc = smap[*readptr];
          if (cc == UNDEFCHAR)
          {
            gt_error_set(err,"illegal input characters %c\n",*readptr);
            haserr = true;
            break;
          }
          if (ISSPECIAL(cc))
          {
            if (lastspecialrange_length == 0)
            {
              GT_GETNEXTFREEINARRAY(srptr,&bare_encseq->specialranges,
                                    GtBareSpecialrange,128UL);
              srptr->start = (GtUword) (writeptr - filecontents);
            }
            lastspecialrange_length++;
            bare_encseq->specialcharacters++;
          } else
          {
            gt_assert((GtUword) cc < bare_encseq->numofchars);
            bare_encseq->charcount[(int) cc]++;
            if (lastspecialrange_length > 0)
            {
              gt_assert(srptr != NULL);
              srptr->length = lastspecialrange_length;
            }
            lastspecialrange_length = 0;
          }
          *writeptr++ = cc;
        }
        readptr++;
      }
      readptr++;
    }
  }
  if (lastspecialrange_length > 0)
  {
    gt_assert(srptr != NULL);
    srptr->length = lastspecialrange_length;
  }
  bare_encseq->sequence = filecontents;
  bare_encseq->totallength = (GtUword) (writeptr - filecontents);
  if (haserr)
  {
    gt_bare_encseq_delete(bare_encseq);
    return NULL;
  }
  return bare_encseq;
}