Ejemplo n.º 1
0
int gt_echodescriptionandsequence(const GtStrArray *filenametab,GtError *err)
{
    GtSeqIterator *seqit;
    char *desc = NULL;
    const GtUchar *sequence;
    unsigned long seqlen;
    bool haserr = false;
    int retval;

    seqit = gt_seqiterator_sequence_buffer_new(filenametab, err);
    if (!seqit)
        return -1;
    while (true)
    {
        retval = gt_seqiterator_next(seqit,
                                     &sequence,
                                     &seqlen,
                                     &desc,
                                     err);
        if (retval < 0)
        {
            haserr = true;
            break;
        }
        if (retval == 0)
        {
            break;
        }
        gt_symbolstring2fasta(stdout,desc,NULL,sequence,seqlen,70UL);
    }
    gt_seqiterator_delete(seqit);
    return haserr ? -1 : 0;
}
Ejemplo n.º 2
0
int runsubstringiteration(Greedygmatchforwardfunction gmatchforward,
                          const void *genericindex,
                          unsigned long totalwidth,
                          const unsigned long *leftborder,
                          const unsigned long *countspecialcodes,
                          const Alphabet *alphabet,
                          unsigned int prefixlength,
                          const GtStrArray *queryfilenames,
                          GtError *err)
{
    GtSeqIterator *seqit;
    const GtUchar *query;
    unsigned long querylen;
    char *desc = NULL;
    Substriter *substriter;
    Substring substring;
    bool haserr = false;
    int retval;
    unsigned int numofchars;
    unsigned long gmatchlength, gmatchlength2;
    GtCodetype maxcode;
    GtBucketspecification bucketspec;
    bool haserr = false;

    seqit = gt_seqiterator_new(queryfilenames,getsymbolmapAlphabet(alphabet),
                               *                          true);
    for (unitnum = 0; ; unitnum++)
    {
        retval = gt_seqiterator_next(seqit,
                                     &query,
                                     &querylen,
                                     &desc,
                                     err);
        if (retval < 0)
        {
            haserr = true;
            break;
        }
        if (retval == 0)
        {
            break;
        }
    }
    gt_seqiterator_delete(seqit);
    return haserr ? -1 : 0;
}
Ejemplo n.º 3
0
int gt_genomediff_pck_shu_simple(GtLogger *logger,
                                 const GtGenomediffArguments *arguments,
                                 GtError *err)
{
  int had_err = 0;
  int retval;
  GtSeqIterator *queries = NULL;
  const GtUchar *symbolmap, *currentQuery;
  const GtAlphabet *alphabet;
  GtUchar c_sym = 0,
          g_sym = 0;
  uint64_t queryNo;
  char *description = NULL;
  unsigned long queryLength,
                subjectLength = 0,
                currentSuffix;
  double avgShuLength,
         currentShuLength = 0.0,
         /*gc_subject,*/
         gc_query /*, gc*/;
  const FMindex *subjectindex = NULL;
  Genericindex *genericindexSubject;
  const GtEncseq *encseq = NULL;
  double *ln_n_fac;

  /* get the precalculation of ln(n!) for 0<n<max_ln_n_fac */
  ln_n_fac = gt_get_ln_n_fac(arguments->max_ln_n_fac);
  gt_log_log("ln(max_ln_n_fac!) = %f\n",
             ln_n_fac[arguments->max_ln_n_fac]);

  genericindexSubject = genericindex_new(gt_str_get(
                                           arguments->indexname),
                                         arguments->with_esa,
                                         true,
                                         false,
                                         true,
                                         arguments->user_max_depth,
                                         logger,
                                         err);
  if (genericindexSubject == NULL)
  {
    had_err = 1;
  }
  else
  {
    encseq = genericindex_getencseq(genericindexSubject);
  }

  if (!had_err)
  {
    subjectLength = genericindex_get_totallength(genericindexSubject) - 1;
    /*subjectLength /= 2;*/
    /*gt_log_log("subject length: %lu", subjectLength);*/
    subjectindex = genericindex_get_packedindex(genericindexSubject);

    queries = gt_seqiterator_sequence_buffer_new(
                                          arguments->queryname,
                                          err);
    gt_assert(queries);
    alphabet = gt_encseq_alphabet(encseq);
    /* makes assumption that alphabet is dna, it has to calculate the gc! */
    if (!gt_alphabet_is_dna(alphabet))
    {
      fprintf(stderr, "error: Sequences need to be dna");
      had_err = 1;
    }
    else
    {
      symbolmap = gt_alphabet_symbolmap(alphabet);
      gt_seqiterator_set_symbolmap(queries, symbolmap);
      c_sym = gt_alphabet_encode(alphabet, 'c');
      g_sym = gt_alphabet_encode(alphabet, 'g');
    }
  }

  for (queryNo = 0; !had_err; queryNo++)
  {
    retval = gt_seqiterator_next(queries,
                                 &currentQuery,
                                 &queryLength,
                                 &description,
                                 err);
    if ( retval != 1)
    {
      if (retval < 0)
      {
        gt_free(description);
      }
      break;
    }
    gt_logger_log(logger,
                  "found query of length: %lu",
                  queryLength);
    avgShuLength = 0.0;
    gc_query = 0.0;
    for (currentSuffix = 0; currentSuffix < queryLength; currentSuffix++)
    {
      currentShuLength = (double) gt_pck_getShuStringLength(
                    subjectindex,
                    &currentQuery[currentSuffix],
                    queryLength - currentSuffix);
      avgShuLength += currentShuLength;
      if (currentQuery[currentSuffix] == c_sym ||
          currentQuery[currentSuffix] == g_sym)
      {
        gc_query++;
      }
    }
    if (arguments->shulen_only)
    {
      printf("# Query %d sum of shulen:\n %.0f\n",
             (int) queryNo, avgShuLength);
    }
    else
    {
      avgShuLength /= (double) queryLength;
      gc_query /= (double) queryLength;

      gt_logger_log(logger, "Query %d has an average SHUstring length "
                            "of\n# shulength: %f",
                            (int) queryNo, avgShuLength);
      gt_logger_log(logger, "Query description: %s", description);
      gt_log_log("Query (i): %s", description);

  /* XXX Fehlerabfragen einbauen */

      if ( !had_err )
      {
        double div, kr;

        gt_logger_log(logger, "shulen:\n%f", avgShuLength);
        gt_log_log("shu: %f, gc: %f, len: %lu",
            avgShuLength, gc_query, subjectLength);
        div =  gt_divergence(arguments->divergence_rel_err,
                             arguments->divergence_abs_err,
                             arguments->divergence_m,
                             arguments->divergence_threshold,
                             avgShuLength,
                             subjectLength,
                             gc_query,
                             ln_n_fac,
                             arguments->max_ln_n_fac);
        gt_logger_log(logger, "divergence:\n%f", div);

        kr = gt_calculateKr(div);

        printf("# Kr:\n%f\n", kr);
      }
    }
  }
  gt_free(ln_n_fac);
  gt_seqiterator_delete(queries);
  genericindex_delete(genericindexSubject);
  return had_err;
}
Ejemplo n.º 4
0
int gt_tyrsearch(const char *tyrindexname,
                 const GtStrArray *queryfilenames,
                 unsigned int showmode,
                 unsigned int searchstrand,
                 bool verbose,
                 bool performtest,
                 GtError *err)
{
  Tyrindex *tyrindex;
  Tyrcountinfo *tyrcountinfo = NULL;
  Tyrbckinfo *tyrbckinfo = NULL;
  bool haserr = false;

  gt_error_check(err);
  tyrindex = gt_tyrindex_new(tyrindexname,err);
  if (tyrindex == NULL)
  {
    haserr = true;
  } else
  {
    if (verbose)
    {
      gt_tyrindex_show(tyrindex);
    }
    if (performtest)
    {
      gt_tyrindex_check(tyrindex);
    }
  }
  if (!haserr)
  {
    gt_assert(tyrindex != NULL);
    if ((showmode & SHOWCOUNTS) && !gt_tyrindex_isempty(tyrindex))
    {
      tyrcountinfo = gt_tyrcountinfo_new(tyrindex,tyrindexname,err);
      if (tyrcountinfo == NULL)
      {
        haserr = true;
      }
    }
  }
  if (!haserr)
  {
    gt_assert(tyrindex != NULL);
    if (!gt_tyrindex_isempty(tyrindex))
    {
      tyrbckinfo = gt_tyrbckinfo_new(tyrindexname,
                                     gt_tyrindex_alphasize(tyrindex),
                                     err);
      if (tyrbckinfo == NULL)
      {
        haserr = true;
      }
    }
  }
  if (!haserr)
  {
    const GtUchar *query;
    unsigned long querylen;
    char *desc = NULL;
    uint64_t unitnum;
    int retval;
    Tyrsearchinfo tyrsearchinfo;
    GtSeqIterator *seqit;

    gt_assert(tyrindex != NULL);
    gt_tyrsearchinfo_init(&tyrsearchinfo,tyrindex,showmode,searchstrand);
    seqit = gt_seqiterator_sequence_buffer_new(queryfilenames, err);
    if (!seqit)
      haserr = true;
    if (!haserr)
    {
      gt_seqiterator_set_symbolmap(seqit,
                                 gt_alphabet_symbolmap(tyrsearchinfo.dnaalpha));
      for (unitnum = 0; /* Nothing */; unitnum++)
      {
        retval = gt_seqiterator_next(seqit,
                                     &query,
                                     &querylen,
                                     &desc,
                                     err);
        if (retval < 0)
        {
          haserr = true;
          break;
        }
        if (retval == 0)
        {
          break;
        }
        singleseqtyrsearch(tyrindex,
                           tyrcountinfo,
                           &tyrsearchinfo,
                           tyrbckinfo,
                           unitnum,
                           query,
                           querylen,
                           desc);
      }
      gt_seqiterator_delete(seqit);
    }
    gt_tyrsearchinfo_delete(&tyrsearchinfo);
  }
  if (tyrbckinfo != NULL)
  {
    gt_tyrbckinfo_delete(&tyrbckinfo);
  }
  if (tyrcountinfo != NULL)
  {
    gt_tyrcountinfo_delete(&tyrcountinfo);
  }
  if (tyrindex != NULL)
  {
    gt_tyrindex_delete(&tyrindex);
  }
  return haserr ? -1 : 0;
}
Ejemplo n.º 5
0
int gt_extractkeysfromfastafile(bool verbose,
                                GtFile *outfp,
                                unsigned long width,
                                const GtStr *fileofkeystoextract,
                                GtStrArray *referencefiletab,
                                GtError *err)
{
  GtSeqIterator *seqit;
  const GtUchar *sequence;
  char *desc, *headerbufferspace = NULL, *keyspace = NULL;
  const char *keyptr;
  unsigned long allockeyspace = 0, len, keylen, numofqueries, keyposition,
                countmarkhit = 0;
  int had_err = 0;
  off_t totalsize;
  Fastakeyquery *fastakeyqueries;
  size_t headerbuffersize = 0, headerlength;

  gt_error_check(err);
  fastakeyqueries = readfileofkeystoextract(verbose,&numofqueries,
                                            fileofkeystoextract,err);
  if (fastakeyqueries == NULL)
  {
    return -1;
  }
  totalsize = gt_files_estimate_total_size(referencefiletab);
  if (verbose)
  {
    printf("# estimated total size is " Formatuint64_t "\n",
            PRINTuint64_tcast(totalsize));
  }
  seqit = gt_seqiterator_sequence_buffer_new(referencefiletab, err);
  if (!seqit)
  {
    had_err = -1;
  }
  if (!had_err && verbose)
  {
    gt_progressbar_start(gt_seqiterator_getcurrentcounter(seqit,
                                                          (unsigned long long)
                                                          totalsize),
                                                          (unsigned long long)
                                                          totalsize);
  }
  while (had_err != -1 && countmarkhit < numofqueries)
  {
    had_err = gt_seqiterator_next(seqit, &sequence, &len, &desc, err);
    if (had_err != 1)
    {
      break;
    }
    keyptr = desc2key(&keylen,desc,err);
    if (keyptr == NULL)
    {
      had_err = -1;
    } else
    {
      if (allockeyspace < keylen)
      {
        keyspace = gt_realloc(keyspace,sizeof (*keyspace) * (keylen+1));
        allockeyspace = keylen;
      }
      gt_assert(keyspace != NULL);
      strncpy(keyspace,keyptr,(size_t) keylen);
      keyspace[keylen] = '\0';
      keyposition = searchdesinfastakeyqueries(keyspace,fastakeyqueries,
                                               numofqueries);
      if (keyposition < numofqueries)
      {
        while (keyposition < numofqueries &&
               strcmp(fastakeyqueries[keyposition].fastakey,keyspace) == 0)
        {
#ifndef NDEBUG
          if (fastakeyqueries[keyposition].markhit)
          {
            fprintf(stderr,"key %s was already found before\n",
                     fastakeyqueries[keyposition].fastakey);
            exit(GT_EXIT_PROGRAMMING_ERROR);
          }
#endif
          headerlength = strlen(desc);
          if (headerbuffersize < headerlength + EXTRABUF + 1)
          {
            headerbuffersize = headerlength + EXTRABUF + 1;
            headerbufferspace = gt_realloc(headerbufferspace,
                                           sizeof (*headerbufferspace)
                                           * headerbuffersize);
          }
          if (COMPLETE(fastakeyqueries + keyposition))
          {
            /*
            (void) snprintf(headerbufferspace,headerbuffersize,
                            "%*.*s complete %s",
                            (int) keylen,(int) keylen,keyspace,
                            desc);
            */
            gt_fasta_show_entry(desc, (const char *) sequence, len, width,
                                outfp);
          } else
          {
            (void) snprintf(headerbufferspace,headerbuffersize,
                            "%*.*s %lu %lu %s",
                            (int) keylen,(int) keylen,keyspace,
                            fastakeyqueries[keyposition].frompos,
                            fastakeyqueries[keyposition].topos,
                            desc);
            gt_fasta_show_entry(headerbufferspace,
                                (const char *)
                                (sequence+fastakeyqueries[keyposition].
                                                          frompos - 1),
                                fastakeyqueries[keyposition].topos -
                                fastakeyqueries[keyposition].frompos+1,
                                width, outfp);
          }
          fastakeyqueries[keyposition].markhit = true;
          countmarkhit++;
          keyposition++;
        }
      }
#ifdef SKDEBUG
      printf("%s 1 %lu\n",keyspace, len);
#endif
    }
  }
  gt_free(headerbufferspace);
  gt_free(keyspace);
  if (verbose)
  {
    gt_progressbar_stop();
  }
  if (verbose)
  {
    outputnonmarked(fastakeyqueries,numofqueries);
  }
  fastakeyqueries_delete(fastakeyqueries,numofqueries);
  gt_seqiterator_delete(seqit);
  return had_err;
}
Ejemplo n.º 6
0
int gt_findsubquerygmatchforward(const GtEncseq *encseq,
                                 const void *genericindex,
                                 unsigned long totallength,
                                 Greedygmatchforwardfunction gmatchforward,
                                 const GtAlphabet *alphabet,
                                 const GtStrArray *queryfilenames,
                                 Definedunsignedlong minlength,
                                 Definedunsignedlong maxlength,
                                 bool showsequence,
                                 bool showquerypos,
                                 bool showsubjectpos,
                                 GtError *err)
{
    Substringinfo substringinfo;
    Rangespecinfo rangespecinfo;
    bool haserr = false;
    GtSeqIterator *seqit;
    const GtUchar *query;
    unsigned long querylen;
    char *desc = NULL;
    int retval;
    uint64_t unitnum;

    gt_error_check(err);
    substringinfo.genericindex = genericindex;
    substringinfo.totallength = totallength;
    rangespecinfo.minlength = minlength;
    rangespecinfo.maxlength = maxlength;
    rangespecinfo.showsequence = showsequence;
    rangespecinfo.showquerypos = showquerypos;
    rangespecinfo.showsubjectpos = showsubjectpos;
    substringinfo.preprocessgmatchlength = showunitnum;
    substringinfo.processgmatchlength = showifinlengthrange;
    substringinfo.postprocessgmatchlength = NULL;
    substringinfo.alphabet = alphabet;
    substringinfo.processinfo = &rangespecinfo;
    substringinfo.gmatchforward = gmatchforward;
    substringinfo.encseq = encseq;
    seqit = gt_seqiterator_sequence_buffer_new(queryfilenames, err);
    if (!seqit)
        haserr = true;
    if (!haserr)
    {
        gt_seqiterator_set_symbolmap(seqit, gt_alphabet_symbolmap(alphabet));
        for (unitnum = 0; /* Nothing */; unitnum++)
        {
            retval = gt_seqiterator_next(seqit,
                                         &query,
                                         &querylen,
                                         &desc,
                                         err);
            if (retval < 0)
            {
                haserr = true;
                break;
            }
            if (retval == 0)
            {
                break;
            }
            gmatchposinsinglesequence(&substringinfo,
                                      unitnum,
                                      query,
                                      querylen,
                                      desc);
        }
        gt_seqiterator_delete(seqit);
    }
    return haserr ? -1 : 0;
}
Ejemplo n.º 7
0
int runidxlocali(const IdxlocaliOptions *idxlocalioptions,GtError *err)
{
  Genericindex *genericindex = NULL;
  bool haserr = false;
  Verboseinfo *verboseinfo;
  const Encodedsequence *encseq = NULL;

  verboseinfo = newverboseinfo(idxlocalioptions->verbose);

  if (idxlocalioptions->doonline)
  {
    encseq = mapencodedsequence (true,
                                 idxlocalioptions->indexname,
                                 true,
                                 false,
                                 false,
                                 true,
                                 verboseinfo,
                                 err);
    if (encseq == NULL)
    {
      haserr = true;
    }
  } else
  {
    genericindex = genericindex_new(idxlocalioptions->indexname,
                                    idxlocalioptions->withesa,
                                    idxlocalioptions->withesa ||
                                    idxlocalioptions->docompare,
                                    false,
                                    true,
                                    0,
                                    verboseinfo,
                                    err);
    if (genericindex == NULL)
    {
      haserr = true;
    } else
    {
      encseq = genericindex_getencseq(genericindex);
    }
  }
  if (!haserr)
  {
    GtSeqIterator *seqit;
    const GtUchar *query;
    unsigned long querylen;
    char *desc = NULL;
    int retval;
    Limdfsresources *limdfsresources = NULL;
    const AbstractDfstransformer *dfst;
    SWdpresource *swdpresource = NULL;
    Showmatchinfo showmatchinfo;
    Processmatch processmatch;
    void *processmatchinfoonline, *processmatchinfooffline;
    Storematchinfo storeonline, storeoffline;

    if (idxlocalioptions->docompare)
    {
      processmatch = storematch;
      initstorematch(&storeonline,encseq);
      initstorematch(&storeoffline,encseq);
      processmatchinfoonline = &storeonline;
      processmatchinfooffline = &storeoffline;
    } else
    {
      processmatch = showmatch;
      showmatchinfo.encseq = encseq;
      showmatchinfo.characters = getencseqAlphabetcharacters(encseq);
      showmatchinfo.wildcardshow = getencseqAlphabetwildcardshow(encseq);
      showmatchinfo.showalignment = idxlocalioptions->showalignment;
      processmatchinfoonline = processmatchinfooffline = &showmatchinfo;
    }
    if (idxlocalioptions->doonline || idxlocalioptions->docompare)
    {
      swdpresource = newSWdpresource(idxlocalioptions->matchscore,
                                     idxlocalioptions->mismatchscore,
                                     idxlocalioptions->gapextend,
                                     idxlocalioptions->threshold,
                                     idxlocalioptions->showalignment,
                                     processmatch,
                                     processmatchinfoonline);
    }
    dfst = locali_AbstractDfstransformer();
    if (!idxlocalioptions->doonline || idxlocalioptions->docompare)
    {
      gt_assert(genericindex != NULL);
      limdfsresources = newLimdfsresources(genericindex,
                                           true,
                                           0,
                                           0,    /* maxpathlength */
                                           true, /* keepexpandedonstack */
                                           processmatch,
                                           processmatchinfooffline,
                                           NULL, /* processresult */
                                           NULL, /* processresult info */
                                           dfst);
    }
    seqit = gt_seqiterator_new(idxlocalioptions->queryfiles, err);
    if (!seqit)
      haserr = true;
    if (!haserr)
    {
      gt_seqiterator_set_symbolmap(seqit, getencseqAlphabetsymbolmap(encseq));
      for (showmatchinfo.queryunit = 0; /* Nothing */;
           showmatchinfo.queryunit++)
      {
        retval = gt_seqiterator_next(seqit,
                                     &query,
                                     &querylen,
                                     &desc,
                                     err);
        if (retval < 0)
        {
          haserr = true;
          break;
        }
        if (retval == 0)
        {
          break;
        }
        printf("process sequence " Formatuint64_t " of length %lu\n",
                PRINTuint64_tcast(showmatchinfo.queryunit),querylen);
        if (idxlocalioptions->doonline || idxlocalioptions->docompare)
        {
          multiapplysmithwaterman(swdpresource,encseq,query,querylen);
        }
        if (!idxlocalioptions->doonline || idxlocalioptions->docompare)
        {
          indexbasedlocali(limdfsresources,
                           idxlocalioptions->matchscore,
                           idxlocalioptions->mismatchscore,
                           idxlocalioptions->gapstart,
                           idxlocalioptions->gapextend,
                           idxlocalioptions->threshold,
                           query,
                           querylen,
                           dfst);
        }
        if (idxlocalioptions->docompare)
        {
          checkandresetstorematch(showmatchinfo.queryunit,
                                  &storeonline,&storeoffline);
        }
        gt_free(desc);
      }
      if (limdfsresources != NULL)
      {
        freeLimdfsresources(&limdfsresources,dfst);
      }
      if (swdpresource != NULL)
      {
        freeSWdpresource(swdpresource);
        swdpresource = NULL;
      }
      gt_seqiterator_delete(seqit);
    }
    if (idxlocalioptions->docompare)
    {
      freestorematch(&storeonline);
      freestorematch(&storeoffline);
    }
  }
  if (genericindex == NULL)
  {
    gt_assert(encseq != NULL);
    encodedsequence_free((Encodedsequence **) &encseq);
  } else
  {
    genericindex_delete(genericindex);
  }
  freeverboseinfo(&verboseinfo);
  return haserr ? -1 : 0;
}