Beispiel #1
0
Enumcodeatposition *gt_Enumcodeatposition_new(const GtEncseq *encseq,
                                              GtReadmode readmode,
                                              unsigned int prefixlength,
                                              unsigned int numofchars)
{
  Enumcodeatposition *ecp;

  ecp = gt_malloc(sizeof *ecp);
  ecp->encseq = encseq;
  ecp->readmode = readmode;
  ecp->multimappower = gt_initmultimappower(numofchars,prefixlength);
  ecp->filltable = gt_initfilltable(numofchars,prefixlength);
  ecp->prefixlength = prefixlength;
  ecp->moveforward = GT_ISDIRREVERSE(readmode) ? true : false;
  ecp->totallength = gt_encseq_total_length(encseq);
  if (ecp->moveforward)
  {
    ecp->previousrange.start = ecp->previousrange.end = 0;
  } else
  {
    ecp->previousrange.start = ecp->previousrange.end = ecp->totallength;
  }
  ecp->exhausted = false;
  if (gt_encseq_has_specialranges(encseq))
  {
    ecp->sri = gt_specialrangeiterator_new(encseq,ecp->moveforward);
  } else
  {
    ecp->sri = NULL;
  }
  return ecp;
}
Rankedbounds *gt_fillrankbounds(const GtEncseq *encseq,
                             GtReadmode readmode)
{
  if (gt_encseq_has_specialranges(encseq))
  {
    GtSpecialrangeiterator *sri;
    GtRange range;
    GtUword currentrank = 0, realspecialranges;
    Rankedbounds *rankedbounds, *rbptr;

    realspecialranges = gt_encseq_realspecialranges(encseq);
    rankedbounds = gt_malloc(sizeof (Rankedbounds) * realspecialranges);
    sri = gt_specialrangeiterator_new(encseq,
                                      GT_ISDIRREVERSE(readmode)
                                      ? false : true);
    for (rbptr = rankedbounds;
         gt_specialrangeiterator_next(sri,&range);
         rbptr++)
    {
      rbptr->lowerbound = range.start;
      rbptr->upperbound = range.end;
      rbptr->rank = currentrank;
      currentrank += rbptr->upperbound - rbptr->lowerbound;
    }
    gt_assert(rbptr == rankedbounds + realspecialranges);
    gt_specialrangeiterator_delete(sri);
    return rankedbounds;
  }
  return NULL;
}
Specialrank *gt_fillspecialranklist(const GtEncseq *encseq,
                                 GtReadmode readmode,
                                 const GtUword *inversesuftab)
{
  if (gt_encseq_has_specialranges(encseq))
  {
    GtSpecialrangeiterator *sri;
    GtRange range;
    GtUword realspecialranges, specialrank;
    GT_UNUSED GtUword totallength;
    Specialrank *specialranklist, *rbptr;

    totallength = gt_encseq_total_length(encseq);
    realspecialranges = gt_encseq_realspecialranges(encseq);
    specialranklist = gt_malloc(sizeof (Specialrank) * realspecialranges);
    sri = gt_specialrangeiterator_new(encseq,
                                  GT_ISDIRREVERSE(readmode)
                                  ? false : true);
    rbptr = specialranklist;
    specialrank = 0;
    while (gt_specialrangeiterator_next(sri,&range))
    {
      gt_assert(rbptr < specialranklist + realspecialranges);
      gt_assert(range.end<=totallength);
      specialrank += range.end - range.start;
      rbptr->specialrank = specialrank - 1;
      rbptr->key = inversesuftab[range.end];
      rbptr++;
    }
    gt_assert(rbptr == specialranklist + realspecialranges);
    gt_specialrangeiterator_delete(sri);
    qsort(specialranklist,(size_t) realspecialranges,
          sizeof (Specialrank),compareSpecialrank);
    return specialranklist;
  }
  return NULL;
}
Beispiel #4
0
static int gt_encseq_bitextract_runner(GT_UNUSED int argc, const char **argv,
                                       GT_UNUSED int parsed_args,
                                       void *tool_arguments,
                                       GT_UNUSED GtError *err)
{
  GtEncseqBitextractArguments *arguments = tool_arguments;
  GtEncseqLoader *el;
  GtEncseq *encseq;
  int had_err = 0;
  bool fwd, it1, GT_UNUSED it2;
  char buffer[BUFSIZ];
  GtEndofTwobitencoding etbe;
  GtEncseqReader *esr;
  GtSpecialrangeiterator *sri;
  GtRange srng;
  GtReadmode rm;

  gt_error_check(err);
  gt_assert(arguments);

  el = gt_encseq_loader_new();
  encseq = gt_encseq_loader_load(el, argv[parsed_args], err);
  if (!encseq)
    had_err = -1;

  if (!had_err && arguments->mirror) {
    had_err = gt_encseq_mirror(encseq, err);
  }

  if (!had_err) {
    rm = gt_readmode_parse(gt_str_get(arguments->readmode), NULL);
    fwd = GT_ISDIRREVERSE(rm) ? false : true;
  }

  if (!had_err && arguments->bitpos != GT_UNDEF_ULONG) {
    if (arguments->bitpos >= gt_encseq_total_length(encseq)) {
      gt_error_set(err, "position %lu exceeds encoded sequence length of %lu",
                   arguments->bitpos, gt_encseq_total_length(encseq));
      had_err = -1;
    }

    if (!had_err) {
      unsigned long ret;
      esr = gt_encseq_create_reader_with_readmode(encseq, rm,
                                                  arguments->bitpos);
      ret = gt_encseq_extract2bitencwithtwobitencodingstoppos(&etbe, esr,
                                                        encseq,
                                                        rm, arguments->bitpos);
      gt_bitsequence_tostring(buffer, etbe.tbe);
      printf("Twobitencoding   %s\n"
             "unitsnotspecial  %u\n"
             "position         %lu\n"
             "returnvalue      %lu\n",
             buffer,
             etbe.unitsnotspecial,
             arguments->bitpos,
             ret);
      gt_encseq_reader_delete(esr);
    }
  }

  if (!had_err && arguments->stoppos != GT_UNDEF_ULONG) {
    if (arguments->stoppos >= gt_encseq_total_length(encseq)) {
      gt_error_set(err, "position %lu exceeds encoded sequence length of %lu",
                   arguments->stoppos, gt_encseq_total_length(encseq));
      had_err = -1;
    }
    if (!had_err) {
      esr = gt_encseq_create_reader_with_readmode(encseq, rm, 0);
      /* check stoppos stuff */
      gt_encseq_reader_reinit_with_readmode(esr, encseq, rm,
                                            arguments->stoppos);
      printf("%lu: %lu\n", arguments->stoppos,
                           gt_getnexttwobitencodingstoppos(fwd, esr));
      gt_encseq_reader_delete(esr);
    }
  }

  if (!had_err && arguments->specialranges) {
    /* check specialrangeiterator stuff */
    if (gt_encseq_has_specialranges(encseq)) {
      sri = gt_specialrangeiterator_new(encseq, fwd);
      while (true) {
        it1 = gt_specialrangeiterator_next(sri, &srng);
        if (it1)
          printf("%lu:%lu\n", srng.start, srng.end);
        else break;
      }
      gt_specialrangeiterator_delete(sri);
    }
  }

  gt_encseq_delete(encseq);
  gt_encseq_loader_delete(el);
  return had_err;
}
Beispiel #5
0
static GtUword *leftcontextofspecialchardist(unsigned int numofchars,
                                                   const GtEncseq *encseq,
                                                   GtReadmode readmode)
{
  GtUchar cc;
  unsigned int idx;
  GtUword *specialchardist,
                totallength = gt_encseq_total_length(encseq);
  GtReadmode convertedreadmode = (readmode == GT_READMODE_REVERSE)
                                      ? GT_READMODE_FORWARD
                                      : GT_READMODE_COMPL;

  specialchardist = gt_malloc(sizeof (*specialchardist) * numofchars);
  for (idx = 0; idx<numofchars; idx++)
  {
    specialchardist[idx] = 0;
  }
  if (gt_encseq_has_specialranges(encseq))
  {
    GtSpecialrangeiterator *sri;
    GtRange range;
    sri = gt_specialrangeiterator_new(encseq,true);
    if (GT_ISDIRREVERSE(readmode))
    {
      while (gt_specialrangeiterator_next(sri,&range))
      {
        if (range.end < totallength)
        {
          cc = gt_encseq_get_encoded_char(encseq,range.end,convertedreadmode);
          if (ISNOTSPECIAL(cc))
          {
            specialchardist[cc]++;
          }
        }
      }
    } else
    {
      while (gt_specialrangeiterator_next(sri,&range))
      {
        if (range.start > 0)
        {
          cc = gt_encseq_get_encoded_char(encseq,range.start-1,readmode);
          if (ISNOTSPECIAL(cc))
          {
            specialchardist[cc]++;
          }
        }
      }
    }
    gt_specialrangeiterator_delete(sri);
  }
  if (GT_ISDIRREVERSE(readmode))
  {
    if (gt_encseq_lengthofspecialprefix(encseq) == 0)
    {
      cc = gt_encseq_get_encoded_char(encseq,0,convertedreadmode);
      gt_assert(ISNOTSPECIAL(cc));
      specialchardist[cc]++;
    }
  } else
  {
    if (gt_encseq_lengthofspecialsuffix(encseq) == 0)
    {
      cc = gt_encseq_get_encoded_char(encseq,totallength-1,readmode);
      gt_assert(ISNOTSPECIAL(cc));
      specialchardist[cc]++;
    }
  }
  return specialchardist;
}
Beispiel #6
0
static int gt_encseq_info_runner(GT_UNUSED int argc, const char **argv,
                           int parsed_args, void *tool_arguments,
                           GtError *err)
{
  GtEncseqInfoArguments *arguments = tool_arguments;
  int had_err = 0;
  GtAlphabet *alpha;
  const GtUchar *chars;
  gt_error_check(err);
  gt_assert(arguments);

  if (arguments->nomap) {
    GtEncseqMetadata *emd = gt_encseq_metadata_new(argv[parsed_args], err);
    if (!emd)
      had_err = -1;

    if (!had_err) {
      if (!arguments->noindexname) {
        gt_file_xprintf(arguments->outfp, "index name: ");
        gt_file_xprintf(arguments->outfp, "%s\n", argv[parsed_args]);
      }

      gt_file_xprintf(arguments->outfp, "file format version: ");
      gt_file_xprintf(arguments->outfp, ""GT_WU"\n",
                                          gt_encseq_metadata_version(emd));

      gt_file_xprintf(arguments->outfp, "64-bit file: ");
      gt_file_xprintf(arguments->outfp, "%s\n", gt_encseq_metadata_is64bit(emd)
                                                  ? "yes"
                                                  : "no");

      gt_file_xprintf(arguments->outfp, "total length: ");
      gt_file_xprintf(arguments->outfp, ""GT_WU"\n",
                                        gt_encseq_metadata_total_length(emd));

      gt_file_xprintf(arguments->outfp, "number of sequences: ");
      gt_file_xprintf(arguments->outfp, ""GT_WU"\n",
                                      gt_encseq_metadata_num_of_sequences(emd));

      gt_file_xprintf(arguments->outfp, "number of files: ");
      gt_file_xprintf(arguments->outfp, ""GT_WU"\n",
                                        gt_encseq_metadata_num_of_files(emd));

      gt_file_xprintf(arguments->outfp, "length of shortest/longest "
                                        "sequence: ");
      gt_file_xprintf(arguments->outfp, ""GT_WU"/"GT_WU"\n",
                                        gt_encseq_metadata_min_seq_length(emd),
                                        gt_encseq_metadata_max_seq_length(emd));

      gt_file_xprintf(arguments->outfp, "accesstype: ");
      gt_file_xprintf(arguments->outfp, "%s\n",
                 gt_encseq_access_type_str(gt_encseq_metadata_accesstype(emd)));

      alpha = gt_encseq_metadata_alphabet(emd);
      chars = gt_alphabet_characters(alpha);
      gt_file_xprintf(arguments->outfp, "alphabet size: ");
      gt_file_xprintf(arguments->outfp, "%u\n",
                                        gt_alphabet_num_of_chars(alpha));
      gt_file_xprintf(arguments->outfp, "alphabet characters: ");
      gt_file_xprintf(arguments->outfp, "%.*s", gt_alphabet_num_of_chars(alpha),
                                        (char*) chars);
      if (gt_alphabet_is_dna(alpha))
        gt_file_xprintf(arguments->outfp, " (DNA)");
      if (gt_alphabet_is_protein(alpha))
        gt_file_xprintf(arguments->outfp, " (Protein)");
      gt_file_xprintf(arguments->outfp, "\n");
      if (arguments->show_alphabet) {
        GtStr *out = gt_str_new();
        gt_alphabet_to_str(alpha, out);
        gt_file_xprintf(arguments->outfp, "alphabet definition:\n");
        gt_file_xprintf(arguments->outfp, "%s\n", gt_str_get(out));
        gt_str_delete(out);
      }

    }
    gt_encseq_metadata_delete(emd);
  } else {
    GtEncseqLoader *encseq_loader;
    GtEncseq *encseq;

    encseq_loader = gt_encseq_loader_new();
    if (arguments->mirror)
      gt_encseq_loader_mirror(encseq_loader);
    if (!(encseq = gt_encseq_loader_load(encseq_loader,
                                         argv[parsed_args], err)))
      had_err = -1;

    if (!had_err) {
      const GtStrArray *filenames;
      GtUword i;

      if (!arguments->noindexname) {
        gt_file_xprintf(arguments->outfp, "index name: ");
        gt_file_xprintf(arguments->outfp, "%s\n", argv[parsed_args]);
      }

      gt_file_xprintf(arguments->outfp, "file format version: ");
      gt_file_xprintf(arguments->outfp, ""GT_WU"\n", gt_encseq_version(encseq));

      gt_file_xprintf(arguments->outfp, "64-bit file: ");
      gt_file_xprintf(arguments->outfp, "%s\n", gt_encseq_is_64_bit(encseq)
                                                   ? "yes"
                                                   : "no");

      gt_file_xprintf(arguments->outfp, "total length: ");
      gt_file_xprintf(arguments->outfp, ""GT_WU"\n",
                                        gt_encseq_total_length(encseq));

      gt_file_xprintf(arguments->outfp, "compressed size: ");
      gt_file_xprintf(arguments->outfp, ""GT_WU" bytes\n",
                                        gt_encseq_sizeofrep(encseq));

      gt_file_xprintf(arguments->outfp, "number of sequences: ");
      gt_file_xprintf(arguments->outfp, ""GT_WU"\n",
                                        gt_encseq_num_of_sequences(encseq));

      gt_file_xprintf(arguments->outfp, "number of files: ");
      gt_file_xprintf(arguments->outfp, ""GT_WU"\n",
                                        gt_encseq_num_of_files(encseq));

      gt_file_xprintf(arguments->outfp, "length of shortest/longest "
                                        "sequence: ");
      gt_file_xprintf(arguments->outfp, ""GT_WU"/"GT_WU"\n",
                                      gt_encseq_min_seq_length(encseq),
                                      gt_encseq_max_seq_length(encseq));

      filenames = gt_encseq_filenames(encseq);
      gt_file_xprintf(arguments->outfp, "original filenames:\n");
      for (i = 0; i < gt_str_array_size(filenames); i++) {
        gt_file_xprintf(arguments->outfp, "\t%s ("GT_WU" characters)\n",
                                          gt_str_array_get(filenames, i),
                                          (GtUword)
                                     gt_encseq_effective_filelength(encseq, i));
      }

      alpha = gt_encseq_alphabet(encseq);
      chars = gt_alphabet_characters(alpha);
      gt_file_xprintf(arguments->outfp, "alphabet size: ");
      gt_file_xprintf(arguments->outfp, "%u\n",
                                        gt_alphabet_num_of_chars(alpha));
      gt_file_xprintf(arguments->outfp, "alphabet characters: ");
      gt_file_xprintf(arguments->outfp, "%.*s", gt_alphabet_num_of_chars(alpha),
                                        (char*) chars);
      if (gt_alphabet_is_dna(alpha))
        gt_file_xprintf(arguments->outfp, " (DNA)");
      if (gt_alphabet_is_protein(alpha))
        gt_file_xprintf(arguments->outfp, " (Protein)");
      gt_file_xprintf(arguments->outfp, "\n");
      if (arguments->show_alphabet) {
        GtStr *out = gt_str_new();
        gt_alphabet_to_str(alpha, out);
        gt_file_xprintf(arguments->outfp, "alphabet definition:\n");
        gt_file_xprintf(arguments->outfp, "%s\n", gt_str_get(out));
        gt_str_delete(out);
      }

      gt_file_xprintf(arguments->outfp, "character distribution:\n");
      for (i = 0; i < gt_alphabet_num_of_chars(alpha); i++) {
        GtUword cc;
        cc = gt_encseq_charcount(encseq, gt_alphabet_encode(alpha, chars[i]));
        gt_file_xprintf(arguments->outfp, "\t%c: "GT_WU" (%.2f%%)\n",
                                          (char) chars[i],
                                          cc,
                             (cc /(double) (gt_encseq_total_length(encseq)
                                  - gt_encseq_num_of_sequences(encseq)+1))*100);
      }

      gt_file_xprintf(arguments->outfp, "number of wildcards: ");
      gt_file_xprintf(arguments->outfp, ""GT_WU" ("GT_WU" range(s))\n",
                                        gt_encseq_wildcards(encseq),
                                        gt_encseq_realwildcardranges(encseq));

      gt_file_xprintf(arguments->outfp, "number of special characters: ");
      gt_file_xprintf(arguments->outfp, ""GT_WU" ("GT_WU" range(s))\n",
                                        gt_encseq_specialcharacters(encseq),
                                        gt_encseq_realspecialranges(encseq));

      gt_file_xprintf(arguments->outfp, "length of longest non-special "
                                        "character stretch: ");
      gt_file_xprintf(arguments->outfp, ""GT_WU"\n",
                                   gt_encseq_lengthoflongestnonspecial(encseq));

      gt_file_xprintf(arguments->outfp, "accesstype: ");
      gt_file_xprintf(arguments->outfp, "%s\n",
                   gt_encseq_access_type_str(gt_encseq_accesstype_get(encseq)));

      gt_file_xprintf(arguments->outfp, "bits used per character: ");
      gt_file_xprintf(arguments->outfp, "%f\n",
        (double) ((uint64_t) CHAR_BIT *
                  (uint64_t) gt_encseq_sizeofrep(encseq)) /
        (double) gt_encseq_total_length(encseq));

      gt_file_xprintf(arguments->outfp, "has special ranges: ");
      gt_file_xprintf(arguments->outfp, "%s\n",
                                        gt_encseq_has_specialranges(encseq)
                                          ? "yes"
                                          : "no");

      gt_file_xprintf(arguments->outfp, "has description support: ");
      gt_file_xprintf(arguments->outfp, "%s\n",
                                       gt_encseq_has_description_support(encseq)
                                          ? "yes"
                                          : "no");

      if (gt_encseq_has_description_support(encseq)) {
        gt_file_xprintf(arguments->outfp, "length of longest description: ");
        gt_file_xprintf(arguments->outfp, ""GT_WU"\n",
                                          gt_encseq_max_desc_length(encseq));
      }

      gt_file_xprintf(arguments->outfp, "has multiple sequence support: ");
      gt_file_xprintf(arguments->outfp, "%s\n",
                                        gt_encseq_has_multiseq_support(encseq)
                                          ? "yes"
                                          : "no");
    }
    gt_encseq_delete(encseq);
    gt_encseq_loader_delete(encseq_loader);
  }

  return had_err;
}