Ejemplo n.º 1
0
GtWtree* gt_wtree_encseq_new(GtEncseq *encseq)
{
  /* sample rate for compressd bitseq */
  const unsigned int samplerate = 32U;
  GtWtree *wtree;
  GtWtreeEncseq *wtree_encseq;
  wtree = gt_wtree_create(gt_wtree_encseq_class());
  wtree_encseq = gt_wtree_encseq_cast(wtree);
  wtree_encseq->encseq = gt_encseq_ref(encseq);
  wtree_encseq->alpha = gt_alphabet_ref(gt_encseq_alphabet(encseq));
  /* encoded chars + WC given by gt_alphabet_size,
     we have to encode UNDEFCHAR and SEPARATOR too */
  wtree_encseq->alpha_size = gt_alphabet_size(wtree_encseq->alpha) + 2;
  wtree->members->num_of_symbols = (GtUword) wtree_encseq->alpha_size;
  /* levels in tree: \lceil log_2(\sigma)\rceil */
  wtree_encseq->levels =
    gt_determinebitspervalue((GtUword) wtree_encseq->alpha_size);
  wtree_encseq->root_fo = gt_wtree_encseq_fill_offset_new();
  wtree_encseq->current_fo = wtree_encseq->root_fo;
  wtree->members->length =
    gt_encseq_total_length(encseq);
  /* each level has number of symbols bits */
  wtree_encseq->num_of_bits =
    wtree_encseq->levels *
    wtree->members->length;
  wtree_encseq->bits_size =
    wtree_encseq->num_of_bits / (sizeof (GtBitsequence) * CHAR_BIT);
  if (wtree_encseq->num_of_bits % (sizeof (GtBitsequence) * CHAR_BIT) != 0)
    wtree_encseq->bits_size++;
  wtree_encseq->bits =
    gt_calloc((size_t) wtree_encseq->bits_size, sizeof (GtBitsequence));
  wtree_encseq->node_start = 0;
  gt_wtree_encseq_fill_bits(wtree_encseq);
  wtree_encseq->c_bits =
    gt_compressed_bitsequence_new(wtree_encseq->bits,
                                  samplerate,
                                  wtree_encseq->num_of_bits);
  gt_free(wtree_encseq->bits);
  wtree_encseq->bits = NULL;
  return wtree;
}
Ejemplo n.º 2
0
void gt_alphabet_add_mapping(GtAlphabet *alphabet, const char *characters)
{
  size_t i, num_of_characters;
  gt_assert(alphabet && characters);
  num_of_characters = strlen(characters);
  gt_assert(num_of_characters);
  alphabet->mapdomain = gt_realloc(alphabet->mapdomain,
                                   (size_t) alphabet->domainsize
                                      + num_of_characters);
  memcpy(alphabet->mapdomain + alphabet->domainsize, characters,
         num_of_characters);
  alphabet->domainsize += num_of_characters;
  alphabet->symbolmap[(int) characters[0]] = (GtUchar) alphabet->mapsize;
  alphabet->characters = gt_realloc(alphabet->characters,
                                    (size_t) alphabet->domainsize);
  alphabet->characters[alphabet->mapsize] = (GtUchar) characters[0];
  for (i = 0; i < num_of_characters; i++)
    alphabet->symbolmap[(int) characters[i]] = (GtUchar) alphabet->mapsize;
  alphabet->mapsize++;
  alphabet->bitspersymbol
    = gt_determinebitspervalue((unsigned long) alphabet->mapsize);
}
Ejemplo n.º 3
0
static int read_symbolmap_from_lines(GtAlphabet *alpha,
                                     const char *mapfile,
                                     const GtStrArray *lines,
                                     GtError *err)
{
  char cc;
  unsigned int cnum, allocateddomainsize = 0;
  unsigned long linecount, column;
  bool blankfound, ignore, preamble = true, haserr = false;
  const char *currentline;
  GtUchar chartoshow;

  gt_error_check(err);
  alpha->alphadef = gt_str_new();
  alpha->domainsize = alpha->mapsize = alpha->mappedwildcards = 0;
  for (cnum=0; cnum<=(unsigned int) GT_MAXALPHABETCHARACTER; cnum++)
  {
    alpha->symbolmap[cnum] = (GtUchar) UNDEFCHAR;
  }
  alpha->mapdomain = NULL;
  alpha->characters = gt_malloc(sizeof (GtUchar) *
                                (gt_str_array_size(lines)-1));
  for (linecount = 0; linecount < gt_str_array_size(lines); linecount++)
  {
    currentline = gt_str_array_get(lines,linecount);
    gt_str_append_cstr(alpha->alphadef, currentline);
    gt_str_append_char(alpha->alphadef, '\n');
    ignore = false;
    if (currentline != NULL && currentline[0] != '\0')
    {
      if (preamble)
      {
        if (LINE(0) == (GtUchar) '#')
        {
          ignore = true;
        } else
        {
          preamble = false;
        }
      }
      if (!ignore)
      {
        blankfound = false;
        for (column=0; LINE(column) != '\0'; column++)
        { /* for all chars in line */
          cc = LINE(column);
          if (ispunct((int) cc) || isalnum((int) cc))
          {
            if (alpha->symbolmap[(unsigned int) cc] != (GtUchar) UNDEFCHAR)
            {
              gt_error_set(err,"cannot map symbol '%c' to %u: "
                            "it is already mapped to %u",
                             cc,
                             alpha->mapsize,
                             (unsigned int) alpha->
                                            symbolmap[(unsigned int) cc]);
              haserr = true;
              break;
            }
            /* get same value */
            alpha->symbolmap[(unsigned int) cc] = (GtUchar) alpha->mapsize;
            if (alpha->domainsize >= allocateddomainsize)
            {
              allocateddomainsize += 8;
              alpha->mapdomain = gt_realloc(alpha->mapdomain,
                                        sizeof (GtUchar) * allocateddomainsize);
            }
            gt_assert(alpha->mapdomain != NULL);
            alpha->mapdomain[alpha->domainsize++] = (GtUchar) cc;
          } else
          {
            if (cc == (GtUchar) ' ')    /* first blank in line found */
            {
              blankfound = true;
              /*@innerbreak@*/ break;
            }
            if (mapfile != NULL) {
              gt_error_set(err,
                           "illegal character '%c' in line %lu of mapfile %s",
                           cc,linecount,mapfile);
            } else {
              gt_error_set(err,
                           "illegal character '%c' in line %lu of alphabet "
                           "definition",
                           cc,linecount);
            }
            haserr = true;
            break;
          }
        }
        if (haserr)
        {
          break;
        }
        if (blankfound)
        {
          if (isspace((int) LINE(column+1)))
          {
            if (mapfile != NULL) {
              gt_error_set(err,"illegal character '%c' at the end of "
                            "line %lu in mapfile %s",
                            LINE(column+1),linecount,mapfile);
            } else {
              gt_error_set(err,"illegal character '%c' at the end of "
                            "line %lu of alphabet definition",
                            LINE(column+1),linecount);
            }
            haserr  = true;
            break;
          }
          /* use next character to display character */
          chartoshow = (GtUchar) LINE(column+1);
        } else
        {
          /* use first character of line to display character */
          chartoshow = (GtUchar) LINE(0);
        }
        if (linecount == gt_str_array_size(lines)-1)
        {
          alpha->wildcardshow = chartoshow;
        } else
        {
          alpha->characters[alpha->mapsize] = chartoshow;
        }
        alpha->mapsize++;
      }
    }
  }
  if (!haserr)
  {
    for (cnum=0;cnum<=(unsigned int) GT_MAXALPHABETCHARACTER; cnum++)
    {
      if (alpha->symbolmap[cnum] == (GtUchar) (alpha->mapsize - 1))
      {
        alpha->symbolmap[cnum] = (GtUchar) WILDCARD;
        alpha->mappedwildcards++;
      }
    }
  }
  /* there are mapsize-1 characters plus wildcard plus separator.
     hence there are mapsize+1 symbols in the range 0..mapsize.
     that is, mapsize is the largest symbol and we obtain */
  alpha->bitspersymbol
    = gt_determinebitspervalue((unsigned long) alpha->mapsize);
  return haserr ? -1 : 0;
}