예제 #1
1
const uint8_t *
u8_grapheme_next (const uint8_t *s, const uint8_t *end)
{
  ucs4_t prev;
  int mblen;

  if (s == end)
    return NULL;

  for (s += u8_mbtouc (&prev, s, end - s); s != end; s += mblen)
    {
      ucs4_t next;

      mblen = u8_mbtouc (&next, s, end - s);
      if (uc_is_grapheme_break (prev, next))
        break;

      prev = next;
    }

  return s;
}
예제 #2
0
void
u8_grapheme_breaks (const uint8_t *s, size_t n, char *p)
{
  ucs4_t prev;
  int mblen;

  prev = 0;
  for (; n > 0; s += mblen, p += mblen, n -= mblen)
    {
      ucs4_t next;
      int i;

      mblen = u8_mbtouc (&next, s, n);

      p[0] = uc_is_grapheme_break (prev, next);
      for (i = 1; i < mblen; i++)
        p[i] = 0;

      prev = next;
    }
}
size_t
u8_mbsnlen (const uint8_t *s, size_t n)
{
  size_t characters;

  characters = 0;
  while (n > 0)
    {
      ucs4_t uc;
      int count = u8_mbtoucr (&uc, s, n);
      characters++;
      if (count == -2)
        break;
      if (count < 0)
        count = u8_mbtouc (&uc, s, n);
      else if (count == 0)
        count = 1;
      s += count;
      n -= count;
    }
  return characters;
}
예제 #4
0
static void
html_ostream::write_mem (html_ostream_t stream, const void *data, size_t len)
{
  if (len > 0)
    {
      #define BUFFERSIZE 2048
      char inbuffer[BUFFERSIZE];
      size_t inbufcount;

      inbufcount = stream->buflen;
      if (inbufcount > 0)
        memcpy (inbuffer, stream->buf, inbufcount);
      for (;;)
        {
          /* At this point, inbuffer[0..inbufcount-1] is filled.  */
          {
            /* Combine the previous rest with a chunk of new input.  */
            size_t n =
              (len <= BUFFERSIZE - inbufcount ? len : BUFFERSIZE - inbufcount);

            if (n > 0)
              {
                memcpy (inbuffer + inbufcount, data, n);
                data = (char *) data + n;
                inbufcount += n;
                len -= n;
              }
          }
          {
            /* Handle complete UTF-8 characters.  */
            const char *inptr = inbuffer;
            size_t insize = inbufcount;

            while (insize > 0)
              {
                unsigned char c0;
                ucs4_t uc;
                int nbytes;

                c0 = ((const unsigned char *) inptr)[0];
                if (insize < (c0 < 0xc0 ? 1 : c0 < 0xe0 ? 2 : c0 < 0xf0 ? 3 :
                              c0 < 0xf8 ? 4 : c0 < 0xfc ? 5 : 6))
                  break;

                nbytes = u8_mbtouc (&uc, (const unsigned char *) inptr, insize);

                if (uc == '\n')
                  {
                    size_t prev_class_stack_size = stream->curr_class_stack_size;
                    stream->curr_class_stack_size = 0;
                    emit_pending_spans (stream, false);
                    ostream_write_str (stream->destination, "<br/>");
                    stream->curr_class_stack_size = prev_class_stack_size;
                  }
                else
                  {
                    emit_pending_spans (stream, true);

                    switch (uc)
                      {
                      case '"':
                        ostream_write_str (stream->destination, "&quot;");
                        break;
                      case '&':
                        ostream_write_str (stream->destination, "&amp;");
                        break;
                      case '<':
                        ostream_write_str (stream->destination, "&lt;");
                        break;
                      case '>':
                        /* Needed to avoid "]]>" in the output.  */
                        ostream_write_str (stream->destination, "&gt;");
                        break;
                      case ' ':
                        /* Needed because HTML viewers merge adjacent spaces
                           and drop spaces adjacent to <br> and similar.  */
                        ostream_write_str (stream->destination, "&nbsp;");
                        break;
                      default:
                        if (uc >= 0x20 && uc < 0x7F)
                          {
                            /* Output ASCII characters as such.  */
                            char bytes[1];
                            bytes[0] = uc;
                            ostream_write_mem (stream->destination, bytes, 1);
                          }
                        else
                          {
                            /* Output non-ASCII characters in #&nnn;
                               notation.  */
                            char bytes[32];
                            sprintf (bytes, "&#%d;", (int) uc);
                            ostream_write_str (stream->destination, bytes);
                          }
                        break;
                      }
                  }

                inptr += nbytes;
                insize -= nbytes;
              }
            /* Put back the unconverted part.  */
            if (insize > BUFSIZE)
              abort ();
            if (len == 0)
              {
                if (insize > 0)
                  memcpy (stream->buf, inptr, insize);
                stream->buflen = insize;
                break;
              }
            if (insize > 0)
              memmove (inbuffer, inptr, insize);
            inbufcount = insize;
          }
        }
      #undef BUFFERSIZE
    }
}
예제 #5
0
static void
check_segmentation (const char *input, size_t length, bool print_segments)
{
  size_t offset, line_number, line_offset;
  struct segmenter s;
  int prev_type;

  segmenter_init (&s, mode);

  line_number = 1;
  line_offset = 0;
  prev_type = -1;
  for (offset = 0; offset < length; )
    {
      enum segment_type type;
      const char *type_name, *p;
      int n;

      if (one_byte)
        {
          int n_newlines = 0;
          int i;

          for (i = 0; i <= length - offset; i++)
            {
              /* Make a copy to ensure that segmenter_push() isn't actually
                 looking ahead. */
              char *copy;

              if (i > 0 && input[offset + i - 1] == '\n')
                n_newlines++;

              copy = xmemdup (input + offset, i);
              n = segmenter_push (&s, copy, i, &type);
              free (copy);

              if (n >= 0)
                break;
            }
          assert (n_newlines <= 2);
        }
      else
        n = segmenter_push (&s, input + offset, length - offset, &type);

      if (n < 0)
        error (EXIT_FAILURE, 0, "segmenter_push returned -1 at offset %zu",
               offset);
      assert (offset + n <= length);

      if (type == SEG_NEWLINE)
        assert ((n == 1 && input[offset] == '\n')
                || (n == 2
                    && input[offset] == '\r' && input[offset + 1] == '\n'));
      else
        assert (memchr (&input[offset], '\n', n) == NULL);

      if (!print_segments)
        {
          offset += n;
          continue;
        }

      if (!verbose)
        {
          if (prev_type != SEG_SPACES && prev_type != -1
              && type == SEG_SPACES && n == 1 && input[offset] == ' ')
            {
              printf ("    space\n");
              offset++;
              prev_type = -1;
              continue;
            }
        }
      if (prev_type != -1)
        putchar ('\n');
      prev_type = type;

      if (verbose)
        printf ("%2zu:%2zu: ", line_number, offset - line_offset);

      type_name = segment_type_to_string (type);
      for (p = type_name; *p != '\0'; p++)
        putchar (tolower ((unsigned char) *p));
      if (n > 0)
        {
          int i;

          for (i = MIN (15, strlen (type_name)); i < 16; i++)
            putchar (' ');
          for (i = 0; i < n; )
            {
              const uint8_t *u_input = CHAR_CAST (const uint8_t *, input);
              ucs4_t uc;
              int mblen;

              mblen = u8_mbtoucr (&uc, u_input + (offset + i), n - i);
              if (mblen < 0)
                {
                  int j;

                  mblen = u8_mbtouc (&uc, u_input + (offset + i), n - i);
                  putchar ('<');
                  for (j = 0; j < mblen; j++)
                    {
                      if (j > 0)
                        putchar (' ');
                      printf ("%02x", input[offset + i + j]);
                    }
                  putchar ('>');
                }
              else
                {
                  switch (uc)
                    {
                    case ' ':
                      printf ("_");
                      break;

                    case '_':
                      printf ("\\_");
                      break;

                    case '\\':
                      printf ("\\\\");
                      break;

                    case '\t':
                      printf ("\\t");
                      break;

                    case '\r':
                      printf ("\\r");
                      break;

                    case '\n':
                      printf ("\\n");
                      break;

                    case '\v':
                      printf ("\\v");
                      break;

                    default:
                      if (uc < 0x20 || uc == 0x00a0)
                        printf ("<U+%04X>", uc);
                      else
                        fwrite (input + offset + i, 1, mblen, stdout);
                      break;
                    }
                }

              i += mblen;
            }
        }

      offset += n;
      if (type == SEG_NEWLINE)
        {
          enum prompt_style prompt;

          line_number++;
          line_offset = offset;

          prompt = segmenter_get_prompt (&s);
          printf (" (%s)\n", prompt_style_to_string (prompt));
        }
    }
예제 #6
0
파일: striconveh.c 프로젝트: TaylanUB/guile
/* utf8conv_carefully is like iconv, except that
     - it converts from UTF-8 to UTF-8,
     - it stops as soon as it encounters a conversion error, and it returns
       in *INCREMENTED a boolean telling whether it has incremented the input
       pointers past the error location,
     - if one_character_only is true, it stops after converting one
       character.  */
static size_t
utf8conv_carefully (bool one_character_only,
                    const char **inbuf, size_t *inbytesleft,
                    char **outbuf, size_t *outbytesleft,
                    bool *incremented)
{
  const char *inptr = *inbuf;
  size_t insize = *inbytesleft;
  char *outptr = *outbuf;
  size_t outsize = *outbytesleft;
  size_t res;

  res = 0;
  do
    {
      ucs4_t uc;
      int n;
      int m;

      n = u8_mbtoucr (&uc, (const uint8_t *) inptr, insize);
      if (n < 0)
        {
          errno = (n == -2 ? EINVAL : EILSEQ);
          n = u8_mbtouc (&uc, (const uint8_t *) inptr, insize);
          inptr += n;
          insize -= n;
          res = (size_t)(-1);
          *incremented = true;
          break;
        }
      if (outsize == 0)
        {
          errno = E2BIG;
          res = (size_t)(-1);
          *incremented = false;
          break;
        }
      m = u8_uctomb ((uint8_t *) outptr, uc, outsize);
      if (m == -2)
        {
          errno = E2BIG;
          res = (size_t)(-1);
          *incremented = false;
          break;
        }
      inptr += n;
      insize -= n;
      if (m == -1)
        {
          errno = EILSEQ;
          res = (size_t)(-1);
          *incremented = true;
          break;
        }
      outptr += m;
      outsize -= m;
    }
  while (!one_character_only && insize > 0);

  *inbuf = inptr;
  *inbytesleft = insize;
  *outbuf = outptr;
  *outbytesleft = outsize;
  return res;
}
예제 #7
0
/* Convert a resource name to a class name.
   Return a nonempty string consisting of alphanumerics and underscores
   and starting with a letter or underscore.  */
static char *
construct_class_name (const char *resource_name)
{
  /* This code must be kept consistent with intl.cs, function
     GettextResourceManager.ConstructClassName.  */
  /* We could just return an arbitrary fixed class name, like "Messages",
     assuming that every assembly will only ever contain one
     GettextResourceSet subclass, but this assumption would break the day
     we want to support multi-domain PO files in the same format...  */
  bool valid;
  const char *p;

  /* Test for a valid ASCII identifier:
     - nonempty,
     - first character is A..Za..z_ - see x-csharp.c:is_identifier_start.
     - next characters are A..Za..z_0..9 - see x-csharp.c:is_identifier_part.
   */
  valid = (resource_name[0] != '\0');
  for (p = resource_name; valid && *p != '\0'; p++)
    {
      char c = *p;
      if (!((c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z') || (c == '_')
            || (p > resource_name && c >= '0' && c <= '9')))
        valid = false;
    }
  if (valid)
    return xstrdup (resource_name);
  else
    {
      static const char hexdigit[] = "0123456789abcdef";
      const char *str = resource_name;
      const char *str_limit = str + strlen (str);
      char *class_name = XNMALLOC (12 + 6 * (str_limit - str) + 1, char);
      char *b;

      b = class_name;
      memcpy (b, "__UESCAPED__", 12); b += 12;
      while (str < str_limit)
        {
          ucs4_t uc;
          str += u8_mbtouc (&uc, (const unsigned char *) str, str_limit - str);
          if (uc >= 0x10000)
            {
              *b++ = '_';
              *b++ = 'U';
              *b++ = hexdigit[(uc >> 28) & 0x0f];
              *b++ = hexdigit[(uc >> 24) & 0x0f];
              *b++ = hexdigit[(uc >> 20) & 0x0f];
              *b++ = hexdigit[(uc >> 16) & 0x0f];
              *b++ = hexdigit[(uc >> 12) & 0x0f];
              *b++ = hexdigit[(uc >> 8) & 0x0f];
              *b++ = hexdigit[(uc >> 4) & 0x0f];
              *b++ = hexdigit[uc & 0x0f];
            }
          else if (!((uc >= 'A' && uc <= 'Z') || (uc >= 'a' && uc <= 'z')
                     || (uc >= '0' && uc <= '9')))
            {
              *b++ = '_';
              *b++ = 'u';
              *b++ = hexdigit[(uc >> 12) & 0x0f];
              *b++ = hexdigit[(uc >> 8) & 0x0f];
              *b++ = hexdigit[(uc >> 4) & 0x0f];
              *b++ = hexdigit[uc & 0x0f];
            }
예제 #8
0
/* This function works in a similar way to 'forward-sentence' in
   Emacs, which basically does a regular expression matching of:

     [.?!\u2026]
       []"'\u201d)}]*
         \($\|[ \u00a0]$\|\t\|[ \u00a0]\{REQUIRED_SPACES\}\)

   Since we are lacking a regular expression routine capable of
   Unicode (though gnulib-lib/lib/regex.c provides a locale-dependent
   version, we would rather avoid depending on it), apply a manually
   constructed DFA, which consists of 8 states where 4 of them are a
   terminal.  */
const char *
sentence_end (const char *string, ucs4_t *ending_charp)
{
  const char *str = string;
  const char *str_limit = string + strlen (str);
  /* States of the DFA, 0 to 7, where 3, 5, 6, and 7 are a terminal.  */
  int state = 0;
  /* Previous character before an end marker.  */
  ucs4_t ending_char = 0xfffd;
  /* Possible starting position of the match, and the next starting
     position if the current match fails.  */
  const char *match_start, *match_next;
  /* Number of spaces.  */
  int spaces;

  while (str <= str_limit)
    {
      ucs4_t uc;
      size_t length;

      length = u8_mbtouc (&uc, (const unsigned char *) str, str_limit - str);

      if (state == 0)
        {
          switch (uc)
            {
            case '.': case '?': case '!': case 0x2026:
              state = 1;
              match_start = str;
              match_next = str + length;
              ending_char = uc;
              spaces = 0;
              break;

            default:
              break;
            }

          str += length;
          continue;
        }

      if (state == 1)
        {
          switch (uc)
            {
            case ']': case '"': case '\'': case ')': case '}': case 0x201d:
              state = 2;
              break;

            case '\0': case '\n':
              /* State 3.  */
              *ending_charp = ending_char;
              return match_start;

            case ' ': case 0x00a0:
              if (++spaces == sentence_end_required_spaces)
                {
                  /* State 7.  */
                  *ending_charp = ending_char;
                  return match_start;
                }
              state = 4;
              break;

            case '\t':
              /* State 5.  */
              *ending_charp = ending_char;
              return match_start;

            default:
              str = match_next;
              state = 0;
              continue;
            }

          str += length;
          continue;
        }

      if (state == 2)
        {
          switch (uc)
            {
            case ']': case '"': case '\'': case ')': case '}': case 0x201d:
              break;

            case '\0': case '\n':
              /* State 3.  */
              *ending_charp = ending_char;
              return match_start;

            case ' ': case 0x00a0:
              if (++spaces == sentence_end_required_spaces)
                {
                  /* State 7.  */
                  *ending_charp = ending_char;
                  return match_start;
                }
              state = 4;
              break;

            case '\t':
              /* State 5.  */
              *ending_charp = ending_char;
              return match_start;

            default:
              state = 0;
              str = match_next;
              continue;
            }

          str += length;
          continue;
        }

      if (state == 4)
        {
          switch (uc)
            {
            case '\0': case '\n':
              /* State 6.  */
              *ending_charp = ending_char;
              return match_start;

            case ' ': case 0x00a0:
              if (++spaces == sentence_end_required_spaces)
                {
                  /* State 7.  */
                  *ending_charp = ending_char;
                  return match_start;
                }
              break;

            default:
              state = 0;
              str = match_next;
              continue;
            }

          str += length;
          continue;
        }
    }

  *ending_charp = 0xfffd;
  return str_limit;
}
예제 #9
0
/* Return true if INPUT is an XML reference.  */
static bool
is_reference (const char *input)
{
  const char *str = input;
  const char *str_limit = str + strlen (input);
  ucs4_t uc;
  int i;

  str += u8_mbtouc (&uc, (const unsigned char *) str, str_limit - str);
  assert (uc == '&');

  str += u8_mbtouc (&uc, (const unsigned char *) str, str_limit - str);

  /* CharRef */
  if (uc == '#')
    {
      str += u8_mbtouc (&uc, (const unsigned char *) str, str_limit - str);
      if (uc == 'x')
        {
          while (str < str_limit)
            {
              str += u8_mbtouc (&uc, (const unsigned char *) str,
                                str_limit - str);
              if (!(('0' <= uc && uc <= '9')
                    || ('A' <= uc && uc <= 'F')
                    || ('a' <= uc && uc <= 'f')))
                break;
            }
          return uc == ';';
        }
      else if ('0' <= uc && uc <= '9')
        {
          while (str < str_limit)
            {
              str += u8_mbtouc (&uc, (const unsigned char *) str,
                                str_limit - str);
              if (!('0' <= uc && uc <= '9'))
                break;
            }
          return uc == ';';
        }
    }
  else
    {
      /* EntityRef */
      for (i = 0; i < SIZEOF (name_chars1); i++)
        if (name_chars1[i].start <= uc && uc <= name_chars1[i].end)
          break;

      if (i == SIZEOF (name_chars1))
        return false;

      while (str < str_limit)
        {
          str += u8_mbtouc (&uc, (const unsigned char *) str, str_limit - str);
          for (i = 0; i < SIZEOF (name_chars1); i++)
            if (name_chars1[i].start <= uc && uc <= name_chars1[i].end)
              break;
          if (i == SIZEOF (name_chars1))
            {
              for (i = 0; i < SIZEOF (name_chars2); i++)
                if (name_chars2[i].start <= uc && uc <= name_chars2[i].end)
                  break;
              if (i == SIZEOF (name_chars2))
                return false;
            }
        }
      return uc == ';';
    }

  return false;
}