int
main ()
{
  /* Test case n = 0.  */
  u8_wordbreaks (NULL, 0, NULL);

  {
    static const uint8_t input[91] =
      /* "Grüß Gott. Здравствуйте! x=(-b±sqrt(b²-4ac))/(2a)  日本語,中文,한글" */
      "Gr\303\274\303\237 Gott. \320\227\320\264\321\200\320\260\320\262\321\201\321\202\320\262\321\203\320\271\321\202\320\265! x=(-b\302\261sqrt(b\302\262-4ac))/(2a)  \346\227\245\346\234\254\350\252\236,\344\270\255\346\226\207,\355\225\234\352\270\200\n";
    char *p = (char *) malloc (SIZEOF (input));
    size_t i;

    u8_wordbreaks (input, SIZEOF (input), p);

    for (i = 0; i < 91; i++)
      {
        ASSERT (p[i] == ((i >= 6 && i <= 7)
                         || (i >= 11 && i <= 13)
                         || (i >= 37 && i <= 44)
                         || i == 46 || (i >= 50 && i <= 52)
                         || (i >= 54 && i <= 55)
                         || (i >= 58 && i <= 62) || (i >= 64 && i <= 67)
                         || i == 70 || i == 73 || i == 76
                         || i == 77 || i == 80 || i == 83
                         || i == 84 || i == 90
                         ? 1 : 0));
      }
    free (p);
  }

  {
    /* Same input string, decomposed.  */
    static const uint8_t input[106] =
      /* "Grüß Gott. Здравствуйте! x=(-b±sqrt(b²-4ac))/(2a)  日本語,中文,한글" */
      "Gru\314\210\303\237 Gott. \320\227\320\264\321\200\320\260\320\262\321\201\321\202\320\262\321\203\320\270\314\206\321\202\320\265! x=(-b\302\261sqrt(b\302\262-4ac))/(2a)  \346\227\245\346\234\254\350\252\236,\344\270\255\346\226\207,\341\204\222\341\205\241\341\206\253\341\204\200\341\205\263\341\206\257\n";
    char *p = (char *) malloc (SIZEOF (input));
    size_t i;

    u8_wordbreaks (input, SIZEOF (input), p);

    for (i = 0; i < 106; i++)
      {
        ASSERT (p[i] == ((i >= 7 && i <= 8)
                         || (i >= 12 && i <= 14)
                         || (i >= 40 && i <= 47)
                         || i == 49 || (i >= 53 && i <= 55)
                         || (i >= 57 && i <= 58)
                         || (i >= 61 && i <= 65) || (i >= 67 && i <= 70)
                         || i == 73 || i == 76 || i == 79
                         || i == 80 || i == 83 || i == 86
                         || i == 87 || i == 105
                         ? 1 : 0));
      }
    free (p);
  }

  return 0;
}
void
tracker_parser_reset (TrackerParser *parser,
                      const gchar   *txt,
                      gint           txt_size,
                      guint          max_word_length,
                      gboolean       enable_stemmer,
                      gboolean       enable_unaccent,
                      gboolean       ignore_stop_words,
                      gboolean       ignore_reserved_words,
                      gboolean       ignore_numbers)
{
	g_return_if_fail (parser != NULL);
	g_return_if_fail (txt != NULL);

	parser->max_word_length = max_word_length;
	parser->enable_stemmer = enable_stemmer;
	parser->enable_unaccent = enable_unaccent;
	parser->ignore_stop_words = ignore_stop_words;
	parser->ignore_reserved_words = ignore_reserved_words;
	parser->ignore_numbers = ignore_numbers;

	/* Note: We're forcing some unicode characters to behave
	 * as wordbreakers: e.g, the '.' The main reason for this
	 * is to enable FTS searches matching file extension. */
	parser->enable_forced_wordbreaks = TRUE;

	parser->txt_size = txt_size;
	parser->txt = txt;

	g_free (parser->word);
	parser->word = NULL;

	parser->word_position = 0;

	parser->cursor = 0;

	g_free (parser->word_break_flags);

	/* Create array of flags, same size as original text. */
	parser->word_break_flags = g_malloc (txt_size);

	/* Get wordbreak flags in the whole string */
	u8_wordbreaks ((const uint8_t *)txt,
	               (size_t) txt_size,
	               (char *)parser->word_break_flags);

	/* Prepare a custom category which is a combination of the
	 * desired ones */
	parser->allowed_start = UC_LETTER;
	if (!parser->ignore_numbers) {
		parser->allowed_start = uc_general_category_or (parser->allowed_start, UC_NUMBER);
	}
}
Esempio n. 3
0
int
main (int argc, char * argv[])
{
  if (argc == 1)
    {
      /* Display all the word breaks in the input string.  */
      char *input = read_file (stdin);
      int length = strlen (input);
      char *breaks = malloc (length);
      int i;

      u8_wordbreaks ((uint8_t *) input, length, breaks);

      for (i = 0; i < length; i++)
        {
          switch (breaks[i])
            {
            case 1:
              /* U+2027 in UTF-8 encoding */
              putc (0xe2, stdout); putc (0x80, stdout); putc (0xa7, stdout);
              break;
            case 0:
              break;
            default:
              abort ();
            }
          putc (input[i], stdout);
        }

      free (breaks);

      return 0;
    }
  else
    return 1;
}
Esempio n. 4
0
void
ulc_wordbreaks (const char *s, size_t n, char *p)
{
  if (n > 0)
    {
      const char *encoding = locale_charset ();

      if (is_utf8_encoding (encoding))
        u8_wordbreaks ((const uint8_t *) s, n, p);
      else
        {
          /* Convert the string to UTF-8 and build a translation table
             from offsets into s to offsets into the translated string.  */
          size_t *offsets = (size_t *) malloc (n * sizeof (size_t));

          if (offsets != NULL)
            {
              uint8_t *t;
              size_t m;

              t = u8_conv_from_encoding (encoding, iconveh_question_mark,
                                         s, n, offsets, NULL, &m);
              if (t != NULL)
                {
                  char *q = (char *) (m > 0 ? malloc (m) : NULL);

                  if (m == 0 || q != NULL)
                    {
                      size_t i;

                      /* Determine the word breaks of the UTF-8 string.  */
                      u8_wordbreaks (t, m, q);

                      /* Translate the result back to the original string.  */
                      memset (p, 0, n);
                      for (i = 0; i < n; i++)
                        if (offsets[i] != (size_t)(-1))
                          p[i] = q[offsets[i]];

                      free (q);
                      free (t);
                      free (offsets);
                      return;
                    }
                  free (t);
                }
              free (offsets);
            }

          /* Impossible to convert.  */
#if C_CTYPE_ASCII
          if (is_all_ascii (s, n))
            {
              /* ASCII is a subset of UTF-8.  */
              u8_wordbreaks ((const uint8_t *) s, n, p);
              return;
            }
#endif
          /* We have a non-ASCII string and cannot convert it.
             Don't produce any word breaks.  */
          memset (p, 0, n);
        }
    }
}