Beispiel #1
0
static gchar *
smie_gtk_source_buffer_backward_token (gpointer data)
{
  smie_gtk_source_buffer_context_t *context = data;
  GtkTextIter iter, start_iter;

  if (gtk_text_iter_is_start (&context->iter))
    return NULL;

  /* Skip comments and whitespaces.  */
  gtk_text_iter_backward_char (&context->iter);
  while (!gtk_text_iter_is_start (&context->iter)
	 && (gtk_source_buffer_iter_has_context_class (context->buffer,
						       &context->iter,
						       "comment")
	     || g_unichar_isspace (gtk_text_iter_get_char (&context->iter))))
    gtk_text_iter_backward_char (&context->iter);

  gtk_text_iter_assign (&iter, &context->iter);
  if (gtk_source_buffer_iter_has_context_class (context->buffer,
						&context->iter,
						"string"))
    {
      /* Read a string literal.  */
      while (!gtk_text_iter_is_start (&context->iter)
	     && gtk_source_buffer_iter_has_context_class (context->buffer,
							  &context->iter,
							  "string"))
	gtk_text_iter_backward_char (&context->iter);
    }
  else if (g_unichar_ispunct (gtk_text_iter_get_char (&context->iter)))
    {
      /* Read a punctuation.  */
      while (!gtk_text_iter_is_start (&context->iter)
	     && g_unichar_ispunct (gtk_text_iter_get_char (&context->iter)))
	gtk_text_iter_backward_char (&context->iter);
    }
  else
    {
      /* Read a normal token.  */
      while (!gtk_text_iter_is_start (&context->iter)
	     && !(gtk_source_buffer_iter_has_context_class (context->buffer,
							    &context->iter,
							    "comment")
		  || gtk_source_buffer_iter_has_context_class (context->buffer,
							       &context->iter,
							       "string")
		  || g_unichar_ispunct (gtk_text_iter_get_char (&context->iter))
		  || g_unichar_isspace (gtk_text_iter_get_char (&context->iter))))
	gtk_text_iter_backward_char (&context->iter);
    }

  gtk_text_iter_assign (&start_iter, &context->iter);
  if (!gtk_text_iter_is_start (&start_iter))
    gtk_text_iter_forward_char (&start_iter);
  gtk_text_iter_forward_char (&iter);
  return gtk_text_iter_get_slice (&start_iter, &iter);
}
Beispiel #2
0
/*
 *  call-seq:
 *    utf8_titleize(string)
 *
 *  Returns a title case string.
 *
 *    Glib.utf8_titleize('привет всем') #=> Привет Всем
 */
static VALUE utf8_titleize(VALUE self, VALUE string)
{
  VALUE result;
  gchar *temp;
  long index, length_in_bytes, length_in_chars;
  gunichar *chars_as_ucs4, current_char;
  gboolean first_character_of_word = TRUE;

  Check_Type(string, T_STRING);

  length_in_bytes = RSTRING_LEN(string);
  if ((chars_as_ucs4 = g_utf8_to_ucs4(StringValuePtr(string), length_in_bytes, NULL, &length_in_chars, NULL))) {
    for (index = 0; index < length_in_chars; index++) {
      current_char = chars_as_ucs4[index];
      if (first_character_of_word == TRUE && g_unichar_isalpha(current_char)) {
        chars_as_ucs4[index] = g_unichar_totitle(current_char);
        first_character_of_word = FALSE;
      }

      if (g_unichar_isspace(current_char) || g_unichar_ispunct(current_char)) {
        first_character_of_word = TRUE;
      }
    }
    
    temp = g_ucs4_to_utf8(chars_as_ucs4, -1, NULL, NULL, NULL);
    result = rb_str_new2(temp);
    g_free(chars_as_ucs4);
    g_free(temp);
    
    return result;
  } else {
    return Qnil;
  }
}
Beispiel #3
0
void TextEdit::setFlags(int new_flags, bool revalidate)
{
  if (new_flags == flags)
    return;

  flags = new_flags;

  if (flags && revalidate) {
    bool valid = true;
    const char *p = getTextStart();
    while (p < bufend - 1) {
      gunichar uc = g_utf8_get_char(p);
      if ((flags & FLAG_ALPHABETIC) && !g_unichar_isalpha(uc)) {
        valid = false;
        break;
      }
      if ((flags & FLAG_NUMERIC) && !g_unichar_isdigit(uc)) {
        valid = false;
        break;
      }
      if ((flags & FLAG_NOSPACE) && g_unichar_isspace(uc)) {
        valid = false;
        break;
      }
      if ((flags & FLAG_NOPUNCTUATION) && g_unichar_ispunct(uc)) {
        valid = false;
        break;
      }
      p = nextChar(p);
    }
    if (!valid)
      clear();
  }
}
Beispiel #4
0
static int
str_utf8_ispunct (const char *text)
{
    gunichar uni;

    uni = g_utf8_get_char_validated (text, -1);
    return g_unichar_ispunct (uni);
}
Beispiel #5
0
static gboolean
handle_esc_maybe (GString *gstr, char **cur, gunichar uc,
		  gboolean query_esc, gboolean range_field)
{
	char kar;

	kar = *cur[0];

	if (query_esc) {
		switch (kar) {
		case ':':
		case '(':
		case ')':
		case '*':
		case '&':
		case '"':
			g_string_append_c (gstr, kar);
			return TRUE;
		case '.':
			if (!range_field)
				break;

			if ((*cur)[1] == '.' && (*cur)[2] != '.') {
				g_string_append (gstr, "..");
				*cur = g_utf8_next_char (*cur);
				return TRUE;
			}
		default: break;
		}
	}

	if (g_unichar_ispunct(uc) || isblank(kar)) {
		g_string_append_c (gstr, '_');
		return TRUE;
	}

	return FALSE;
}
Beispiel #6
0
static gboolean
rspamd_tokenizer_get_word (rspamd_ftok_t * buf,
		gchar const **cur, rspamd_ftok_t * token,
		GList **exceptions, gboolean is_utf, gsize *rl,
		gboolean check_signature)
{
	gsize remain, pos, siglen = 0;
	const gchar *p, *next_p, *sig = NULL;
	gunichar uc;
	guint processed = 0;
	struct process_exception *ex = NULL;
	enum {
		skip_delimiters = 0,
		feed_token,
		skip_exception,
		process_signature
	} state = skip_delimiters;

	if (buf == NULL) {
		return FALSE;
	}

	if (exceptions != NULL && *exceptions != NULL) {
		ex = (*exceptions)->data;
	}

	g_assert (is_utf);
	g_assert (cur != NULL);

	if (*cur == NULL) {
		*cur = buf->begin;
	}

	token->len = 0;

	pos = *cur - buf->begin;
	if (pos >= buf->len) {
		return FALSE;
	}

	remain = buf->len - pos;
	p = *cur;
	token->begin = p;

	while (remain > 0) {
		uc = g_utf8_get_char (p);
		next_p = g_utf8_next_char (p);

		if (next_p - p > (gint)remain) {
			return FALSE;
		}

		switch (state) {
		case skip_delimiters:
			if (ex != NULL && p - buf->begin == (gint)ex->pos) {
				token->begin = "!!EX!!";
				token->len = sizeof ("!!EX!!") - 1;
				processed = token->len;
				state = skip_exception;
				continue;
			}
			else if (g_unichar_isgraph (uc)) {
				if (!g_unichar_ispunct (uc)) {
					state = feed_token;
					token->begin = p;
					continue;
				}
				else if (check_signature && pos != 0 && (*p == '_' || *p == '-')) {
					sig = p;
					siglen = remain;
					state = process_signature;
					continue;
				}
			}
			break;
		case feed_token:
			if (ex != NULL && p - buf->begin == (gint)ex->pos) {
				goto set_token;
			}
			else if (!g_unichar_isgraph (uc) || g_unichar_ispunct (uc)) {
				goto set_token;
			}
			processed ++;
			break;
		case skip_exception:
			*cur = p + ex->len;
			*exceptions = g_list_next (*exceptions);
			goto set_token;
			break;
		case process_signature:
			if (*p == '\r' || *p == '\n') {
				msg_debug ("signature found: %*s", (gint)siglen, sig);
				return FALSE;
			}
			else if (*p != ' ' && *p != '-' && *p != '_') {
				state = skip_delimiters;
				continue;
			}
			break;
		}

		remain -= next_p - p;
		p = next_p;
	}

set_token:
	if (rl) {
		*rl = processed;
	}

	if (token->len == 0) {
		token->len = p - token->begin;
		g_assert (token->len > 0);
		*cur = p;
	}

	return TRUE;
}
Beispiel #7
0
static gboolean is_word_sep(gunichar c)
{
	return g_unichar_isspace(c) || g_unichar_ispunct(c);
}
Beispiel #8
0
gint
donna_strcmp (const gchar *s1, const gchar *s2, DonnaSortOptions options)
{
    gboolean is_string = TRUE;
    gint     res_fb = 0; /* fallback */
    gint     res_cs = 0; /* case-sensitive */
    gint     res = 0;

    /* if at least one string if NULL or empty, we have a result */
    if (!s1 || *s1 == '\0')
    {
        if (s2 && *s2 != '\0')
            return -1;
        else
            return 0;
    }
    else if (!s2 || *s2 == '\0')
        return 1;

    if (options & DONNA_SORT_DOT_FIRST)
    {
        if (*s1 == '.')
        {
            if (*s2 != '.')
                /* only s1 is dotted, it comes first */
                return -1;
            else
            {
                /* both are dotted, skip the dot */
                ++s1;
                ++s2;
            }
        }
        else if (*s2 == '.')
            /* only s2 is dotted, it comes first */
            return 1;
    }
    else if (options & DONNA_SORT_DOT_MIXED)
    {
        if (*s1 == '.')
            ++s1;
        if (*s2 == '.')
            ++s2;
    }

    for (;;)
    {
        gunichar c1, c2;

        /* is at least one string over? */
        if (!*s1)
        {
            if (!*s2)
                res = 0;
            else
                /* shorter first */
                res = -1;
            goto done;
        }
        else if (!*s2)
        {
            /* shorter first */
            res = 1;
            goto done;
        }

        c1 = g_utf8_get_char (s1);
        c2 = g_utf8_get_char (s2);

        if (is_string)
        {
            if (options & DONNA_SORT_IGNORE_SPUNCT)
            {
                while (g_unichar_isspace (c1) || g_unichar_ispunct (c1))
                {
                    s1 = g_utf8_next_char (s1);
                    c1 = (*s1) ? g_utf8_get_char (s1) : 0;
                }
                while (g_unichar_isspace (c2) || g_unichar_ispunct (c2))
                {
                    s2 = g_utf8_next_char (s2);
                    c2 = (*s2) ? g_utf8_get_char (s2) : 0;
                }
                /* did we reached the end of a string? */
                if (!*s1 || !*s2)
                    continue;
            }

            /* is at least one string a number? */
            if (g_unichar_isdigit (c1))
            {
                if (g_unichar_isdigit (c2))
                {
                    if (options & DONNA_SORT_NATURAL_ORDER)
                    {
                        /* switch to number comparison */
                        is_string = FALSE;
                        continue;
                    }
                }
                else
                {
                    /* number first */
                    res = -1;
                    goto done;
                }
            }
            else if (g_unichar_isdigit (c2))
            {
                /* number first */
                res = 1;
                goto done;
            }

            /* compare chars */
            if (c1 > c2)
                res_cs = 1;
            else if (c1 < c2)
                res_cs = -1;

            if (options & DONNA_SORT_CASE_INSENSITIVE)
            {
                /* compare uppper chars */
                c1 = g_unichar_toupper (c1);
                c2 = g_unichar_toupper (c2);

                if (c1 > c2)
                {
                    res = 1;
                    goto done;
                }
                else if (c1 < c2)
                {
                    res = -1;
                    goto done;
                }
                else if (res_fb == 0)
                    /* set the case-sensitive result in case strings end up
                     * being the same otherwise */
                    res_fb = res_cs;
            }
            /* do we have a res_cs yet? */
            else if (res_cs != 0)
            {
                res = res_cs;
                goto done;
            }

            /* next chars */
            s1 = g_utf8_next_char (s1);
            s2 = g_utf8_next_char (s2);
        }
        /* mode number */
        else
        {
            unsigned long n1, n2;

            if (res_fb == 0)
            {
                /* count number of leading zeros */
                for (n1 = 0; *s1 == '0'; ++n1, ++s1)
                    ;
                for (n2 = 0; *s2 == '0'; ++n2, ++s2)
                    ;
                /* try to set a fallback to put less leading zeros first */
                if (n1 > n2)
                    res_fb = 1;
                else if (n1 < n2)
                    res_fb = -1;

                if (n1 > 0)
                    c1 = g_utf8_get_char (s1);
                if (n2 > 0)
                    c2 = g_utf8_get_char (s2);
            }

            n1 = 0;
            while (g_unichar_isdigit (c1))
            {
                int d;

                d = g_unichar_digit_value (c1);
                n1 *= 10;
                n1 += (unsigned long) d;
                s1 = g_utf8_next_char (s1);
                if (*s1)
                    c1 = g_utf8_get_char (s1);
                else
                    break;
            }

            n2 = 0;
            while (g_unichar_isdigit (c2))
            {
                int d;

                d = g_unichar_digit_value (c2);
                n2 *= 10;
                n2 += (unsigned long) d;
                s2 = g_utf8_next_char (s2);
                if (*s2)
                    c2 = g_utf8_get_char (s2);
                else
                    break;
            }

            if (n1 > n2)
            {
                res = 1;
                goto done;
            }
            else if (n1 < n2)
            {
                res = -1;
                goto done;
            }

            /* back to string comparison */
            is_string = TRUE;
        }
    }

done:
    return (res != 0) ? res : res_fb;
}
Beispiel #9
0
static gboolean is_word_sep(gunichar c)
{
	return (g_unichar_isspace(c) || g_unichar_ispunct(c)) && c != (gunichar)'\'';
}
//!
//! @brief Analyzes a sentence for misspellings, positions, and stem forms of words
//!
GList*
lw_morphologyengine_hunspell_analyze (LwMorphologyEngine *engine, 
                                      const gchar        *TEXT, 
                                      gboolean            include_spellcheck)
{
    //Sanity checks
    if (engine == NULL) return NULL;
    if (engine->hunspell == NULL) return NULL;
    if (TEXT == NULL) return NULL;

    //Declations
    gint start_offset = 0, end_offset = 0;
    GMatchInfo *match_info = NULL;
    GList *list = NULL;
    gchar *word = NULL;
    LwMorphology *morphology = NULL;

    //Initializations
    gchar *shortened = lw_regex_remove_parenthesis (TEXT);

    //Body
    lw_regex_get_contiguous (shortened, &match_info);

    while (g_match_info_matches (match_info))
    {
        word = g_match_info_fetch (match_info, 0);
        if (word != NULL && !g_unichar_ispunct (g_utf8_get_char (word)) && !lw_util_string_has_japanese (word))
        {
          g_match_info_fetch_pos (match_info, 0, &start_offset, &end_offset);

          gchar *normalized = NULL, *stem = NULL, *canonical = NULL, *spellcheck = NULL;

          //Generate the forms
          normalized = lw_util_normalize_string (word, TRUE, FALSE);    
          stem = lw_morphologyengine_hunspell_stem (engine, word);
          if (stem != NULL) canonical = lw_util_normalize_string (stem, TRUE, FALSE); //You don't want to case fold before hunspell works
          if (include_spellcheck) spellcheck = lw_morphologyengine_hunspell_spellcheck (engine, word);

          //Cleanup identicals
          if (normalized != NULL)
          {
            if (strcmp(normalized, word) == 0) { g_free (normalized); normalized = NULL; }
          }
          if (stem != NULL)
          {
            if (strcmp(stem, word) == 0) { g_free (stem); stem = NULL; }
            if (canonical != NULL && strcmp(stem, canonical) == 0) { g_free (canonical); canonical = NULL; } //Canonical is built on stem
          }

          morphology = lw_morphology_new (
            word,
            normalized,
            stem,
            canonical,
            spellcheck,
            NULL,
            start_offset,
            end_offset
          );
          if (morphology != NULL)
          {
            list = g_list_append (list, morphology);
            morphology = NULL;
          }
          word = NULL;
        }
        else if (word != NULL)
        {
          g_free (word); word = NULL;
        }
        g_match_info_next (match_info, NULL);
    }

errored:

    if (match_info != NULL) g_match_info_free (match_info); match_info = NULL;
    if (shortened != NULL) g_free(shortened); shortened = NULL;
    if (morphology != NULL) lw_morphology_free (morphology); morphology = NULL;
    if (word != NULL) g_free (word); word = NULL;

    return list;
}