Beispiel #1
0
static gint32 string_invariant_compare_char (gunichar2 c1, gunichar2 c2,
					     gint32 options)
{
	gint32 result;

	/* Ordinal can not be mixed with other options, and must return the difference, not only -1, 0, 1 */
	if (options & CompareOptions_Ordinal) 
		return (gint32) c1 - c2;
	
	if (options & CompareOptions_IgnoreCase) {
		GUnicodeType c1type, c2type;

		c1type = g_unichar_type (c1);
		c2type = g_unichar_type (c2);
	
		result = (gint32) (c1type != G_UNICODE_LOWERCASE_LETTER ? g_unichar_tolower(c1) : c1) -
			(c2type != G_UNICODE_LOWERCASE_LETTER ? g_unichar_tolower(c2) : c2);
	} else {
		/*
		 * No options. Kana, symbol and spacing options don't
		 * apply to the invariant culture.
		 */

		/*
		 * FIXME: here we must use the information from c1type and c2type
		 * to find out the proper collation, even on the InvariantCulture, the
		 * sorting is not done by computing the unicode values, but their
		 * actual sort order.
		 */
		result = (gint32) c1 - c2;
	}

	return ((result < 0) ? -1 : (result > 0) ? 1 : 0);
}
Beispiel #2
0
static int enchant_is_title_case(const char*const word, size_t len)
{
	gunichar ch;
	GUnicodeType type;
	const char* it = word;

	g_return_val_if_fail (word && *word, 0);

	ch = g_utf8_get_char(it);
	
	type = g_unichar_type(ch);
	if(type != G_UNICODE_UPPERCASE_LETTER && type != G_UNICODE_TITLECASE_LETTER)
		return 0;

	if(ch != g_unichar_totitle(ch) )
		return 0;
			
	for(it = g_utf8_next_char(it); it < word + len; it = g_utf8_next_char(it))
		{
			type = g_unichar_type(g_utf8_get_char(it));
			if(type == G_UNICODE_UPPERCASE_LETTER || type == G_UNICODE_TITLECASE_LETTER)
				return 0;
		}
	return 1;
}
Beispiel #3
0
/*
 * g_unichar_type
 */
RESULT
test_g_unichar_type ()
{
	if (g_unichar_type ('A') != G_UNICODE_UPPERCASE_LETTER)
		return FAILED ("#1");
	if (g_unichar_type ('a') != G_UNICODE_LOWERCASE_LETTER)
		return FAILED ("#2");
	if (g_unichar_type ('1') != G_UNICODE_DECIMAL_NUMBER)
		return FAILED ("#3");
	if (g_unichar_type (0xA3) != G_UNICODE_CURRENCY_SYMBOL)
		return FAILED ("#4");
	return NULL;
}
Beispiel #4
0
static int
str_unichar_iscombiningmark (gunichar uni)
{
    int type = g_unichar_type (uni);
    return (type == G_UNICODE_COMBINING_MARK)
        || (type == G_UNICODE_ENCLOSING_MARK) || (type == G_UNICODE_NON_SPACING_MARK);
}
Beispiel #5
0
/* RFC 3454, Appendix C. ish. */
static inline gboolean
idna_is_prohibited (gunichar ch)
{
  switch (g_unichar_type (ch))
    {
    case G_UNICODE_CONTROL:
    case G_UNICODE_FORMAT:
    case G_UNICODE_UNASSIGNED:
    case G_UNICODE_PRIVATE_USE:
    case G_UNICODE_SURROGATE:
    case G_UNICODE_LINE_SEPARATOR:
    case G_UNICODE_PARAGRAPH_SEPARATOR:
    case G_UNICODE_SPACE_SEPARATOR:
      return TRUE;

    case G_UNICODE_OTHER_SYMBOL:
      if (ch == 0xFFFC || ch == 0xFFFD ||
	  (ch >= 0x2FF0 && ch <= 0x2FFB))
	return TRUE;
      return FALSE;

    case G_UNICODE_NON_SPACING_MARK:
      if (ch == 0x0340 || ch == 0x0341)
	return TRUE;
      return FALSE;

    default:
      return FALSE;
    }
}
/**
 * stripped_char:
 *
 * Returns a stripped version of @ch, removing any case, accentuation
 * mark, or any special mark on it.
 **/
static gunichar
stripped_char (gunichar ch)
{
  gunichar *decomp, retval;
  GUnicodeType utype;
  gsize dlen;

  utype = g_unichar_type (ch);

  switch (utype) {
  case G_UNICODE_CONTROL:
  case G_UNICODE_FORMAT:
  case G_UNICODE_UNASSIGNED:
  case G_UNICODE_COMBINING_MARK:
    /* Ignore those */
    return 0;
    break;
  default:
    /* Convert to lowercase, fall through */
    ch = g_unichar_tolower (ch);
  case G_UNICODE_LOWERCASE_LETTER:
    if ((decomp = g_unicode_canonical_decomposition (ch, &dlen))) {
      retval = decomp[0];
      g_free (decomp);
      return retval;
    }
    break;
  }

  return 0;
}
Beispiel #7
0
static gboolean
str_unichar_iscombiningmark (gunichar uni)
{
    GUnicodeType type;

    type = g_unichar_type (uni);
    return (type == G_UNICODE_COMBINING_MARK)
        || (type == G_UNICODE_ENCLOSING_MARK) || (type == G_UNICODE_NON_SPACING_MARK);
}
bool UT_isWordDelimiter(UT_UCSChar currentChar, UT_UCSChar followChar, UT_UCSChar prevChar)
{
	// fast track Ascii letters
	if('a' <= currentChar && currentChar <= 'z')
		return false;

	if('A' <= currentChar && currentChar <= 'Z')
		return false;

	switch (g_unichar_type(currentChar))
	{
		case G_UNICODE_MODIFIER_LETTER:
		case G_UNICODE_LOWERCASE_LETTER:
		case G_UNICODE_TITLECASE_LETTER:
		case G_UNICODE_UPPERCASE_LETTER:
		case G_UNICODE_OTHER_LETTER:
		case G_UNICODE_COMBINING_MARK:
		case G_UNICODE_ENCLOSING_MARK:
		case G_UNICODE_NON_SPACING_MARK:
		case G_UNICODE_DECIMAL_NUMBER:
		case G_UNICODE_LETTER_NUMBER:
		case G_UNICODE_OTHER_NUMBER:
			return false;
		case G_UNICODE_CONNECT_PUNCTUATION:
			return (currentChar == '_'); // _ is a word separator!

		case G_UNICODE_OTHER_PUNCTUATION:
		case G_UNICODE_INITIAL_PUNCTUATION:
		case G_UNICODE_FINAL_PUNCTUATION:
			switch (currentChar)
			{
				// some punctuation can be internal in word
				case 0x0022:           // QUOTATION MARK
				case 0x0027:           // APOSTROPHE
				case UCS_LDBLQUOTE:    // smart quote, open double /* wjc */
				case UCS_RDBLQUOTE:    // smart quote, close double /* wjc */
				case UCS_LQUOTE:       // smart quote, open single  /* wjc */
				case UCS_RQUOTE:	   // smart quote, close single
				case 0x055F:           // ARMENIAN ABBREVIATION MARK
				case 0x070A:           // SYRIAC CONTRACTION
				case 0x070F:           // SYRIAC ABBREVIATION MARK
				case 0x0970:           // DEVANAGARI ABBREVIATION SIGN
					if (UT_UCS4_isalpha(followChar) &&
						UT_UCS4_isalpha(prevChar))
						return false;
					else
						return true;
					
				default:
					return true;
			}
			
		default:
			return true;
	} // switch
}
Beispiel #9
0
static int
is_word_char (gunichar uc, size_t n)
{
	GUnicodeType type;

	if (uc == g_utf8_get_char("'") || uc == g_utf8_get_char("’")) {
		return 1;
	}

	type = g_unichar_type(uc);

	switch (type) {
	case G_UNICODE_MODIFIER_LETTER:
	case G_UNICODE_LOWERCASE_LETTER:
	case G_UNICODE_TITLECASE_LETTER:
	case G_UNICODE_UPPERCASE_LETTER:
	case G_UNICODE_OTHER_LETTER:
	case G_UNICODE_COMBINING_MARK: /* Older name for G_UNICODE_SPACING_MARK; deprecated since glib 2.30 */
	case G_UNICODE_ENCLOSING_MARK:
	case G_UNICODE_NON_SPACING_MARK:
	case G_UNICODE_DECIMAL_NUMBER:
	case G_UNICODE_LETTER_NUMBER:
	case G_UNICODE_OTHER_NUMBER:
	case G_UNICODE_CONNECT_PUNCTUATION:
                return 1;     /* Enchant 1.3.0 defines word chars like this. */

	case G_UNICODE_DASH_PUNCTUATION:
		if ((n > 0) && (type == G_UNICODE_DASH_PUNCTUATION)) {
			return 1; /* hyphens only accepted within a word. */
		}
		/* Fallthrough */

	case G_UNICODE_CONTROL:
	case G_UNICODE_FORMAT:
	case G_UNICODE_UNASSIGNED:
	case G_UNICODE_PRIVATE_USE:
	case G_UNICODE_SURROGATE:
	case G_UNICODE_CLOSE_PUNCTUATION:
	case G_UNICODE_FINAL_PUNCTUATION:
	case G_UNICODE_INITIAL_PUNCTUATION:
	case G_UNICODE_OTHER_PUNCTUATION:
	case G_UNICODE_OPEN_PUNCTUATION:
	case G_UNICODE_CURRENCY_SYMBOL:
	case G_UNICODE_MODIFIER_SYMBOL:
	case G_UNICODE_MATH_SYMBOL:
	case G_UNICODE_OTHER_SYMBOL:
	case G_UNICODE_LINE_SEPARATOR:
	case G_UNICODE_PARAGRAPH_SEPARATOR:
	case G_UNICODE_SPACE_SEPARATOR:
	default:
		return 0;
	}
}
Beispiel #10
0
static gboolean
mcview_is_spacing_mark (const WView * view, int c)
{
#ifdef HAVE_CHARSET
    if (view->utf8)
        return g_unichar_type (c) == SPACING_MARK;
#else
    (void) view;
    (void) c;
#endif /* HAVE_CHARSET */
    return FALSE;
}
Beispiel #11
0
static int enchant_is_all_caps(const char*const word, size_t len)
{
	const char* it;
	int hasCap = 0;

	g_return_val_if_fail (word && *word, 0);

	for(it = word; it < word + len; it = g_utf8_next_char(it))
		{
			GUnicodeType type = g_unichar_type(g_utf8_get_char(it));
			switch(type)
				{
				case G_UNICODE_UPPERCASE_LETTER:
					hasCap = 1;
					break;
				case G_UNICODE_TITLECASE_LETTER:
				case G_UNICODE_LOWERCASE_LETTER:
					return 0;

				case G_UNICODE_CONTROL:
				case G_UNICODE_FORMAT:
				case G_UNICODE_UNASSIGNED:
				case G_UNICODE_PRIVATE_USE:
				case G_UNICODE_SURROGATE:
				case G_UNICODE_MODIFIER_LETTER:
				case G_UNICODE_OTHER_LETTER:
				case G_UNICODE_COMBINING_MARK:
				case G_UNICODE_ENCLOSING_MARK:
				case G_UNICODE_NON_SPACING_MARK:
				case G_UNICODE_DECIMAL_NUMBER:
				case G_UNICODE_LETTER_NUMBER:
				case G_UNICODE_OTHER_NUMBER:
				case G_UNICODE_CONNECT_PUNCTUATION:
				case G_UNICODE_DASH_PUNCTUATION:
				case G_UNICODE_CLOSE_PUNCTUATION:
				case G_UNICODE_FINAL_PUNCTUATION:
				case G_UNICODE_INITIAL_PUNCTUATION:
				case G_UNICODE_OTHER_PUNCTUATION:
				case G_UNICODE_OPEN_PUNCTUATION:
				case G_UNICODE_CURRENCY_SYMBOL:
				case G_UNICODE_MODIFIER_SYMBOL:
				case G_UNICODE_MATH_SYMBOL:
				case G_UNICODE_OTHER_SYMBOL:
				case G_UNICODE_LINE_SEPARATOR:
				case G_UNICODE_PARAGRAPH_SEPARATOR:
				case G_UNICODE_SPACE_SEPARATOR:
				default:
					break;
				}
		}

	return hasCap;
}
static guint
infinoted_plugin_linekeeper_count_lines(InfTextBuffer* buffer)
{
  /* Count the number of lines at the end of the document. This assumes the
   * buffer content is in UTF-8, which is currently hardcoded in infinoted. */
  InfTextBufferIter* iter;
  guint n_lines;
  gboolean has_iter;

  guint length;
  gsize bytes;
  gchar* text;
  gchar* pos;
  gchar* new_pos;
  gunichar c;

  g_assert(strcmp(inf_text_buffer_get_encoding(buffer), "UTF-8") == 0);

  n_lines = 0;

  iter = inf_text_buffer_create_end_iter(buffer);
  if(iter == NULL) return 0;

  do
  {
    length = inf_text_buffer_iter_get_length(buffer, iter);
    bytes = inf_text_buffer_iter_get_bytes(buffer, iter);
    text = inf_text_buffer_iter_get_text(buffer, iter);
    pos = text + bytes;

    while(length > 0)
    {
      new_pos = g_utf8_prev_char(pos);
      g_assert(bytes >= (pos - new_pos));

      c = g_utf8_get_char(new_pos);
      if(c == '\n' || g_unichar_type(c) == G_UNICODE_LINE_SEPARATOR)
        ++n_lines;
      else
        break;

      --length;
      bytes -= (pos - new_pos);
      pos = new_pos;
    }

    g_free(text);
  } while(length == 0 && inf_text_buffer_iter_prev(buffer, iter));

  inf_text_buffer_destroy_iter(buffer, iter);
  return n_lines;
}
Beispiel #13
0
    bool IsFirstLetterCapitalOrTitleCase(const std::string& word)
    {
	    gunichar ch;
	    GUnicodeType type;

        ch = g_utf8_get_char(word.c_str());
    	
	    type = g_unichar_type(ch);
	    if(type == G_UNICODE_UPPERCASE_LETTER || type == G_UNICODE_TITLECASE_LETTER)
		    return true;

	    return false;
    }
Beispiel #14
0
/**
 * tracker_text_normalize:
 * @text: the text to normalize
 * @max_words: the maximum words of @text to normalize
 * @n_words: the number of words actually normalized
 *
 * This function iterates through @text checking for UTF-8 validity
 * using g_utf8_get_char_validated(). For each character found, the
 * %GUnicodeType is checked to make sure it is one fo the following
 * values:
 * <itemizedlist>
 *  <listitem><para>%G_UNICODE_LOWERCASE_LETTER</para></listitem>
 *  <listitem><para>%G_UNICODE_MODIFIER_LETTER</para></listitem>
 *  <listitem><para>%G_UNICODE_OTHER_LETTER</para></listitem>
 *  <listitem><para>%G_UNICODE_TITLECASE_LETTER</para></listitem>
 *  <listitem><para>%G_UNICODE_UPPERCASE_LETTER</para></listitem>
 * </itemizedlist>
 *
 * All other symbols, punctuation, marks, numbers and separators are
 * stripped. A regular space (i.e. " ") is used to separate the words
 * in the returned string.
 *
 * The @n_words can be %NULL. If specified, it will be populated with
 * the number of words that were normalized in the result.
 *
 * Returns: a newly-allocated string holding the result which should
 * be freed with g_free() when finished with, otherwise %NULL.
 *
 * Since: 0.8
 *
 * Deprecated: 0.10: Use tracker_text_validate_utf8() instead.
 **/
gchar *
tracker_text_normalize (const gchar *text,
                        guint        max_words,
                        guint       *n_words)
{
	GString *string;
	gboolean in_break = TRUE;
	gunichar ch;
	gint words = 0;

	string = g_string_new (NULL);

	while ((ch = g_utf8_get_char_validated (text, -1)) > 0) {
		GUnicodeType type;

		type = g_unichar_type (ch);

		if (type == G_UNICODE_LOWERCASE_LETTER ||
		    type == G_UNICODE_MODIFIER_LETTER ||
		    type == G_UNICODE_OTHER_LETTER ||
		    type == G_UNICODE_TITLECASE_LETTER ||
		    type == G_UNICODE_UPPERCASE_LETTER) {
			/* Append regular chars */
			g_string_append_unichar (string, ch);
			in_break = FALSE;
		} else if (!in_break) {
			/* Non-regular char found, treat as word break */
			g_string_append_c (string, ' ');
			in_break = TRUE;
			words++;

			if (words > max_words) {
				break;
			}
		}

		text = g_utf8_find_next_char (text, NULL);
	}

	if (n_words) {
		if (!in_break) {
			/* Count the last word */
			words += 1;
		}
		*n_words = words;
	}

	return g_string_free (string, FALSE);
}
Beispiel #15
0
/* actually is_non_spacing_mark_or_enclosing_mark */
static gboolean
mcview_is_non_spacing_mark (const WView * view, int c)
{
#ifdef HAVE_CHARSET
    if (view->utf8)
    {
        GUnicodeType type;

        type = g_unichar_type (c);

        return type == G_UNICODE_NON_SPACING_MARK || type == G_UNICODE_ENCLOSING_MARK;
    }
#else
    (void) view;
    (void) c;
#endif /* HAVE_CHARSET */
    return FALSE;
}
Beispiel #16
0
static gboolean
exact_prefix_cmp (const gchar *string,
                  const gchar *prefix,
                  guint        prefix_len)
{
    GUnicodeType type;

    if (strncmp (string, prefix, prefix_len) != 0)
        return FALSE;
    if (string[prefix_len] == '\0')
        return TRUE;

    type = g_unichar_type (g_utf8_get_char (string + prefix_len));

    /* If string contains prefix, check that prefix is not followed
     * by a unicode mark symbol, e.g. that trailing 'a' in prefix
     * is not part of two-char a-with-hat symbol in string. */
    return type != G_UNICODE_COMBINING_MARK &&
           type != G_UNICODE_ENCLOSING_MARK &&
           type != G_UNICODE_NON_SPACING_MARK;
}
Beispiel #17
0
    bool IsWordAllCaps(const std::string& word)
    {
	    const char* it, *itEnd;
	    bool hasCap = false;

        for(it = word.c_str(), itEnd = it+word.length();
            it < itEnd; it = g_utf8_next_char(it))
		    {
			    GUnicodeType type = g_unichar_type(g_utf8_get_char(it));
			    switch(type)
				    {
					    case G_UNICODE_UPPERCASE_LETTER:
						    hasCap = true;
						    break;
					    case G_UNICODE_TITLECASE_LETTER:
					    case G_UNICODE_LOWERCASE_LETTER:
						    return false;
				    }
		    }

	    return hasCap;
    }
Beispiel #18
0
static gunichar
json_scanner_get_unichar (JsonScanner *scanner,
                          guint       *line_p,
                          guint       *position_p)
{
  gunichar uchar;
  gchar ch;
  gint i;

  uchar = 0;
  for (i = 0; i < 4; i++)
    {
      ch = json_scanner_get_char (scanner, line_p, position_p);

      if (is_hex_digit (ch))
        uchar += ((gunichar) to_hex_digit (ch) << ((3 - i) * 4));
      else
        break;
    }

  g_assert (g_unichar_validate (uchar) || g_unichar_type (uchar) == G_UNICODE_SURROGATE);

  return uchar;
}
NS_IMETHODIMP 
sbStringTransformImpl::NormalizeString(const nsAString & aCharset, 
                                       PRUint32 aTransformFlags, 
                                       const nsAString & aInput, 
                                       nsAString & _retval)
{
  nsCString str;
  CopyUTF16toUTF8(aInput, str);

  if(aTransformFlags & sbIStringTransform::TRANSFORM_LOWERCASE) {
    gchar* lowercaseStr = g_utf8_strdown(str.BeginReading(), str.Length());
    NS_ENSURE_TRUE(lowercaseStr, NS_ERROR_OUT_OF_MEMORY);
    str.Assign(lowercaseStr);
    g_free(lowercaseStr);
  }

  if(aTransformFlags & sbIStringTransform::TRANSFORM_UPPERCASE) {
    gchar* uppercaseStr = g_utf8_strup(str.BeginReading(), str.Length());
    NS_ENSURE_TRUE(uppercaseStr, NS_ERROR_OUT_OF_MEMORY);
    str.Assign(uppercaseStr);
    g_free(uppercaseStr);
  }

  if(aTransformFlags & sbIStringTransform::TRANSFORM_IGNORE_NONSPACE) {
    nsString workingStr;

    PRBool leadingOnly = aTransformFlags & 
                         sbIStringTransform::TRANSFORM_IGNORE_LEADING;
    PRBool bypassTest = PR_FALSE;

    gchar* nonspaceStr = g_utf8_normalize(str.BeginReading(), 
                                          str.Length(), 
                                          G_NORMALIZE_ALL);
    NS_ENSURE_TRUE(nonspaceStr, NS_ERROR_OUT_OF_MEMORY);

    glong strLen = g_utf8_strlen(nonspaceStr, -1);
    
    for(glong currentChar = 0; currentChar < strLen; ++currentChar) {

      gchar* offset = g_utf8_offset_to_pointer(nonspaceStr, currentChar);
      gunichar unichar = g_utf8_get_char(offset);
      GUnicodeType unicharType = g_unichar_type(unichar);

      if(bypassTest ||
         (unicharType != G_UNICODE_NON_SPACING_MARK && 
          unicharType != G_UNICODE_COMBINING_MARK &&
          unicharType != G_UNICODE_ENCLOSING_MARK)) {
        workingStr += unichar;
        if(leadingOnly)
          bypassTest = PR_TRUE;
      }
    }

    g_free(nonspaceStr);
    CopyUTF16toUTF8(workingStr, str);
  }

  if(aTransformFlags & sbIStringTransform::TRANSFORM_IGNORE_SYMBOLS) {
    nsString workingStr;

    PRBool leadingOnly = aTransformFlags & 
                         sbIStringTransform::TRANSFORM_IGNORE_LEADING;
    PRBool bypassTest = PR_FALSE;

    gchar* nosymbolsStr = g_utf8_normalize(str.BeginReading(), 
                                           str.Length(), 
                                           G_NORMALIZE_ALL);
    NS_ENSURE_TRUE(nosymbolsStr, NS_ERROR_OUT_OF_MEMORY);

    glong strLen = g_utf8_strlen(nosymbolsStr, -1);
    
    for(glong currentChar = 0; currentChar < strLen; ++currentChar) {
      gchar* offset = g_utf8_offset_to_pointer(nosymbolsStr, currentChar);
      gunichar unichar = g_utf8_get_char(offset);
      GUnicodeType unicharType = g_unichar_type(unichar);

      if (aTransformFlags & sbIStringTransform::TRANSFORM_IGNORE_KEEPNUMBERSYMBOLS) {
        PRInt32 numberLength;
        SB_ExtractLeadingNumber((const gchar *)offset, NULL, NULL, &numberLength);
        if (numberLength > 0) {
          for (glong copychar=0;copychar < numberLength;copychar++) {
            gchar* copyoffset = g_utf8_offset_to_pointer(nosymbolsStr, currentChar+copychar);
            gunichar unichar = g_utf8_get_char(copyoffset);
            workingStr += unichar;
          }
          currentChar += numberLength-1;
          if(leadingOnly)
            bypassTest = PR_TRUE;
          continue;
        }
      }

      if(bypassTest ||
         (unicharType != G_UNICODE_CURRENCY_SYMBOL &&
          unicharType != G_UNICODE_MODIFIER_SYMBOL &&
          unicharType != G_UNICODE_MATH_SYMBOL &&
          unicharType != G_UNICODE_OTHER_SYMBOL)) {
        workingStr += unichar;
        if(leadingOnly)
          bypassTest = PR_TRUE;
      }
    }

    g_free(nosymbolsStr);
    CopyUTF16toUTF8(workingStr, str); 
  }

  if((aTransformFlags & sbIStringTransform::TRANSFORM_IGNORE_NONALPHANUM) ||
     (aTransformFlags & sbIStringTransform::TRANSFORM_IGNORE_NONALPHANUM_IGNORE_SPACE)) {
    nsString workingStr;

    PRBool leadingOnly = aTransformFlags & 
                         sbIStringTransform::TRANSFORM_IGNORE_LEADING;
    PRBool bypassTest = PR_FALSE;

    gchar* nosymbolsStr = g_utf8_normalize(str.BeginReading(), 
                                           str.Length(), 
                                           G_NORMALIZE_ALL);
    NS_ENSURE_TRUE(nosymbolsStr, NS_ERROR_OUT_OF_MEMORY);

    glong strLen = g_utf8_strlen(nosymbolsStr, -1);
    
    for(glong currentChar = 0; currentChar < strLen; ++currentChar) {

      gchar* offset = g_utf8_offset_to_pointer(nosymbolsStr, currentChar);
      gunichar unichar = g_utf8_get_char(offset);
      GUnicodeType unicharType = g_unichar_type(unichar);

      if (aTransformFlags & sbIStringTransform::TRANSFORM_IGNORE_KEEPNUMBERSYMBOLS) {
        PRInt32 numberLength;
        SB_ExtractLeadingNumber((const gchar *)offset, NULL, NULL, &numberLength);
        if (numberLength > 0) {
          for (glong copychar=0;copychar < numberLength;copychar++) {
            gchar* copyoffset = g_utf8_offset_to_pointer(nosymbolsStr, currentChar+copychar);
            gunichar unichar = g_utf8_get_char(copyoffset);
            workingStr += unichar;
          }
          currentChar += numberLength-1;
          if(leadingOnly)
            bypassTest = PR_TRUE;
          continue;
        }
      }

      if(bypassTest ||
         (unicharType == G_UNICODE_LOWERCASE_LETTER ||
          unicharType == G_UNICODE_MODIFIER_LETTER ||
          unicharType == G_UNICODE_OTHER_LETTER ||
          unicharType == G_UNICODE_TITLECASE_LETTER ||
          unicharType == G_UNICODE_UPPERCASE_LETTER ||
          unicharType == G_UNICODE_DECIMAL_NUMBER ||
          unicharType == G_UNICODE_LETTER_NUMBER ||
          unicharType == G_UNICODE_OTHER_NUMBER) ||
          (!(aTransformFlags & sbIStringTransform::TRANSFORM_IGNORE_NONALPHANUM_IGNORE_SPACE) && 
            unichar == ' ')) {
        workingStr += unichar;
        if(leadingOnly)
          bypassTest = PR_TRUE;
      }
    }

    g_free(nosymbolsStr);
    CopyUTF16toUTF8(workingStr, str);
  }

  CopyUTF8toUTF16(str, _retval);

  return NS_OK;
}
Beispiel #20
0
static void
basic_engine_shape (PangoEngineShape *engine,
                    PangoFont        *font,
                    const char       *text,
                    gint              length,
                    PangoAnalysis    *analysis,
                    PangoGlyphString *glyphs)
{
    int n_chars;
    int i;
    const char *p;

    g_return_if_fail (font != NULL);
    g_return_if_fail (text != NULL);
    g_return_if_fail (length >= 0);
    g_return_if_fail (analysis != NULL);

#ifdef HAVE_USP10_H

    if (have_uniscribe &&
            !text_is_simple (text, length) &&
            uniscribe_shape (font, text, length, analysis, glyphs))
        return;

#endif

    n_chars = g_utf8_strlen (text, length);

    pango_glyph_string_set_size (glyphs, n_chars);

    p = text;
    for (i = 0; i < n_chars; i++)
    {
        gunichar wc;
        gunichar mirrored_ch;
        PangoGlyph index;

        wc = g_utf8_get_char (p);

        if (analysis->level % 2)
            if (pango_get_mirror_char (wc, &mirrored_ch))
                wc = mirrored_ch;

        if (wc == 0xa0)	/* non-break-space */
            wc = 0x20;

        if (pango_is_zero_width (wc))
        {
            set_glyph (font, glyphs, i, p - text, PANGO_GLYPH_EMPTY);
        }
        else
        {
            index = find_char (font, wc);
            if (index)
            {
                set_glyph (font, glyphs, i, p - text, index);

                if (g_unichar_type (wc) == G_UNICODE_NON_SPACING_MARK)
                {
                    if (i > 0)
                    {
                        PangoRectangle logical_rect, ink_rect;

                        glyphs->glyphs[i].geometry.width = MAX (glyphs->glyphs[i-1].geometry.width,
                                                                glyphs->glyphs[i].geometry.width);
                        glyphs->glyphs[i-1].geometry.width = 0;
                        glyphs->log_clusters[i] = glyphs->log_clusters[i-1];

                        /* Some heuristics to try to guess how overstrike glyphs are
                         * done and compensate
                         */
                        /* FIXME: (alex) Is this double call to get_glyph_extents really necessary? */
                        pango_font_get_glyph_extents (font, glyphs->glyphs[i].glyph, &ink_rect, &logical_rect);
                        if (logical_rect.width == 0 && ink_rect.x == 0)
                            glyphs->glyphs[i].geometry.x_offset = (glyphs->glyphs[i].geometry.width - ink_rect.width) / 2;
                    }
                }
            }
            else
                set_glyph (font, glyphs, i, p - text, PANGO_GET_UNKNOWN_GLYPH (wc));
        }

        p = g_utf8_next_char (p);
    }

    /* Simple bidi support... may have separate modules later */

    if (analysis->level % 2)
    {
        int start, end;

        /* Swap all glyphs */
        swap_range (glyphs, 0, n_chars);

        /* Now reorder glyphs within each cluster back to LTR */
        for (start = 0; start < n_chars;)
        {
            end = start;
            while (end < n_chars &&
                    glyphs->log_clusters[end] == glyphs->log_clusters[start])
                end++;

            swap_range (glyphs, start, end);
            start = end;
        }
    }
}
Beispiel #21
0
/**
 * gsdl_tokenizer_next:
 * @self: A valid %GSDLTokenizer.
 * @result: (out callee-allocates): A %GSDLToken to initialize and fill in.
 * @err: (out) (allow-none): Location to store any error, may be %NULL.
 *
 * Fetches the next token from the input. Depending on the source of input, may set an error in one
 * of the %GSDL_SYNTAX_ERROR, %G_IO_CHANNEL_ERROR, or %G_CONVERT_ERROR domains.
 *
 * Returns: Whether a token could be successfully read.
 */
bool gsdl_tokenizer_next(GSDLTokenizer *self, GSDLToken **result, GError **err) {
	gunichar c, nc;
	int line;
	int col;

	retry:
	line = self->line;
	col = self->col;
	if (!_read(self, &c, err)) return false;

	if (G_UNLIKELY(c == EOF)) {
		*result = _maketoken(T_EOF, line, col);
		return true;
	} else if (c == '\r') {
		if (_peek(self, &c, err) && c == '\n') _consume(self);

		*result = _maketoken('\n', line, col);
		FAIL_IF_ERR();

		return true;
	} else if ((c == '/' && _peek(self, &nc, err) && nc == '/') || (c == '-' && _peek(self, &nc, err) && nc == '-') || c == '#') {
		if (c != '#') _consume(self);
		while (_peek(self, &c, err) && !(c == '\n' || c == EOF)) _consume(self);

		goto retry;
	} else if (c == '/' && _peek(self, &nc, err) && nc == '*') {
		while (_read(self, &c, err)) {
			if (c == EOF) {
				_set_error(err,
					self,
					GSDL_SYNTAX_ERROR_UNEXPECTED_CHAR,
					"Unterminated comment"
				);

				return false;
			} else if (c == '*' && _peek(self, &c, err) && c == '/') {
				_consume(self);
				break;
			}
		}

		goto retry;
	} else if (c < 256 && strchr("-+:;./{}=\n", (char) c)) {
		*result = _maketoken(c, line, col);
		return true;
	} else if (c < 256 && isdigit((char) c)) {
		*result = _maketoken(T_NUMBER, line, col);
		return _tokenize_number(self, *result, c, err);
	} else if (g_unichar_isalpha(c) || g_unichar_type(c) == G_UNICODE_CONNECT_PUNCTUATION || g_unichar_type(c) == G_UNICODE_CURRENCY_SYMBOL) {
		*result = _maketoken(T_IDENTIFIER, line, col);
		return _tokenize_identifier(self, *result, c, err);
	} else if (c == '[') {
		*result = _maketoken(T_BINARY, line, col);
		if (!_tokenize_binary(self, *result, err)) return false;

		REQUIRE(_read(self, &c, err));
		if (c == ']') {
			return true;
		} else {
			_set_error(err,
				self,
				GSDL_SYNTAX_ERROR_MISSING_DELIMITER,
				"Missing ']'"
			);
			return false;
		}
	} else if (c == '"') {
		*result = _maketoken(T_STRING, line, col);
		if (!_tokenize_string(self, *result, err)) return false;

		REQUIRE(_read(self, &c, err));
		if (c == '"') {
			return true;
		} else {
			_set_error(err,
				self,
				GSDL_SYNTAX_ERROR_MISSING_DELIMITER,
				"Missing '\"'"
			);
			return false;
		}
	} else if (c == '`') {
		*result = _maketoken(T_STRING, line, col);
		if (!_tokenize_backquote_string(self, *result, err)) return false;

		REQUIRE(_read(self, &c, err));
		if (c == '`') {
			return true;
		} else {
			_set_error(err,
				self,
				GSDL_SYNTAX_ERROR_MISSING_DELIMITER,
				"Missing '`'"
			);
			return false;
		}
	} else if (c == '\'') {
		*result = _maketoken(T_CHAR, line, col);
		(*result)->val = g_malloc0(4);

		_read(self, &c, err);

		if (c == '\\') {
			_read(self, &c, err);

			switch (c) {
				case 'n': c = '\n'; break;
				case 'r': c = '\r'; break;
				case 't': c = '\t'; break;
				case '"': c = '"'; break;
				case '\'': c = '\''; break;
				case '\\': c = '\\'; break;
			}
		}

		g_unichar_to_utf8(c, (*result)->val); 

		REQUIRE(_read(self, &c, err));
		if (c == '\'') {
			return true;
		} else {
			_set_error(err,
				self,
				GSDL_SYNTAX_ERROR_MISSING_DELIMITER,
				"Missing \"'\""
			);
			return false;
		}
	} else if (c == '\\' && _peek(self, &nc, err) && (nc == '\r' || nc == '\n')) {
		_consume(self);

		if (c == '\r') _read(self, &c, err);

		goto retry;
	} else if (c == ' ' || c == '\t') {
		// Do nothing
		goto retry;
	} else {
		_set_error(err,
			self,
			GSDL_SYNTAX_ERROR_UNEXPECTED_CHAR,
		   	g_strdup_printf("Invalid character '%s'(%d)", g_ucs4_to_utf8(&c, 1, NULL, NULL, NULL), c)
		);
		return false;
	}
}
Beispiel #22
0
static bool _tokenize_identifier(GSDLTokenizer *self, GSDLToken *result, gunichar c, GError **err) {
	int length = 7;
	char *output = result->val = g_malloc(length);
	GUnicodeType type;

	int i = g_unichar_to_utf8(c, output);

	while (_peek(self, &c, err) && (c == '-' || c == '.' || g_unichar_isalpha(c) || g_unichar_isdigit(c) || (type = g_unichar_type(c)) == G_UNICODE_CURRENCY_SYMBOL || type == G_UNICODE_CONNECT_PUNCTUATION || type == G_UNICODE_LETTER_NUMBER || type == G_UNICODE_SPACING_MARK || type == G_UNICODE_NON_SPACING_MARK)) {
		GROW_IF_NEEDED(output = result->val, i + 5, length);

		_consume(self);
		i += g_unichar_to_utf8(c, output + i);
	}

	FAIL_IF_ERR();
	output[i] = '\0';

	if (
			strcmp(output, "true") == 0 ||
			strcmp(output, "on") == 0 ||
			strcmp(output, "false") == 0 ||
			strcmp(output, "off") == 0) {
		result->type = T_BOOLEAN;
	} else if (strcmp(output, "null") == 0) {
		result->type = T_NULL;
	}

	return true;
}
Beispiel #23
0
/**
 * Get one base character, along with its combining or spacing mark characters.
 *
 * (A spacing mark is a character that extends the base character's width 1 into a combined
 * character of width 2, yet these two character cells should not be separated. E.g. Devanagari
 * <U+0939><U+094B>.)
 *
 * This method exists mainly for two reasons. One is to be able to tell if we fit on the current
 * line or need to wrap to the next one. The other is that both slang and ncurses seem to require
 * that the character and its combining marks are printed in a single call (or is it just a
 * limitation of mc's wrapper to them?).
 *
 * For convenience, this method takes care of converting CR or CR+LF into LF.
 * TODO this should probably happen later, when displaying the file?
 *
 * Normally: stores cs and color, updates state, returns >= 1 (entries in cs).
 * At EOF: state is unchanged, cs and color are undefined, returns 0.
 *
 * @param view ...
 * @param state the parser-formatter state machine's state, updated
 * @param cs store the characters here
 * @param clen the room available in cs (that is, at most clen-1 combining marks are allowed), must
 *   be at least 2
 * @param color if non-NULL, store the color here, taken from the first codepoint's color
 * @return the number of entries placed in cs, or 0 on EOF
 */
static int
mcview_next_combining_char_sequence (WView * view, mcview_state_machine_t * state, int *cs,
                                     int clen, int *color)
{
    int i = 1;

    if (!mcview_get_next_maybe_nroff_char (view, state, cs, color))
        return 0;

    /* Process \r and \r\n newlines. */
    if (cs[0] == '\r')
    {
        int cnext;

        mcview_state_machine_t state_after_crlf = *state;
        if (mcview_get_next_maybe_nroff_char (view, &state_after_crlf, &cnext, NULL)
            && cnext == '\n')
            *state = state_after_crlf;
        cs[0] = '\n';
        return 1;
    }

    /* We don't want combining over non-printable characters. This includes '\n' and '\t' too. */
    if (!mcview_isprint (view, cs[0]))
        return 1;

    if (mcview_ismark (view, cs[0]))
    {
        if (!state->print_lonely_combining)
        {
            /* First character is combining. Either just return it, ... */
            return 1;
        }
        else
        {
            /* or place this (and subsequent combining ones) over a dotted circle. */
            cs[1] = cs[0];
            cs[0] = BASE_CHARACTER_FOR_LONELY_COMBINING;
            i = 2;
        }
    }

    if (mcview_wcwidth (view, cs[0]) == 2)
    {
        /* Don't allow combining or spacing mark for wide characters, is this okay? */
        return 1;
    }

    /* Look for more combining chars. Either at most clen-1 zero-width combining chars,
     * or at most 1 spacing mark. Is this logic correct? */
    for (; i < clen; i++)
    {
        mcview_state_machine_t state_after_combining;

        state_after_combining = *state;
        if (!mcview_get_next_maybe_nroff_char (view, &state_after_combining, &cs[i], NULL))
            return i;
        if (!mcview_ismark (view, cs[i]) || !mcview_isprint (view, cs[i]))
            return i;
        if (g_unichar_type (cs[i]) == SPACING_MARK)
        {
            /* Only allow as the first combining char. Stop processing in either case. */
            if (i == 1)
            {
                *state = state_after_combining;
                i++;
            }
            return i;
        }
        *state = state_after_combining;
    }
    return i;
}
Beispiel #24
0
static void
json_scanner_get_token_ll (JsonScanner *scanner,
                           GTokenType  *token_p,
                           GTokenValue *value_p,
                           guint       *line_p,
                           guint       *position_p)
{
  JsonScannerConfig *config;
  GTokenType	   token;
  gboolean	   in_comment_multi;
  gboolean	   in_comment_single;
  gboolean	   in_string_sq;
  gboolean	   in_string_dq;
  GString	  *gstring;
  GTokenValue	   value;
  guchar	   ch;
  
  config = scanner->config;
  (*value_p).v_int64 = 0;
  
  if ((scanner->text >= scanner->text_end && scanner->input_fd < 0) ||
      scanner->token == G_TOKEN_EOF)
    {
      *token_p = G_TOKEN_EOF;
      return;
    }
  
  in_comment_multi = FALSE;
  in_comment_single = FALSE;
  in_string_sq = FALSE;
  in_string_dq = FALSE;
  gstring = NULL;
  
  do /* while (ch != 0) */
    {
      gboolean dotted_float = FALSE;
      
      ch = json_scanner_get_char (scanner, line_p, position_p);
      
      value.v_int64 = 0;
      token = G_TOKEN_NONE;
      
      /* this is *evil*, but needed ;(
       * we first check for identifier first character, because	 it
       * might interfere with other key chars like slashes or numbers
       */
      if (config->scan_identifier &&
	  ch && strchr (config->cset_identifier_first, ch))
	goto identifier_precedence;
      
      switch (ch)
	{
	case 0:
	  token = G_TOKEN_EOF;
	  (*position_p)++;
	  /* ch = 0; */
	  break;
	  
	case '/':
	  if (!config->scan_comment_multi ||
	      json_scanner_peek_next_char (scanner) != '*')
	    goto default_case;
	  json_scanner_get_char (scanner, line_p, position_p);
	  token = G_TOKEN_COMMENT_MULTI;
	  in_comment_multi = TRUE;
	  gstring = g_string_new (NULL);
	  while ((ch = json_scanner_get_char (scanner, line_p, position_p)) != 0)
	    {
	      if (ch == '*' && json_scanner_peek_next_char (scanner) == '/')
		{
		  json_scanner_get_char (scanner, line_p, position_p);
		  in_comment_multi = FALSE;
		  break;
		}
	      else
		gstring = g_string_append_c (gstring, ch);
	    }
	  ch = 0;
	  break;
	  
	case '\'':
	  if (!config->scan_string_sq)
	    goto default_case;
	  token = G_TOKEN_STRING;
	  in_string_sq = TRUE;
	  gstring = g_string_new (NULL);
	  while ((ch = json_scanner_get_char (scanner, line_p, position_p)) != 0)
	    {
	      if (ch == '\'')
		{
		  in_string_sq = FALSE;
		  break;
		}
	      else
		gstring = g_string_append_c (gstring, ch);
	    }
	  ch = 0;
	  break;
	  
	case '"':
	  if (!config->scan_string_dq)
	    goto default_case;
	  token = G_TOKEN_STRING;
	  in_string_dq = TRUE;
	  gstring = g_string_new (NULL);
	  while ((ch = json_scanner_get_char (scanner, line_p, position_p)) != 0)
	    {
	      if (ch == '"')
		{
		  in_string_dq = FALSE;
		  break;
		}
	      else
		{
		  if (ch == '\\')
		    {
		      ch = json_scanner_get_char (scanner, line_p, position_p);
		      switch (ch)
			{
			  guint	i;
			  guint	fchar;
			  
			case 0:
			  break;
			  
			case '\\':
			  gstring = g_string_append_c (gstring, '\\');
			  break;
			  
			case 'n':
			  gstring = g_string_append_c (gstring, '\n');
			  break;
			  
			case 't':
			  gstring = g_string_append_c (gstring, '\t');
			  break;
			  
			case 'r':
			  gstring = g_string_append_c (gstring, '\r');
			  break;
			  
			case 'b':
			  gstring = g_string_append_c (gstring, '\b');
			  break;
			  
			case 'f':
			  gstring = g_string_append_c (gstring, '\f');
			  break;

                        case 'u':
                          fchar = json_scanner_peek_next_char (scanner);
                          if (is_hex_digit (fchar))
                            {
                              gunichar ucs;

                              ucs = json_scanner_get_unichar (scanner, line_p, position_p);

                              if (g_unichar_type (ucs) == G_UNICODE_SURROGATE)
                                {
                                  /* read next surrogate */
                                  if ('\\' == json_scanner_get_char (scanner, line_p, position_p)
                                      && 'u' == json_scanner_get_char (scanner, line_p, position_p))
                                    {
                                      gunichar ucs_lo = json_scanner_get_unichar (scanner, line_p, position_p);
                                      g_assert (g_unichar_type (ucs_lo) == G_UNICODE_SURROGATE);
                                      ucs = (((ucs & 0x3ff) << 10) | (ucs_lo & 0x3ff)) + 0x10000;
                                    }
                                }

                              g_assert (g_unichar_validate (ucs));
                              gstring = g_string_append_unichar (gstring, ucs);
                            }
                          break;
			  
			case '0':
			case '1':
			case '2':
			case '3':
			case '4':
			case '5':
			case '6':
			case '7':
			  i = ch - '0';
			  fchar = json_scanner_peek_next_char (scanner);
			  if (fchar >= '0' && fchar <= '7')
			    {
			      ch = json_scanner_get_char (scanner, line_p, position_p);
			      i = i * 8 + ch - '0';
			      fchar = json_scanner_peek_next_char (scanner);
			      if (fchar >= '0' && fchar <= '7')
				{
				  ch = json_scanner_get_char (scanner, line_p, position_p);
				  i = i * 8 + ch - '0';
				}
			    }
			  gstring = g_string_append_c (gstring, i);
			  break;
			  
			default:
			  gstring = g_string_append_c (gstring, ch);
			  break;
			}
		    }
		  else
		    gstring = g_string_append_c (gstring, ch);
		}
	    }
	  ch = 0;
	  break;
	  
	case '.':
	  if (!config->scan_float)
	    goto default_case;
	  token = G_TOKEN_FLOAT;
	  dotted_float = TRUE;
	  ch = json_scanner_get_char (scanner, line_p, position_p);
	  goto number_parsing;
	  
	case '$':
	  if (!config->scan_hex_dollar)
	    goto default_case;
	  token = G_TOKEN_HEX;
	  ch = json_scanner_get_char (scanner, line_p, position_p);
	  goto number_parsing;
	  
	case '0':
	  if (config->scan_octal)
	    token = G_TOKEN_OCTAL;
	  else
	    token = G_TOKEN_INT;
	  ch = json_scanner_peek_next_char (scanner);
	  if (config->scan_hex && (ch == 'x' || ch == 'X'))
	    {
	      token = G_TOKEN_HEX;
	      json_scanner_get_char (scanner, line_p, position_p);
	      ch = json_scanner_get_char (scanner, line_p, position_p);
	      if (ch == 0)
		{
		  token = G_TOKEN_ERROR;
		  value.v_error = G_ERR_UNEXP_EOF;
		  (*position_p)++;
		  break;
		}
	      if (json_scanner_char_2_num (ch, 16) < 0)
		{
		  token = G_TOKEN_ERROR;
		  value.v_error = G_ERR_DIGIT_RADIX;
		  ch = 0;
		  break;
		}
	    }
	  else if (config->scan_binary && (ch == 'b' || ch == 'B'))
	    {
	      token = G_TOKEN_BINARY;
	      json_scanner_get_char (scanner, line_p, position_p);
	      ch = json_scanner_get_char (scanner, line_p, position_p);
	      if (ch == 0)
		{
		  token = G_TOKEN_ERROR;
		  value.v_error = G_ERR_UNEXP_EOF;
		  (*position_p)++;
		  break;
		}
	      if (json_scanner_char_2_num (ch, 10) < 0)
		{
		  token = G_TOKEN_ERROR;
		  value.v_error = G_ERR_NON_DIGIT_IN_CONST;
		  ch = 0;
		  break;
		}
	    }
	  else
	    ch = '0';
	  /* fall through */
	case '1':
	case '2':
	case '3':
	case '4':
	case '5':
	case '6':
	case '7':
	case '8':
	case '9':
	number_parsing:
	{
          gboolean in_number = TRUE;
	  gchar *endptr;
	  
	  if (token == G_TOKEN_NONE)
	    token = G_TOKEN_INT;
	  
	  gstring = g_string_new (dotted_float ? "0." : "");
	  gstring = g_string_append_c (gstring, ch);
	  
	  do /* while (in_number) */
	    {
	      gboolean is_E;
	      
	      is_E = token == G_TOKEN_FLOAT && (ch == 'e' || ch == 'E');
	      
	      ch = json_scanner_peek_next_char (scanner);
	      
	      if (json_scanner_char_2_num (ch, 36) >= 0 ||
		  (config->scan_float && ch == '.') ||
		  (is_E && (ch == '+' || ch == '-')))
		{
		  ch = json_scanner_get_char (scanner, line_p, position_p);
		  
		  switch (ch)
		    {
		    case '.':
		      if (token != G_TOKEN_INT && token != G_TOKEN_OCTAL)
			{
			  value.v_error = token == G_TOKEN_FLOAT ? G_ERR_FLOAT_MALFORMED : G_ERR_FLOAT_RADIX;
			  token = G_TOKEN_ERROR;
			  in_number = FALSE;
			}
		      else
			{
			  token = G_TOKEN_FLOAT;
			  gstring = g_string_append_c (gstring, ch);
			}
		      break;
		      
		    case '0':
		    case '1':
		    case '2':
		    case '3':
		    case '4':
		    case '5':
		    case '6':
		    case '7':
		    case '8':
		    case '9':
		      gstring = g_string_append_c (gstring, ch);
		      break;
		      
		    case '-':
		    case '+':
		      if (token != G_TOKEN_FLOAT)
			{
			  token = G_TOKEN_ERROR;
			  value.v_error = G_ERR_NON_DIGIT_IN_CONST;
			  in_number = FALSE;
			}
		      else
			gstring = g_string_append_c (gstring, ch);
		      break;
		      
		    case 'e':
		    case 'E':
		      if ((token != G_TOKEN_HEX && !config->scan_float) ||
			  (token != G_TOKEN_HEX &&
			   token != G_TOKEN_OCTAL &&
			   token != G_TOKEN_FLOAT &&
			   token != G_TOKEN_INT))
			{
			  token = G_TOKEN_ERROR;
			  value.v_error = G_ERR_NON_DIGIT_IN_CONST;
			  in_number = FALSE;
			}
		      else
			{
			  if (token != G_TOKEN_HEX)
			    token = G_TOKEN_FLOAT;
			  gstring = g_string_append_c (gstring, ch);
			}
		      break;
		      
		    default:
		      if (token != G_TOKEN_HEX)
			{
			  token = G_TOKEN_ERROR;
			  value.v_error = G_ERR_NON_DIGIT_IN_CONST;
			  in_number = FALSE;
			}
		      else
			gstring = g_string_append_c (gstring, ch);
		      break;
		    }
		}
	      else
		in_number = FALSE;
	    }
	  while (in_number);
	  
	  endptr = NULL;
	  if (token == G_TOKEN_FLOAT)
	    value.v_float = g_strtod (gstring->str, &endptr);
	  else
	    {
	      guint64 ui64 = 0;
	      switch (token)
		{
		case G_TOKEN_BINARY:
		  ui64 = g_ascii_strtoull (gstring->str, &endptr, 2);
		  break;
		case G_TOKEN_OCTAL:
		  ui64 = g_ascii_strtoull (gstring->str, &endptr, 8);
		  break;
		case G_TOKEN_INT:
		  ui64 = g_ascii_strtoull (gstring->str, &endptr, 10);
		  break;
		case G_TOKEN_HEX:
		  ui64 = g_ascii_strtoull (gstring->str, &endptr, 16);
		  break;
		default: ;
		}
	      if (scanner->config->store_int64)
		value.v_int64 = ui64;
	      else
		value.v_int = ui64;
	    }
	  if (endptr && *endptr)
	    {
	      token = G_TOKEN_ERROR;
	      if (*endptr == 'e' || *endptr == 'E')
		value.v_error = G_ERR_NON_DIGIT_IN_CONST;
	      else
		value.v_error = G_ERR_DIGIT_RADIX;
	    }
	  g_string_free (gstring, TRUE);
	  gstring = NULL;
	  ch = 0;
	} /* number_parsing:... */
	break;
	
	default:
	default_case:
	{
	  if (config->cpair_comment_single &&
	      ch == config->cpair_comment_single[0])
	    {
	      token = G_TOKEN_COMMENT_SINGLE;
	      in_comment_single = TRUE;
	      gstring = g_string_new (NULL);
	      ch = json_scanner_get_char (scanner, line_p, position_p);
	      while (ch != 0)
		{
		  if (ch == config->cpair_comment_single[1])
		    {
		      in_comment_single = FALSE;
		      ch = 0;
		      break;
		    }
		  
		  gstring = g_string_append_c (gstring, ch);
		  ch = json_scanner_get_char (scanner, line_p, position_p);
		}
	      /* ignore a missing newline at EOF for single line comments */
	      if (in_comment_single &&
		  config->cpair_comment_single[1] == '\n')
		in_comment_single = FALSE;
	    }
	  else if (config->scan_identifier && ch &&
		   strchr (config->cset_identifier_first, ch))
	    {
	    identifier_precedence:
	      
	      if (config->cset_identifier_nth && ch &&
		  strchr (config->cset_identifier_nth,
			  json_scanner_peek_next_char (scanner)))
		{
		  token = G_TOKEN_IDENTIFIER;
		  gstring = g_string_new (NULL);
		  gstring = g_string_append_c (gstring, ch);
		  do
		    {
		      ch = json_scanner_get_char (scanner, line_p, position_p);
		      gstring = g_string_append_c (gstring, ch);
		      ch = json_scanner_peek_next_char (scanner);
		    }
		  while (ch && strchr (config->cset_identifier_nth, ch));
		  ch = 0;
		}
	      else if (config->scan_identifier_1char)
		{
		  token = G_TOKEN_IDENTIFIER;
		  value.v_identifier = g_new0 (gchar, 2);
		  value.v_identifier[0] = ch;
		  ch = 0;
		}
	    }
	  if (ch)
	    {
	      if (config->char_2_token)
		token = ch;
	      else
		{
		  token = G_TOKEN_CHAR;
		  value.v_char = ch;
		}
	      ch = 0;
	    }
	} /* default_case:... */
	break;
	}
      g_assert (ch == 0 && token != G_TOKEN_NONE); /* paranoid */
    }
  while (ch != 0);
  
  if (in_comment_multi || in_comment_single ||
      in_string_sq || in_string_dq)
    {
      token = G_TOKEN_ERROR;
      if (gstring)
	{
	  g_string_free (gstring, TRUE);
	  gstring = NULL;
	}
      (*position_p)++;
      if (in_comment_multi || in_comment_single)
	value.v_error = G_ERR_UNEXP_EOF_IN_COMMENT;
      else /* (in_string_sq || in_string_dq) */
	value.v_error = G_ERR_UNEXP_EOF_IN_STRING;
    }
  
  if (gstring)
    {
      value.v_string = g_string_free (gstring, FALSE);
      gstring = NULL;
    }
  
  if (token == G_TOKEN_IDENTIFIER)
    {
      if (config->scan_symbols)
	{
	  JsonScannerKey *key;
	  guint scope_id;
	  
	  scope_id = scanner->scope_id;
	  key = json_scanner_lookup_internal (scanner, scope_id, value.v_identifier);
	  if (!key && scope_id && scanner->config->scope_0_fallback)
	    key = json_scanner_lookup_internal (scanner, 0, value.v_identifier);
	  
	  if (key)
	    {
	      g_free (value.v_identifier);
	      token = G_TOKEN_SYMBOL;
	      value.v_symbol = key->value;
	    }
	}
      
      if (token == G_TOKEN_IDENTIFIER &&
	  config->scan_identifier_NULL &&
	  strlen (value.v_identifier) == 4)
	{
	  gchar *null_upper = "NULL";
	  gchar *null_lower = "null";
	  
	  if (scanner->config->case_sensitive)
	    {
	      if (value.v_identifier[0] == null_upper[0] &&
		  value.v_identifier[1] == null_upper[1] &&
		  value.v_identifier[2] == null_upper[2] &&
		  value.v_identifier[3] == null_upper[3])
		token = G_TOKEN_IDENTIFIER_NULL;
	    }
	  else
	    {
	      if ((value.v_identifier[0] == null_upper[0] ||
		   value.v_identifier[0] == null_lower[0]) &&
		  (value.v_identifier[1] == null_upper[1] ||
		   value.v_identifier[1] == null_lower[1]) &&
		  (value.v_identifier[2] == null_upper[2] ||
		   value.v_identifier[2] == null_lower[2]) &&
		  (value.v_identifier[3] == null_upper[3] ||
		   value.v_identifier[3] == null_lower[3]))
		token = G_TOKEN_IDENTIFIER_NULL;
	    }
	}
    }
  
  *token_p = token;
  *value_p = value;
}
Beispiel #25
0
static void
syriac_engine_shape (PangoEngineShape *engine,
		     PangoFont        *font,
		     const char       *text,
		     gint              length,
		     const PangoAnalysis *analysis,
		     PangoGlyphString *glyphs)
{
  PangoFcFont *fc_font;
  FT_Face face;
  PangoOTRulesetDescription desc;
  const PangoOTRuleset *ruleset;
  PangoOTBuffer *buffer;
  gulong *properties = NULL;
  glong n_chars;
  gunichar *wcs;
  const char *p;
  int cluster = 0;
  int i;

  g_return_if_fail (font != NULL);
  g_return_if_fail (text != NULL);
  g_return_if_fail (length >= 0);
  g_return_if_fail (analysis != NULL);

  fc_font = PANGO_FC_FONT (font);
  face = pango_fc_font_lock_face (fc_font);
  if (!face)
    return;

  buffer = pango_ot_buffer_new (fc_font);
  pango_ot_buffer_set_rtl (buffer, analysis->level % 2 != 0);
  pango_ot_buffer_set_zero_width_marks (buffer, TRUE);

  wcs = g_utf8_to_ucs4_fast (text, length, &n_chars);
  properties = g_new0 (gulong, n_chars);

  syriac_assign_properties (wcs, properties, n_chars);

  g_free (wcs);

  p = text;
  for (i=0; i < n_chars; i++)
    {
      gunichar wc;
      PangoGlyph glyph;

      wc = g_utf8_get_char (p);

      if (g_unichar_type (wc) != G_UNICODE_NON_SPACING_MARK)
	cluster = p - text;

      if (pango_is_zero_width (wc))
        glyph = PANGO_GLYPH_EMPTY;
      else
        {
	  gunichar c = wc;

	  if (analysis->level % 2)
	    g_unichar_get_mirror_char (c, &c);

	  glyph = pango_fc_font_get_glyph (fc_font, c);
	}

      if (!glyph)
	glyph = PANGO_GET_UNKNOWN_GLYPH (wc);

      pango_ot_buffer_add_glyph (buffer, glyph, properties[i], cluster);

      p = g_utf8_next_char (p);
    }

  g_free (properties);

  desc.script = analysis->script;
  desc.language = analysis->language;

  desc.n_static_gsub_features = G_N_ELEMENTS (gsub_features);
  desc.static_gsub_features = gsub_features;
  desc.n_static_gpos_features = G_N_ELEMENTS (gpos_features);
  desc.static_gpos_features = gpos_features;

  /* TODO populate other_features from analysis->extra_attrs */
  desc.n_other_features = 0;
  desc.other_features = NULL;

  ruleset = pango_ot_ruleset_get_for_description (pango_ot_info_get (face), &desc);

  pango_ot_ruleset_substitute (ruleset, buffer);
  pango_ot_ruleset_position (ruleset, buffer);
  pango_ot_buffer_output (buffer, glyphs);

  pango_ot_buffer_destroy (buffer);

  pango_fc_font_unlock_face (fc_font);
}
Beispiel #26
0
/** Sets the value of the property by parsing str. Note: this should
 * only be called once on an instance of TransProperty, as calling it
 * more than once can cause memory leaks.
 * @param prop The property being set
 * @param str The string to be parsed
 * @return TRUE on success, FALSE on failure
 */
static gboolean trans_property_set(TransProperty* prop, char* str)
{
    char *endptr, *possible_currency_symbol, *str_dupe;
    double value;
    switch (prop->type)
    {
    case GNC_CSV_DATE:
        prop->value = g_new(time_t, 1);
        *((time_t*)(prop->value)) = parse_date(str, prop->list->date_format);
        return *((time_t*)(prop->value)) != -1;

    case GNC_CSV_DESCRIPTION:
    case GNC_CSV_NUM:
        prop->value = g_strdup(str);
        return TRUE;

    case GNC_CSV_BALANCE:
    case GNC_CSV_DEPOSIT:
    case GNC_CSV_WITHDRAWAL:
        str_dupe = g_strdup(str); /* First, we make a copy so we can't mess up real data. */

        /* Go through str_dupe looking for currency symbols. */
        for (possible_currency_symbol = str_dupe; *possible_currency_symbol;
        possible_currency_symbol = g_utf8_next_char(possible_currency_symbol))
        {
            if (g_unichar_type(g_utf8_get_char(possible_currency_symbol)) == G_UNICODE_CURRENCY_SYMBOL)
            {
                /* If we find a currency symbol, save the position just ahead
                 * of the currency symbol (next_symbol), and find the null
                 * terminator of the string (last_symbol). */
                char *next_symbol = g_utf8_next_char(possible_currency_symbol), *last_symbol = next_symbol;
                while (*last_symbol)
                    last_symbol = g_utf8_next_char(last_symbol);

                /* Move all of the string (including the null byte, which is
                 * why we have +1 in the size parameter) following the
                 * currency symbol back one character, thereby overwriting the
                 * currency symbol. */
                memmove(possible_currency_symbol, next_symbol, last_symbol - next_symbol + 1);
                break;
            }
        }

        /* Translate the string (now clean of currency symbols) into a number. */
        value = strtod(str_dupe, &endptr);

        /* If this isn't a valid numeric string, this is an error. */
        if (endptr != str_dupe + strlen(str_dupe))
        {
            g_free(str_dupe);
            return FALSE;
        }

        g_free(str_dupe);

        if (abs(value) > 0.00001)
        {
            prop->value = g_new(gnc_numeric, 1);
            *((gnc_numeric*)(prop->value)) =
            double_to_gnc_numeric(value, xaccAccountGetCommoditySCU(prop->list->account),
            GNC_RND_ROUND);
        }
        return TRUE;
    }
    return FALSE; /* We should never actually get here. */
}
Beispiel #27
0
static void
log_attr_foreach (const char     *text,
		  PangoLogAttr   *attrs,
		  CharForeachFunc func,
		  gpointer        data)
{
  const gchar *next = text;
  gint length = strlen (text);
  const gchar *end = text + length;
  gint i = 0;
  gunichar prev_wc;
  gunichar next_wc;
  GUnicodeType prev_type;
  GUnicodeType next_type;

  if (next == end)
    return;

  offset = 0;
  line = 1;

  prev_type = (GUnicodeType) -1;
  prev_wc = 0;

  next_wc = g_utf8_get_char (next);
  next_type = g_unichar_type (next_wc);

  line_start = text;
  line_end = text;

  while (next_wc != 0)
    {
      GUnicodeType type;
      gunichar wc;

      wc = next_wc;
      type = next_type;

      current_wc = wc;

      next = g_utf8_next_char (next);
      line_end = next;

      if (next >= end)
	next_wc = 0;
      else
	next_wc = g_utf8_get_char (next);

      if (next_wc)
	next_type = g_unichar_type (next_wc);

      (* func) (wc, prev_wc, next_wc,
		type, prev_type, next_type,
		&attrs[i],
		i != 0 ? &attrs[i-1] : NULL,
		next_wc != 0 ? &attrs[i+1] : NULL,
		data);

      prev_type = type;
      prev_wc = wc;
      ++i;
      ++offset;
      if (wc == '\n')
	{
	  ++line;
	  offset = 0;
	  line_start = next;
	  line_end = next;
	}
    }
}
Beispiel #28
0
int main(int argc, char *argv[])
{
  // Set locale required for Glib-2.0:
  setlocale(LC_ALL, "");

  GError *error = NULL;
  GOptionContext *context;
  
  // Process command line arguments.
  context = g_option_context_new(" - convert fortune file to omikuji file");
  g_option_context_add_main_entries(context, options, NULL);
  if (!g_option_context_parse(context, &argc, &argv, &error)) {
    g_printerr("%s\n", error->message);
    return 1;
  }
  else if (error != NULL) {
    g_printerr("%s\n", error->message);
    g_clear_error(&error);
  }

  if (inputEncoding == NULL)
    g_get_charset(&inputEncoding);

  GRegex *regex = compileRegularExpression(&error);
  if (regex == NULL) {
    g_printerr("%s\n", error->message);
    return 2;
  }

  GList *comments = NULL, *fortunes = NULL;
  gchar *buf;
  // Process input files.
  for (gint i = 1; i < argc; i++) {
    buf = slurpfile(argv[i], &error);
    if (buf == NULL) {
      if (error->domain == G_FILE_ERROR)
        g_printerr("%s\n", error->message);
      else 
        g_printerr("Failed to parse %s: %s\n", argv[i], error->message);
      if (outputFilename)
        return 3;
      else {
        g_clear_error(&error);
        continue;
      }
    }
    GMatchInfo *match_info;
    gint start = 0, end = 0, mstart, mend;
    gboolean isComment = FALSE;
    if (g_regex_match(regex, buf, 0, &match_info)) {
      while (g_match_info_matches(match_info)) {
        if (g_match_info_fetch_pos(match_info, 0, &mstart, &mend)) {
          if (start == 0) {
            start = mend;
            gunichar next = g_utf8_get_char(&buf[mend]);
            if (g_unichar_isspace(next)) {
              start++;
              if (g_unichar_type(next) == G_UNICODE_SPACE_SEPARATOR)
                isComment = TRUE;
              else
                isComment = FALSE;
            }
            else
              isComment = TRUE;
            if (!g_match_info_next(match_info, &error))
              g_clear_error(&error);
          }
          else {
            if ((isComment && !skipComments) || !isComment) {
              end = mstart;
              GString *string = g_string_sized_new(end - start);
              for (gint j = start; j < end; j++)
                g_string_append_c(string, buf[j]);
              g_string_append_c(string, '\0');
              if (isComment)
                comments = g_list_append(comments, string);
              else
                fortunes = g_list_append(fortunes, string);
            }
            start = 0; end = 0;
          }
        }
      }
      g_match_info_free(match_info);
      if (outputFilename == NULL) {
        outputFilename = g_path_get_basename(argv[i]);
        if (outputFilename) {
          gpointer temp = g_try_realloc(outputFilename, strlen(outputFilename) + 5);
          if (temp) {
            gboolean delStr = TRUE;
            outputFilename = (gchar *) temp;
            strcat(outputFilename, ".omi");
            writeOmikujiFile(outputFilename, comments, fortunes);
            if (comments) {
              g_list_foreach(comments, stringDelete, &delStr);
              g_list_free(comments);
              comments = NULL;
            }
            g_list_foreach(fortunes, stringDelete, &delStr);
            g_list_free(fortunes);
            fortunes = NULL;
          }
          g_free(outputFilename);
          outputFilename = NULL;
        }
      }
    }
    g_free(buf);
  }

  g_regex_unref(regex);

  if (outputFilename) {
    gboolean delStr = TRUE;
    writeOmikujiFile(outputFilename, comments, fortunes);
    if (comments) {
      g_list_foreach(comments, stringDelete, &delStr);
      g_list_free(comments);
      comments = NULL;
    }
    g_list_foreach(fortunes, stringDelete, &delStr);
    g_list_free(fortunes);
    fortunes = NULL;
  }

  return 0;
}
Beispiel #29
0
/** Sets the value of the property by parsing str. Note: this should
 * only be called once on an instance of TransProperty, as calling it
 * more than once can cause memory leaks.
 * @param prop The property being set
 * @param str The string to be parsed
 * @return TRUE on success, FALSE on failure
 */
static gboolean trans_property_set (TransProperty* prop, char* str)
{
    char *endptr, *possible_currency_symbol, *str_dupe;
    gnc_numeric val;
    int reti;
    regex_t regex;
    switch (prop->type)
    {
    case GNC_CSV_DATE:
        prop->value = g_new(time64, 1);
        *((time64*)(prop->value)) = parse_date(str, prop->list->date_format);
        return *((time64*)(prop->value)) != -1;

    case GNC_CSV_DESCRIPTION:
    case GNC_CSV_NOTES:
    case GNC_CSV_NUM:
        prop->value = g_strdup (str);
        return TRUE;

    case GNC_CSV_BALANCE:
    case GNC_CSV_DEPOSIT:
    case GNC_CSV_WITHDRAWAL:
        str_dupe = g_strdup (str); /* First, we make a copy so we can't mess up real data. */
        /* If a cell is empty or just spaces make its value = "0" */
        reti = regcomp(&regex, "[0-9]", 0);
        reti = regexec(&regex, str_dupe, 0, NULL, 0);
        if (reti == REG_NOMATCH)
        {
            g_free (str_dupe);
            str_dupe = g_strdup ("0");
        }
        /* Go through str_dupe looking for currency symbols. */
        for (possible_currency_symbol = str_dupe; *possible_currency_symbol;
                possible_currency_symbol = g_utf8_next_char (possible_currency_symbol))
        {
            if (g_unichar_type (g_utf8_get_char (possible_currency_symbol)) == G_UNICODE_CURRENCY_SYMBOL)
            {
                /* If we find a currency symbol, save the position just ahead
                 * of the currency symbol (next_symbol), and find the null
                 * terminator of the string (last_symbol). */
                char *next_symbol = g_utf8_next_char (possible_currency_symbol), *last_symbol = next_symbol;
                while (*last_symbol)
                    last_symbol = g_utf8_next_char (last_symbol);

                /* Move all of the string (including the null byte, which is
                 * why we have +1 in the size parameter) following the
                 * currency symbol back one character, thereby overwriting the
                 * currency symbol. */
                memmove (possible_currency_symbol, next_symbol, last_symbol - next_symbol + 1);
                break;
            }
        }

        /* Currency format */
        switch (prop->list->currency_format)
        {
        case 0:
            /* Currancy locale */
            if (!(xaccParseAmount (str_dupe, TRUE, &val, &endptr)))
            {
                g_free (str_dupe);
                return FALSE;
            }
            break;
        case 1:
            /* Currancy decimal period */
            if (!(xaccParseAmountExtended (str_dupe, TRUE, '-', '.', ',', "\003\003", "$+", &val, &endptr)))
            {
                g_free (str_dupe);
                return FALSE;
            }
            break;
        case 2:
            /* Currancy decimal comma */
            if (!(xaccParseAmountExtended (str_dupe, TRUE, '-', ',', '.', "\003\003", "$+", &val, &endptr)))
            {
                g_free (str_dupe);
                return FALSE;
            }
            break;
        }

        prop->value = g_new (gnc_numeric, 1);
        *((gnc_numeric*)(prop->value)) = val;
        g_free (str_dupe);
        return TRUE;

    }
    return FALSE; /* We should never actually get here. */
}
gchar*
rb_search_fold (const char *original)
{
	GString *string;
	gchar *normalized;
	gunichar *unicode, *cur;
	
	g_return_val_if_fail (original != NULL, NULL);

	/* old behaviour is equivalent to: return g_utf8_casefold (original, -1); */
	
	string = g_string_new (NULL);
	normalized = g_utf8_normalize(original, -1, G_NORMALIZE_DEFAULT);
	unicode = g_utf8_to_ucs4_fast (normalized, -1, NULL);
	

	for (cur = unicode; *cur != 0; cur++) {
		switch (g_unichar_type (*cur)) {
		case G_UNICODE_COMBINING_MARK:
		case G_UNICODE_ENCLOSING_MARK:
		case G_UNICODE_NON_SPACING_MARK:
		case G_UNICODE_CONNECT_PUNCTUATION:
		case G_UNICODE_DASH_PUNCTUATION:
		case G_UNICODE_CLOSE_PUNCTUATION:
		case G_UNICODE_FINAL_PUNCTUATION:
		case G_UNICODE_INITIAL_PUNCTUATION:
		case G_UNICODE_OTHER_PUNCTUATION:
		case G_UNICODE_OPEN_PUNCTUATION:
			/* remove these */
			break;

		case G_UNICODE_LOWERCASE_LETTER:
		case G_UNICODE_MODIFIER_LETTER:
		case G_UNICODE_OTHER_LETTER:
		case G_UNICODE_TITLECASE_LETTER:
		case G_UNICODE_UPPERCASE_LETTER:
			/* convert to lower case */
			*cur = g_unichar_tolower (*cur);
			/* ... and fall through */\
		case G_UNICODE_DECIMAL_NUMBER:
		case G_UNICODE_LETTER_NUMBER:
		case G_UNICODE_OTHER_NUMBER:
		/* should be keep symbols? */
		case G_UNICODE_CURRENCY_SYMBOL:
		case G_UNICODE_MODIFIER_SYMBOL:
		case G_UNICODE_MATH_SYMBOL:
		case G_UNICODE_OTHER_SYMBOL:
			g_string_append_unichar (string, *cur);
			break;

		case G_UNICODE_UNASSIGNED:
			rb_debug ("unassigned unicode character type found");
			/* fall through */

		default:
			/* leave these in */
			g_string_append_unichar (string, *cur);
		}
	}
	
	g_free (unicode);
	g_free (normalized);
			
	return g_string_free (string, FALSE);
}