コード例 #1
0
static VALUE utf8proc_ruby_char(VALUE self, VALUE code_param) {
  char buffer[4];
  ssize_t result;
  int uc;
  uc = NUM2INT(code_param);
  if (!utf8proc_codepoint_valid(uc))
    rb_raise(rb_eArgError, "Invalid Unicode code point");
  result = utf8proc_encode_char(uc, buffer);
  return rb_str_new(buffer, result);
}
コード例 #2
0
ファイル: utf8proc.c プロジェクト: SeamusConnor/tmux
int
utf8proc_wctomb(char *s, wchar_t wc)
{
	if (s == NULL)
		return (0);

	if (!utf8proc_codepoint_valid(wc))
		return (-1);
	return (utf8proc_encode_char(wc, s));
}
コード例 #3
0
ファイル: init.c プロジェクト: AlexShiLucky/rtems
static void
test_utf8proc_encode_char ( void )
{
  uint8_t utf8_str[4];
  int32_t unicode_char;
  ssize_t bytes_written;

  for ( unicode_char = 0; unicode_char < 128; ++unicode_char ) {
    bytes_written = utf8proc_encode_char ( unicode_char, utf8_str );

    rtems_test_assert ( bytes_written == 1 );
    rtems_test_assert ( utf8_str[0]   == (uint8_t)unicode_char );
  }
}
コード例 #4
0
ファイル: tmarcrec.cpp プロジェクト: NatLibFi/usemarcon
typestr TMarcRecord::unescape_xml(const typestr & a_str)
{
    typestr new_str = a_str;

    // Convert numeric entities
    char* p = new_str.str();
    while (p = strchr(p, '&'))
    {
        char* p2 = p + 1;
        if (*p2 == '#')
        {
            ++p2;
            int value;
            const char* mask = "%u;";
            if (*p2 == 'x')
            {
                ++p2;
                mask = "%x;";
            }
            if (sscanf(p2, mask, &value) == 1)
            {
                p2 = p;
                typestr source;
                while (*p2 != ';')
                {
                    source.append_char(*p2);
                    ++p2;
                }
                source.append_char(';');

                char utf8[10];
                int chars = utf8proc_encode_char(value, (uint8_t *)utf8);
                utf8[chars] = '\0';

                new_str.replace(source.cstr(), utf8, p - new_str.str());
                p += chars;
                continue;
            }
        }
        ++p;
    }

    new_str.replace("&lt;", "<");
    new_str.replace("&gt;", ">");
    new_str.replace("&apos;", "'");
    new_str.replace("&quot;", "\"");
    new_str.replace("&amp;", "&");

    return new_str;
}
コード例 #5
0
string_tree_t *regex_string_tree(char *regex, size_t len) {
    uint8_t *char_ptr = (uint8_t *)regex;
    bool in_set = false;
    bool in_brackets = false;

    int32_t codepoint;
    int32_t last_codepoint = 0;
    ssize_t char_len;

    size_t bracket_start;
    size_t bracket_len;

    char temp_char[MAX_UTF8_CHAR_SIZE];
    ssize_t temp_char_len;

    string_tree_t *tree = string_tree_new();

    if (len == 0) {
        // Single token with zero-length
        string_tree_add_string_len(tree, regex, len);
        string_tree_finalize_token(tree);
        return tree;
    }

    uint32_array *char_set = uint32_array_new();

    size_t idx = 0;

    int i, j;

    bool add_to_index = false;

    while (idx < len) {
        char_len = utf8proc_iterate(char_ptr, len, &codepoint);
        if (char_len <= 0) {
            uint32_array_destroy(char_set);
            string_tree_destroy(tree);
            return NULL;
        }

        if (!(utf8proc_codepoint_valid(codepoint))) {
            idx += char_len;
            char_ptr += char_len;
            continue;
        }

        add_to_index = true;

        if (codepoint == LSQUARE_CODEPOINT && last_codepoint != BACKSLASH_CODEPOINT) {
            log_debug("begin set\n");
            in_set = true;
            codepoint = BEGIN_SET_CODEPOINT;
            uint32_array_clear(char_set);
        } else if (codepoint == RSQUARE_CODEPOINT && last_codepoint != BACKSLASH_CODEPOINT && in_set) {
            log_debug("end set");

            for (j = 0; j < char_set->n; j++) {
                temp_char_len = utf8proc_encode_char(char_set->a[j], (uint8_t *)temp_char);
                log_debug("Adding string %.*s\n", (int)temp_char_len, temp_char);
                string_tree_add_string_len(tree, temp_char, temp_char_len);
            }
            string_tree_finalize_token(tree);

            uint32_array_clear(char_set);
            // Add a special codepoint to the sequence to distinguish from an escaped square bracket
            codepoint = END_SET_CODEPOINT;
            in_set = false;
        } else if (codepoint == LCURLY_CODEPOINT && last_codepoint != BACKSLASH_CODEPOINT) {
            in_brackets = true;
            bracket_start = idx + char_len;
            bracket_len = 0;
            add_to_index = false;
        } else if (codepoint == RCURLY_CODEPOINT && last_codepoint != BACKSLASH_CODEPOINT && in_brackets) {
            log_debug("Adding bracketed string: %.*s\n", (int) bracket_len, regex + bracket_start);
            string_tree_add_string_len(tree, regex + bracket_start, bracket_len);
            in_brackets = false;
        } else if ((codepoint == LPAREN_CODEPOINT || codepoint == RPAREN_CODEPOINT) && last_codepoint != BACKSLASH_CODEPOINT) {
            log_debug("group\n");
            add_to_index = false;
        } else if (in_set) {
            log_debug("in set\n");
            // Queue node, we'll add them to the trie
            uint32_array_push(char_set, codepoint);
            add_to_index = false;
        } else if (in_brackets) {
            add_to_index = false;
            bracket_len += char_len;
        } else if (codepoint == BACKSLASH_CODEPOINT && last_codepoint != BACKSLASH_CODEPOINT) {
            add_to_index = false;
        }

        log_debug("codepoint = %d\n", codepoint);

        if (add_to_index) {
            temp_char_len = utf8proc_encode_char(codepoint, (uint8_t *)temp_char);
            log_debug("char = %.*s\n", (int)temp_char_len, temp_char);
            string_tree_add_string_len(tree, temp_char, temp_char_len);
            string_tree_finalize_token(tree);
        }

        idx += char_len;
        char_ptr += char_len;
    }

    uint32_array_destroy(char_set);

    return tree;
   
}
コード例 #6
0
ファイル: houdini_html_u.c プロジェクト: bobfridley/cmark
size_t
houdini_unescape_ent(cmark_strbuf *ob, const uint8_t *src, size_t size)
{
	size_t i = 0;

	if (size >= 3 && src[0] == '#') {
		int codepoint  = 0;
		int num_digits = 0;

		if (_isdigit(src[1])) {
			for (i = 1; i < size && _isdigit(src[i]); ++i) {
				codepoint = (codepoint * 10) + (src[i] - '0');

				if (codepoint >= 0x110000) {
					// Keep counting digits but
					// avoid integer overflow.
					codepoint = 0x110000;
				}
			}

			num_digits = i - 1;
		}

		else if (src[1] == 'x' || src[1] == 'X') {
			for (i = 2; i < size && _isxdigit(src[i]); ++i) {
				codepoint = (codepoint * 16) + ((src[i] | 32) % 39 - 9);

				if (codepoint >= 0x110000) {
					// Keep counting digits but
					// avoid integer overflow.
					codepoint = 0x110000;
				}
			}

			num_digits = i - 2;
		}

		if (num_digits >= 1 && num_digits <= 8 &&
		    i < size && src[i] == ';') {
			if (codepoint == 0 ||
			    (codepoint >= 0xD800 && codepoint < 0xE000) ||
			    codepoint >= 0x110000) {
				codepoint = 0xFFFD;
			}
			utf8proc_encode_char(codepoint, ob);
			return i + 1;
		}
	}

	else {
		if (size > MAX_WORD_LENGTH)
			size = MAX_WORD_LENGTH;

		for (i = MIN_WORD_LENGTH; i < size; ++i) {
			if (src[i] == ' ')
				break;

			if (src[i] == ';') {
				const struct html_ent *entity = find_entity((char *)src, i);

				if (entity != NULL) {
					int len = 0;
					while (len < 4 && entity->utf8[len] != '\0') {
						++len;
					}
					cmark_strbuf_put(ob, entity->utf8, len);
					return i + 1;
				}

				break;
			}
		}
	}

	return 0;
}
コード例 #7
0
ファイル: render.c プロジェクト: apache/lucy-clownfish
void
cmark_render_code_point(cmark_renderer *renderer, uint32_t c)
{
	utf8proc_encode_char(c, renderer->buffer);
	renderer->column += 1;
}