示例#1
0
// return NFC-normalized UTF8-encoded version of s
static char *normalize(char *s)
{
    static size_t buflen = 0;
    static void *buf = NULL; // persistent buffer (avoid repeated malloc/free)
    // options equivalent to utf8proc_NFC:
    const int options = UTF8PROC_NULLTERM|UTF8PROC_STABLE|UTF8PROC_COMPOSE;
    ssize_t result;
    size_t newlen;
    result = utf8proc_decompose((uint8_t*) s, 0, NULL, 0, options);
    if (result < 0) goto error;
    newlen = result * sizeof(int32_t) + 1;
    if (newlen > buflen) {
        buflen = newlen * 2;
        buf = realloc(buf, buflen);
        if (!buf) lerror(MemoryError, "error allocating UTF8 buffer");
    }
    result = utf8proc_decompose((uint8_t*)s,0, (int32_t*)buf,result, options);
    if (result < 0) goto error;
    result = utf8proc_reencode((int32_t*)buf,result, options);
    if (result < 0) goto error;
    return (char*) buf;
error:
    lerrorf(symbol("error"), "error normalizing identifier %s: %s", s,
            utf8proc_errmsg(result));
}
示例#2
0
文件: init.c 项目: AlexShiLucky/rtems
static void
test_utf8proc_reencode ( void )
{
  char         string_simple[]    = "The quick brown.fox";
  uint8_t     *string_simple_utf8 = (uint8_t*)(&string_simple[0]);
  int32_t      string_decomposed[sizeof ( string_simple ) * 4];
  uint8_t     *string_reencoded   = (uint8_t*)(&string_decomposed[0]);
  ssize_t      chars_written;
  unsigned int index;

  memset (&string_decomposed[0], 0, sizeof ( string_decomposed ) );

  chars_written = utf8proc_decompose (
    string_simple_utf8,
    sizeof ( string_simple ),
    &string_decomposed[0],
    sizeof ( string_decomposed ),
    UTF8PROC_NULLTERM | UTF8PROC_STABLE | UTF8PROC_DECOMPOSE );
  rtems_test_assert ( chars_written == strlen ( string_simple ) );

  chars_written = utf8proc_reencode (
    &string_decomposed[0],
    chars_written,
    UTF8PROC_NULLTERM | UTF8PROC_STABLE | UTF8PROC_DECOMPOSE );
  rtems_test_assert ( chars_written == strlen ( string_simple ) );
  /* Our source string contains only very simple characters. Thus the above
   * decomposition should result in exactly the same string
   */
  for ( index = 0; index < sizeof ( string_simple ); ++index ) {
    rtems_test_assert ( string_simple_utf8[index] == string_reencoded[index] );
  }
}
示例#3
0
Inversion*
Normalizer_transform(Normalizer *self, Inversion *inversion) {
    // allocate additional space because utf8proc_reencode adds a
    // terminating null char
    int32_t static_buffer[INITIAL_BUFSIZE + 1];
    int32_t *buffer = static_buffer;
    ssize_t bufsize = INITIAL_BUFSIZE;
    Token *token;

    while (NULL != (token = Inversion_Next(inversion))) {
        ssize_t len = utf8proc_decompose((uint8_t*)token->text, token->len,
                                         buffer, bufsize, self->options);

        if (len > bufsize) {
            // buffer too small, (re)allocate
            if (buffer != static_buffer) {
                FREEMEM(buffer);
            }
            // allocate additional INITIAL_BUFSIZE items
            bufsize = len + INITIAL_BUFSIZE;
            buffer = (int32_t*)MALLOCATE((bufsize + 1) * sizeof(int32_t));
            len = utf8proc_decompose((uint8_t*)token->text, token->len,
                                     buffer, bufsize, self->options);
        }
        if (len < 0) {
            continue;
        }

        len = utf8proc_reencode(buffer, len, self->options);

        if (len >= 0) {
            if (len > (ssize_t)token->len) {
                FREEMEM(token->text);
                token->text = (char*)MALLOCATE(len + 1);
            }
            memcpy(token->text, buffer, len + 1);
            token->len = len;
        }
    }

    if (buffer != static_buffer) {
        FREEMEM(buffer);
    }

    Inversion_Reset(inversion);
    return (Inversion*)INCREF(inversion);
}
示例#4
0
VALUE utf8proc_ruby_map(VALUE self, VALUE str_param, VALUE options_param) {
  VALUE str;
  int options;
  VALUE env_obj;
  utf8proc_ruby_mapenv_t *env;
  ssize_t result;
  VALUE retval;
  str = StringValue(str_param);
  options = NUM2INT(options_param) & ~UTF8PROC_NULLTERM;
  env_obj = Data_Make_Struct(rb_cObject, utf8proc_ruby_mapenv_t, NULL,
    utf8proc_ruby_mapenv_free, env);
  result = utf8proc_decompose(RSTRING_PTR(str), RSTRING_LEN(str),
    NULL, 0, options);
  if (result < 0) {
    utf8proc_ruby_map_error(result);
    return Qnil;  /* needed to prevent problems with optimization */
  }
  env->buffer = ALLOC_N(int32_t, result+1);
  result = utf8proc_decompose(RSTRING_PTR(str), RSTRING_LEN(str),
    env->buffer, result, options);
  if (result < 0) {
    free(env->buffer);
    env->buffer = 0;
    utf8proc_ruby_map_error(result);
    return Qnil;  /* needed to prevent problems with optimization */
  }
  result = utf8proc_reencode(env->buffer, result, options);
  if (result < 0) {
    free(env->buffer);
    env->buffer = 0;
    utf8proc_ruby_map_error(result);
    return Qnil;  /* needed to prevent problems with optimization */
  }
  retval = rb_str_new((char *)env->buffer, result);
  free(env->buffer);
  env->buffer = 0;
  return retval;
}
示例#5
0
文件: vmod_vsf.c 项目: aaam/VSF
vmod_normalize(VRT_CTX, VCL_STRING s)
{
	char *p;
	utf8proc_ssize_t len;
	unsigned u;
	int options;

	CHECK_OBJ_NOTNULL(ctx, VRT_CTX_MAGIC);
	if (!s || !*s) {
		VSLb(ctx->vsl, SLT_Error, "vsf.normalize: No input");
		return (NULL);
	}
	len = strlen(s);
	assert(len > 0);

	u = WS_Reserve(ctx->ws, 0);
	if (u < len * sizeof(utf8proc_int32_t) + 1) {
		VSLb(ctx->vsl, SLT_Error, "vsf.normalize: Out of workspace");
		WS_Release(ctx->ws, 0);
		return (NULL);
	}
	p = ctx->ws->f;

	options = UTF8PROC_STABLE | UTF8PROC_COMPAT | UTF8PROC_COMPOSE |
	    UTF8PROC_IGNORE | UTF8PROC_NLF2LF | UTF8PROC_LUMP |
	    UTF8PROC_STRIPMARK;

	len = utf8proc_decompose((utf8proc_uint8_t *)s, len,
	    (utf8proc_int32_t *)p, len, options);
	if (len < 0) {
		VSLb(ctx->vsl, SLT_Error,
		    "vsf.normalize: utf8proc_decompose: %s",
		    utf8proc_errmsg(len));
		WS_Release(ctx->ws, 0);
		return (NULL);
	}
	assert(len * sizeof(utf8proc_int32_t) + 1 < u);

	len = utf8proc_reencode((utf8proc_int32_t *)p, len, options);
	assert(len > 0);

	WS_Release(ctx->ws, len + 1);
	return (p);
}
示例#6
0
vmod_transform(VRT_CTX, VCL_STRING s, VCL_INT options)
{
	char *p;
	utf8proc_ssize_t len;
	unsigned u;

	CHECK_OBJ_NOTNULL(ctx, VRT_CTX_MAGIC);

	if (!s) {
		VSLb(ctx->vsl, SLT_Error, "utf8.transform: No input");
		return (NULL);
	}

	/* Use composed if not specified. */
	if ((options & UTF8PROC_STRIPMARK) &&
	    (options & (UTF8PROC_COMPOSE | UTF8PROC_DECOMPOSE)) == 0)
		options |= UTF8PROC_COMPOSE;
	/* Input is NULL terminated. */
	options |= UTF8PROC_NULLTERM;

	u = WS_Reserve(ctx->ws, 0);
	p = ctx->ws->f;

	len = utf8proc_decompose((utf8proc_uint8_t *)s, 0 /* IGNORED */,
	    (utf8proc_int32_t *)p, u, options);
	if (len < 0) {
		VSLb(ctx->vsl, SLT_Error,
		    "utf8.transform: utf8proc_decompose: %s",
		    utf8proc_errmsg(len));
		WS_Release(ctx->ws, 0);
		return (NULL);
	}

	len = utf8proc_reencode((utf8proc_int32_t *)p, len, options);
	assert(len > 0);
	assert(len < u);

	WS_Release(ctx->ws, len + 1);
	return (p);
}