// return NFC-normalized UTF8-encoded version of s static char *normalize(char *s) { static size_t buflen = 0; static void *buf = NULL; // persistent buffer (avoid repeated malloc/free) // options equivalent to utf8proc_NFC: const int options = UTF8PROC_NULLTERM|UTF8PROC_STABLE|UTF8PROC_COMPOSE; ssize_t result; size_t newlen; result = utf8proc_decompose((uint8_t*) s, 0, NULL, 0, options); if (result < 0) goto error; newlen = result * sizeof(int32_t) + 1; if (newlen > buflen) { buflen = newlen * 2; buf = realloc(buf, buflen); if (!buf) lerror(MemoryError, "error allocating UTF8 buffer"); } result = utf8proc_decompose((uint8_t*)s,0, (int32_t*)buf,result, options); if (result < 0) goto error; result = utf8proc_reencode((int32_t*)buf,result, options); if (result < 0) goto error; return (char*) buf; error: lerrorf(symbol("error"), "error normalizing identifier %s: %s", s, utf8proc_errmsg(result)); }
static void test_utf8proc_reencode ( void ) { char string_simple[] = "The quick brown.fox"; uint8_t *string_simple_utf8 = (uint8_t*)(&string_simple[0]); int32_t string_decomposed[sizeof ( string_simple ) * 4]; uint8_t *string_reencoded = (uint8_t*)(&string_decomposed[0]); ssize_t chars_written; unsigned int index; memset (&string_decomposed[0], 0, sizeof ( string_decomposed ) ); chars_written = utf8proc_decompose ( string_simple_utf8, sizeof ( string_simple ), &string_decomposed[0], sizeof ( string_decomposed ), UTF8PROC_NULLTERM | UTF8PROC_STABLE | UTF8PROC_DECOMPOSE ); rtems_test_assert ( chars_written == strlen ( string_simple ) ); chars_written = utf8proc_reencode ( &string_decomposed[0], chars_written, UTF8PROC_NULLTERM | UTF8PROC_STABLE | UTF8PROC_DECOMPOSE ); rtems_test_assert ( chars_written == strlen ( string_simple ) ); /* Our source string contains only very simple characters. Thus the above * decomposition should result in exactly the same string */ for ( index = 0; index < sizeof ( string_simple ); ++index ) { rtems_test_assert ( string_simple_utf8[index] == string_reencoded[index] ); } }
Inversion* Normalizer_transform(Normalizer *self, Inversion *inversion) { // allocate additional space because utf8proc_reencode adds a // terminating null char int32_t static_buffer[INITIAL_BUFSIZE + 1]; int32_t *buffer = static_buffer; ssize_t bufsize = INITIAL_BUFSIZE; Token *token; while (NULL != (token = Inversion_Next(inversion))) { ssize_t len = utf8proc_decompose((uint8_t*)token->text, token->len, buffer, bufsize, self->options); if (len > bufsize) { // buffer too small, (re)allocate if (buffer != static_buffer) { FREEMEM(buffer); } // allocate additional INITIAL_BUFSIZE items bufsize = len + INITIAL_BUFSIZE; buffer = (int32_t*)MALLOCATE((bufsize + 1) * sizeof(int32_t)); len = utf8proc_decompose((uint8_t*)token->text, token->len, buffer, bufsize, self->options); } if (len < 0) { continue; } len = utf8proc_reencode(buffer, len, self->options); if (len >= 0) { if (len > (ssize_t)token->len) { FREEMEM(token->text); token->text = (char*)MALLOCATE(len + 1); } memcpy(token->text, buffer, len + 1); token->len = len; } } if (buffer != static_buffer) { FREEMEM(buffer); } Inversion_Reset(inversion); return (Inversion*)INCREF(inversion); }
VALUE utf8proc_ruby_map(VALUE self, VALUE str_param, VALUE options_param) { VALUE str; int options; VALUE env_obj; utf8proc_ruby_mapenv_t *env; ssize_t result; VALUE retval; str = StringValue(str_param); options = NUM2INT(options_param) & ~UTF8PROC_NULLTERM; env_obj = Data_Make_Struct(rb_cObject, utf8proc_ruby_mapenv_t, NULL, utf8proc_ruby_mapenv_free, env); result = utf8proc_decompose(RSTRING_PTR(str), RSTRING_LEN(str), NULL, 0, options); if (result < 0) { utf8proc_ruby_map_error(result); return Qnil; /* needed to prevent problems with optimization */ } env->buffer = ALLOC_N(int32_t, result+1); result = utf8proc_decompose(RSTRING_PTR(str), RSTRING_LEN(str), env->buffer, result, options); if (result < 0) { free(env->buffer); env->buffer = 0; utf8proc_ruby_map_error(result); return Qnil; /* needed to prevent problems with optimization */ } result = utf8proc_reencode(env->buffer, result, options); if (result < 0) { free(env->buffer); env->buffer = 0; utf8proc_ruby_map_error(result); return Qnil; /* needed to prevent problems with optimization */ } retval = rb_str_new((char *)env->buffer, result); free(env->buffer); env->buffer = 0; return retval; }
vmod_normalize(VRT_CTX, VCL_STRING s) { char *p; utf8proc_ssize_t len; unsigned u; int options; CHECK_OBJ_NOTNULL(ctx, VRT_CTX_MAGIC); if (!s || !*s) { VSLb(ctx->vsl, SLT_Error, "vsf.normalize: No input"); return (NULL); } len = strlen(s); assert(len > 0); u = WS_Reserve(ctx->ws, 0); if (u < len * sizeof(utf8proc_int32_t) + 1) { VSLb(ctx->vsl, SLT_Error, "vsf.normalize: Out of workspace"); WS_Release(ctx->ws, 0); return (NULL); } p = ctx->ws->f; options = UTF8PROC_STABLE | UTF8PROC_COMPAT | UTF8PROC_COMPOSE | UTF8PROC_IGNORE | UTF8PROC_NLF2LF | UTF8PROC_LUMP | UTF8PROC_STRIPMARK; len = utf8proc_decompose((utf8proc_uint8_t *)s, len, (utf8proc_int32_t *)p, len, options); if (len < 0) { VSLb(ctx->vsl, SLT_Error, "vsf.normalize: utf8proc_decompose: %s", utf8proc_errmsg(len)); WS_Release(ctx->ws, 0); return (NULL); } assert(len * sizeof(utf8proc_int32_t) + 1 < u); len = utf8proc_reencode((utf8proc_int32_t *)p, len, options); assert(len > 0); WS_Release(ctx->ws, len + 1); return (p); }
vmod_transform(VRT_CTX, VCL_STRING s, VCL_INT options) { char *p; utf8proc_ssize_t len; unsigned u; CHECK_OBJ_NOTNULL(ctx, VRT_CTX_MAGIC); if (!s) { VSLb(ctx->vsl, SLT_Error, "utf8.transform: No input"); return (NULL); } /* Use composed if not specified. */ if ((options & UTF8PROC_STRIPMARK) && (options & (UTF8PROC_COMPOSE | UTF8PROC_DECOMPOSE)) == 0) options |= UTF8PROC_COMPOSE; /* Input is NULL terminated. */ options |= UTF8PROC_NULLTERM; u = WS_Reserve(ctx->ws, 0); p = ctx->ws->f; len = utf8proc_decompose((utf8proc_uint8_t *)s, 0 /* IGNORED */, (utf8proc_int32_t *)p, u, options); if (len < 0) { VSLb(ctx->vsl, SLT_Error, "utf8.transform: utf8proc_decompose: %s", utf8proc_errmsg(len)); WS_Release(ctx->ws, 0); return (NULL); } len = utf8proc_reencode((utf8proc_int32_t *)p, len, options); assert(len > 0); assert(len < u); WS_Release(ctx->ws, len + 1); return (p); }