/* conv UTF-8 to UCS-4, useful for comparisons */ ldap_ucs4_t ldap_x_utf8_to_ucs4( const char * p ) { const unsigned char *c = (const unsigned char *) p; ldap_ucs4_t ch; int len, i; static unsigned char mask[] = { 0, 0x7f, 0x1f, 0x0f, 0x07, 0x03, 0x01 }; len = LDAP_UTF8_CHARLEN2(p, len); if( len == 0 ) return LDAP_UCS4_INVALID; ch = c[0] & mask[len]; for(i=1; i < len; i++) { if ((c[i] & 0xc0) != 0x80) { return LDAP_UCS4_INVALID; } ch <<= 6; ch |= c[i] & 0x3f; } return ch; }
/*----------------------------------------------------------------------------- Convert a UTF-8 character to a wide char. Return the length of the UTF-8 input character in bytes. */ int ldap_x_utf8_to_wc ( wchar_t *wchar, const char *utf8char ) { int utflen, i; wchar_t ch; if (utf8char == NULL) return -1; /* Get UTF-8 sequence length from 1st byte */ utflen = LDAP_UTF8_CHARLEN2(utf8char, utflen); if( utflen==0 || utflen > (int)LDAP_MAX_UTF8_LEN ) return -1; /* First byte minus length tag */ ch = (wchar_t)(utf8char[0] & mask[utflen]); for(i=1; i < utflen; i++) { /* Subsequent bytes must start with 10 */ if ((utf8char[i] & 0xc0) != 0x80) return -1; ch <<= 6; /* 6 bits of data in each subsequent byte */ ch |= (wchar_t)(utf8char[i] & 0x3f); } if (wchar) *wchar = ch; return utflen; }
/*----------------------------------------------------------------------------- Convert a UTF-8 string to a wide char string. No more than 'count' wide chars will be written to the output buffer. Return the size of the converted string in wide chars, excl null terminator. */ int ldap_x_utf8s_to_wcs ( wchar_t *wcstr, const char *utf8str, size_t count ) { size_t wclen = 0; int utflen, i; wchar_t ch; /* If input ptr is NULL or empty... */ if (utf8str == NULL || !*utf8str) { if ( wcstr ) *wcstr = 0; return 0; } /* Examine next UTF-8 character. If output buffer is NULL, ignore count */ while ( *utf8str && (wcstr==NULL || wclen<count) ) { /* Get UTF-8 sequence length from 1st byte */ utflen = LDAP_UTF8_CHARLEN2(utf8str, utflen); if( utflen==0 || utflen > (int)LDAP_MAX_UTF8_LEN ) return -1; /* First byte minus length tag */ ch = (wchar_t)(utf8str[0] & mask[utflen]); for(i=1; i < utflen; i++) { /* Subsequent bytes must start with 10 */ if ((utf8str[i] & 0xc0) != 0x80) return -1; ch <<= 6; /* 6 bits of data in each subsequent byte */ ch |= (wchar_t)(utf8str[i] & 0x3f); } if (wcstr) wcstr[wclen] = ch; utf8str += utflen; /* Move to next UTF-8 character */ wclen++; /* Count number of wide chars stored/required */ } /* Add null terminator if there's room in the buffer. */ if (wcstr && wclen < count) wcstr[wclen] = 0; return (int)wclen; }
struct berval * UTF8bvnormalize( struct berval *bv, struct berval *newbv, unsigned flags ) { int i, j, len, clen, outpos, ucsoutlen, outsize, last; char *out, *outtmp, *s; unsigned long *ucs, *p, *ucsout; unsigned casefold = flags & LDAP_UTF8_CASEFOLD; unsigned approx = flags & LDAP_UTF8_APPROX; static unsigned char mask[] = { 0, 0x7f, 0x1f, 0x0f, 0x07, 0x03, 0x01 }; if ( bv == NULL ) { return NULL; } s = bv->bv_val; len = bv->bv_len; if ( len == 0 ) { return ber_dupbv( newbv, bv ); } /* FIXME: Should first check to see if string is already in * proper normalized form. This is almost as time consuming * as the normalization though. */ /* finish off everything up to character before first non-ascii */ if ( LDAP_UTF8_ISASCII( s ) ) { if ( casefold ) { outsize = len + 7; out = (char *) malloc( outsize ); if ( out == NULL ) { return NULL; } outpos = 0; for ( i = 1; (i < len) && LDAP_UTF8_ISASCII(s + i); i++ ) { out[outpos++] = TOLOWER( s[i-1] ); } if ( i == len ) { out[outpos++] = TOLOWER( s[len - 1] ); out[outpos] = '\0'; return ber_str2bv( out, outpos, 0, newbv); } } else { for ( i = 1; (i < len) && LDAP_UTF8_ISASCII(s + i); i++ ) { /* empty */ } if ( i == len ) { return ber_str2bv( s, len, 1, newbv ); } outsize = len + 7; out = (char *) malloc( outsize ); if ( out == NULL ) { return NULL; } outpos = i - 1; memcpy(out, s, outpos); } } else { outsize = len + 7; out = (char *) malloc( outsize ); if ( out == NULL ) { return NULL; } outpos = 0; i = 0; } p = ucs = malloc( len * sizeof(*ucs) ); if ( ucs == NULL ) { free(out); return NULL; } /* convert character before first non-ascii to ucs-4 */ if ( i > 0 ) { *p = casefold ? TOLOWER( s[i - 1] ) : s[i - 1]; p++; } /* s[i] is now first non-ascii character */ for (;;) { /* s[i] is non-ascii */ /* convert everything up to next ascii to ucs-4 */ while ( i < len ) { clen = LDAP_UTF8_CHARLEN2( s + i, clen ); if ( clen == 0 ) { free( ucs ); free( out ); return NULL; } if ( clen == 1 ) { /* ascii */ break; } *p = s[i] & mask[clen]; i++; for( j = 1; j < clen; j++ ) { if ( (s[i] & 0xc0) != 0x80 ) { free( ucs ); free( out ); return NULL; } *p <<= 6; *p |= s[i] & 0x3f; i++; } if ( casefold ) { *p = uctolower( *p ); } p++; } /* normalize ucs of length p - ucs */ uccompatdecomp( ucs, p - ucs, &ucsout, &ucsoutlen ); if ( approx ) { for ( j = 0; j < ucsoutlen; j++ ) { if ( ucsout[j] < 0x80 ) { out[outpos++] = ucsout[j]; } } } else { ucsoutlen = uccanoncomp( ucsout, ucsoutlen ); /* convert ucs to utf-8 and store in out */ for ( j = 0; j < ucsoutlen; j++ ) { /* allocate more space if not enough room for 6 bytes and terminator */ if ( outsize - outpos < 7 ) { outsize = ucsoutlen - j + outpos + 6; outtmp = (char *) realloc( out, outsize ); if ( outtmp == NULL ) { free( out ); free( ucs ); free( ucsout ); return NULL; } out = outtmp; } outpos += ldap_x_ucs4_to_utf8( ucsout[j], &out[outpos] ); } } free( ucsout ); ucsout = NULL; if ( i == len ) { break; } last = i; /* s[i] is ascii */ /* finish off everything up to char before next non-ascii */ for ( i++; (i < len) && LDAP_UTF8_ISASCII(s + i); i++ ) { out[outpos++] = casefold ? TOLOWER( s[i-1] ) : s[i-1]; } if ( i == len ) { out[outpos++] = casefold ? TOLOWER( s[len - 1] ) : s[len - 1]; break; } /* convert character before next non-ascii to ucs-4 */ *ucs = casefold ? TOLOWER( s[i - 1] ) : s[i - 1]; p = ucs + 1; } free( ucs ); out[outpos] = '\0'; return ber_str2bv( out, outpos, 0, newbv ); }