static int eatwhitespace_c(const char **str_p) { int ret = 0; ucs4_t ucs; size_t len; const char *str = *str_p; /* skip over potential whitespace */ for (;;) { unsigned char utf8_character = (unsigned char)*str; if (~utf8_character & 0x80) { if (!iswspace(utf8_character)) break; ++str; } else { ret = unicode_utf8_to_ucs4(&ucs, str, &len); if (ret != 0) { log_warning("illegal character sequence in UTF8 string: %s\n", str); break; } if (!iswspace((wint_t)ucs)) break; str += len; } } *str_p = str; return ret; }
int findtoken(const void * root, const char *key, variant * result) { const char * str = key; const tnode * tk = (const tnode *)root; if (!tk || !str || *str == 0) { return E_TOK_NOMATCH; } do { int index; const tref *ref; ucs4_t ucs; size_t len; int ret = unicode_utf8_to_ucs4(&ucs, str, &len); if (ret != 0) { /* encoding is broken. youch */ log_error("findtoken | encoding error in '%s'\n", key); return E_TOK_NOMATCH; } #if NODEHASHSIZE == 8 index = ucs & 7; #else index = ucs % NODEHASHSIZE; #endif ref = tk->next[index]; while (ref && ref->ucs != ucs) ref = ref->nexthash; str += len; if (!ref) { log_debug("findtoken | token not found '%s'\n", key); return E_TOK_NOMATCH; } tk = ref->node; } while (*str); if (tk) { *result = tk->id; return E_TOK_SUCCESS; } log_debug("findtoken | token not found '%s'\n", key); return E_TOK_NOMATCH; }
static int count_umlaut(const char *s) { int result = 0; const char *cp; for (cp = s; *cp; ++cp) { ucs4_t ucs = *cp; if (ucs & 0x80) { size_t size; int err; err = unicode_utf8_to_ucs4(&ucs, cp, &size); if (err != 0) { log_error("illegal utf8 encoding %s at %s", s, cp); return result; } cp += size; ++result; } } return result; }
void skip_token(void) { char quotechar = 0; eatwhitespace_c(&states->current_token); while (*states->current_token) { ucs4_t ucs; size_t len; unsigned char utf8_character = (unsigned char)states->current_token[0]; if (~utf8_character & 0x80) { ucs = utf8_character; ++states->current_token; } else { int ret = unicode_utf8_to_ucs4(&ucs, states->current_token, &len); if (ret == 0) { states->current_token += len; } else { log_warning("illegal character sequence in UTF8 string: %s\n", states->current_token); } } if (iswspace((wint_t)ucs) && quotechar == 0) { return; } else { switch (utf8_character) { case '"': case '\'': if (utf8_character == quotechar) return; quotechar = utf8_character; break; case ESCAPE_CHAR: ++states->current_token; break; } } } }
char *parse_token(const char **str, char *lbuf, size_t buflen) { char *cursor = lbuf; char quotechar = 0; bool escape = false; const char *ctoken = *str; if (!ctoken) { return 0; } eatwhitespace_c(&ctoken); if (!*ctoken) { if (buflen > 0) { *cursor = 0; } return 0; } while (*ctoken) { ucs4_t ucs; size_t len; bool copy = false; unsigned char utf8_character = *(unsigned char *)ctoken; if (~utf8_character & 0x80) { ucs = utf8_character; len = 1; } else { int ret = unicode_utf8_to_ucs4(&ucs, ctoken, &len); if (ret != 0) { log_warning("illegal character sequence in UTF8 string: %s\n", ctoken); break; } } if (escape) { copy = true; escape = false; } else if (iswspace((wint_t)ucs)) { if (quotechar == 0) break; copy = true; } else if (utf8_character == '"' || utf8_character == '\'') { if (utf8_character == quotechar) { ++ctoken; break; } else if (quotechar == 0) { quotechar = utf8_character; ++ctoken; } else { if (cursor - buflen < lbuf - len) { *cursor++ = *ctoken++; } } } else if (utf8_character == SPACE_REPLACEMENT) { if (cursor - buflen < lbuf - len) { *cursor++ = ' '; } ++ctoken; } else if (utf8_character == ESCAPE_CHAR) { escape = true; ++ctoken; } else { copy = true; } if (copy) { if (cursor - buflen < lbuf - len) { memcpy(cursor, ctoken, len); cursor += len; } ctoken += len; } } *cursor = '\0'; *str = ctoken; return lbuf; }
char * transliterate(char * out, size_t size, const char * in) { const char *src = in; char *dst = out; assert(in && size > 0); --size; /* need space for a final 0-byte */ while (*src && size) { size_t len; const char * p = src; while ((p + size > src) && *src && (~*src & 0x80)) { *dst++ = (char)tolower(*src++); } len = src - p; size -= len; while (size > 0 && *src && (*src & 0x80)) { unsigned int advance = 2; if (src[0] == '\xc3') { if (src[1] == '\xa4' || src[1] == '\x84') { memcpy(dst, "ae", 2); } else if (src[1] == '\xb6' || src[1] == '\x96') { memcpy(dst, "oe", 2); } else if (src[1] == '\xbc' || src[1] == '\x9c') { memcpy(dst, "ue", 2); } else if (src[1] == '\x9f') { memcpy(dst, "ss", 2); } else { advance = 0; } } else if (src[0] == '\xe1') { if (src[1] == '\xba' && src[2] == '\x9e') { memcpy(dst, "ss", 2); ++src; } else { advance = 0; } } else { advance = 0; } if (advance && advance <= size) { src += advance; dst += advance; size -= advance; } else { ucs4_t ucs; int ret = unicode_utf8_to_ucs4(&ucs, src, &len); if (ret != 0) { /* encoding is broken. yikes */ log_error("transliterate | encoding error in '%s'\n", src); return NULL; } src += len; *dst++ = '?'; --size; } } } *dst = 0; return *src ? 0 : out; }
void addtoken(tnode ** root, const char *str, variant id) { tnode * tk; static const struct replace { ucs4_t ucs; const char str[3]; } replace[] = { /* match lower-case (!) umlauts and others to transcriptions */ { 228, "AE" }, { 246, "OE" }, { 252, "UE" }, { 223, "SS" }, { 230, "AE" }, { 248, "OE" }, { 229, "AA" }, { 0, "" } }; assert(root && str); if (!*root) { tk = *root = mknode(); } else { tk = *root; } assert(tk && tk == *root); if (!*str) { tk->id = id; tk->flags |= LEAF; } else { tref *next; int ret, index, i = 0; ucs4_t ucs, lcs; size_t len; ret = unicode_utf8_to_ucs4(&ucs, str, &len); assert(ret == 0 || !"invalid utf8 string"); lcs = ucs; #if NODEHASHSIZE == 8 index = ucs & 7; #else index = ucs % NODEHASHSIZE; #endif assert(index >= 0); next = tk->next[index]; if (!(tk->flags & LEAF)) tk->id = id; while (next && next->ucs != ucs) next = next->nexthash; if (!next) { tref *ref; tnode *node = mknode(); // TODO: what is the reason for this empty node to exist? if (ucs < 'a' || ucs > 'z') { lcs = towlower((wint_t)ucs); } if (ucs == lcs) { ucs = towupper((wint_t)ucs); } ref = (tref *)malloc(sizeof(tref)); ref->ucs = ucs; ref->node = node; ref->nexthash = tk->next[index]; tk->next[index] = ref; /* try lower/upper casing the character, and try again */ if (ucs != lcs) { #if NODEHASHSIZE == 8 index = lcs & 7; #else index = lcs % NODEHASHSIZE; #endif ref = (tref *)malloc(sizeof(tref)); assert_alloc(ref); ref->ucs = lcs; ref->node = node; ++node->refcount; ref->nexthash = tk->next[index]; tk->next[index] = ref; } next = ref; } else { tnode * next_node = (tnode *)next->node; next_node->flags |= SHARED; if ((next_node->flags & LEAF) == 0) next_node->id.v = NULL; /* why? */ } addtoken(&next->node, str + len, id); while (replace[i].str[0]) { if (lcs == replace[i].ucs) { char zText[1024]; memcpy(zText, replace[i].str, 3); strcpy(zText + 2, (const char *)str + len); addtoken(root, zText, id); break; } ++i; } } }
const char *abkz(const char *s, char *buf, size_t buflen, size_t maxchars) { const char *p = s; char *bufp; unsigned int c = 0; size_t bpt, i; ucs4_t ucs; size_t size; int result; /* Prüfen, ob Kurz genug */ if (strlen(s) <= maxchars) { return s; } /* Anzahl der Wörter feststellen */ while (*p != 0) { result = unicode_utf8_to_ucs4(&ucs, p, &size); assert(result == 0 || "damnit, we're not handling invalid input here!"); /* Leerzeichen überspringen */ while (*p != 0 && !iswalnum((wint_t)ucs)) { p += size; result = unicode_utf8_to_ucs4(&ucs, p, &size); assert(result == 0 || "damnit, we're not handling invalid input here!"); } /* Counter erhöhen */ if (*p != 0) ++c; /* alnums überspringen */ while (*p != 0 && iswalnum((wint_t)ucs)) { p += size; result = unicode_utf8_to_ucs4(&ucs, p, &size); assert(result == 0 || "damnit, we're not handling invalid input here!"); } } /* Buchstaben pro Teilkürzel = _max(1,max/AnzWort) */ bpt = _max(1, maxchars / c); /* Einzelne Wörter anspringen und jeweils die ersten BpT kopieren */ p = s; c = 0; bufp = buf; result = unicode_utf8_to_ucs4(&ucs, p, &size); assert(result == 0 || "damnit, we're not handling invalid input here!"); while (*p != 0 && c < maxchars) { /* Leerzeichen überspringen */ while (*p != 0 && !iswalnum((wint_t)ucs)) { p += size; result = unicode_utf8_to_ucs4(&ucs, p, &size); assert(result == 0 || "damnit, we're not handling invalid input here!"); } /* alnums übertragen */ for (i = 0; i < bpt && *p != 0 && iswalnum((wint_t)ucs); ++i) { memcpy(bufp, p, size); p += size; bufp += size; ++c; result = unicode_utf8_to_ucs4(&ucs, p, &size); assert(result == 0 || "damnit, we're not handling invalid input here!"); } /* Bis zum nächsten Leerzeichen */ while (c < maxchars && *p != 0 && iswalnum((wint_t)ucs)) { p += size; result = unicode_utf8_to_ucs4(&ucs, p, &size); assert(result == 0 || "damnit, we're not handling invalid input here!"); } } *bufp = 0; return buf; }