Beispiel #1
0
static int eatwhitespace_c(const char **str_p)
{
    int ret = 0;
    ucs4_t ucs;
    size_t len;
    const char *str = *str_p;

    /* skip over potential whitespace */
    for (;;) {
        unsigned char utf8_character = (unsigned char)*str;
        if (~utf8_character & 0x80) {
            if (!iswspace(utf8_character))
                break;
            ++str;
        }
        else {
            ret = unicode_utf8_to_ucs4(&ucs, str, &len);
            if (ret != 0) {
                log_warning("illegal character sequence in UTF8 string: %s\n", str);
                break;
            }
            if (!iswspace((wint_t)ucs))
                break;
            str += len;
        }
    }
    *str_p = str;
    return ret;
}
Beispiel #2
0
int findtoken(const void * root, const char *key, variant * result)
{
    const char * str = key;
    const tnode * tk = (const tnode *)root;

    if (!tk || !str || *str == 0) {
        return E_TOK_NOMATCH;
    }
    do {
        int index;
        const tref *ref;
        ucs4_t ucs;
        size_t len;
        int ret = unicode_utf8_to_ucs4(&ucs, str, &len);

        if (ret != 0) {
            /* encoding is broken. youch */
            log_error("findtoken | encoding error in '%s'\n", key);
            return E_TOK_NOMATCH;
        }
#if NODEHASHSIZE == 8
        index = ucs & 7;
#else
        index = ucs % NODEHASHSIZE;
#endif
        ref = tk->next[index];
        while (ref && ref->ucs != ucs)
            ref = ref->nexthash;
        str += len;
        if (!ref) {
            log_debug("findtoken | token not found '%s'\n", key);
            return E_TOK_NOMATCH;
        }
        tk = ref->node;
    } while (*str);
    if (tk) {
        *result = tk->id;
        return E_TOK_SUCCESS;
    }
    log_debug("findtoken | token not found '%s'\n", key);
    return E_TOK_NOMATCH;
}
Beispiel #3
0
static int count_umlaut(const char *s)
{
    int result = 0;
    const char *cp;
    for (cp = s; *cp; ++cp) {
        ucs4_t ucs = *cp;
        if (ucs & 0x80) {
            size_t size;
            int err;
            err = unicode_utf8_to_ucs4(&ucs, cp, &size);
            if (err != 0) {
                log_error("illegal utf8 encoding %s at %s", s, cp);
                return result;
            }
            cp += size;
            ++result;
        }
    }
    return result;
}
Beispiel #4
0
void skip_token(void)
{
    char quotechar = 0;
    eatwhitespace_c(&states->current_token);

    while (*states->current_token) {
        ucs4_t ucs;
        size_t len;

        unsigned char utf8_character = (unsigned char)states->current_token[0];
        if (~utf8_character & 0x80) {
            ucs = utf8_character;
            ++states->current_token;
        }
        else {
            int ret = unicode_utf8_to_ucs4(&ucs, states->current_token, &len);
            if (ret == 0) {
                states->current_token += len;
            }
            else {
                log_warning("illegal character sequence in UTF8 string: %s\n", states->current_token);
            }
        }
        if (iswspace((wint_t)ucs) && quotechar == 0) {
            return;
        }
        else {
            switch (utf8_character) {
            case '"':
            case '\'':
                if (utf8_character == quotechar)
                    return;
                quotechar = utf8_character;
                break;
            case ESCAPE_CHAR:
                ++states->current_token;
                break;
            }
        }
    }
}
Beispiel #5
0
char *parse_token(const char **str, char *lbuf, size_t buflen)
{
    char *cursor = lbuf;
    char quotechar = 0;
    bool escape = false;
    const char *ctoken = *str;

    if (!ctoken) {
        return 0;
    }
    eatwhitespace_c(&ctoken);
    if (!*ctoken) {
        if (buflen > 0) {
            *cursor = 0;
        }
        return 0;
    }
    while (*ctoken) {
        ucs4_t ucs;
        size_t len;
        bool copy = false;

        unsigned char utf8_character = *(unsigned char *)ctoken;
        if (~utf8_character & 0x80) {
            ucs = utf8_character;
            len = 1;
        }
        else {
            int ret = unicode_utf8_to_ucs4(&ucs, ctoken, &len);
            if (ret != 0) {
                log_warning("illegal character sequence in UTF8 string: %s\n", ctoken);
                break;
            }
        }
        if (escape) {
            copy = true;
            escape = false;
        }
        else if (iswspace((wint_t)ucs)) {
            if (quotechar == 0)
                break;
            copy = true;
        }
        else if (utf8_character == '"' || utf8_character == '\'') {
            if (utf8_character == quotechar) {
                ++ctoken;
                break;
            }
            else if (quotechar == 0) {
                quotechar = utf8_character;
                ++ctoken;
            }
            else {
                if (cursor - buflen < lbuf - len) {
                    *cursor++ = *ctoken++;
                }
            }
        }
        else if (utf8_character == SPACE_REPLACEMENT) {
            if (cursor - buflen < lbuf - len) {
                *cursor++ = ' ';
            }
            ++ctoken;
        }
        else if (utf8_character == ESCAPE_CHAR) {
            escape = true;
            ++ctoken;
        }
        else {
            copy = true;
        }
        if (copy) {
            if (cursor - buflen < lbuf - len) {
                memcpy(cursor, ctoken, len);
                cursor += len;
            }
            ctoken += len;
        }
    }

    *cursor = '\0';
    *str = ctoken;
    return lbuf;
}
Beispiel #6
0
char * transliterate(char * out, size_t size, const char * in)
{
    const char *src = in;
    char *dst = out;

    assert(in && size > 0);
    --size; /* need space for a final 0-byte */
    while (*src && size) {
        size_t len;
        const char * p = src;
        while ((p + size > src) && *src && (~*src & 0x80)) {
            *dst++ = (char)tolower(*src++);
        }
        len = src - p;
        size -= len;
        while (size > 0 && *src && (*src & 0x80)) {
            unsigned int advance = 2;
            if (src[0] == '\xc3') {
                if (src[1] == '\xa4' || src[1] == '\x84') {
                    memcpy(dst, "ae", 2);
                }
                else if (src[1] == '\xb6' || src[1] == '\x96') {
                    memcpy(dst, "oe", 2);
                }
                else if (src[1] == '\xbc' || src[1] == '\x9c') {
                    memcpy(dst, "ue", 2);
                }
                else if (src[1] == '\x9f') {
                    memcpy(dst, "ss", 2);
                }
                else {
                    advance = 0;
                }
            }
            else if (src[0] == '\xe1') {
                if (src[1] == '\xba' && src[2] == '\x9e') {
                    memcpy(dst, "ss", 2);
                    ++src;
                }
                else {
                    advance = 0;
                }
            }
            else {
                advance = 0;
            }

            if (advance && advance <= size) {
                src += advance;
                dst += advance;
                size -= advance;
            }
            else {
                ucs4_t ucs;
                int ret = unicode_utf8_to_ucs4(&ucs, src, &len);
                if (ret != 0) {
                    /* encoding is broken. yikes */
                    log_error("transliterate | encoding error in '%s'\n", src);
                    return NULL;
                }
                src += len;
                *dst++ = '?';
                --size;
            }
        }
    }
    *dst = 0;
    return *src ? 0 : out;
}
Beispiel #7
0
void addtoken(tnode ** root, const char *str, variant id)
{
    tnode * tk;
    static const struct replace {
        ucs4_t ucs;
        const char str[3];
    } replace[] = {
        /* match lower-case (!) umlauts and others to transcriptions */
        { 228, "AE" }, { 246, "OE" }, { 252, "UE" }, { 223, "SS" },
        { 230, "AE" }, { 248, "OE" }, { 229, "AA" }, { 0, "" }
    };

    assert(root && str);
    if (!*root) {
        tk = *root = mknode();
    }
    else {
        tk = *root;
    }
    assert(tk && tk == *root);
    if (!*str) {
        tk->id = id;
        tk->flags |= LEAF;
    }
    else {
        tref *next;
        int ret, index, i = 0;
        ucs4_t ucs, lcs;
        size_t len;

        ret = unicode_utf8_to_ucs4(&ucs, str, &len);
        assert(ret == 0 || !"invalid utf8 string");
        lcs = ucs;

#if NODEHASHSIZE == 8
        index = ucs & 7;
#else
        index = ucs % NODEHASHSIZE;
#endif
        assert(index >= 0);
        next = tk->next[index];
        if (!(tk->flags & LEAF))
            tk->id = id;
        while (next && next->ucs != ucs)
            next = next->nexthash;
        if (!next) {
            tref *ref;
            tnode *node = mknode(); // TODO: what is the reason for this empty node to exist?

            if (ucs < 'a' || ucs > 'z') {
                lcs = towlower((wint_t)ucs);
            }
            if (ucs == lcs) {
                ucs = towupper((wint_t)ucs);
            }

            ref = (tref *)malloc(sizeof(tref));
            ref->ucs = ucs;
            ref->node = node;
            ref->nexthash = tk->next[index];
            tk->next[index] = ref;

            /* try lower/upper casing the character, and try again */
            if (ucs != lcs) {
#if NODEHASHSIZE == 8
                index = lcs & 7;
#else
                index = lcs % NODEHASHSIZE;
#endif
                ref = (tref *)malloc(sizeof(tref));
                assert_alloc(ref);
                ref->ucs = lcs;
                ref->node = node;
                ++node->refcount;
                ref->nexthash = tk->next[index];
                tk->next[index] = ref;
            }
            next = ref;
        }
        else {
            tnode * next_node = (tnode *)next->node;
            next_node->flags |= SHARED;
            if ((next_node->flags & LEAF) == 0)
                next_node->id.v = NULL;        /* why? */
        }
        addtoken(&next->node, str + len, id);
        while (replace[i].str[0]) {
            if (lcs == replace[i].ucs) {
                char zText[1024];
                memcpy(zText, replace[i].str, 3);
                strcpy(zText + 2, (const char *)str + len);
                addtoken(root, zText, id);
                break;
            }
            ++i;
        }
    }
}
Beispiel #8
0
const char *abkz(const char *s, char *buf, size_t buflen, size_t maxchars)
{
    const char *p = s;
    char *bufp;
    unsigned int c = 0;
    size_t bpt, i;
    ucs4_t ucs;
    size_t size;
    int result;

    /* Prüfen, ob Kurz genug */

    if (strlen(s) <= maxchars) {
        return s;
    }
    /* Anzahl der Wörter feststellen */

    while (*p != 0) {

        result = unicode_utf8_to_ucs4(&ucs, p, &size);
        assert(result == 0 || "damnit, we're not handling invalid input here!");

        /* Leerzeichen überspringen */
        while (*p != 0 && !iswalnum((wint_t)ucs)) {
            p += size;
            result = unicode_utf8_to_ucs4(&ucs, p, &size);
            assert(result == 0 || "damnit, we're not handling invalid input here!");
        }

        /* Counter erhöhen */
        if (*p != 0)
            ++c;

        /* alnums überspringen */
        while (*p != 0 && iswalnum((wint_t)ucs)) {
            p += size;
            result = unicode_utf8_to_ucs4(&ucs, p, &size);
            assert(result == 0 || "damnit, we're not handling invalid input here!");
        }
    }

    /* Buchstaben pro Teilkürzel = _max(1,max/AnzWort) */

    bpt = _max(1, maxchars / c);

    /* Einzelne Wörter anspringen und jeweils die ersten BpT kopieren */

    p = s;
    c = 0;
    bufp = buf;

    result = unicode_utf8_to_ucs4(&ucs, p, &size);
    assert(result == 0 || "damnit, we're not handling invalid input here!");

    while (*p != 0 && c < maxchars) {
        /* Leerzeichen überspringen */

        while (*p != 0 && !iswalnum((wint_t)ucs)) {
            p += size;
            result = unicode_utf8_to_ucs4(&ucs, p, &size);
            assert(result == 0 || "damnit, we're not handling invalid input here!");
        }

        /* alnums übertragen */

        for (i = 0; i < bpt && *p != 0 && iswalnum((wint_t)ucs); ++i) {
            memcpy(bufp, p, size);
            p += size;
            bufp += size;
            ++c;

            result = unicode_utf8_to_ucs4(&ucs, p, &size);
            assert(result == 0 || "damnit, we're not handling invalid input here!");
        }

        /* Bis zum nächsten Leerzeichen */

        while (c < maxchars && *p != 0 && iswalnum((wint_t)ucs)) {
            p += size;
            result = unicode_utf8_to_ucs4(&ucs, p, &size);
            assert(result == 0 || "damnit, we're not handling invalid input here!");
        }
    }

    *bufp = 0;

    return buf;
}