Ejemplos de ubrk_getRuleStatus en C++ (Cpp)

Ejemplo n.º 1

0

Mostrar archivo

Archivo: split.hpp Proyecto: reverbrain/ribosome

	std::vector<lstring> convert_split_words(const lstring &lt) {
		std::vector<lstring> ret;

		UBreakIterator* bi;
		int prev = -1, pos;

		UErrorCode err = U_ZERO_ERROR;
		bi = ubrk_open(UBRK_WORD, get_locale(), (UChar *)lt.data(), lt.size(), &err);
		if (U_FAILURE(err))
			return ret;

		pos = ubrk_first(bi);
		while (pos != UBRK_DONE) {
			int rules = ubrk_getRuleStatus(bi);
			if ((rules == UBRK_WORD_NONE) || (prev == -1)) {
				prev = pos;
			} else {
				ret.emplace_back(lt.substr(prev, pos - prev));

				prev = -1;
			}

			pos = ubrk_next(bi);
		}

		ubrk_close(bi);

		return ret;
	}

Ejemplo n.º 2

0

Mostrar archivo

Archivo: icu.c Proyecto: IvoNet/calibre

// BreakIterator.split {{{
static PyObject *
icu_BreakIterator_split(icu_BreakIterator *self, PyObject *args, PyObject *kwargs) {
    int32_t prev = 0, p = 0, sz = 0;
    PyObject *ans = NULL, *token = NULL;
  
    ans = PyList_New(0);
    if (ans == NULL) return PyErr_NoMemory();

    p = ubrk_first(self->break_iterator);
    while (p != UBRK_DONE) {
        prev = p; p = ubrk_next(self->break_iterator);
        if (self->type == UBRK_WORD && ubrk_getRuleStatus(self->break_iterator) == UBRK_WORD_NONE) 
            continue;  // We are not at the start of a word
        sz = (p == UBRK_DONE) ? self->text_len - prev : p - prev;
        if (sz > 0) {
            token = icu_to_python(self->text + prev, sz);
            if (token == NULL) {
                Py_DECREF(ans); ans = NULL; break; 
            }
            if (PyList_Append(ans, token) != 0) {
                Py_DECREF(token); Py_DECREF(ans); ans = NULL; break; 
            }
            Py_DECREF(token);
        }
    }

    return ans;

} // }}}

Ejemplo n.º 3

0

Mostrar archivo

Archivo: icu.c Proyecto: artbycrunk/calibre

// BreakIterator.index {{{
static PyObject *
icu_BreakIterator_index(icu_BreakIterator *self, PyObject *token) {
#if PY_VERSION_HEX >= 0x03030000
#error Not implemented for python >= 3.3
#endif

    UChar *buf = NULL, *needle = NULL;
    int32_t word_start = 0, p = 0, sz = 0, ans = -1, leading_hyphen = 0, trailing_hyphen = 0;

    buf = python_to_icu(token, &sz);
    if (buf == NULL) return NULL;
    if (sz < 1) goto end;
    needle = buf;
    if (sz > 1 && IS_HYPHEN_CHAR(buf[0])) { needle = buf + 1; leading_hyphen = 1; sz -= 1; }
    if (sz > 1 && IS_HYPHEN_CHAR(buf[sz-1])) trailing_hyphen = 1;

    Py_BEGIN_ALLOW_THREADS;
    p = ubrk_first(self->break_iterator);
    while (p != UBRK_DONE) {
        word_start = p; p = ubrk_next(self->break_iterator);
        if (self->type == UBRK_WORD && ubrk_getRuleStatus(self->break_iterator) == UBRK_WORD_NONE)
            continue;  // We are not at the start of a word

        if (self->text_len >= word_start + sz && memcmp(self->text + word_start, needle, sz * sizeof(UChar)) == 0) {
            if (word_start > 0 && (
                    (leading_hyphen && !IS_HYPHEN_CHAR(self->text[word_start-1])) ||
                    (!leading_hyphen && IS_HYPHEN_CHAR(self->text[word_start-1]))
            )) continue;
            if (!trailing_hyphen && IS_HYPHEN_CHAR(self->text[word_start + sz])) continue;

            if (p == UBRK_DONE || self->text_len <= word_start + sz) { ans = word_start; break; }

            if (
                    // Check that the found word is followed by a word boundary
                    ubrk_isBoundary(self->break_iterator, word_start + sz) &&
                    // If there is a leading hyphen check  that the leading
                    // hyphen is preceded by a word boundary
                    (!leading_hyphen || (word_start > 1 && ubrk_isBoundary(self->break_iterator, word_start - 2))) &&
                    // Check that there is a word boundary *after* the trailing
                    // hyphen. We cannot rely on ubrk_isBoundary() as that
                    // always returns true because of the trailing hyphen.
                    (!trailing_hyphen || ubrk_following(self->break_iterator, word_start + sz) == UBRK_DONE || ubrk_getRuleStatus(self->break_iterator) == UBRK_WORD_NONE)
            ) { ans = word_start; break; }

            if (p != UBRK_DONE) ubrk_isBoundary(self->break_iterator, p); // Reset the iterator to its position before the call to ubrk_isBoundary()
        }
    }
    if (leading_hyphen && ans > -1) ans -= 1;
#ifdef Py_UNICODE_WIDE
    if (ans > 0) ans = u_countChar32(self->text, ans);
#endif
    Py_END_ALLOW_THREADS;

end:
    free(buf);
    return Py_BuildValue("l", (long)ans);

} // }}}

Ejemplo n.º 4

0

Mostrar archivo

Archivo: i18n_string.cpp Proyecto: alepharchives/i18n

/**
 * Is current element in this iterator valid?
 * If result is false, the element will be skipped.
 * Used for word_only. 
 */
inline int is_valid_elem(cloner* res, UBreakIterator* iter)
{
    cloner_break* res_brk = (cloner_break*) res;
    int32_t type;

    if (res_brk->skip_elem == NULL)
        return 1;

    type = ubrk_getRuleStatus(iter);
    return !((res_brk->skip_elem)(type));
}

Ejemplo n.º 5

0

Mostrar archivo

Archivo: cbiapts.c Proyecto: Epictetus/build-couchdb

/*
 *  TestBreakIteratorRules - Verify that a break iterator can be created from
 *                           a set of source rules.
 */
static void TestBreakIteratorRules() {
    /*  Rules will keep together any run of letters not including 'a', OR
     *             keep together 'abc', but only when followed by 'def', OTHERWISE
     *             just return one char at a time.
     */
    char         rules[]  = "abc{666}/def;\n   [\\p{L} - [a]]* {2};  . {1};";
    /*                        0123456789012345678 */
    char         data[]   =  "abcdex abcdefgh-def";     /* the test data string                     */
    char         breaks[] =  "**    **  *    **  *";    /*  * the expected break positions          */
    char         tags[]   =  "01    21  6    21  2";    /*  expected tag values at break positions  */
    int32_t      tagMap[] = {0, 1, 2, 3, 4, 5, 666};

    UChar       *uData;
    void        *freeHook = NULL;
    UErrorCode   status   = U_ZERO_ERROR;
    int32_t      pos;
    int          i;

    UBreakIterator *bi = testOpenRules(rules);
    if (bi == NULL) {return;}
    uData = toUChar(data, &freeHook);
    ubrk_setText(bi,  uData, -1, &status);

    pos = ubrk_first(bi);
    for (i=0; i<sizeof(breaks); i++) {
        if (pos == i && breaks[i] != '*') {
            log_err("FAIL: unexpected break at position %d found\n", pos);
            break;
        }
        if (pos != i && breaks[i] == '*') {
            log_err("FAIL: expected break at position %d not found.\n", i);
            break;
        }
        if (pos == i) {
            int32_t tag, expectedTag;
            tag = ubrk_getRuleStatus(bi);
            expectedTag = tagMap[tags[i]&0xf];
            if (tag != expectedTag) {
                log_err("FAIL: incorrect tag value.  Position = %d;  expected tag %d, got %d",
                    pos, expectedTag, tag);
                break;
            }
            pos = ubrk_next(bi);
        }
    }

    freeToUCharStrings(&freeHook);
    ubrk_close(bi);
}

Ejemplo n.º 6

0

Mostrar archivo

Archivo: MojDbTextTokenizer.cpp Proyecto: webOS-ports/db8

MojErr MojDbTextTokenizer::tokenize(const MojString& text, MojDbTextCollator* collator, KeySet& keysOut) const
{
    LOG_TRACE("Entering function %s", __FUNCTION__);
    MojAssert(m_ubrk.get());

    // convert to UChar from str
    MojDbTextUtils::UnicodeVec unicodeStr;
    MojErr err = MojDbTextUtils::strToUnicode(text, unicodeStr);
    MojErrCheck(err);

    // clone break iterator and set text
    MojByte buf[U_BRK_SAFECLONE_BUFFERSIZE];
    UErrorCode status = U_ZERO_ERROR;
    MojInt32 size = sizeof(buf);
    IterPtr ubrk(ubrk_safeClone(m_ubrk.get(), buf, &size, &status));
    MojUnicodeErrCheck(status);
    MojAssert(ubrk.get());
    ubrk_setText(ubrk.get(), unicodeStr.begin(), (MojInt32) unicodeStr.size(), &status);
    MojUnicodeErrCheck(status);

    MojInt32 tokBegin = -1;
    MojInt32 pos = ubrk_first(ubrk.get());
    while (pos != UBRK_DONE) {
        UWordBreak status = (UWordBreak) ubrk_getRuleStatus(ubrk.get());
        if (status != UBRK_WORD_NONE) {
            MojAssert(tokBegin != -1);
            MojDbKey key;
            const UChar* tokChars = unicodeStr.begin() + tokBegin;
            MojSize tokSize = (MojSize) (pos - tokBegin);
            if (collator) {
                err = collator->sortKey(tokChars, tokSize, key);
                MojErrCheck(err);
            } else {
                MojString tok;
                err = MojDbTextUtils::unicodeToStr(tokChars, tokSize, tok);
                MojErrCheck(err);
                err = key.assign(tok);
                MojErrCheck(err);
            }
            err = keysOut.put(key);
            MojErrCheck(err);
        }
        tokBegin = pos;
        pos = ubrk_next(ubrk.get());
    }
    return MojErrNone;
}

Ejemplo n.º 7

0

Mostrar archivo

Archivo: icu.c Proyecto: IvoNet/calibre

// BreakIterator.index {{{
static PyObject *
icu_BreakIterator_index(icu_BreakIterator *self, PyObject *args, PyObject *kwargs) {
#if PY_VERSION_HEX >= 0x03030000 
#error Not implemented for python >= 3.3
#endif

    UChar *buf = NULL;
    int32_t prev = 0, p = 0, sz = 0, tsz = 0, ans = -1;
    PyObject *token = NULL;
  
    if (!PyArg_ParseTuple(args, "O", &token)) return NULL;
    buf = python_to_icu(token, &sz, 1);
    if (buf == NULL) return NULL;
    if (sz < 1) goto end;

    Py_BEGIN_ALLOW_THREADS;
    p = ubrk_first(self->break_iterator);
    while (p != UBRK_DONE) {
        prev = p; p = ubrk_next(self->break_iterator);
        if (self->type == UBRK_WORD && ubrk_getRuleStatus(self->break_iterator) == UBRK_WORD_NONE) 
            continue;  // We are not at the start of a word
        tsz = (p == UBRK_DONE) ? self->text_len - prev : p - prev;
        if (sz == tsz && memcmp(self->text + prev, buf, sz * sizeof(UChar)) == 0) { 
#ifdef PY_UNICODE_WIDE
            ans = u_countChar32(self->text, prev);
#else
            ans = prev; 
#endif
            break;
        }
    }
    Py_END_ALLOW_THREADS;

end:
    free(buf);
    return Py_BuildValue("i", ans);

} // }}}

Ejemplo n.º 8

0

Mostrar archivo

Archivo: icu.c Proyecto: IvoNet/calibre

// BreakIterator.split2 {{{
static PyObject *
icu_BreakIterator_split2(icu_BreakIterator *self, PyObject *args, PyObject *kwargs) {
#if PY_VERSION_HEX >= 0x03030000 
#error Not implemented for python >= 3.3
#endif

    int32_t prev = 0, p = 0, sz = 0;
    PyObject *ans = NULL, *temp = NULL;
  
    ans = PyList_New(0);
    if (ans == NULL) return PyErr_NoMemory();

    p = ubrk_first(self->break_iterator);
    while (p != UBRK_DONE) {
        prev = p; p = ubrk_next(self->break_iterator);
        if (self->type == UBRK_WORD && ubrk_getRuleStatus(self->break_iterator) == UBRK_WORD_NONE) 
            continue;  // We are not at the start of a word
        sz = (p == UBRK_DONE) ? self->text_len - prev : p - prev;
        if (sz > 0) {
#ifdef Py_UNICODE_WIDE
            sz = u_countChar32(self->text + prev, sz);
            prev = u_countChar32(self->text, prev);
#endif
            temp = Py_BuildValue("II", prev, sz); 
            if (temp == NULL) {
                Py_DECREF(ans); ans = NULL; break; 
            } 
            if (PyList_Append(ans, temp) != 0) {
                Py_DECREF(temp); Py_DECREF(ans); ans = NULL; break; 
            }
            Py_DECREF(temp);
        }
    }

    return ans;

} // }}}

Ejemplo n.º 9

0

Mostrar archivo

Archivo: icu.c Proyecto: artbycrunk/calibre

// BreakIterator.split2 {{{
static PyObject *
icu_BreakIterator_split2(icu_BreakIterator *self, PyObject *args) {
#if PY_VERSION_HEX >= 0x03030000
#error Not implemented for python >= 3.3
#endif

    int32_t word_start = 0, p = 0, sz = 0, last_pos = 0, last_sz = 0;
    int is_hyphen_sep = 0, leading_hyphen = 0, trailing_hyphen = 0;
    UChar sep = 0;
    PyObject *ans = NULL, *temp = NULL, *t = NULL;

    ans = PyList_New(0);
    if (ans == NULL) return PyErr_NoMemory();

    p = ubrk_first(self->break_iterator);
    while (p != UBRK_DONE) {
        word_start = p; p = ubrk_next(self->break_iterator);
        if (self->type == UBRK_WORD && ubrk_getRuleStatus(self->break_iterator) == UBRK_WORD_NONE)
            continue;  // We are not at the start of a word
        sz = (p == UBRK_DONE) ? self->text_len - word_start : p - word_start;
        if (sz > 0) {
            // ICU breaks on words containing hyphens, we do not want that, so we recombine manually
            is_hyphen_sep = 0; leading_hyphen = 0; trailing_hyphen = 0;
            if (word_start > 0) { // Look for a leading hyphen
                sep = *(self->text + word_start - 1);
                if (IS_HYPHEN_CHAR(sep)) {
                    leading_hyphen = 1;
                    if (last_pos > 0 && word_start - last_pos == 1) is_hyphen_sep = 1;
                }
            }
            if (word_start + sz < self->text_len) { // Look for a trailing hyphen
                sep = *(self->text + word_start + sz);
                if (IS_HYPHEN_CHAR(sep)) trailing_hyphen = 1;
            }
            last_pos = p;
#ifdef Py_UNICODE_WIDE
            sz = u_countChar32(self->text + word_start, sz);
            word_start = u_countChar32(self->text, word_start);
#endif
            if (is_hyphen_sep && PyList_GET_SIZE(ans) > 0) {
                sz = last_sz + sz + trailing_hyphen;
                last_sz = sz;
                t = PyInt_FromLong((long)sz);
                if (t == NULL) { Py_DECREF(ans); ans = NULL; break; }
                temp = PyList_GET_ITEM(ans, PyList_GET_SIZE(ans) - 1);
                Py_DECREF(PyTuple_GET_ITEM(temp, 1));
                PyTuple_SET_ITEM(temp, 1, t);
            } else {
                sz += leading_hyphen + trailing_hyphen;
                last_sz = sz;
                temp = Py_BuildValue("ll", (long)(word_start - leading_hyphen), (long)sz);
                if (temp == NULL) {
                    Py_DECREF(ans); ans = NULL; break;
                }
                if (PyList_Append(ans, temp) != 0) {
                    Py_DECREF(temp); Py_DECREF(ans); ans = NULL; break;
                }
                Py_DECREF(temp);
            }
        }
    }

    return ans;

} // }}}

Ejemplo n.º 10

0

Mostrar archivo

Archivo: TextBreakIterator.cpp Proyecto: AndriyKalashnykov/webkit

bool isWordTextBreak(TextBreakIterator* iterator)
{
    int ruleStatus = ubrk_getRuleStatus(reinterpret_cast<UBreakIterator*>(iterator));
    return ruleStatus != UBRK_WORD_NONE;
}

Ejemplo n.º 11

0

Mostrar archivo

Archivo: TextBreakIterator.cpp Proyecto: wolfviking0/webcl-webkit

bool isWordTextBreak(UBreakIterator* iterator)
{
    int ruleStatus = ubrk_getRuleStatus(iterator);
    return ruleStatus != UBRK_WORD_NONE;
}

Ejemplo n.º 12

0

Mostrar archivo

Archivo: fixed.c Proyecto: julp/ugrep

static engine_return_t engine_fixed_match(error_t **error, void *data, const UString *subject)
{
    int32_t ret;
    UErrorCode status;
    FETCH_DATA(data, p, fixed_pattern_t);

    status = U_ZERO_ERROR;
    if (ustring_empty(p->pattern)) {
        if (IS_WORD_BOUNDED(p->flags)) {
            if (ustring_empty(subject)) {
                return ENGINE_MATCH_FOUND;
            } else {
                int32_t l, u, lastState, state;

                ubrk_setText(p->ubrk, subject->ptr, subject->len, &status);
                if (U_FAILURE(status)) {
                    icu_error_set(error, FATAL, status, "ubrk_setText");
                    return ENGINE_FAILURE;
                }
                if (UBRK_DONE != (l = ubrk_first(p->ubrk))) {
                    lastState = ubrk_getRuleStatus(p->ubrk);
                    while (UBRK_DONE != (u = ubrk_next(p->ubrk))) {
                        state = ubrk_getRuleStatus(p->ubrk);
                        if (UBRK_WORD_NONE == lastState && lastState == state) {
                            return ENGINE_MATCH_FOUND;
                        }
                        lastState = state;
                        l = u;
                    }
                }
                return ENGINE_NO_MATCH;
            }
        } else {
            return ENGINE_MATCH_FOUND;
        }
    } else if (NULL != p->usearch) {
        if (subject->len > 0) {
            usearch_setText(p->usearch, subject->ptr, subject->len, &status);
            if (U_FAILURE(status)) {
                icu_error_set(error, FATAL, status, "usearch_setText");
                return ENGINE_FAILURE;
            }
            ret = usearch_first(p->usearch, &status);
            if (U_FAILURE(status)) {
                icu_error_set(error, FATAL, status, "usearch_first");
                return ENGINE_FAILURE;
            }
            usearch_unbindText(p->usearch);

            return (ret != USEARCH_DONE ? ENGINE_MATCH_FOUND : ENGINE_NO_MATCH);
        } else {
            return ENGINE_NO_MATCH;
        }
    } else {
        UChar *m;
        int32_t pos;

        pos = 0;
        ret = ENGINE_NO_MATCH;
        if (NULL != p->ubrk) {
            ubrk_setText(p->ubrk, subject->ptr, subject->len, &status);
            if (U_FAILURE(status)) {
                icu_error_set(error, FATAL, status, "ubrk_setText");
                return ENGINE_FAILURE;
            }
        }
        while (NULL != (m = u_strFindFirst(subject->ptr + pos, subject->len - pos, p->pattern->ptr, p->pattern->len))) {
            pos = m - subject->ptr;
            if (NULL == p->ubrk || (ubrk_isBoundary(p->ubrk, pos) && ubrk_isBoundary(p->ubrk, pos + p->pattern->len))) {
                ret = ENGINE_MATCH_FOUND;
            }
            pos += p->pattern->len;
        }
        ubrk_unbindText(p->ubrk);

        return ret;
    }
}

Ejemplo n.º 13

0

Mostrar archivo

Archivo: fixed.c Proyecto: julp/ugrep

static engine_return_t engine_fixed_match_all(error_t **error, void *data, const UString *subject, interval_list_t *intervals)
{
    int32_t matches;
    UErrorCode status;
    FETCH_DATA(data, p, fixed_pattern_t);

    matches = 0;
    status = U_ZERO_ERROR;
    if (ustring_empty(p->pattern)) {
        if (IS_WORD_BOUNDED(p->flags)) {
            if (ustring_empty(subject)) {
                return ENGINE_MATCH_FOUND;
            } else {
                int32_t l, u, lastState, state;

                ubrk_setText(p->ubrk, subject->ptr, subject->len, &status);
                if (U_FAILURE(status)) {
                    icu_error_set(error, FATAL, status, "ubrk_setText");
                    return ENGINE_FAILURE;
                }
                if (UBRK_DONE != (l = ubrk_first(p->ubrk))) {
                    lastState = ubrk_getRuleStatus(p->ubrk);
                    while (UBRK_DONE != (u = ubrk_next(p->ubrk))) {
                        state = ubrk_getRuleStatus(p->ubrk);
                        if (UBRK_WORD_NONE == lastState && lastState == state) {
                            return ENGINE_MATCH_FOUND;
                        }
                        lastState = state;
                        l = u;
                    }
                }
                return ENGINE_NO_MATCH;
            }
        } else {
            return ENGINE_MATCH_FOUND;
        }
    } else if (NULL != p->usearch) {
        int32_t l, u;

        if (subject->len > 0) {
            usearch_setText(p->usearch, subject->ptr, subject->len, &status);
            if (U_FAILURE(status)) {
                icu_error_set(error, FATAL, status, "usearch_setText");
                return ENGINE_FAILURE;
            }
            for (l = usearch_first(p->usearch, &status); U_SUCCESS(status) && USEARCH_DONE != l; l = usearch_next(p->usearch, &status)) {
                matches++;
                u = l + usearch_getMatchedLength(p->usearch);
                if (interval_list_add(intervals, subject->len, l, u)) {
                    return ENGINE_WHOLE_LINE_MATCH;
                }
            }
            if (U_FAILURE(status)) {
                icu_error_set(error, FATAL, status, "usearch_[first|next]");
                return ENGINE_FAILURE;
            }
            usearch_unbindText(p->usearch);

            return (matches ? ENGINE_MATCH_FOUND : ENGINE_NO_MATCH);
        } else {
            return ENGINE_NO_MATCH;
        }
    } else {
        UChar *m;
        int32_t pos;

        pos = 0;
        if (NULL != p->ubrk) {
            ubrk_setText(p->ubrk, subject->ptr, subject->len, &status);
            if (U_FAILURE(status)) {
                icu_error_set(error, FATAL, status, "ubrk_setText");
                return ENGINE_FAILURE;
            }
        }
        while (NULL != (m = u_strFindFirst(subject->ptr + pos, subject->len - pos, p->pattern->ptr, p->pattern->len))) {
            pos = m - subject->ptr;
            if (NULL == p->ubrk || (ubrk_isBoundary(p->ubrk, pos) && ubrk_isBoundary(p->ubrk, pos + p->pattern->len))) {
                matches++;
                if (interval_list_add(intervals, subject->len, pos, pos + p->pattern->len)) {
                    return ENGINE_WHOLE_LINE_MATCH;
                }
            }
            pos += p->pattern->len;
        }
        ubrk_unbindText(p->ubrk);

        return (matches ? ENGINE_MATCH_FOUND : ENGINE_NO_MATCH);
    }
}

Ejemplo n.º 14

0

Mostrar archivo

Archivo: justenoughicu.c Proyecto: alerque/sile

int icu_breakpoints(lua_State *L) {
  const char* input = luaL_checkstring(L, 1);
  int input_l = strlen(input);
  const char* locale = luaL_checkstring(L, 2);
  UChar *buffer;
  int32_t l, breakcount = 0;
  UErrorCode err = U_ZERO_ERROR;
  u_strFromUTF8(NULL, 0, &l, input, input_l, &err);
  /* Above call returns an error every time. */
  err = U_ZERO_ERROR;
  buffer = malloc(l * sizeof(UChar));
  u_strFromUTF8(buffer, l, &l, input, input_l, &err);

  UBreakIterator* wordbreaks, *linebreaks;
  int32_t i, previous;
  wordbreaks = ubrk_open(UBRK_WORD, locale, buffer, l, &err);
  if(U_FAILURE(err)) {
    luaL_error(L, "Word break parser failure: %s", u_errorName(err));
  }

  linebreaks = ubrk_open(UBRK_LINE, locale, buffer, l, &err);
  if(U_FAILURE(err)) {
    luaL_error(L, "Line break parser failure: %s", u_errorName(err));
  }

  previous = 0;
  i = 0;
  while (i <= l) {
    int32_t out_l;
    int32_t type;
    if (!ubrk_isBoundary(linebreaks, i) && !ubrk_isBoundary(wordbreaks,i)) {
      i++; continue;
    }
    lua_checkstack(L, 3);
    /* At some kind of boundary */
    lua_newtable(L);
    lua_pushstring(L, "type");
    lua_pushstring(L, ubrk_isBoundary(linebreaks,i) ? "line" : "word");
    lua_settable(L, -3);

    int32_t utf8_index = 0;
    err = U_ZERO_ERROR;
    u_strToUTF8(NULL, 0, &utf8_index, buffer, i, &err);
    assert(U_SUCCESS(err) || err == U_BUFFER_OVERFLOW_ERROR);

    lua_pushstring(L, "index");
    lua_pushinteger(L, utf8_index);
    lua_settable(L, -3);

    if (ubrk_isBoundary(linebreaks, i)) {
      lua_pushstring(L, "subtype");
      type = ubrk_getRuleStatus(linebreaks);
      if (type >= UBRK_LINE_SOFT && type < UBRK_LINE_SOFT_LIMIT) {
        lua_pushstring(L, "soft");
      } else {
        lua_pushstring(L, "hard");
      }
      lua_settable(L, -3);
    }
    lua_pushstring(L, "token");
    lua_pushlstring(L, input+previous, utf8_index-previous);

    lua_settable(L, -3);

    previous = utf8_index;
    breakcount++;
    i++;
  }
  ubrk_close(wordbreaks);
  ubrk_close(linebreaks);
  return breakcount;
}

Ejemplo n.º 15

0

Mostrar archivo

Archivo: text_icu.c Proyecto: bos/text-icu

int32_t __hs_ubrk_getRuleStatus(UBreakIterator *bi)
{
    return ubrk_getRuleStatus(bi);
}