Example #1
0
File: icu.c Project: IvoNet/calibre
// string_length {{{
static PyObject *
icu_string_length(PyObject *self, PyObject *args) {
    int32_t sz = 0;
    UChar *icu = NULL;
    PyObject *src = NULL;
  
    if (!PyArg_ParseTuple(args, "O", &src)) return NULL;
    icu = python_to_icu(src, &sz, 1);
    if (icu == NULL) return NULL;
    sz = u_countChar32(icu, sz);
    free(icu);
    return Py_BuildValue("i", sz);
} // }}}
Example #2
0
// Collator.find {{{
static PyObject *
icu_Collator_find(icu_Collator *self, PyObject *args) {
#if PY_VERSION_HEX >= 0x03030000
#error Not implemented for python >= 3.3
#endif
    PyObject *a_ = NULL, *b_ = NULL;
    UChar *a = NULL, *b = NULL;
    int32_t asz = 0, bsz = 0, pos = -1, length = -1;
    UErrorCode status = U_ZERO_ERROR;
    UStringSearch *search = NULL;

    if (!PyArg_ParseTuple(args, "OO", &a_, &b_)) return NULL;

    a = python_to_icu(a_, &asz);
    if (a == NULL) goto end;
    b = python_to_icu(b_, &bsz);
    if (b == NULL) goto end;

    search = usearch_openFromCollator(a, asz, b, bsz, self->collator, NULL, &status);
    if (U_SUCCESS(status)) {
        pos = usearch_first(search, &status);
        if (pos != USEARCH_DONE) {
            length = usearch_getMatchedLength(search);
#ifdef Py_UNICODE_WIDE
            // We have to return number of unicode characters since the string
            // could contain surrogate pairs which are represented as a single
            // character in python wide builds
            length = u_countChar32(b + pos, length);
            pos = u_countChar32(b, pos);
#endif
        } else pos = -1;
    }
end:
    if (search != NULL) usearch_close(search);
    if (a != NULL) free(a);
    if (b != NULL) free(b);

    return (PyErr_Occurred()) ? NULL : Py_BuildValue("ll", (long)pos, (long)length);
} // }}}
Example #3
0
// Matcher.calculate_scores {{{
static PyObject *
Matcher_calculate_scores(Matcher *self, PyObject *args) {
    int32_t *final_positions = NULL, *p;
    Match *matches = NULL;
    bool ok = FALSE;
    uint32_t i = 0, needle_char_len = 0, j = 0;
    PyObject *items = NULL, *score = NULL, *positions = NULL, *pneedle = NULL;
    UChar *needle = NULL;

    if (!PyArg_ParseTuple(args, "O", &pneedle)) return NULL;

    needle = python_to_icu(pneedle, NULL);
    if (needle == NULL) return NULL;
    needle_char_len = u_countChar32(needle, -1);
    items = PyTuple_New(self->item_count);
    positions = PyTuple_New(self->item_count);
    matches = (Match*)calloc(self->item_count, sizeof(Match));
    final_positions = (int32_t*) calloc(needle_char_len * self->item_count, sizeof(int32_t));
    if (items == NULL || matches == NULL || final_positions == NULL || positions == NULL) {PyErr_NoMemory(); goto end;}

    for (i = 0; i < self->item_count; i++) {
        score = PyTuple_New(needle_char_len);
        if (score == NULL) { PyErr_NoMemory(); goto end; }
        PyTuple_SET_ITEM(positions, (Py_ssize_t)i, score);
    }

    Py_BEGIN_ALLOW_THREADS;
    ok = match(self->items, self->item_lengths, self->item_count, needle, matches, final_positions, needle_char_len, self->collator, self->level1, self->level2, self->level3);
    Py_END_ALLOW_THREADS;

    if (ok) {
        for (i = 0; i < self->item_count; i++) {
            score = PyFloat_FromDouble(matches[i].score);
            if (score == NULL) { PyErr_NoMemory(); goto end; }
            PyTuple_SET_ITEM(items, (Py_ssize_t)i, score);
            p = final_positions + (i * needle_char_len);
            for (j = 0; j < needle_char_len; j++) {
                score = PyLong_FromLong((long)p[j]);
                if (score == NULL) { PyErr_NoMemory(); goto end; }
                PyTuple_SET_ITEM(PyTuple_GET_ITEM(positions, (Py_ssize_t)i), (Py_ssize_t)j, score);
            }
        }
    } else { PyErr_NoMemory(); goto end; }

end:
    nullfree(needle);
    nullfree(matches);
    nullfree(final_positions);
    if (PyErr_Occurred()) { Py_XDECREF(items); items = NULL; Py_XDECREF(positions); positions = NULL; return NULL; }
    return Py_BuildValue("NN", items, positions);
} // }}}
Example #4
0
static double process_item(MatchInfo *m, Stack *stack, int32_t *final_positions) {
    UChar32 nc, hc, lc;
    UChar *p;
    double final_score = 0.0, score = 0.0, score_for_char = 0.0;
    int32_t pos, i, j, hidx, nidx, last_idx, distance, *positions = final_positions + m->needle_len;
    MemoryItem mem = {0};

    stack_push(stack, 0, 0, 0, 0.0, final_positions);

    while (stack->pos >= 0) {
        stack_pop(stack, &hidx, &nidx, &last_idx, &score, positions);
        mem = m->memo[hidx][nidx][last_idx];
        if (mem.score == DBL_MAX) {
            // No memoized result, calculate the score
            for (i = nidx; i < m->needle_len;) {
                nidx = i;
                U16_NEXT(m->needle, i, m->needle_len, nc); // i now points to next char in needle 
                if (m->haystack_len - hidx < m->needle_len - nidx) { score = 0.0; break; }
                p = u_strchr32(m->haystack + hidx, nc);  // TODO: Use primary collation for the find
                if (p == NULL) { score = 0.0; break; }
                pos = (int32_t)(p - m->haystack);
                distance = u_countChar32(m->haystack + last_idx, pos - last_idx);  
                if (distance <= 1) score_for_char = m->max_score_per_char;
                else {
                    U16_GET(m->haystack, 0, pos, m->haystack_len, hc); 
                    j = pos;
                    U16_PREV(m->haystack, 0, j, lc); // lc is the prev character
                    score_for_char = calc_score_for_char(m, lc, hc, distance);
                }
                j = pos;
                U16_NEXT(m->haystack, j, m->haystack_len, hc); 
                hidx = j;
                if (m->haystack_len - hidx >= m->needle_len - nidx) stack_push(stack, hidx, nidx, last_idx, score, positions);
                last_idx = pos; 
                positions[nidx] = pos; 
                score += score_for_char;
            } // for(i) iterate over needle
            mem.score = score; memcpy(mem.positions, positions, sizeof(*positions) * m->needle_len);

        } else {
            score = mem.score; memcpy(positions, mem.positions, sizeof(*positions) * m->needle_len);
        }
        // We have calculated the score for this hidx, nidx, last_idx combination, update final_score and final_positions, if needed
        if (score > final_score) {
            final_score = score;
            memcpy(final_positions, positions, sizeof(*positions) * m->needle_len);
        }
    }
    return final_score;
}
Example #5
0
File: icu.c Project: IvoNet/calibre
// BreakIterator.split2 {{{
static PyObject *
icu_BreakIterator_split2(icu_BreakIterator *self, PyObject *args, PyObject *kwargs) {
#if PY_VERSION_HEX >= 0x03030000 
#error Not implemented for python >= 3.3
#endif

    int32_t prev = 0, p = 0, sz = 0;
    PyObject *ans = NULL, *temp = NULL;
  
    ans = PyList_New(0);
    if (ans == NULL) return PyErr_NoMemory();

    p = ubrk_first(self->break_iterator);
    while (p != UBRK_DONE) {
        prev = p; p = ubrk_next(self->break_iterator);
        if (self->type == UBRK_WORD && ubrk_getRuleStatus(self->break_iterator) == UBRK_WORD_NONE) 
            continue;  // We are not at the start of a word
        sz = (p == UBRK_DONE) ? self->text_len - prev : p - prev;
        if (sz > 0) {
#ifdef Py_UNICODE_WIDE
            sz = u_countChar32(self->text + prev, sz);
            prev = u_countChar32(self->text, prev);
#endif
            temp = Py_BuildValue("II", prev, sz); 
            if (temp == NULL) {
                Py_DECREF(ans); ans = NULL; break; 
            } 
            if (PyList_Append(ans, temp) != 0) {
                Py_DECREF(temp); Py_DECREF(ans); ans = NULL; break; 
            }
            Py_DECREF(temp);
        }
    }

    return ans;

} // }}}
Example #6
0
static void convert_positions(int32_t *positions, int32_t *final_positions, UChar *string, int32_t char_len, int32_t byte_len, double score) {
    // The positions array stores character positions as byte offsets in string, convert them into character offsets
    int32_t i, *end;

    if (score == 0.0) {
        for (i = 0; i < char_len; i++) final_positions[i] = -1;
        return;
    }

    end = final_positions + char_len;
    for (i = 0; i < byte_len && final_positions < end; i++) {
        if (positions[i] == -1) continue;
        *final_positions = u_countChar32(string, positions[i]);
        final_positions += 1;
    }
}
Example #7
0
// string_length {{{
static PyObject *
icu_string_length(PyObject *self, PyObject *args) {
#if PY_VERSION_HEX >= 0x03030000 
#error Not implemented for python >= 3.3
#endif

    int32_t sz = 0;
    UChar *icu = NULL;
    PyObject *src = NULL;
  
    if (!PyArg_ParseTuple(args, "O", &src)) return NULL;
    icu = python_to_icu(src, &sz, 1);
    if (icu == NULL) return NULL;
    sz = u_countChar32(icu, sz);
    free(icu);
    return Py_BuildValue("i", sz);
} // }}}
Example #8
0
static void convert_positions(int32_t *positions, int32_t *final_positions, UChar *string, int32_t char_len, int32_t byte_len, double score) {
    // The positions array stores character positions as byte offsets in string, convert them into character offsets
    int32_t i, *end;

    if (score == 0.0) { for (i = 0; i < char_len; i++) final_positions[i] = -1; return; }

    end = final_positions + char_len;
    for (i = 0; i < byte_len && final_positions < end; i++) {
        if (positions[i] == -1) continue;
#if PY_VERSION_HEX >= 0x03030000
        *final_positions = positions[i];
#else
#ifdef Py_UNICODE_WIDE
        *final_positions = u_countChar32(string, positions[i]);
#else
        *final_positions = positions[i];
#endif
#endif
        final_positions += 1;
    }
}
Example #9
0
    UnicodeSetPerformanceTest(int32_t argc, const char *argv[], UErrorCode &status)
            : UPerfTest(argc, argv, options, LENGTHOF(options), unisetperf_usage, status),
              utf8(NULL), utf8Length(0), countInputCodePoints(0), spanCount(0) {
        if (U_SUCCESS(status)) {
            UnicodeString pattern=UnicodeString(options[SET_PATTERN].value, -1, US_INV).unescape();
            set.applyPattern(pattern, status);
            prefrozen=set;
            if(0==strcmp(options[FAST_TYPE].value, "fast")) {
                set.freeze();
            }

            int32_t inputLength;
            UPerfTest::getBuffer(inputLength, status);
            if(U_SUCCESS(status) && inputLength>0) {
                countInputCodePoints = u_countChar32(buffer, bufferLen);

                countSpans();

                // Preflight the UTF-8 length and allocate utf8.
                u_strToUTF8(NULL, 0, &utf8Length, buffer, bufferLen, &status);
                if(status==U_BUFFER_OVERFLOW_ERROR) {
                    utf8=(char *)malloc(utf8Length);
                    if(utf8!=NULL) {
                        status=U_ZERO_ERROR;
                        u_strToUTF8(utf8, utf8Length, NULL, buffer, bufferLen, &status);
                    } else {
                        status=U_MEMORY_ALLOCATION_ERROR;
                    }
                }

                if(verbose) {
                    printf("code points:%ld  len16:%ld  len8:%ld  spans:%ld  "
                           "cp/span:%.3g  UChar/span:%.3g  B/span:%.3g  B/cp:%.3g\n",
                           (long)countInputCodePoints, (long)bufferLen, (long)utf8Length, (long)spanCount,
                           (double)countInputCodePoints/spanCount, (double)bufferLen/spanCount, (double)utf8Length/spanCount,
                           (double)utf8Length/countInputCodePoints);
                }
            }
        }
    }
Example #10
0
int utf16_cp_to_cu(const UChar *ustring, int32_t ustring_len, long cp_offset, int32_t *cu_offset, UErrorCode *status)
{
    if (0 != cp_offset) {
        int32_t _cp_count = u_countChar32(ustring, ustring_len);
        if (cp_offset < 0) {
            if (cp_offset < -_cp_count) {
                *status = U_INDEX_OUTOFBOUNDS_ERROR;
                return FAILURE;
            }
            *cu_offset = ustring_len;
            U16_BACK_N(ustring, 0, *cu_offset, -cp_offset);
        } else {
            if (cp_offset >= _cp_count) {
                *status = U_INDEX_OUTOFBOUNDS_ERROR;
                return FAILURE;
            }
            U16_FWD_N(ustring, *cu_offset, ustring_len, cp_offset);
        }
    }

    return SUCCESS;
}
Example #11
0
File: icu.c Project: IvoNet/calibre
// BreakIterator.index {{{
static PyObject *
icu_BreakIterator_index(icu_BreakIterator *self, PyObject *args, PyObject *kwargs) {
#if PY_VERSION_HEX >= 0x03030000 
#error Not implemented for python >= 3.3
#endif

    UChar *buf = NULL;
    int32_t prev = 0, p = 0, sz = 0, tsz = 0, ans = -1;
    PyObject *token = NULL;
  
    if (!PyArg_ParseTuple(args, "O", &token)) return NULL;
    buf = python_to_icu(token, &sz, 1);
    if (buf == NULL) return NULL;
    if (sz < 1) goto end;

    Py_BEGIN_ALLOW_THREADS;
    p = ubrk_first(self->break_iterator);
    while (p != UBRK_DONE) {
        prev = p; p = ubrk_next(self->break_iterator);
        if (self->type == UBRK_WORD && ubrk_getRuleStatus(self->break_iterator) == UBRK_WORD_NONE) 
            continue;  // We are not at the start of a word
        tsz = (p == UBRK_DONE) ? self->text_len - prev : p - prev;
        if (sz == tsz && memcmp(self->text + prev, buf, sz * sizeof(UChar)) == 0) { 
#ifdef PY_UNICODE_WIDE
            ans = u_countChar32(self->text, prev);
#else
            ans = prev; 
#endif
            break;
        }
    }
    Py_END_ALLOW_THREADS;

end:
    free(buf);
    return Py_BuildValue("i", ans);

} // }}}
Example #12
0
// BreakIterator.split2 {{{
static PyObject *
icu_BreakIterator_split2(icu_BreakIterator *self, PyObject *args) {
#if PY_VERSION_HEX >= 0x03030000
#error Not implemented for python >= 3.3
#endif

    int32_t word_start = 0, p = 0, sz = 0, last_pos = 0, last_sz = 0;
    int is_hyphen_sep = 0, leading_hyphen = 0, trailing_hyphen = 0;
    UChar sep = 0;
    PyObject *ans = NULL, *temp = NULL, *t = NULL;

    ans = PyList_New(0);
    if (ans == NULL) return PyErr_NoMemory();

    p = ubrk_first(self->break_iterator);
    while (p != UBRK_DONE) {
        word_start = p; p = ubrk_next(self->break_iterator);
        if (self->type == UBRK_WORD && ubrk_getRuleStatus(self->break_iterator) == UBRK_WORD_NONE)
            continue;  // We are not at the start of a word
        sz = (p == UBRK_DONE) ? self->text_len - word_start : p - word_start;
        if (sz > 0) {
            // ICU breaks on words containing hyphens, we do not want that, so we recombine manually
            is_hyphen_sep = 0; leading_hyphen = 0; trailing_hyphen = 0;
            if (word_start > 0) { // Look for a leading hyphen
                sep = *(self->text + word_start - 1);
                if (IS_HYPHEN_CHAR(sep)) {
                    leading_hyphen = 1;
                    if (last_pos > 0 && word_start - last_pos == 1) is_hyphen_sep = 1;
                }
            }
            if (word_start + sz < self->text_len) { // Look for a trailing hyphen
                sep = *(self->text + word_start + sz);
                if (IS_HYPHEN_CHAR(sep)) trailing_hyphen = 1;
            }
            last_pos = p;
#ifdef Py_UNICODE_WIDE
            sz = u_countChar32(self->text + word_start, sz);
            word_start = u_countChar32(self->text, word_start);
#endif
            if (is_hyphen_sep && PyList_GET_SIZE(ans) > 0) {
                sz = last_sz + sz + trailing_hyphen;
                last_sz = sz;
                t = PyInt_FromLong((long)sz);
                if (t == NULL) { Py_DECREF(ans); ans = NULL; break; }
                temp = PyList_GET_ITEM(ans, PyList_GET_SIZE(ans) - 1);
                Py_DECREF(PyTuple_GET_ITEM(temp, 1));
                PyTuple_SET_ITEM(temp, 1, t);
            } else {
                sz += leading_hyphen + trailing_hyphen;
                last_sz = sz;
                temp = Py_BuildValue("ll", (long)(word_start - leading_hyphen), (long)sz);
                if (temp == NULL) {
                    Py_DECREF(ans); ans = NULL; break;
                }
                if (PyList_Append(ans, temp) != 0) {
                    Py_DECREF(temp); Py_DECREF(ans); ans = NULL; break;
                }
                Py_DECREF(temp);
            }
        }
    }

    return ans;

} // }}}