// string_length {{{ static PyObject * icu_string_length(PyObject *self, PyObject *args) { int32_t sz = 0; UChar *icu = NULL; PyObject *src = NULL; if (!PyArg_ParseTuple(args, "O", &src)) return NULL; icu = python_to_icu(src, &sz, 1); if (icu == NULL) return NULL; sz = u_countChar32(icu, sz); free(icu); return Py_BuildValue("i", sz); } // }}}
// Collator.find {{{ static PyObject * icu_Collator_find(icu_Collator *self, PyObject *args) { #if PY_VERSION_HEX >= 0x03030000 #error Not implemented for python >= 3.3 #endif PyObject *a_ = NULL, *b_ = NULL; UChar *a = NULL, *b = NULL; int32_t asz = 0, bsz = 0, pos = -1, length = -1; UErrorCode status = U_ZERO_ERROR; UStringSearch *search = NULL; if (!PyArg_ParseTuple(args, "OO", &a_, &b_)) return NULL; a = python_to_icu(a_, &asz); if (a == NULL) goto end; b = python_to_icu(b_, &bsz); if (b == NULL) goto end; search = usearch_openFromCollator(a, asz, b, bsz, self->collator, NULL, &status); if (U_SUCCESS(status)) { pos = usearch_first(search, &status); if (pos != USEARCH_DONE) { length = usearch_getMatchedLength(search); #ifdef Py_UNICODE_WIDE // We have to return number of unicode characters since the string // could contain surrogate pairs which are represented as a single // character in python wide builds length = u_countChar32(b + pos, length); pos = u_countChar32(b, pos); #endif } else pos = -1; } end: if (search != NULL) usearch_close(search); if (a != NULL) free(a); if (b != NULL) free(b); return (PyErr_Occurred()) ? NULL : Py_BuildValue("ll", (long)pos, (long)length); } // }}}
// Matcher.calculate_scores {{{ static PyObject * Matcher_calculate_scores(Matcher *self, PyObject *args) { int32_t *final_positions = NULL, *p; Match *matches = NULL; bool ok = FALSE; uint32_t i = 0, needle_char_len = 0, j = 0; PyObject *items = NULL, *score = NULL, *positions = NULL, *pneedle = NULL; UChar *needle = NULL; if (!PyArg_ParseTuple(args, "O", &pneedle)) return NULL; needle = python_to_icu(pneedle, NULL); if (needle == NULL) return NULL; needle_char_len = u_countChar32(needle, -1); items = PyTuple_New(self->item_count); positions = PyTuple_New(self->item_count); matches = (Match*)calloc(self->item_count, sizeof(Match)); final_positions = (int32_t*) calloc(needle_char_len * self->item_count, sizeof(int32_t)); if (items == NULL || matches == NULL || final_positions == NULL || positions == NULL) {PyErr_NoMemory(); goto end;} for (i = 0; i < self->item_count; i++) { score = PyTuple_New(needle_char_len); if (score == NULL) { PyErr_NoMemory(); goto end; } PyTuple_SET_ITEM(positions, (Py_ssize_t)i, score); } Py_BEGIN_ALLOW_THREADS; ok = match(self->items, self->item_lengths, self->item_count, needle, matches, final_positions, needle_char_len, self->collator, self->level1, self->level2, self->level3); Py_END_ALLOW_THREADS; if (ok) { for (i = 0; i < self->item_count; i++) { score = PyFloat_FromDouble(matches[i].score); if (score == NULL) { PyErr_NoMemory(); goto end; } PyTuple_SET_ITEM(items, (Py_ssize_t)i, score); p = final_positions + (i * needle_char_len); for (j = 0; j < needle_char_len; j++) { score = PyLong_FromLong((long)p[j]); if (score == NULL) { PyErr_NoMemory(); goto end; } PyTuple_SET_ITEM(PyTuple_GET_ITEM(positions, (Py_ssize_t)i), (Py_ssize_t)j, score); } } } else { PyErr_NoMemory(); goto end; } end: nullfree(needle); nullfree(matches); nullfree(final_positions); if (PyErr_Occurred()) { Py_XDECREF(items); items = NULL; Py_XDECREF(positions); positions = NULL; return NULL; } return Py_BuildValue("NN", items, positions); } // }}}
static double process_item(MatchInfo *m, Stack *stack, int32_t *final_positions) { UChar32 nc, hc, lc; UChar *p; double final_score = 0.0, score = 0.0, score_for_char = 0.0; int32_t pos, i, j, hidx, nidx, last_idx, distance, *positions = final_positions + m->needle_len; MemoryItem mem = {0}; stack_push(stack, 0, 0, 0, 0.0, final_positions); while (stack->pos >= 0) { stack_pop(stack, &hidx, &nidx, &last_idx, &score, positions); mem = m->memo[hidx][nidx][last_idx]; if (mem.score == DBL_MAX) { // No memoized result, calculate the score for (i = nidx; i < m->needle_len;) { nidx = i; U16_NEXT(m->needle, i, m->needle_len, nc); // i now points to next char in needle if (m->haystack_len - hidx < m->needle_len - nidx) { score = 0.0; break; } p = u_strchr32(m->haystack + hidx, nc); // TODO: Use primary collation for the find if (p == NULL) { score = 0.0; break; } pos = (int32_t)(p - m->haystack); distance = u_countChar32(m->haystack + last_idx, pos - last_idx); if (distance <= 1) score_for_char = m->max_score_per_char; else { U16_GET(m->haystack, 0, pos, m->haystack_len, hc); j = pos; U16_PREV(m->haystack, 0, j, lc); // lc is the prev character score_for_char = calc_score_for_char(m, lc, hc, distance); } j = pos; U16_NEXT(m->haystack, j, m->haystack_len, hc); hidx = j; if (m->haystack_len - hidx >= m->needle_len - nidx) stack_push(stack, hidx, nidx, last_idx, score, positions); last_idx = pos; positions[nidx] = pos; score += score_for_char; } // for(i) iterate over needle mem.score = score; memcpy(mem.positions, positions, sizeof(*positions) * m->needle_len); } else { score = mem.score; memcpy(positions, mem.positions, sizeof(*positions) * m->needle_len); } // We have calculated the score for this hidx, nidx, last_idx combination, update final_score and final_positions, if needed if (score > final_score) { final_score = score; memcpy(final_positions, positions, sizeof(*positions) * m->needle_len); } } return final_score; }
// BreakIterator.split2 {{{ static PyObject * icu_BreakIterator_split2(icu_BreakIterator *self, PyObject *args, PyObject *kwargs) { #if PY_VERSION_HEX >= 0x03030000 #error Not implemented for python >= 3.3 #endif int32_t prev = 0, p = 0, sz = 0; PyObject *ans = NULL, *temp = NULL; ans = PyList_New(0); if (ans == NULL) return PyErr_NoMemory(); p = ubrk_first(self->break_iterator); while (p != UBRK_DONE) { prev = p; p = ubrk_next(self->break_iterator); if (self->type == UBRK_WORD && ubrk_getRuleStatus(self->break_iterator) == UBRK_WORD_NONE) continue; // We are not at the start of a word sz = (p == UBRK_DONE) ? self->text_len - prev : p - prev; if (sz > 0) { #ifdef Py_UNICODE_WIDE sz = u_countChar32(self->text + prev, sz); prev = u_countChar32(self->text, prev); #endif temp = Py_BuildValue("II", prev, sz); if (temp == NULL) { Py_DECREF(ans); ans = NULL; break; } if (PyList_Append(ans, temp) != 0) { Py_DECREF(temp); Py_DECREF(ans); ans = NULL; break; } Py_DECREF(temp); } } return ans; } // }}}
static void convert_positions(int32_t *positions, int32_t *final_positions, UChar *string, int32_t char_len, int32_t byte_len, double score) { // The positions array stores character positions as byte offsets in string, convert them into character offsets int32_t i, *end; if (score == 0.0) { for (i = 0; i < char_len; i++) final_positions[i] = -1; return; } end = final_positions + char_len; for (i = 0; i < byte_len && final_positions < end; i++) { if (positions[i] == -1) continue; *final_positions = u_countChar32(string, positions[i]); final_positions += 1; } }
// string_length {{{ static PyObject * icu_string_length(PyObject *self, PyObject *args) { #if PY_VERSION_HEX >= 0x03030000 #error Not implemented for python >= 3.3 #endif int32_t sz = 0; UChar *icu = NULL; PyObject *src = NULL; if (!PyArg_ParseTuple(args, "O", &src)) return NULL; icu = python_to_icu(src, &sz, 1); if (icu == NULL) return NULL; sz = u_countChar32(icu, sz); free(icu); return Py_BuildValue("i", sz); } // }}}
static void convert_positions(int32_t *positions, int32_t *final_positions, UChar *string, int32_t char_len, int32_t byte_len, double score) { // The positions array stores character positions as byte offsets in string, convert them into character offsets int32_t i, *end; if (score == 0.0) { for (i = 0; i < char_len; i++) final_positions[i] = -1; return; } end = final_positions + char_len; for (i = 0; i < byte_len && final_positions < end; i++) { if (positions[i] == -1) continue; #if PY_VERSION_HEX >= 0x03030000 *final_positions = positions[i]; #else #ifdef Py_UNICODE_WIDE *final_positions = u_countChar32(string, positions[i]); #else *final_positions = positions[i]; #endif #endif final_positions += 1; } }
UnicodeSetPerformanceTest(int32_t argc, const char *argv[], UErrorCode &status) : UPerfTest(argc, argv, options, LENGTHOF(options), unisetperf_usage, status), utf8(NULL), utf8Length(0), countInputCodePoints(0), spanCount(0) { if (U_SUCCESS(status)) { UnicodeString pattern=UnicodeString(options[SET_PATTERN].value, -1, US_INV).unescape(); set.applyPattern(pattern, status); prefrozen=set; if(0==strcmp(options[FAST_TYPE].value, "fast")) { set.freeze(); } int32_t inputLength; UPerfTest::getBuffer(inputLength, status); if(U_SUCCESS(status) && inputLength>0) { countInputCodePoints = u_countChar32(buffer, bufferLen); countSpans(); // Preflight the UTF-8 length and allocate utf8. u_strToUTF8(NULL, 0, &utf8Length, buffer, bufferLen, &status); if(status==U_BUFFER_OVERFLOW_ERROR) { utf8=(char *)malloc(utf8Length); if(utf8!=NULL) { status=U_ZERO_ERROR; u_strToUTF8(utf8, utf8Length, NULL, buffer, bufferLen, &status); } else { status=U_MEMORY_ALLOCATION_ERROR; } } if(verbose) { printf("code points:%ld len16:%ld len8:%ld spans:%ld " "cp/span:%.3g UChar/span:%.3g B/span:%.3g B/cp:%.3g\n", (long)countInputCodePoints, (long)bufferLen, (long)utf8Length, (long)spanCount, (double)countInputCodePoints/spanCount, (double)bufferLen/spanCount, (double)utf8Length/spanCount, (double)utf8Length/countInputCodePoints); } } } }
int utf16_cp_to_cu(const UChar *ustring, int32_t ustring_len, long cp_offset, int32_t *cu_offset, UErrorCode *status) { if (0 != cp_offset) { int32_t _cp_count = u_countChar32(ustring, ustring_len); if (cp_offset < 0) { if (cp_offset < -_cp_count) { *status = U_INDEX_OUTOFBOUNDS_ERROR; return FAILURE; } *cu_offset = ustring_len; U16_BACK_N(ustring, 0, *cu_offset, -cp_offset); } else { if (cp_offset >= _cp_count) { *status = U_INDEX_OUTOFBOUNDS_ERROR; return FAILURE; } U16_FWD_N(ustring, *cu_offset, ustring_len, cp_offset); } } return SUCCESS; }
// BreakIterator.index {{{ static PyObject * icu_BreakIterator_index(icu_BreakIterator *self, PyObject *args, PyObject *kwargs) { #if PY_VERSION_HEX >= 0x03030000 #error Not implemented for python >= 3.3 #endif UChar *buf = NULL; int32_t prev = 0, p = 0, sz = 0, tsz = 0, ans = -1; PyObject *token = NULL; if (!PyArg_ParseTuple(args, "O", &token)) return NULL; buf = python_to_icu(token, &sz, 1); if (buf == NULL) return NULL; if (sz < 1) goto end; Py_BEGIN_ALLOW_THREADS; p = ubrk_first(self->break_iterator); while (p != UBRK_DONE) { prev = p; p = ubrk_next(self->break_iterator); if (self->type == UBRK_WORD && ubrk_getRuleStatus(self->break_iterator) == UBRK_WORD_NONE) continue; // We are not at the start of a word tsz = (p == UBRK_DONE) ? self->text_len - prev : p - prev; if (sz == tsz && memcmp(self->text + prev, buf, sz * sizeof(UChar)) == 0) { #ifdef PY_UNICODE_WIDE ans = u_countChar32(self->text, prev); #else ans = prev; #endif break; } } Py_END_ALLOW_THREADS; end: free(buf); return Py_BuildValue("i", ans); } // }}}
// BreakIterator.split2 {{{ static PyObject * icu_BreakIterator_split2(icu_BreakIterator *self, PyObject *args) { #if PY_VERSION_HEX >= 0x03030000 #error Not implemented for python >= 3.3 #endif int32_t word_start = 0, p = 0, sz = 0, last_pos = 0, last_sz = 0; int is_hyphen_sep = 0, leading_hyphen = 0, trailing_hyphen = 0; UChar sep = 0; PyObject *ans = NULL, *temp = NULL, *t = NULL; ans = PyList_New(0); if (ans == NULL) return PyErr_NoMemory(); p = ubrk_first(self->break_iterator); while (p != UBRK_DONE) { word_start = p; p = ubrk_next(self->break_iterator); if (self->type == UBRK_WORD && ubrk_getRuleStatus(self->break_iterator) == UBRK_WORD_NONE) continue; // We are not at the start of a word sz = (p == UBRK_DONE) ? self->text_len - word_start : p - word_start; if (sz > 0) { // ICU breaks on words containing hyphens, we do not want that, so we recombine manually is_hyphen_sep = 0; leading_hyphen = 0; trailing_hyphen = 0; if (word_start > 0) { // Look for a leading hyphen sep = *(self->text + word_start - 1); if (IS_HYPHEN_CHAR(sep)) { leading_hyphen = 1; if (last_pos > 0 && word_start - last_pos == 1) is_hyphen_sep = 1; } } if (word_start + sz < self->text_len) { // Look for a trailing hyphen sep = *(self->text + word_start + sz); if (IS_HYPHEN_CHAR(sep)) trailing_hyphen = 1; } last_pos = p; #ifdef Py_UNICODE_WIDE sz = u_countChar32(self->text + word_start, sz); word_start = u_countChar32(self->text, word_start); #endif if (is_hyphen_sep && PyList_GET_SIZE(ans) > 0) { sz = last_sz + sz + trailing_hyphen; last_sz = sz; t = PyInt_FromLong((long)sz); if (t == NULL) { Py_DECREF(ans); ans = NULL; break; } temp = PyList_GET_ITEM(ans, PyList_GET_SIZE(ans) - 1); Py_DECREF(PyTuple_GET_ITEM(temp, 1)); PyTuple_SET_ITEM(temp, 1, t); } else { sz += leading_hyphen + trailing_hyphen; last_sz = sz; temp = Py_BuildValue("ll", (long)(word_start - leading_hyphen), (long)sz); if (temp == NULL) { Py_DECREF(ans); ans = NULL; break; } if (PyList_Append(ans, temp) != 0) { Py_DECREF(temp); Py_DECREF(ans); ans = NULL; break; } Py_DECREF(temp); } } } return ans; } // }}}