int splitUnicodeString(Splitter *self,PyObject *doc) { PyObject *word ; Py_UNICODE *s; int i, inside_word=0, start=0, len; register int value, next_value; s = PyUnicode_AS_UNICODE(doc); // start of unicode string len = PyUnicode_GET_SIZE(doc); for (i=0; i<len; i++,s++) { register Py_UNICODE c; c = *s; if (self->casefolding) *s = Py_UNICODE_TOLOWER(c); value = inode_get(self, c); if (value == MISS ) { // cache miss value = Py_UNICODE_ISALNUM(c) ? IS_ALNUM : IS_TRASH; inode_set(self, c, value); } if (!inside_word) { if (value != IS_TRASH ) { start = i; inside_word = 1; } } else { if (value == IS_SEPARATOR) { register Py_UNICODE next_c = *(s+1); next_value = inode_get(self, next_c); if (next_value == MISS ) { // cache miss next_value = Py_UNICODE_ISALNUM(next_c) ? IS_ALNUM : IS_TRASH; inode_set(self, next_c, next_value); } if (next_value == IS_TRASH) { if (! (i-start<2 && ! self->single_chars)) { word = Py_BuildValue("u#", s-(i-start), min(i-start, self->max_len)); PyList_Append(self->list, word); Py_XDECREF(word); } start = i; inside_word = 0; } } else if (value==IS_TRASH) { if (! (i-start<2 && ! self->single_chars)) { word = Py_BuildValue("u#", s-(i-start), min(i-start, self->max_len)); PyList_Append(self->list, word); Py_XDECREF(word); } start = i; inside_word = 0; } } } if (inside_word) { if (! (i-start<2 && ! self->single_chars)) { word = Py_BuildValue("u#", s-(i-start), min(i-start, self->max_len)); PyList_Append(self->list, word); Py_XDECREF(word); } } return 1; }
/* BKE_text.h */ int text_check_identifier_unicode(const unsigned int ch) { return (ch < 255 && text_check_identifier((char)ch)) || Py_UNICODE_ISALNUM(ch); }
static int splitUnicodeString(Splitter *self,PyUnicodeObject *doc) { PyObject *word,*synword; PyUnicodeObject * doc1; Py_UNICODE *s; int len = doc->length; int inside_word=0; int i=0; int start=0; doc1 = prepareString(self,doc); if (doc1 == NULL) return -1; s=doc1->str; self->list = PyList_New(0); for (i = 0; i < len; s++, i++) { register Py_UNICODE ch; ch = *s; if (!inside_word) { if (self->index_numbers) { if (Py_UNICODE_ISALNUM(ch)) { inside_word=1; start = i; } } else { if (Py_UNICODE_ISALPHA(ch)) { inside_word=1; start = i; } } } else { if (!(Py_UNICODE_ISALNUM(ch) || ch=='/' || ch=='_' || ch=='-')) { inside_word = 0; word = PySequence_GetSlice((PyObject *)doc1,start, min(i, start + self->max_len)); if (word==NULL) goto err; synword = checkSynword(self,word); if (synword != Py_None) { PyList_Append(self->list,synword); } start = 0; #ifdef DEBUG PyObject_Print(word,stdout,0); fflush(stdout); #endif Py_DECREF(word); } } } if (inside_word) { word = PySequence_GetSlice((PyObject *)doc1,start, min(len, start + self->max_len)); if (word==NULL) goto err; synword = checkSynword(self,word); if (synword != Py_None) { PyList_Append(self->list,synword); } Py_DECREF(word); } #ifdef DEBUG PyObject_Print(self->list,stdout,0); fflush(stdout); #endif Py_DECREF(doc1); return 1; err: Py_DECREF(doc1); return -1; }