Beispiel #1
0
int splitUnicodeString(Splitter *self,PyObject *doc)
{
    PyObject *word ;
    Py_UNICODE *s;
    int i, inside_word=0, start=0, len;
    register int value, next_value;

    s = PyUnicode_AS_UNICODE(doc);       // start of unicode string
    len = PyUnicode_GET_SIZE(doc);


    for (i=0; i<len; i++,s++) {
        register Py_UNICODE c;

        c = *s;

        if (self->casefolding)
            *s = Py_UNICODE_TOLOWER(c);

        value = inode_get(self, c);

        if (value == MISS ) {
            // cache miss

            value = Py_UNICODE_ISALNUM(c) ? IS_ALNUM : IS_TRASH;
            inode_set(self, c, value);
        }

        if (!inside_word) {
            if (value != IS_TRASH ) {
                start = i;
                inside_word = 1;
            }
        } else {

            if (value == IS_SEPARATOR) {
                register Py_UNICODE next_c = *(s+1);

                next_value = inode_get(self, next_c);

                if (next_value == MISS ) {
                    // cache miss

                    next_value = Py_UNICODE_ISALNUM(next_c) ? IS_ALNUM : IS_TRASH;
                    inode_set(self, next_c, next_value);
                }

                if (next_value == IS_TRASH) {
                    if (! (i-start<2 && ! self->single_chars)) {
                        word = Py_BuildValue("u#", s-(i-start), min(i-start, self->max_len));
                        PyList_Append(self->list, word);
                        Py_XDECREF(word);
                    }
                    start = i;
                    inside_word = 0;
                }

            }

            else if (value==IS_TRASH) {
                if (! (i-start<2 && ! self->single_chars)) {
                    word = Py_BuildValue("u#", s-(i-start), min(i-start, self->max_len));
                    PyList_Append(self->list, word);
                    Py_XDECREF(word);
                }
                start = i;
                inside_word = 0;
            }
        }
    }

    if (inside_word) {
        if (! (i-start<2 && ! self->single_chars)) {
            word = Py_BuildValue("u#", s-(i-start), min(i-start, self->max_len));
            PyList_Append(self->list, word);
            Py_XDECREF(word);
        }
    }

    return 1;
}
Beispiel #2
0
/* BKE_text.h */
int text_check_identifier_unicode(const unsigned int ch)
{
	return (ch < 255 && text_check_identifier((char)ch)) || Py_UNICODE_ISALNUM(ch);
}
static int splitUnicodeString(Splitter *self,PyUnicodeObject *doc)
{

    PyObject *word,*synword;
    PyUnicodeObject * doc1;
    Py_UNICODE *s;

    int len = doc->length;
    int inside_word=0;
    int i=0;
    int start=0;

    doc1 = prepareString(self,doc);
    if (doc1 == NULL)
      return -1;

    s=doc1->str;

    self->list = PyList_New(0);

    for (i = 0; i < len; s++, i++) {
        register Py_UNICODE ch;

        ch = *s;

        if (!inside_word) {
            if (self->index_numbers) {
                if (Py_UNICODE_ISALNUM(ch)) {
                    inside_word=1;
                    start = i;
                }

            } else {
                if (Py_UNICODE_ISALPHA(ch)) {
                    inside_word=1;
                    start = i;
                }
            }
        } else {

            if (!(Py_UNICODE_ISALNUM(ch) || ch=='/' || ch=='_' || ch=='-')) {
                inside_word = 0;

                word = PySequence_GetSlice((PyObject *)doc1,start,
                                           min(i, start + self->max_len));
                if (word==NULL)
                  goto err;

                synword = checkSynword(self,word);
                if (synword != Py_None) {
                  PyList_Append(self->list,synword);
                }

                start =  0;
#ifdef DEBUG
                PyObject_Print(word,stdout,0);
                fflush(stdout);
#endif
                Py_DECREF(word);
            }
        }
    }

    if (inside_word) {
        word = PySequence_GetSlice((PyObject *)doc1,start,
                                   min(len, start + self->max_len));
        if (word==NULL)
          goto err;

        synword = checkSynword(self,word);
        if (synword != Py_None) {
          PyList_Append(self->list,synword);
        }

        Py_DECREF(word);
    }

#ifdef DEBUG
    PyObject_Print(self->list,stdout,0);
    fflush(stdout);
#endif

    Py_DECREF(doc1);
    return 1;

 err:
    Py_DECREF(doc1);
    return -1;
}