Example #1
0
static PyObject *
Splitter_split(Splitter *self, PyObject *args)
{
    PyObject *doc;
    char *encoding = "iso-8859-15";

    Py_XDECREF(self->list);
    self->list = PyList_New(0);

    if (! (PyArg_ParseTuple(args,"O|s",&doc, &encoding))) return NULL;

    if (PyBytes_Check(doc)) {
        if (strlen(encoding) == 0 || !strcmp(encoding,"ascii"))
            splitString(self, doc);
        else {
            PyObject *doc1;
            if (! (doc1 = PyUnicode_FromEncodedObject(doc, encoding, "strict"))) {
                PyErr_SetString(PyExc_UnicodeError,"unicode conversion failed (maybe wrong encoding parameter)");
                return NULL;
            }

            splitUnicodeString(self, doc1);
            Py_XDECREF(doc1);
        }
    } else if (PyUnicode_Check(doc)) {
        PyObject *doc1; // create a *real* copy since we need to modify the string
        doc1 = PyUnicode_FromUnicode(NULL, PyUnicode_GET_SIZE(doc));
        Py_UNICODE_COPY(PyUnicode_AS_UNICODE(doc1),
                        PyUnicode_AS_UNICODE(doc),
                        PyUnicode_GET_SIZE(doc));
        splitUnicodeString(self, doc1);
        Py_DECREF(doc1);
    } else {
        PyErr_SetString(PyExc_TypeError, "first argument must be  string or unicode");
        return NULL;
    }

    Py_XINCREF(self->list);

    return self->list;
}
static PyObject *
newSplitter(PyObject *modinfo, PyObject *args,PyObject *keywds)
{
    Splitter *self=NULL;
    PyObject *doc=NULL, *unicodedoc=NULL,*synstop=NULL;
    char *encoding = "latin1";
    int index_numbers = 0;
    int max_len=64;
    int single_char = 0;
    int casefolding=1;

    if (! (PyArg_ParseTupleAndKeywords(args,keywds,"O|Osiiii",splitter_args,&doc,&synstop,&encoding,&index_numbers,&single_char,&max_len,&casefolding))) return NULL;

#ifdef DEBUG
    puts("got text");
    PyObject_Print(doc,stdout,0);
    fflush(stdout);
#endif

    if (index_numbers<0 || index_numbers>1) {
        PyErr_SetString(PyExc_ValueError,"indexnumbers must be 0 or 1");
        return NULL;
    }

    if (casefolding<0 || casefolding>1) {
        PyErr_SetString(PyExc_ValueError,"casefolding must be 0 or 1");
        return NULL;
    }

    if (single_char<0 || single_char>1) {
        PyErr_SetString(PyExc_ValueError,"singlechar must be 0 or 1");
        return NULL;
    }

    if (max_len<1 || max_len>128) {
        PyErr_SetString(PyExc_ValueError,"maxlen must be between 1 and 128");
        return NULL;
    }

    if (PyString_Check(doc)) {

        unicodedoc = PyUnicode_FromEncodedObject(doc,encoding,"strict");
        if (unicodedoc ==NULL) {
            PyErr_SetString(PyExc_UnicodeError, "Problem converting encoded string");
            return NULL;
        }

    } else if( PyUnicode_Check(doc)) {
        unicodedoc = doc;
        Py_INCREF(unicodedoc);

    } else {
        PyErr_SetString(PyExc_TypeError, "first argument is neither string nor unicode.");
        return NULL;
    }

    if (! (self = PyObject_NEW(Splitter, &SplitterType))) return NULL;

    if (synstop) {
        self->synstop = synstop;
        Py_INCREF(synstop);
    } else  self->synstop=NULL;

    self->index_numbers      = index_numbers;
    self->max_len            = max_len;
    self->allow_single_chars = single_char;
    self->casefolding        = casefolding;

    if ((splitUnicodeString(self,(PyUnicodeObject *)unicodedoc)) < 0)
      goto err;

    Py_DECREF(unicodedoc);
    return (PyObject*)self;

err:
    Py_DECREF(self);
    Py_DECREF(unicodedoc);

    return NULL;
}