/* Test the u and u# codes for PyArg_ParseTuple. May leak memory in case of an error. */ static PyObject * test_u_code(PyObject *self) { PyObject *tuple, *obj; Py_UNICODE *value; int len; /* issue4122: Undefined reference to _Py_ascii_whitespace on Windows */ /* Just use the macro and check that it compiles */ int x = Py_UNICODE_ISSPACE(25); tuple = PyTuple_New(1); if (tuple == NULL) return NULL; obj = PyUnicode_Decode("test", strlen("test"), "ascii", NULL); if (obj == NULL) return NULL; PyTuple_SET_ITEM(tuple, 0, obj); value = 0; if (PyArg_ParseTuple(tuple, "u:test_u_code", &value) < 0) return NULL; if (value != PyUnicode_AS_UNICODE(obj)) return raiseTestError("test_u_code", "u code returned wrong value for u'test'"); value = 0; if (PyArg_ParseTuple(tuple, "u#:test_u_code", &value, &len) < 0) return NULL; if (value != PyUnicode_AS_UNICODE(obj) || len != PyUnicode_GET_SIZE(obj)) return raiseTestError("test_u_code", "u# code returned wrong values for u'test'"); Py_DECREF(tuple); Py_INCREF(Py_None); return Py_None; }
static PyObject *next_token(ligolw_Tokenizer *tokenizer, Py_UNICODE **start, Py_UNICODE **end) { Py_UNICODE *pos = tokenizer->pos; Py_UNICODE *bailout = tokenizer->length; PyObject *type = *tokenizer->type; Py_UNICODE quote_character; /* * The following code matches the pattern: * * any amount of white-space + " + non-quote characters + " + any * amount of white-space + delimiter * * or * * any amount of white-space + non-white-space, non-delimiter * characters + any amount of white-space + delimiter * * The middle bit is returned as the token. '"' and '\' characters * can be escaped by preceding them with a '\' character. */ /* * start == a white-space to non-white-space transition outside of * a quote, or a non-quoted to quoted transition. * * end == a non-white-space to white-space transition outside of a * quote, or a delimiter outside of a quote, or a quoted to * non-quoted transition. */ if(pos >= bailout) goto stop_iteration; while(Py_UNICODE_ISSPACE(*pos)) if(++pos >= bailout) goto stop_iteration; if(pyunicode_strchr(tokenizer->quote_characters, *pos)) { /* * Found a quoted token. */ int escaped = 0; quote_character = *pos; *start = ++pos; if(pos >= bailout) goto stop_iteration; while((*pos != quote_character) || escaped) { escaped = (*pos == tokenizer->escape_character) && !escaped; if(++pos >= bailout) goto stop_iteration; } *end = pos; if(++pos >= bailout) goto stop_iteration; } else { /* * Found an unquoted token. */ quote_character = 0; *start = pos; while(!Py_UNICODE_ISSPACE(*pos) && (*pos != tokenizer->delimiter)) if(++pos >= bailout) goto stop_iteration; *end = pos; if(*start == *end) /* * Found nothing but unquoted whitespace between * delimiters --> an empty token (not the same as a * zero-length token). */ *start = *end = NULL; } while(*pos != tokenizer->delimiter) { if(!Py_UNICODE_ISSPACE(*pos)) { parse_error(PyExc_ValueError, *start, tokenizer->length - *start - 1, pos, "expected whitespace or delimiter"); return NULL; } if(++pos >= bailout) goto stop_iteration; } /* * After this, tokenizer->pos points to the first character after * the delimiter that terminated this current token. */ tokenizer->pos = ++pos; /* * Select the next type */ if(++tokenizer->type >= tokenizer->types_length) tokenizer->type = tokenizer->types; /* * NULL terminate the token, and if it was quoted unescape special * characters. The unescape() function modifies the token in * place, so we call it after advancing tokenizer->pos and * tokenizer->type so that if a failure occurs we don't leave the * tokenizer pointed at a garbled string. */ if(*end) **end = 0; if(quote_character) { /* FIXME: remove the delimiter */ Py_UNICODE escapable_characters[] = {quote_character, tokenizer->escape_character, tokenizer->delimiter, '\0'}; if(unescape(*start, end, escapable_characters, tokenizer->escape_character)) return NULL; } /* * Done. *start points to the first character of the token, *end * points to the first character following the token (or both are * NULL if there was nothing but unquoted whitespace), * tokenizer->pos and tokenizer->type have been advanced in * readiness for the next token, and the return value is the python * type to which the current token is to be converted. */ return type; /* * Errors */ stop_iteration: advance_to_pos(tokenizer); PyErr_SetNone(PyExc_StopIteration); return NULL; }
static PyObject * filters_uspace_compress(PyObject * self, PyObject *args) { PyObject * com; PyObject * res; Py_ssize_t len; Py_UNICODE *input_buffer; Py_UNICODE *buffer; Py_UNICODE c; int ic, ib; int gobble = 1; com = unicode_arg(args); if(!com) { return NULL; } input_buffer = PyUnicode_AS_UNICODE(com); len = PyUnicode_GetSize(com); buffer = (Py_UNICODE*)malloc(len * sizeof(Py_UNICODE)); /* ic -> input buffer index, ib -> output buffer */ for(ic = 0, ib = 0; ic <= len; ic++) { c = input_buffer[ic]; /* gobble -> we are space compressing */ if(gobble) { /* remove spaces if encountered */ if(Py_UNICODE_ISSPACE(c)) { /* after this loop, c will be a non-space */ while(Py_UNICODE_ISSPACE(c)) { c = input_buffer[++ic]; } /* unless next char is a <, add a single space to account for the multiple spaces that have been removed */ if(c != (Py_UNICODE)('<')) { buffer[ib++] = (Py_UNICODE)(' '); } } /* gobble all space after '>' */ if(c == (Py_UNICODE)('>')) { buffer[ib++] = c; c = input_buffer[++ic]; while(Py_UNICODE_ISSPACE(c)) { c = input_buffer[++ic]; } } /* does the next part of the string match the SC_OFF label */ if (len - ic >= SC_OFF_LEN && memcmp(&input_buffer[ic], SC_OFF_U, sizeof(Py_UNICODE)*SC_OFF_LEN) == 0) { /* disable gobbling, and bypass that part of the string */ gobble = 0; ic += SC_OFF_LEN; c = input_buffer[ic]; } } /* not gobbling, but find the SC_ON tag */ else if (len - ic >= SC_ON_LEN && memcmp(&input_buffer[ic], SC_ON_U, sizeof(Py_UNICODE)*SC_ON_LEN) == 0) { gobble = 1; ic += SC_ON_LEN; c = input_buffer[ic]; } if(c) { buffer[ib++] = c; } } res = PyUnicode_FromUnicode(buffer, ib); free(buffer); return res; }