Exemplo n.º 1
0
/* Test the u and u# codes for PyArg_ParseTuple. May leak memory in case
   of an error.
*/
static PyObject *
test_u_code(PyObject *self)
{
	PyObject *tuple, *obj;
	Py_UNICODE *value;
	int len;

	/* issue4122: Undefined reference to _Py_ascii_whitespace on Windows */
	/* Just use the macro and check that it compiles */
	int x = Py_UNICODE_ISSPACE(25);

        tuple = PyTuple_New(1);
        if (tuple == NULL)
        	return NULL;

        obj = PyUnicode_Decode("test", strlen("test"),
			       "ascii", NULL);
        if (obj == NULL)
        	return NULL;

        PyTuple_SET_ITEM(tuple, 0, obj);

        value = 0;
        if (PyArg_ParseTuple(tuple, "u:test_u_code", &value) < 0)
        	return NULL;
        if (value != PyUnicode_AS_UNICODE(obj))
        	return raiseTestError("test_u_code",
			"u code returned wrong value for u'test'");
        value = 0;
        if (PyArg_ParseTuple(tuple, "u#:test_u_code", &value, &len) < 0)
        	return NULL;
        if (value != PyUnicode_AS_UNICODE(obj) ||
	    len != PyUnicode_GET_SIZE(obj))
        	return raiseTestError("test_u_code",
			"u# code returned wrong values for u'test'");

	Py_DECREF(tuple);
	Py_INCREF(Py_None);
	return Py_None;
}
Exemplo n.º 2
0
static PyObject *next_token(ligolw_Tokenizer *tokenizer, Py_UNICODE **start, Py_UNICODE **end)
{
	Py_UNICODE *pos = tokenizer->pos;
	Py_UNICODE *bailout = tokenizer->length;
	PyObject *type = *tokenizer->type;
	Py_UNICODE quote_character;

	/*
	 * The following code matches the pattern:
	 *
	 * any amount of white-space + " + non-quote characters + " + any
	 * amount of white-space + delimiter
	 *
	 * or
	 *
	 * any amount of white-space + non-white-space, non-delimiter
	 * characters + any amount of white-space + delimiter
	 *
	 * The middle bit is returned as the token.  '"' and '\' characters
	 * can be escaped by preceding them with a '\' character.
	 */

	/*
	 * start == a white-space to non-white-space transition outside of
	 * a quote, or a non-quoted to quoted transition.
	 *
	 * end == a non-white-space to white-space transition outside of a
	 * quote, or a delimiter outside of a quote, or a quoted to
	 * non-quoted transition.
	 */

	if(pos >= bailout)
		goto stop_iteration;
	while(Py_UNICODE_ISSPACE(*pos))
		if(++pos >= bailout)
			goto stop_iteration;
	if(pyunicode_strchr(tokenizer->quote_characters, *pos)) {
		/*
		 * Found a quoted token.
		 */

		int escaped = 0;

		quote_character = *pos;

		*start = ++pos;
		if(pos >= bailout)
			goto stop_iteration;
		while((*pos != quote_character) || escaped) {
			escaped = (*pos == tokenizer->escape_character) && !escaped;
			if(++pos >= bailout)
				goto stop_iteration;
		}
		*end = pos;
		if(++pos >= bailout)
			goto stop_iteration;
	} else {
		/*
		 * Found an unquoted token.
		 */

		quote_character = 0;

		*start = pos;
		while(!Py_UNICODE_ISSPACE(*pos) && (*pos != tokenizer->delimiter))
			if(++pos >= bailout)
				goto stop_iteration;
		*end = pos;
		if(*start == *end)
			/*
			 * Found nothing but unquoted whitespace between
			 * delimiters --> an empty token (not the same as a
			 * zero-length token).
			 */

			*start = *end = NULL;
	}
	while(*pos != tokenizer->delimiter) {
		if(!Py_UNICODE_ISSPACE(*pos)) {
			parse_error(PyExc_ValueError, *start, tokenizer->length - *start - 1, pos, "expected whitespace or delimiter");
			return NULL;
		}
		if(++pos >= bailout)
			goto stop_iteration;
	}

	/*
	 * After this, tokenizer->pos points to the first character after
	 * the delimiter that terminated this current token.
	 */

	tokenizer->pos = ++pos;

	/*
	 * Select the next type
	 */

	if(++tokenizer->type >= tokenizer->types_length)
		tokenizer->type = tokenizer->types;

	/*
	 * NULL terminate the token, and if it was quoted unescape special
	 * characters.  The unescape() function modifies the token in
	 * place, so we call it after advancing tokenizer->pos and
	 * tokenizer->type so that if a failure occurs we don't leave the
	 * tokenizer pointed at a garbled string.
	 */

	if(*end)
		**end = 0;
	if(quote_character) {
		/* FIXME:  remove the delimiter */
		Py_UNICODE escapable_characters[] = {quote_character, tokenizer->escape_character, tokenizer->delimiter, '\0'};
		if(unescape(*start, end, escapable_characters, tokenizer->escape_character))
			return NULL;
	}

	/*
	 * Done.  *start points to the first character of the token, *end
	 * points to the first character following the token (or both are
	 * NULL if there was nothing but unquoted whitespace),
	 * tokenizer->pos and tokenizer->type have been advanced in
	 * readiness for the next token, and the return value is the python
	 * type to which the current token is to be converted.
	 */

	return type;

	/*
	 * Errors
	 */

stop_iteration:
	advance_to_pos(tokenizer);
	PyErr_SetNone(PyExc_StopIteration);
	return NULL;
}
Exemplo n.º 3
0
static PyObject *
filters_uspace_compress(PyObject * self, PyObject *args) {
  PyObject * com;
  PyObject * res;
  Py_ssize_t len;
  Py_UNICODE *input_buffer;
  Py_UNICODE *buffer;
  Py_UNICODE c;
  int ic, ib;
  int gobble = 1;
  com = unicode_arg(args);
  if(!com) {
    return NULL;
  }
  input_buffer = PyUnicode_AS_UNICODE(com);
  len = PyUnicode_GetSize(com);
  buffer = (Py_UNICODE*)malloc(len * sizeof(Py_UNICODE));

  /* ic -> input buffer index, ib -> output buffer */
  for(ic = 0, ib = 0; ic <= len; ic++) {
    c = input_buffer[ic];
    /* gobble -> we are space compressing */
    if(gobble) {
      /* remove spaces if encountered */
      if(Py_UNICODE_ISSPACE(c)) {
        /* after this loop, c will be a non-space */
        while(Py_UNICODE_ISSPACE(c)) { c = input_buffer[++ic]; }
        /* unless next char is a <, add a single space to account for
           the multiple spaces that have been removed */
        if(c != (Py_UNICODE)('<')) {
          buffer[ib++] = (Py_UNICODE)(' ');
        }
      }
      /* gobble all space after '>' */
      if(c == (Py_UNICODE)('>')) {
        buffer[ib++] = c;
	c = input_buffer[++ic];
        while(Py_UNICODE_ISSPACE(c)) { c = input_buffer[++ic]; }
      }
      /* does the next part of the string match the SC_OFF label */
      if (len - ic >= SC_OFF_LEN &&
          memcmp(&input_buffer[ic], SC_OFF_U, 
                 sizeof(Py_UNICODE)*SC_OFF_LEN) == 0) {
        /* disable gobbling, and bypass that part of the string */
        gobble = 0;
        ic += SC_OFF_LEN;
        c = input_buffer[ic];
      }
    }
    /* not gobbling, but find the SC_ON tag */
    else if (len - ic >= SC_ON_LEN &&
          memcmp(&input_buffer[ic], SC_ON_U, 
                 sizeof(Py_UNICODE)*SC_ON_LEN) == 0) {
        gobble = 1;
        ic += SC_ON_LEN;
        c = input_buffer[ic];
    }
    if(c) {
      buffer[ib++] = c;
    }
  }  

  res = PyUnicode_FromUnicode(buffer, ib);
  free(buffer);
  return res;
}