static const char *
decode_str(const char *input, int single, struct tok_state *tok)
{
    PyObject* utf8 = NULL;
    const char *str;
    const char *s;
    const char *newl[2] = {NULL, NULL};
    int lineno = 0;
    tok->input = str = translate_newlines(input, single, tok);
    if (str == NULL)
        return NULL;
    tok->enc = NULL;
    tok->str = str;
    if (!check_bom(buf_getc, buf_ungetc, buf_setreadl, tok))
        return error_ret(tok);
    str = tok->str;             /* string after BOM if any */
    assert(str);
#ifdef Py_USING_UNICODE
    if (tok->enc != NULL) {
        utf8 = translate_into_utf8(str, tok->enc);
        if (utf8 == NULL)
            return error_ret(tok);
        str = PyString_AsString(utf8);
    }
#endif
    for (s = str;; s++) {
        if (*s == '\0') break;
        else if (*s == '\n') {
            assert(lineno < 2);
            newl[lineno] = s;
            lineno++;
            if (lineno == 2) break;
        }
    }
    tok->enc = NULL;
    /* need to check line 1 and 2 separately since check_coding_spec
       assumes a single line as input */
    if (newl[0]) {
        if (!check_coding_spec(str, newl[0] - str, tok, buf_setreadl))
            return error_ret(tok);
        if (tok->enc == NULL && !tok->read_coding_spec && newl[1]) {
            if (!check_coding_spec(newl[0]+1, newl[1] - newl[0],
                                   tok, buf_setreadl))
                return error_ret(tok);
        }
    }
#ifdef Py_USING_UNICODE
    if (tok->enc != NULL) {
        assert(utf8 == NULL);
        utf8 = translate_into_utf8(str, tok->enc);
        if (utf8 == NULL)
            return error_ret(tok);
        str = PyString_AsString(utf8);
    }
#endif
    assert(tok->decoding_buffer == NULL);
    tok->decoding_buffer = utf8; /* CAUTION */
    return str;
}
Beispiel #2
0
int is_utf16le(char *to_check)
{
    int *bom = malloc(2 * sizeof(int));
    bom[0] = 0xFF;
    bom[1] = 0xFE;
    
    return check_bom(to_check, bom);
}
Beispiel #3
0
int is_utf8(char *to_check)
{
    int *bom = malloc(3 * sizeof(int));
    bom[0] = 0xEF;
    bom[1] = 0xBB;
    bom[2] = 0xBF;
    
    return check_bom(to_check, bom);
}
Beispiel #4
0
static char *
decoding_fgets(char *s, int size, struct tok_state *tok)
{
	char *line = NULL;
	int badchar = 0;
	for (;;) {
		if (tok->decoding_state < 0) {
			/* We already have a codec associated with
			   this input. */
			line = fp_readl(s, size, tok);
			break;
		} else if (tok->decoding_state > 0) {
			/* We want a 'raw' read. */
			line = Py_UniversalNewlineFgets(s, size,
							tok->fp, NULL);
			break;
		} else {
			/* We have not yet determined the encoding.
			   If an encoding is found, use the file-pointer
			   reader functions from now on. */
			if (!check_bom(fp_getc, fp_ungetc, fp_setreadl, tok))
				return error_ret(tok);
			assert(tok->decoding_state != 0);
		}
	}
	if (line != NULL && tok->lineno < 2 && !tok->read_coding_spec) {
		if (!check_coding_spec(line, strlen(line), tok, fp_setreadl)) {
			return error_ret(tok);
		}
	}
#ifndef PGEN
	/* The default encoding is ASCII, so make sure we don't have any
           non-ASCII bytes in it. */
	if (line && !tok->encoding) {
		unsigned char *c;
		for (c = (unsigned char *)line; *c; c++)
			if (*c > 127) {
				badchar = *c;
				break;
			}
	}
	if (badchar) {
		char buf[500];
		/* Need to add 1 to the line number, since this line
		   has not been counted, yet.  */
		sprintf(buf,
			"Non-ASCII character '\\x%.2x' "
			"in file %.200s on line %i, "
			"but no encoding declared; "
			"see http://www.python.org/peps/pep-0263.html for details",
			badchar, tok->filename, tok->lineno + 1);
		PyErr_SetString(PyExc_SyntaxError, buf);
		return error_ret(tok);
	}
#endif
	return line;
}
Beispiel #5
0
static char *
decoding_fgets(char *s, int size, struct tok_state *tok)
{
    char *line = NULL;
    int badchar = 0;
    for (;;) {
        if (tok->decoding_state == STATE_NORMAL) {
            /* We already have a codec associated with
               this input. */
            line = fp_readl(s, size, tok);
            break;
        } else if (tok->decoding_state == STATE_RAW) {
            /* We want a 'raw' read. */
            line = Py_UniversalNewlineFgets(s, size,
                                            tok->fp, NULL);
            break;
        } else {
            /* We have not yet determined the encoding.
               If an encoding is found, use the file-pointer
               reader functions from now on. */
            if (!check_bom(fp_getc, fp_ungetc, fp_setreadl, tok))
                return error_ret(tok);
            assert(tok->decoding_state != STATE_INIT);
        }
    }
    if (line != NULL && tok->lineno < 2 && !tok->read_coding_spec) {
        if (!check_coding_spec(line, strlen(line), tok, fp_setreadl)) {
            return error_ret(tok);
        }
    }
#ifndef PGEN
    /* The default encoding is UTF-8, so make sure we don't have any
       non-UTF-8 sequences in it. */
    if (line && !tok->encoding) {
        unsigned char *c;
        int length;
        for (c = (unsigned char *)line; *c; c += length)
            if (!(length = valid_utf8(c))) {
                badchar = *c;
                break;
            }
    }
    if (badchar) {
        /* Need to add 1 to the line number, since this line
           has not been counted, yet.  */
        PyErr_Format(PyExc_SyntaxError,
                "Non-UTF-8 code starting with '\\x%.2x' "
                "in file %U on line %i, "
                "but no encoding declared; "
                "see http://python.org/dev/peps/pep-0263/ for details",
                badchar, tok->filename, tok->lineno + 1);
        return error_ret(tok);
    }
#endif
    return line;
}
Beispiel #6
0
int is_utf32le(char *to_check)
{
    int *bom = malloc(4 * sizeof(int));
    bom[0] = 0x00;
    bom[1] = 0x00;
    bom[2] = 0xFF;
    bom[3] = 0xFE;
    
    return check_bom(to_check, bom);
}