static const char * decode_str(const char *input, int single, struct tok_state *tok) { PyObject* utf8 = NULL; const char *str; const char *s; const char *newl[2] = {NULL, NULL}; int lineno = 0; tok->input = str = translate_newlines(input, single, tok); if (str == NULL) return NULL; tok->enc = NULL; tok->str = str; if (!check_bom(buf_getc, buf_ungetc, buf_setreadl, tok)) return error_ret(tok); str = tok->str; /* string after BOM if any */ assert(str); #ifdef Py_USING_UNICODE if (tok->enc != NULL) { utf8 = translate_into_utf8(str, tok->enc); if (utf8 == NULL) return error_ret(tok); str = PyString_AsString(utf8); } #endif for (s = str;; s++) { if (*s == '\0') break; else if (*s == '\n') { assert(lineno < 2); newl[lineno] = s; lineno++; if (lineno == 2) break; } } tok->enc = NULL; /* need to check line 1 and 2 separately since check_coding_spec assumes a single line as input */ if (newl[0]) { if (!check_coding_spec(str, newl[0] - str, tok, buf_setreadl)) return error_ret(tok); if (tok->enc == NULL && !tok->read_coding_spec && newl[1]) { if (!check_coding_spec(newl[0]+1, newl[1] - newl[0], tok, buf_setreadl)) return error_ret(tok); } } #ifdef Py_USING_UNICODE if (tok->enc != NULL) { assert(utf8 == NULL); utf8 = translate_into_utf8(str, tok->enc); if (utf8 == NULL) return error_ret(tok); str = PyString_AsString(utf8); } #endif assert(tok->decoding_buffer == NULL); tok->decoding_buffer = utf8; /* CAUTION */ return str; }
int is_utf16le(char *to_check) { int *bom = malloc(2 * sizeof(int)); bom[0] = 0xFF; bom[1] = 0xFE; return check_bom(to_check, bom); }
int is_utf8(char *to_check) { int *bom = malloc(3 * sizeof(int)); bom[0] = 0xEF; bom[1] = 0xBB; bom[2] = 0xBF; return check_bom(to_check, bom); }
static char * decoding_fgets(char *s, int size, struct tok_state *tok) { char *line = NULL; int badchar = 0; for (;;) { if (tok->decoding_state < 0) { /* We already have a codec associated with this input. */ line = fp_readl(s, size, tok); break; } else if (tok->decoding_state > 0) { /* We want a 'raw' read. */ line = Py_UniversalNewlineFgets(s, size, tok->fp, NULL); break; } else { /* We have not yet determined the encoding. If an encoding is found, use the file-pointer reader functions from now on. */ if (!check_bom(fp_getc, fp_ungetc, fp_setreadl, tok)) return error_ret(tok); assert(tok->decoding_state != 0); } } if (line != NULL && tok->lineno < 2 && !tok->read_coding_spec) { if (!check_coding_spec(line, strlen(line), tok, fp_setreadl)) { return error_ret(tok); } } #ifndef PGEN /* The default encoding is ASCII, so make sure we don't have any non-ASCII bytes in it. */ if (line && !tok->encoding) { unsigned char *c; for (c = (unsigned char *)line; *c; c++) if (*c > 127) { badchar = *c; break; } } if (badchar) { char buf[500]; /* Need to add 1 to the line number, since this line has not been counted, yet. */ sprintf(buf, "Non-ASCII character '\\x%.2x' " "in file %.200s on line %i, " "but no encoding declared; " "see http://www.python.org/peps/pep-0263.html for details", badchar, tok->filename, tok->lineno + 1); PyErr_SetString(PyExc_SyntaxError, buf); return error_ret(tok); } #endif return line; }
static char * decoding_fgets(char *s, int size, struct tok_state *tok) { char *line = NULL; int badchar = 0; for (;;) { if (tok->decoding_state == STATE_NORMAL) { /* We already have a codec associated with this input. */ line = fp_readl(s, size, tok); break; } else if (tok->decoding_state == STATE_RAW) { /* We want a 'raw' read. */ line = Py_UniversalNewlineFgets(s, size, tok->fp, NULL); break; } else { /* We have not yet determined the encoding. If an encoding is found, use the file-pointer reader functions from now on. */ if (!check_bom(fp_getc, fp_ungetc, fp_setreadl, tok)) return error_ret(tok); assert(tok->decoding_state != STATE_INIT); } } if (line != NULL && tok->lineno < 2 && !tok->read_coding_spec) { if (!check_coding_spec(line, strlen(line), tok, fp_setreadl)) { return error_ret(tok); } } #ifndef PGEN /* The default encoding is UTF-8, so make sure we don't have any non-UTF-8 sequences in it. */ if (line && !tok->encoding) { unsigned char *c; int length; for (c = (unsigned char *)line; *c; c += length) if (!(length = valid_utf8(c))) { badchar = *c; break; } } if (badchar) { /* Need to add 1 to the line number, since this line has not been counted, yet. */ PyErr_Format(PyExc_SyntaxError, "Non-UTF-8 code starting with '\\x%.2x' " "in file %U on line %i, " "but no encoding declared; " "see http://python.org/dev/peps/pep-0263/ for details", badchar, tok->filename, tok->lineno + 1); return error_ret(tok); } #endif return line; }
int is_utf32le(char *to_check) { int *bom = malloc(4 * sizeof(int)); bom[0] = 0x00; bom[1] = 0x00; bom[2] = 0xFF; bom[3] = 0xFE; return check_bom(to_check, bom); }