static char * decoding_fgets(char *s, int size, struct tok_state *tok) { char *line = NULL; int badchar = 0; for (;;) { if (tok->decoding_state == STATE_NORMAL) { /* We already have a codec associated with this input. */ line = fp_readl(s, size, tok); break; } else if (tok->decoding_state == STATE_RAW) { /* We want a 'raw' read. */ line = Py_UniversalNewlineFgets(s, size, tok->fp, NULL); break; } else { /* We have not yet determined the encoding. If an encoding is found, use the file-pointer reader functions from now on. */ if (!check_bom(fp_getc, fp_ungetc, fp_setreadl, tok)) return error_ret(tok); assert(tok->decoding_state != STATE_INIT); } } if (line != NULL && tok->lineno < 2 && !tok->read_coding_spec) { if (!check_coding_spec(line, strlen(line), tok, fp_setreadl)) { return error_ret(tok); } } #ifndef PGEN /* The default encoding is UTF-8, so make sure we don't have any non-UTF-8 sequences in it. */ if (line && !tok->encoding) { unsigned char *c; int length; for (c = (unsigned char *)line; *c; c += length) if (!(length = valid_utf8(c))) { badchar = *c; break; } } if (badchar) { /* Need to add 1 to the line number, since this line has not been counted, yet. */ PyErr_Format(PyExc_SyntaxError, "Non-UTF-8 code starting with '\\x%.2x' " "in file %U on line %i, " "but no encoding declared; " "see http://python.org/dev/peps/pep-0263/ for details", badchar, tok->filename, tok->lineno + 1); return error_ret(tok); } #endif return line; }
inline bool ValidUTF8(const char* begin, const char* end) { #if !OGLPLUS_NO_UTF8_CHECKS UTF8Validator valid_utf8; return valid_utf8(begin, end); #else OGLPLUS_FAKE_USE(begin); OGLPLUS_FAKE_USE(end); return true; #endif }
inline bool ValidUTF8(Iterator begin, Iterator end) { #if !OGLPLUS_NO_UTF8_CHECKS UTF8Validator<Iterator> valid_utf8; return valid_utf8(begin, end); #else OGLPLUS_FAKE_USE(begin); OGLPLUS_FAKE_USE(end); return true; #endif }
/* * Takes a string of UTF8-encoded data and strips all characters we cannot use. * * - multibyte characters we can display are mapped over * - all other multibyte characters are stripped * - characters we cannot handle are turned into nulls * * Also see: http://en.wikipedia.org/wiki/Utf8 */ char *utf8_strip(const char *dirty) { // optimistic result: we end up with the same string uint8_t length = strlen(dirty), ci = 0, di = 0; unsigned char bytes = 0; unsigned char current = 0x00, next = 0x00; char *cleaned = ALLOC_STR(length); // iterate character by character and replace it for (di = 0, ci = 0; di < length; di++) { current = dirty[di]; if ( ! valid_utf8(current)) // invalid byte { continue; } else if ( ! is_ascii(current)) // multibyte { if (current == 0xC3 && (di + 1) < length) // might be åäöÅÄÖ, they all are 0xC3xx { next = dirty[++di]; // we consume the next character if (is_ascii(next)) // somehow, next byte is ascii (invalid utf8), so abort { // we cannot safely map the next byte in our charmap as it’ll collide // with the ascii characters which might be bad! continue; } else { current = next; } } else // skip all the additional bytes { bytes = (current & 0xF0); // 1111 xxxx while (bytes <<= 1) di += 1; current = '\0'; // let charmap handle it } } cleaned[ci++] = charmap[current]; } return cleaned; }
bool LoadStrings(const std::string &lang) { int errline; std::set<std::string> seen, missing; ResetStringData(); std::string filename = "lang/English.txt"; RefCountedPtr<FileSystem::FileData> english_data = FileSystem::gameDataFiles.ReadFile(filename); if (!english_data) { fprintf(stderr, "couldn't open string file '%s'\n", filename.c_str()); return false; } errline = valid_utf8(english_data->AsStringRange()); if (errline) { fprintf(stderr, "invalid UTF-8 code in line %d of '%s'\n", errline, filename.c_str()); return false; } seen.clear(); for (StringFileParser parser(filename, english_data->AsStringRange()); !parser.Finished(); parser.Next()) { const std::string token = parser.GetToken().ToString(); token_map::iterator it = s_token_map.find(token); if (it != s_token_map.end()) { seen.insert(token); const std::string &text = parser.GetAdjustedText(); if (text.size() >= size_t(STRING_RECORD_SIZE)) fprintf(stderr, "WARNING: language text is too long -- it will be cut off!\n"); // XXX const_cast is ugly, but see note for declaration of tokens map char *record = const_cast<char*>(it->second); copy_string(record, text.c_str(), text.size(), STRING_RECORD_SIZE); } else { fprintf(stderr, "unknown language token '%s' at %s:%d\n", token.c_str(), parser.GetFileName().c_str(), parser.GetTokenLineNumber()); } } english_data.Reset(); if (seen.size() != s_token_map.size()) { fprintf(stderr, "string file '%s' has missing tokens:\n", filename.c_str()); for (token_map::iterator it = s_token_map.begin(); it != s_token_map.end(); ++it) { if (!seen.count(it->first)) { fprintf(stderr, " %s\n", it->first.c_str()); missing.insert(it->first); } } } if (lang == "English") return (seen.size() == s_token_map.size()); filename = "lang/" + lang + ".txt"; RefCountedPtr<FileSystem::FileData> lang_data = FileSystem::gameDataFiles.ReadFile(filename); if (!lang_data) { fprintf(stderr, "couldn't open string file '%s'\n", filename.c_str()); return false; } errline = valid_utf8(lang_data->AsStringRange()); if (errline) { fprintf(stderr, "invalid UTF-8 code in line %d of '%s'\n", errline, filename.c_str()); return false; } seen.clear(); for (StringFileParser parser(filename, lang_data->AsStringRange()); !parser.Finished(); parser.Next()) { const std::string token = parser.GetToken().ToString(); token_map::iterator it = s_token_map.find(token); if (it != s_token_map.end()) { seen.insert(token); const std::string &text = parser.GetAdjustedText(); if (text.size() >= size_t(STRING_RECORD_SIZE)) fprintf(stderr, "WARNING: language text is too long -- it will be cut off!\n"); // XXX const_cast is ugly, but see note for declaration of tokens map char *record = const_cast<char*>(it->second); copy_string(record, text.c_str(), text.size(), STRING_RECORD_SIZE); } else { fprintf(stderr, "unknown language token '%s' at %s:%d\n", token.c_str(), parser.GetFileName().c_str(), parser.GetTokenLineNumber()); } } if (seen.size() != s_token_map.size()) { fprintf(stderr, "string file '%s' has missing tokens:\n", filename.c_str()); for (token_map::iterator it = s_token_map.begin(); it != s_token_map.end(); ++it) { if (!seen.count(it->first)) { fprintf(stderr, " %s\n", it->first.c_str()); } else { missing.erase(it->first); } } } if (!missing.empty()) { fprintf(stderr, "no strings found for the following tokens:\n"); for (std::set<std::string>::iterator it = missing.begin(); it != missing.end(); ++it) { fprintf(stderr, " %s\n", it->c_str()); } return false; } return true; }
bool read_literals(char *oldpos, size_t size, Header* h) { int i, j; int n = 0; char type; uint32_t str_length; uint32_t ref; char *startpos = oldpos + h->size * 4; char *curpos = startpos; while (!eofreached) { type = *curpos++; if (eofreached) { break; } n++; switch (type) { case TYPE_NUM: curpos += 8; break; case TYPE_NUM | TYPE_SHORT: curpos += 3; break; case TYPE_STR: case TYPE_IDENT: memcpy(&str_length, curpos, 4); curpos += 4 + ntohl(str_length); break; case TYPE_STR | TYPE_SHORT: case TYPE_IDENT | TYPE_SHORT: str_length = (unsigned char)*curpos++; curpos += str_length; break; case TYPE_PAIR: curpos += 6; break; case TYPE_FRAC: curpos += 16; break; case TYPE_FRAC | TYPE_SHORT: curpos += 2; break; case TYPE_LIST: memcpy(&str_length, curpos, 4); curpos += 4 + 3 * ntohl(str_length); break; case TYPE_DICT: memcpy(&str_length, curpos, 4); curpos += 4 + 6 * ntohl(str_length); break; } } V* arr = calloc(n, sizeof(V)); V t; curpos = startpos; for (i = 0; i < n; i++) { type = *curpos++; if (type == TYPE_NUM) { union double_or_uint64_t d; memcpy(&d, curpos, 8); curpos += 8; d.i = ntohll(d.i); t = double_to_value(d.d); } else if (type == (TYPE_NUM | TYPE_SHORT)) { ref = 0; memcpy(((char*)&ref) + 1, curpos, 3); ref = ntohl(ref); curpos += 3; t = int_to_value(ref); } else if (type == TYPE_STR) { memcpy(&str_length, curpos, 4); curpos += 4; str_length = ntohl(str_length); if (!valid_utf8(str_length, curpos)) { set_error_msg("wrong encoding for string literal, should be UTF-8"); return false; } t = str_to_string(str_length, curpos); curpos += str_length; } else if (type == TYPE_IDENT) { memcpy(&str_length, curpos, 4); curpos += 4; str_length = ntohl(str_length); char data[str_length + 1]; memcpy(&data, curpos, str_length); data[str_length] = '\0'; t = lookup_ident(str_length, data); curpos += str_length; } else if (type == (TYPE_STR | TYPE_SHORT)) { str_length = (unsigned char)*curpos++; if (!valid_utf8(str_length, curpos)) { set_error_msg("wrong encoding for string literal, should be UTF-8"); return false; } t = str_to_string(str_length, curpos); curpos += str_length; } else if (type == (TYPE_IDENT | TYPE_SHORT)) { str_length = *curpos++; char data[str_length + 1]; memcpy(&data, curpos, str_length); data[str_length] = '\0'; t = lookup_ident(str_length, data); curpos += str_length; } else if (type == TYPE_PAIR) { ref = 0; memcpy(((char*)&ref) + 1, curpos, 3); ref = ntohl(ref); if (ref >= i) { set_error_msg("illegal pair detected"); return false; } V v1 = arr[ref]; ref = 0; memcpy(((char*)&ref) + 1, curpos + 3, 3); ref = ntohl(ref); if (ref >= i) { set_error_msg("illegal pair detected"); return false; } V v2 = arr[ref]; t = new_pair(v1, v2); curpos += 6; } else if (type == TYPE_FRAC) { int64_t numer; int64_t denom; memcpy(&numer, curpos, 8); numer = ntohll(numer); memcpy(&denom, curpos + 8, 8); denom = ntohll(denom); t = new_frac(numer, denom); curpos += 16; } else if (type == (TYPE_FRAC | TYPE_SHORT)) { int8_t numer; uint8_t denom; numer = *curpos++; denom = *curpos++; t = new_frac(numer, denom); } else if (type == TYPE_LIST) { memcpy(&str_length, curpos, 4); str_length = ntohl(str_length); t = new_list(); curpos += 4; if (str_length > 0) { uint32_t size = 64; while (size < str_length) size <<= 1; toStack(t)->size = size; toStack(t)->used = str_length; toStack(t)->nodes = calloc(size, sizeof(V)); for (j = 0; j < str_length; j++) { ref = 0; memcpy(((char*)&ref) + 1, curpos, 3); ref = ntohl(ref); toStack(t)->nodes[j] = intToV((uint64_t)ref); curpos += 3; } } } else if (type == TYPE_DICT) { memcpy(&str_length, curpos, 4); curpos += 4; str_length = ntohl(str_length); t = new_dict(); if (str_length > 0) { uint32_t size = 16; while (size < str_length) size <<= 1; toHashMap(t)->size = size; toHashMap(t)->used = str_length; toHashMap(t)->map = (Bucket**)curpos; } curpos += 6 * str_length; } else { set_error_msg("Unknown literal type."); return false; } arr[i] = t; } for (i = 0; i < n; i++) { t = arr[i]; switch(getType(t)) { case TYPE_LIST: for (j = 0; j < toStack(t)->used; j++) { toStack(t)->nodes[j] = arr[toInt(toStack(t)->nodes[j])]; } break; case TYPE_DICT: if (toHashMap(t)->map) { curpos = ((char*)toHashMap(t)->map); toHashMap(t)->map = NULL; str_length = toHashMap(t)->used; //worst abuse of variable name ever Y/Y? toHashMap(t)->used = 0; for (j = 0; j < str_length; j++) { ref = 0; memcpy(((char*)&ref) + 1, curpos, 3); ref = ntohl(ref); V key = arr[ref]; ref = 0; memcpy(((char*)&ref) + 1, curpos + 3, 3); ref = ntohl(ref); V value = arr[ref]; set_hashmap(toHashMap(t), key, value); curpos += 6; } } break; } } h->n_literals = n; h->literals = arr; return true; }