C++ (Cpp) valid_utf8 예제들

예제 #1

0

파일 보기

파일: tokenizer.c 프로젝트: Martiusweb/cpython

static char *
decoding_fgets(char *s, int size, struct tok_state *tok)
{
    char *line = NULL;
    int badchar = 0;
    for (;;) {
        if (tok->decoding_state == STATE_NORMAL) {
            /* We already have a codec associated with
               this input. */
            line = fp_readl(s, size, tok);
            break;
        } else if (tok->decoding_state == STATE_RAW) {
            /* We want a 'raw' read. */
            line = Py_UniversalNewlineFgets(s, size,
                                            tok->fp, NULL);
            break;
        } else {
            /* We have not yet determined the encoding.
               If an encoding is found, use the file-pointer
               reader functions from now on. */
            if (!check_bom(fp_getc, fp_ungetc, fp_setreadl, tok))
                return error_ret(tok);
            assert(tok->decoding_state != STATE_INIT);
        }
    }
    if (line != NULL && tok->lineno < 2 && !tok->read_coding_spec) {
        if (!check_coding_spec(line, strlen(line), tok, fp_setreadl)) {
            return error_ret(tok);
        }
    }
#ifndef PGEN
    /* The default encoding is UTF-8, so make sure we don't have any
       non-UTF-8 sequences in it. */
    if (line && !tok->encoding) {
        unsigned char *c;
        int length;
        for (c = (unsigned char *)line; *c; c += length)
            if (!(length = valid_utf8(c))) {
                badchar = *c;
                break;
            }
    }
    if (badchar) {
        /* Need to add 1 to the line number, since this line
           has not been counted, yet.  */
        PyErr_Format(PyExc_SyntaxError,
                "Non-UTF-8 code starting with '\\x%.2x' "
                "in file %U on line %i, "
                "but no encoding declared; "
                "see http://python.org/dev/peps/pep-0263/ for details",
                badchar, tok->filename, tok->lineno + 1);
        return error_ret(tok);
    }
#endif
    return line;
}

예제 #2

0

파일 보기

파일: utf8.hpp 프로젝트: Extrunder/oglplus

inline bool ValidUTF8(const char* begin, const char* end)
{
#if !OGLPLUS_NO_UTF8_CHECKS
	UTF8Validator valid_utf8;
	return valid_utf8(begin, end);
#else
	OGLPLUS_FAKE_USE(begin);
	OGLPLUS_FAKE_USE(end);
	return true;
#endif
}

예제 #3

0

파일 보기

파일: utf8.hpp 프로젝트: detunized/oglplus

inline bool ValidUTF8(Iterator begin, Iterator end)
{
#if !OGLPLUS_NO_UTF8_CHECKS
	UTF8Validator<Iterator> valid_utf8;
	return valid_utf8(begin, end);
#else
	OGLPLUS_FAKE_USE(begin);
	OGLPLUS_FAKE_USE(end);
	return true;
#endif
}

예제 #4

0

파일 보기

파일: Encoding.cpp 프로젝트: Burgestrand/twitterskylt

	/*
	 * Takes a string of UTF8-encoded data and strips all characters we cannot use.
	 *
	 * - multibyte characters we can display are mapped over
	 * - all other multibyte characters are stripped
	 * - characters we cannot handle are turned into nulls
	 *
	 * Also see: http://en.wikipedia.org/wiki/Utf8
	 */
	char *utf8_strip(const char *dirty)
	{
		// optimistic result: we end up with the same string
		uint8_t length = strlen(dirty), ci = 0, di = 0;
		unsigned char bytes	= 0;
		unsigned char current = 0x00, next = 0x00;
		char *cleaned = ALLOC_STR(length);

		// iterate character by character and replace it
		for (di = 0, ci = 0; di < length; di++)
		{
			current = dirty[di];

			if ( ! valid_utf8(current)) // invalid byte
			{
				continue;
			}
			else if ( ! is_ascii(current)) // multibyte
			{
				if (current == 0xC3 && (di + 1) < length) // might be åäöÅÄÖ, they all are 0xC3xx
				{
					next = dirty[++di]; // we consume the next character

					if (is_ascii(next)) // somehow, next byte is ascii (invalid utf8), so abort
					{
						// we cannot safely map the next byte in our charmap as it’ll collide
						// with the ascii characters which might be bad!
						continue;
					}
					else
					{
						current = next;
					}
				}
				else // skip all the additional bytes
				{
					bytes = (current & 0xF0); // 1111 xxxx
					while (bytes <<= 1) di += 1;
					current = '\0'; // let charmap handle it
				}
			}

			cleaned[ci++] = charmap[current];
		}

		return cleaned;
	}

예제 #5

0

파일 보기

파일: Lang.cpp 프로젝트: GizmoR13/pioneer

bool LoadStrings(const std::string &lang)
{
	int errline;
	std::set<std::string> seen, missing;

	ResetStringData();

	std::string filename = "lang/English.txt";
	RefCountedPtr<FileSystem::FileData> english_data = FileSystem::gameDataFiles.ReadFile(filename);
	if (!english_data) {
		fprintf(stderr, "couldn't open string file '%s'\n", filename.c_str());
		return false;
	}

	errline = valid_utf8(english_data->AsStringRange());
	if (errline) {
		fprintf(stderr, "invalid UTF-8 code in line %d of '%s'\n", errline, filename.c_str());
		return false;
	}

	seen.clear();
	for (StringFileParser parser(filename, english_data->AsStringRange()); !parser.Finished(); parser.Next()) {
		const std::string token = parser.GetToken().ToString();
		token_map::iterator it = s_token_map.find(token);
		if (it != s_token_map.end()) {
			seen.insert(token);
			const std::string &text = parser.GetAdjustedText();
			if (text.size() >= size_t(STRING_RECORD_SIZE))
				fprintf(stderr, "WARNING: language text is too long -- it will be cut off!\n");
			// XXX const_cast is ugly, but see note for declaration of tokens map
			char *record = const_cast<char*>(it->second);
			copy_string(record, text.c_str(), text.size(), STRING_RECORD_SIZE);
		} else {
			fprintf(stderr, "unknown language token '%s' at %s:%d\n", token.c_str(), parser.GetFileName().c_str(), parser.GetTokenLineNumber());
		}
	}

	english_data.Reset();

	if (seen.size() != s_token_map.size()) {
		fprintf(stderr, "string file '%s' has missing tokens:\n", filename.c_str());
		for (token_map::iterator it = s_token_map.begin(); it != s_token_map.end(); ++it) {
			if (!seen.count(it->first)) {
				fprintf(stderr, "  %s\n", it->first.c_str());
				missing.insert(it->first);
			}
		}
	}

	if (lang == "English")
		return (seen.size() == s_token_map.size());

	filename = "lang/" + lang + ".txt";
	RefCountedPtr<FileSystem::FileData> lang_data = FileSystem::gameDataFiles.ReadFile(filename);
	if (!lang_data) {
		fprintf(stderr, "couldn't open string file '%s'\n", filename.c_str());
		return false;
	}

	errline = valid_utf8(lang_data->AsStringRange());
	if (errline) {
		fprintf(stderr, "invalid UTF-8 code in line %d of '%s'\n", errline, filename.c_str());
		return false;
	}

	seen.clear();
	for (StringFileParser parser(filename, lang_data->AsStringRange()); !parser.Finished(); parser.Next()) {
		const std::string token = parser.GetToken().ToString();
		token_map::iterator it = s_token_map.find(token);
		if (it != s_token_map.end()) {
			seen.insert(token);
			const std::string &text = parser.GetAdjustedText();
			if (text.size() >= size_t(STRING_RECORD_SIZE))
				fprintf(stderr, "WARNING: language text is too long -- it will be cut off!\n");
			// XXX const_cast is ugly, but see note for declaration of tokens map
			char *record = const_cast<char*>(it->second);
			copy_string(record, text.c_str(), text.size(), STRING_RECORD_SIZE);
		} else {
			fprintf(stderr, "unknown language token '%s' at %s:%d\n", token.c_str(), parser.GetFileName().c_str(), parser.GetTokenLineNumber());
		}
	}

	if (seen.size() != s_token_map.size()) {
		fprintf(stderr, "string file '%s' has missing tokens:\n", filename.c_str());
		for (token_map::iterator it = s_token_map.begin(); it != s_token_map.end(); ++it) {
			if (!seen.count(it->first)) {
				fprintf(stderr, "  %s\n", it->first.c_str());
			} else {
				missing.erase(it->first);
			}
		}
	}

	if (!missing.empty()) {
		fprintf(stderr, "no strings found for the following tokens:\n");
		for (std::set<std::string>::iterator it = missing.begin(); it != missing.end(); ++it) {
			fprintf(stderr, "  %s\n", it->c_str());
		}
		return false;
	}

	return true;
}

예제 #6

0

파일 보기

파일: literals.c 프로젝트: gvx/deja

bool read_literals(char *oldpos, size_t size, Header* h)
{
	int i, j;
	int n = 0;
	char type;
	uint32_t str_length;
	uint32_t ref;
	char *startpos = oldpos + h->size * 4;
	char *curpos = startpos;
	while (!eofreached)
	{
		type = *curpos++;
		if (eofreached)
		{
			break;
		}
		n++;
		switch (type)
		{
			case TYPE_NUM:
				curpos += 8;
				break;
			case TYPE_NUM | TYPE_SHORT:
				curpos += 3;
				break;
			case TYPE_STR:
			case TYPE_IDENT:
				memcpy(&str_length, curpos, 4);
				curpos += 4 + ntohl(str_length);
				break;
			case TYPE_STR | TYPE_SHORT:
			case TYPE_IDENT | TYPE_SHORT:
				str_length = (unsigned char)*curpos++;
				curpos += str_length;
				break;
			case TYPE_PAIR:
				curpos += 6;
				break;
			case TYPE_FRAC:
				curpos += 16;
				break;
			case TYPE_FRAC | TYPE_SHORT:
				curpos += 2;
				break;
			case TYPE_LIST:
				memcpy(&str_length, curpos, 4);
				curpos += 4 + 3 * ntohl(str_length);
				break;
			case TYPE_DICT:
				memcpy(&str_length, curpos, 4);
				curpos += 4 + 6 * ntohl(str_length);
				break;
		}
	}
	V* arr = calloc(n, sizeof(V));
	V t;
	curpos = startpos;
	for (i = 0; i < n; i++)
	{
		type = *curpos++;
		if (type == TYPE_NUM)
		{
			union double_or_uint64_t d;
			memcpy(&d, curpos, 8);
			curpos += 8;
			d.i = ntohll(d.i);
			t = double_to_value(d.d);
		}
		else if (type == (TYPE_NUM | TYPE_SHORT))
		{
			ref = 0;
			memcpy(((char*)&ref) + 1, curpos, 3);
			ref = ntohl(ref);
			curpos += 3;
			t = int_to_value(ref);
		}
		else if (type == TYPE_STR)
		{
			memcpy(&str_length, curpos, 4);
			curpos += 4;
			str_length = ntohl(str_length);
			if (!valid_utf8(str_length, curpos))
			{
				set_error_msg("wrong encoding for string literal, should be UTF-8");
				return false;
			}
			t = str_to_string(str_length, curpos);
			curpos += str_length;
		}
		else if (type == TYPE_IDENT)
		{
			memcpy(&str_length, curpos, 4);
			curpos += 4;
			str_length = ntohl(str_length);
			char data[str_length + 1];
			memcpy(&data, curpos, str_length);
			data[str_length] = '\0';
			t = lookup_ident(str_length, data);
			curpos += str_length;
		}
		else if (type == (TYPE_STR | TYPE_SHORT))
		{
			str_length = (unsigned char)*curpos++;
			if (!valid_utf8(str_length, curpos))
			{
				set_error_msg("wrong encoding for string literal, should be UTF-8");
				return false;
			}
			t = str_to_string(str_length, curpos);
			curpos += str_length;
		}
		else if (type == (TYPE_IDENT | TYPE_SHORT))
		{
			str_length = *curpos++;
			char data[str_length + 1];
			memcpy(&data, curpos, str_length);
			data[str_length] = '\0';
			t = lookup_ident(str_length, data);
			curpos += str_length;
		}
		else if (type == TYPE_PAIR)
		{
			ref = 0;
			memcpy(((char*)&ref) + 1, curpos, 3);
			ref = ntohl(ref);
			if (ref >= i)
			{
				set_error_msg("illegal pair detected");
				return false;
			}
			V v1 = arr[ref];

			ref = 0;
			memcpy(((char*)&ref) + 1, curpos + 3, 3);
			ref = ntohl(ref);
			if (ref >= i)
			{
				set_error_msg("illegal pair detected");
				return false;
			}
			V v2 = arr[ref];

			t = new_pair(v1, v2);
			curpos += 6;
		}
		else if (type == TYPE_FRAC)
		{
			int64_t numer;
			int64_t denom;
			memcpy(&numer, curpos, 8);
			numer = ntohll(numer);
			memcpy(&denom, curpos + 8, 8);
			denom = ntohll(denom);
			t = new_frac(numer, denom);
			curpos += 16;
		}
		else if (type == (TYPE_FRAC | TYPE_SHORT))
		{
			int8_t numer;
			uint8_t denom;
			numer = *curpos++;
			denom = *curpos++;
			t = new_frac(numer, denom);
		}
		else if (type == TYPE_LIST)
		{
			memcpy(&str_length, curpos, 4);
			str_length = ntohl(str_length);
			t = new_list();
			curpos += 4;
			if (str_length > 0)
			{
				uint32_t size = 64;
				while (size < str_length) size <<= 1;
				toStack(t)->size = size;
				toStack(t)->used = str_length;
				toStack(t)->nodes = calloc(size, sizeof(V));
				for (j = 0; j < str_length; j++)
				{
					ref = 0;
					memcpy(((char*)&ref) + 1, curpos, 3);
					ref = ntohl(ref);
					toStack(t)->nodes[j] = intToV((uint64_t)ref);
					curpos += 3;
				}
			}
		}
		else if (type == TYPE_DICT)
		{
			memcpy(&str_length, curpos, 4);
			curpos += 4;
			str_length = ntohl(str_length);
			t = new_dict();
			if (str_length > 0)
			{
				uint32_t size = 16;
				while (size < str_length) size <<= 1;
				toHashMap(t)->size = size;
				toHashMap(t)->used = str_length;
				toHashMap(t)->map = (Bucket**)curpos;
			}
			curpos += 6 * str_length;
		}
		else
		{
			set_error_msg("Unknown literal type.");
			return false;
		}
		arr[i] = t;
	}

	for (i = 0; i < n; i++)
	{
		t = arr[i];
		switch(getType(t))
		{
			case TYPE_LIST:
				for (j = 0; j < toStack(t)->used; j++)
				{
					toStack(t)->nodes[j] = arr[toInt(toStack(t)->nodes[j])];
				}
				break;
			case TYPE_DICT:
				if (toHashMap(t)->map)
				{
					curpos = ((char*)toHashMap(t)->map);

					toHashMap(t)->map = NULL;
					str_length = toHashMap(t)->used; //worst abuse of variable name ever Y/Y?
					toHashMap(t)->used = 0;
					for (j = 0; j < str_length; j++)
					{
						ref = 0;
						memcpy(((char*)&ref) + 1, curpos, 3);
						ref = ntohl(ref);
						V key = arr[ref];

						ref = 0;
						memcpy(((char*)&ref) + 1, curpos + 3, 3);
						ref = ntohl(ref);
						V value = arr[ref];

						set_hashmap(toHashMap(t), key, value);

						curpos += 6;
					}
				}
				break;
		}
	}

	h->n_literals = n;
	h->literals = arr;
	return true;
}