示例#1
0
///////////////////////////////////////////////////////////////////////////////
// vim:         sw=4 ts=4
// Description: Encoding Detection Cases from Mozilla
// Copyright:   2013-2016  JiaYanwei   <*****@*****.**>
// License:     GPLv3
///////////////////////////////////////////////////////////////////////////////
#include "data_from_mozdet.h"
#include <boost/assign/list_of.hpp>

MozCaseVec moz811363cases = boost::assign::list_of
(
EncAndText("UTF-8", "Two-byte UTF-8 including the first and last characters in the range: "
	"\\u0080\\u0428\\u0435\\u0440\\u043b\\u043e\\u043a\\u07ff\n")
)
(
EncAndText("UTF-8", "Three byte UTF-8, first byte 0xE0, including first and last characters\nin the range: "
	"\\u0800\\u0936\\u0930\\u094d\\u0932\\u0915\\u0fff\n\n")
)
(
EncAndText("UTF-8", "Three byte UTF-8, first byte 0xE1-EC, including first and last characters\nin the range: "
	"\\u1000\\u30b7\\u30e3\\u30fc\\u30ed\\u30c3\\u30af\\ucfff\n\n")
)
(
EncAndText("UTF-8", "Three byte UTF-8, first byte 0xED, including first and last characters\nin the range: "
	"\\ud000\\ud648\\ud558\\ud648\\ud0d0\\ud7ff\n\n")
)
(
EncAndText("UTF-8", "Three byte UTF-8, first byte 0xEE-EF, including first and last characters\nin the range: "
	"\\ue000\\ufd0d\\ufedf\\ufeee\\ufed9\\uffff\n\n")
)
(
示例#2
0
	("\\uA758\\uA759")
	;

std::vector<std::string> utf8_invalid_cases = boost::assign::list_of
	("\xC0\xAF")
	("\xE0\x9F\xBF")
	("\xED\xA0\x80")
	("\xED\xBF\xBF")
	("\xF0\x8F\xBF\xBF")
	("\xF4\x90\x80\x80")
	;

const char u32le_bom[] = {'\xFF', '\xFE', '\0', '\0'};
const char u32be_bom[] = {'\0', '\0', '\xFE', '\xFF'};
std::vector<EncAndText> bom_cases = boost::assign::list_of
	(EncAndText("UTF-8", "\xEF\xBB\xBF"))
	(EncAndText("UTF-16LE", "\xFF\xFE"))
	(EncAndText("UTF-16BE", "\xFE\xFF"))
	(EncAndText("UTF-32LE", std::string(u32le_bom, 4)))
	(EncAndText("UTF-32BE", std::string(u32be_bom, 4)))
	(EncAndText("GB18030", "\x84\x31\x95\x33"))
	;

std::vector<std::string> iso646_cases = boost::assign::list_of
	("an US-ASCII text")
	("$ is RMB yuan in ISO-646-CN")
	("# is pound in ISO-646-GB")
	("\\ is yen in ISO-646-JA")
	(":-)")
	;