/////////////////////////////////////////////////////////////////////////////// // vim: sw=4 ts=4 // Description: Encoding Detection Cases from Mozilla // Copyright: 2013-2016 JiaYanwei <*****@*****.**> // License: GPLv3 /////////////////////////////////////////////////////////////////////////////// #include "data_from_mozdet.h" #include <boost/assign/list_of.hpp> MozCaseVec moz811363cases = boost::assign::list_of ( EncAndText("UTF-8", "Two-byte UTF-8 including the first and last characters in the range: " "\\u0080\\u0428\\u0435\\u0440\\u043b\\u043e\\u043a\\u07ff\n") ) ( EncAndText("UTF-8", "Three byte UTF-8, first byte 0xE0, including first and last characters\nin the range: " "\\u0800\\u0936\\u0930\\u094d\\u0932\\u0915\\u0fff\n\n") ) ( EncAndText("UTF-8", "Three byte UTF-8, first byte 0xE1-EC, including first and last characters\nin the range: " "\\u1000\\u30b7\\u30e3\\u30fc\\u30ed\\u30c3\\u30af\\ucfff\n\n") ) ( EncAndText("UTF-8", "Three byte UTF-8, first byte 0xED, including first and last characters\nin the range: " "\\ud000\\ud648\\ud558\\ud648\\ud0d0\\ud7ff\n\n") ) ( EncAndText("UTF-8", "Three byte UTF-8, first byte 0xEE-EF, including first and last characters\nin the range: " "\\ue000\\ufd0d\\ufedf\\ufeee\\ufed9\\uffff\n\n") ) (
("\\uA758\\uA759") ; std::vector<std::string> utf8_invalid_cases = boost::assign::list_of ("\xC0\xAF") ("\xE0\x9F\xBF") ("\xED\xA0\x80") ("\xED\xBF\xBF") ("\xF0\x8F\xBF\xBF") ("\xF4\x90\x80\x80") ; const char u32le_bom[] = {'\xFF', '\xFE', '\0', '\0'}; const char u32be_bom[] = {'\0', '\0', '\xFE', '\xFF'}; std::vector<EncAndText> bom_cases = boost::assign::list_of (EncAndText("UTF-8", "\xEF\xBB\xBF")) (EncAndText("UTF-16LE", "\xFF\xFE")) (EncAndText("UTF-16BE", "\xFE\xFF")) (EncAndText("UTF-32LE", std::string(u32le_bom, 4))) (EncAndText("UTF-32BE", std::string(u32be_bom, 4))) (EncAndText("GB18030", "\x84\x31\x95\x33")) ; std::vector<std::string> iso646_cases = boost::assign::list_of ("an US-ASCII text") ("$ is RMB yuan in ISO-646-CN") ("# is pound in ISO-646-GB") ("\\ is yen in ISO-646-JA") (":-)") ;