TEST(CompressionCodecManager, missingMiddle) { std::unordered_map<uint32_t, CodecConfigPtr> codecConfigs; for (uint32_t i = 1; i <= 20; ++ i) { codecConfigs.emplace( i, folly::make_unique<CodecConfig>( i, CompressionCodecType::LZ4, createBinaryData(i * 1024))); } for (uint32_t i = 50; i <= 64; ++ i) { codecConfigs.emplace( i, folly::make_unique<CodecConfig>( i, CompressionCodecType::LZ4, createBinaryData(i * 1024))); } CompressionCodecManager codecManager(std::move(codecConfigs)); auto codecMap = codecManager.getCodecMap(); EXPECT_TRUE(codecMap); EXPECT_EQ(50, codecMap->getIdRange().firstId); EXPECT_EQ(15, codecMap->getIdRange().size); for (uint32_t i = 1; i <= 64; ++ i) { if (i >= 50) { validateCodec(codecMap->get(i)); } else { EXPECT_FALSE(codecMap->get(i)); } } }
TEST(CompressionCodecManager, invalidDictionary) { std::unordered_map<uint32_t, CodecConfigPtr> codecConfigs; codecConfigs.emplace( 1, folly::make_unique<CodecConfig>( 1, CompressionCodecType::LZ4, createBinaryData(10 * 1024))); codecConfigs.emplace( 2, folly::make_unique<CodecConfig>( 2, CompressionCodecType::LZ4, createBinaryData(65 * 1024))); codecConfigs.emplace( 3, folly::make_unique<CodecConfig>( 3, CompressionCodecType::LZ4, createBinaryData(64 * 1024))); CompressionCodecManager codecManager(std::move(codecConfigs)); auto codecMap = codecManager.getCodecMap(); EXPECT_TRUE(codecMap); EXPECT_EQ(3, codecMap->getIdRange().firstId); EXPECT_EQ(1, codecMap->getIdRange().size); EXPECT_FALSE(codecMap->get(1)); EXPECT_FALSE(codecMap->get(2)); validateCodec(codecMap->get(3)); }
/// return the static global prefered codec TextCodec *TextCodecDetector::globalPreferedCodec() { if( !globalPreferedCodecRef_ ) { globalPreferedCodecRef_ = codecManager()->codecForName("UTF-8"); Q_ASSERT(globalPreferedCodecRef_); } return globalPreferedCodecRef_; }
/// Detects the encoding of the provided buffer. /// If Byte Order Markers are encountered at the beginning of the buffer, we immidiately /// return the charset implied by this BOM. Otherwise, the file would not be a human /// readable text file. /// /// If there is no BOM, this method tries to discern whether the file is UTF-8 or not. /// If it is not UTF-8, we assume the encoding is the default system encoding /// (of course, it might be any 8-bit charset, but usually, an 8-bit charset is the default one) /// /// It is possible to discern UTF-8 thanks to the pattern of characters with a multi-byte sequence /// /// @code /// UCS-4 range (hex.) UTF-8 octet sequence (binary) /// 0000 0000-0000 007F 0xxxxxxx /// 0000 0080-0000 07FF 110xxxxx 10xxxxxx /// 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx /// 0001 0000-001F FFFF 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx /// 0020 0000-03FF FFFF 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx /// 0400 0000-7FFF FFFF 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx /// @endcode /// /// With UTF-8, 0xFE and 0xFF never appear. /// /// @return the QTextCodec that is 'detected' TextCodec* TextCodecDetector::detectCodec() { // if the file has a Byte Order Marker, we can assume the file is in UTF-xx // otherwise, the file would not be human readable if( hasUTF8Bom(bufferRef_,bufferLength_) ) return codecManager()->codecForName("UTF-8 with BOM"); if( hasUTF16LEBom(bufferRef_,bufferLength_) ) return codecManager()->codecForName("UTF-16LE with BOM"); if( hasUTF16BEBom(bufferRef_,bufferLength_) ) return codecManager()->codecForName("UTF-16BE with BOM"); if( hasUTF32LEBom(bufferRef_,bufferLength_) ) return codecManager()->codecForName("UTF-32LE with BOM"); if( hasUTF32BEBom(bufferRef_,bufferLength_) ) return codecManager()->codecForName("UTF-32BE with BOM"); // if a byte has its most significant bit set, the file is in UTF-8 or in the default encoding // otherwise, the file is in US-ASCII bool highOrderBit = false; // if the file is in UTF-8, high order bytes must have a certain value, in order to be valid // if it's not the case, we can assume the encoding is the default encoding of the system bool validU8Char = true; // TODO the buffer is not read up to the end, but up to length - 6 int length = bufferLength_; int i = 0; while( i < length - 6 ) { char b0 = bufferRef_[i]; char b1 = bufferRef_[i + 1]; char b2 = bufferRef_[i + 2]; char b3 = bufferRef_[i + 3]; char b4 = bufferRef_[i + 4]; char b5 = bufferRef_[i + 5]; if (b0 < 0) { // a high order bit was encountered, thus the encoding is not US-ASCII // it may be either an 8-bit encoding or UTF-8 highOrderBit = true; // a two-bytes sequence was encoutered if (isTwoBytesSequence(b0)) { // there must be one continuation byte of the form 10xxxxxx, // otherwise the following characteris is not a valid UTF-8 construct if (!isContinuationChar(b1)) validU8Char = false; else i++; } // a three-bytes sequence was encoutered else if (isThreeBytesSequence(b0)) { // there must be two continuation bytes of the form 10xxxxxx, // otherwise the following characteris is not a valid UTF-8 construct if (!(isContinuationChar(b1) && isContinuationChar(b2))) validU8Char = false; else i += 2; } // a four-bytes sequence was encoutered else if (isFourBytesSequence(b0)) { // there must be three continuation bytes of the form 10xxxxxx, // otherwise the following characteris is not a valid UTF-8 construct if (!(isContinuationChar(b1) && isContinuationChar(b2) && isContinuationChar(b3))) validU8Char = false; else i += 3; } // a five-bytes sequence was encoutered else if (isFiveBytesSequence(b0)) { // there must be four continuation bytes of the form 10xxxxxx, // otherwise the following characteris is not a valid UTF-8 construct if (!(isContinuationChar(b1) && isContinuationChar(b2) && isContinuationChar(b3) && isContinuationChar(b4))) validU8Char = false; else i += 4; } // a six-bytes sequence was encoutered else if (isSixBytesSequence(b0)) { // there must be five continuation bytes of the form 10xxxxxx, // otherwise the following characteris is not a valid UTF-8 construct if (!(isContinuationChar(b1) && isContinuationChar(b2) && isContinuationChar(b3) && isContinuationChar(b4) && isContinuationChar(b5))) validU8Char = false; else i += 5; } else validU8Char = false; } if (!validU8Char) break; i++; } // if no byte with an high order bit set, the encoding is US-ASCII // (it might have been UTF-7, but this encoding is usually internally used only by mail systems) if (!highOrderBit) { return preferedCodec(); //return fallbackCodec(); /// This could be US-ASCII } // if no invalid UTF-8 were encountered, we can assume the encoding is UTF-8, // otherwise the file would not be human readable if( validU8Char ) { // return QTextCodec::codecForName("UTF-8"); return preferedCodec(); // we sort of assume prefered codec is UTF-8 :P } // finally, if it's not UTF-8 nor US-ASCII, let's assume the encoding is the default encoding return fallbackCodec(); }