//============================================================================ // NUnicodeParser::RemoveBOM : Remove a BOM prefix. //---------------------------------------------------------------------------- void NUnicodeParser::RemoveBOM(NData &theData, NStringEncoding theEncoding) const { NStringEncoding bomEncoding; NRange theBOM; // Validate our parameters NN_ASSERT(NStringEncoder::IsEncodingUTF(theEncoding)); NN_UNUSED(theEncoding); // Get the state we need bomEncoding = GetBOM(theData, theBOM); if (!theBOM.IsEmpty()) theData.RemoveData(theBOM); // Validate the encoding // // Endian-specific BOMs should match the format we expected. switch (bomEncoding) { case kNStringEncodingInvalid: // No BOM break; case kNStringEncodingUTF8: NN_ASSERT(theEncoding == kNStringEncodingUTF8); break; case kNStringEncodingUTF16BE: NN_ASSERT(theEncoding == kNStringEncodingUTF16 || theEncoding == kNStringEncodingUTF16BE); break; case kNStringEncodingUTF16LE: NN_ASSERT(theEncoding == kNStringEncodingUTF16 || theEncoding == kNStringEncodingUTF16LE); break; case kNStringEncodingUTF32BE: NN_ASSERT(theEncoding == kNStringEncodingUTF32 || theEncoding == kNStringEncodingUTF32BE); break; case kNStringEncodingUTF32LE: NN_ASSERT(theEncoding == kNStringEncodingUTF32 || theEncoding == kNStringEncodingUTF32LE); break; default: NN_LOG("Invalid encoding: %d", theEncoding); break; } }
//============================================================================ // NUnicodeParser::AddBOM : Add a BOM prefix. //---------------------------------------------------------------------------- void NUnicodeParser::AddBOM(NData &theData, NStringEncoding theEncoding) const { NRange theRange; // Validate our parameters NN_ASSERT(NStringEncoder::IsEncodingUTF(theEncoding)); NN_ASSERT(GetBOM(theData, theRange) == kNStringEncodingInvalid); (void) theRange; // Add the BOM switch (theEncoding) { case kNStringEncodingUTF8: AddBOMToUTF8(theData); break; case kNStringEncodingUTF16: AddBOMToUTF16(theData, kNEndianNative); break; case kNStringEncodingUTF16BE: AddBOMToUTF16(theData, kNEndianBig); break; case kNStringEncodingUTF16LE: AddBOMToUTF16(theData, kNEndianLittle); break; case kNStringEncodingUTF32: AddBOMToUTF32(theData, kNEndianNative); break; case kNStringEncodingUTF32BE: AddBOMToUTF32(theData, kNEndianBig); break; case kNStringEncodingUTF32LE: AddBOMToUTF16(theData, kNEndianLittle); break; default: NN_LOG("Invalid encoding: %d", theEncoding); break; } }
//============================================================================ // NUnicodeParser::Parse : Parse some data. //---------------------------------------------------------------------------- void NUnicodeParser::Parse(const NData &theData, NStringEncoding theEncoding) { NRange theRange; // Validate our parameters NN_ASSERT(NStringEncoder::IsEncodingUTF(theEncoding)); // Set the value mData = theData; mEncoding = theEncoding; (void) GetBOM(mData, theRange); // Identify the code points switch (mEncoding) { case kNStringEncodingUTF8: mCodePoints = GetCodePointsUTF8(theRange); break; case kNStringEncodingUTF16: case kNStringEncodingUTF16BE: case kNStringEncodingUTF16LE: mCodePoints = GetCodePointsUTF16(theRange); break; case kNStringEncodingUTF32: case kNStringEncodingUTF32BE: case kNStringEncodingUTF32LE: mCodePoints = GetCodePointsUTF32(theRange); break; default: NN_LOG("Invalid encoding: %d", theEncoding); break; } }
//------------------------------------------------------------------------------ void CBOMRecognizerFilter::RecognizeBOM() { unsigned long ulBOMBytes = sculBOMBytes; byte* pBOMData = GetBOM( ulBOMBytes ); if( ulBOMBytes == sculBOMBytes && pBOMData ) { unsigned int uiSequence = 0; bool Match[ CBOMRecognizerFilter::sculPatterns ]; unsigned long ulPattern = 0; for( ulPattern = 0; ulPattern < CBOMRecognizerFilter::sculPatterns; ulPattern++ ) { Match[ ulPattern ] = true; } while( uiSequence < ulBOMBytes ) { bool bDone = true; for( ulPattern = 0; ulPattern < CBOMRecognizerFilter::sculPatterns && Match[ ulPattern ]; ulPattern++ ) { if( !( ( pBOMData[ uiSequence ] & Masks[ ulPattern ][ uiSequence ] ) == Patterns[ ulPattern ][ uiSequence ] ) ) { Match[ ulPattern ] = false; } else { bDone = false; } } if( bDone ) { break; } uiSequence++; }; unsigned int uiMatchCount = 0; unsigned int uiMatch = 0; for( ulPattern = 0; ulPattern < CBOMRecognizerFilter::sculPatterns; ulPattern++ ) { if( Match[ ulPattern ] ) { uiMatchCount++; uiMatch = uiSequence; } } if( uiMatchCount > 0 ) { AcknowledgeBOM( ulBOMBytes ); if( uiMatchCount == 1 ) { switch ( uiMatch ) { case 0: m_RecognitionEvent.BOMType() = CBOMRecognizedEvent::eUTF16BE; case 1: m_RecognitionEvent.BOMType() = CBOMRecognizedEvent::eUTF16LE; case 2: m_RecognitionEvent.BOMType() = CBOMRecognizedEvent::eUTF8; } m_RecognitionEvent(); //BOM Matched m_bRecognized = true; } } else { RejectBOM( ulBOMBytes ); m_RecognitionEvent.BOMType() = CBOMRecognizedEvent::eNone; m_RecognitionEvent();//No BOM m_bRecognized = true; } } }