XMLSize_t
MacOSTranscoder::transcodeTo(const  XMLCh* const    srcData
                            , const XMLSize_t       srcCount
                            ,       XMLByte* const  toFill
                            , const XMLSize_t       maxBytes
                            ,       XMLSize_t&      charsEaten
                            , const UnRepOpts       options)
{
	//  Reset the tec state (since we don't know that we're part of a
	//  larger run of text).
	TECClearConverterContextInfo(mUnicodeToText);
	
    //  Do the conversion
    ByteCount bytesConsumed = 0;
    ByteCount bytesProduced = 0;
    OSStatus status = TECConvertText(mUnicodeToText,
                (ConstTextPtr) srcData,
                srcCount * sizeof(XMLCh),   // inputBufferLength
                &bytesConsumed,				// actualInputLength
                (TextPtr) toFill,           // outputBuffer
                maxBytes,                   // outputBufferLength
                &bytesProduced);			// actualOutputLength

    //  Ignorable error codes
    if(    status == kTECUsedFallbacksStatus
        || status == kTECOutputBufferFullStatus
        || status == kTECPartialCharErr
		)
        status = noErr;
        
    std::size_t charsConsumed = bytesConsumed / sizeof(XMLCh);
    
    //  Deal with errors
    if (status != noErr)
    {
    	if (status == kTECUnmappableElementErr && options == UnRep_Throw)
    	{
    		XMLCh tmpBuf[17];
            XMLString::binToText(srcData[charsConsumed], tmpBuf, 16, 16);
            ThrowXML2
            (
                TranscodingException
                , XMLExcepts::Trans_Unrepresentable
                , tmpBuf
                , getEncodingName()
            );
    	}
    }
	
    charsEaten = charsConsumed;
    return bytesProduced;
}
Пример #2
0
OSStatus TextCodecMac::createTECConverter() const
{
    bool cachedEncodingEqual = cachedConverterEncoding == m_encoding;
    cachedConverterEncoding = invalidEncoding;

    if (cachedEncodingEqual && cachedConverterTEC) {
        m_converterTEC = cachedConverterTEC;
        cachedConverterTEC = 0;
        TECClearConverterContextInfo(m_converterTEC);
    } else {
        OSStatus status = TECCreateConverter(&m_converterTEC, m_encoding,
            CreateTextEncoding(kTextEncodingUnicodeDefault, kTextEncodingDefaultVariant, kUnicode16BitFormat));
        if (status)
            return status;

        TECSetBasicOptions(m_converterTEC, kUnicodeForceASCIIRangeMask);
    }
    
    return noErr;
}
XMLSize_t
MacOSTranscoder::transcodeFrom(  const  XMLByte* const          srcData
                                , const XMLSize_t               srcCount
                                ,       XMLCh* const            toFill
                                , const XMLSize_t               maxChars
                                ,       XMLSize_t&              bytesEaten
                                ,       unsigned char* const    charSizes)
{
	//  Reset the tec state (since we don't know that we're part of a
	//  larger run of text).
	TECClearConverterContextInfo(mTextToUnicode);
	
    //  Do the conversion
    ByteCount bytesConsumed = 0;
    ByteCount bytesProduced = 0;
    OSStatus status = TECConvertText(mTextToUnicode,
                (ConstTextPtr) srcData,
                srcCount,                   // inputBufferLength
                &bytesConsumed,				// actualInputLength
                (TextPtr) toFill,           // outputBuffer
                maxChars * sizeof(XMLCh),	// outputBufferLength
                &bytesProduced);			// actualOutputLength

    //  Ignorable error codes
    if(    status == kTECUsedFallbacksStatus
        || status == kTECOutputBufferFullStatus
        || status == kTECPartialCharErr
		)
        status = noErr;
    	
    if (status != noErr)
        ThrowXML(TranscodingException, XMLExcepts::Trans_BadSrcSeq);
	
	std::size_t charsProduced = bytesProduced / sizeof(XMLCh);
	
    bytesEaten = bytesConsumed;
    return charsProduced;
}
Пример #4
0
String TextCodecMac::decode(const char* bytes, size_t length, bool flush, bool stopOnError, bool& sawError)
{
    // Get a converter for the passed-in encoding.
    if (!m_converterTEC && createTECConverter() != noErr)
        return String();
    
    Vector<UChar> result;

    const unsigned char* sourcePointer = reinterpret_cast<const unsigned char*>(bytes);
    int sourceLength = length;
    bool bufferWasFull = false;
    UniChar buffer[ConversionBufferSize];

    while ((sourceLength || bufferWasFull) && !sawError) {
        int bytesRead = 0;
        int bytesWritten = 0;
        OSStatus status = decode(sourcePointer, sourceLength, bytesRead, buffer, sizeof(buffer), bytesWritten);
        ASSERT(bytesRead <= sourceLength);
        sourcePointer += bytesRead;
        sourceLength -= bytesRead;
        
        switch (status) {
            case noErr:
            case kTECOutputBufferFullStatus:
                break;
            case kTextMalformedInputErr:
            case kTextUndefinedElementErr:
                // FIXME: Put FFFD character into the output string in this case?
                TECClearConverterContextInfo(m_converterTEC);
                if (stopOnError) {
                    sawError = true;
                    break;
                }
                if (sourceLength) {
                    sourcePointer += 1;
                    sourceLength -= 1;
                }
                break;
            case kTECPartialCharErr: {
                // Put the partial character into the buffer.
                ASSERT(m_numBufferedBytes == 0);
                const int bufferSize = sizeof(m_numBufferedBytes);
                if (sourceLength < bufferSize) {
                    memcpy(m_bufferedBytes, sourcePointer, sourceLength);
                    m_numBufferedBytes = sourceLength;
                } else {
                    LOG_ERROR("TECConvertText gave a kTECPartialCharErr, but left %u bytes in the buffer", sourceLength);
                }
                sourceLength = 0;
                break;
            }
            default:
                sawError = true;
                return String();
        }

        ASSERT(!(bytesWritten % sizeof(UChar)));
        result.append(buffer, bytesWritten / sizeof(UChar));

        bufferWasFull = status == kTECOutputBufferFullStatus;
    }
    
    if (flush) {
        unsigned long bytesWritten = 0;
        TECFlushText(m_converterTEC, reinterpret_cast<unsigned char*>(buffer), sizeof(buffer), &bytesWritten);
        ASSERT(!(bytesWritten % sizeof(UChar)));
        result.append(buffer, bytesWritten / sizeof(UChar));
    }

    String resultString = String::adopt(result);

    // <rdar://problem/3225472>
    // Simplified Chinese pages use the code A3A0 to mean "full-width space".
    // But GB18030 decodes it to U+E5E5, which is correct in theory but not in practice.
    // To work around, just change all occurences of U+E5E5 to U+3000 (ideographic space).
    if (m_encoding == kCFStringEncodingGB_18030_2000)
        resultString.replace(0xE5E5, ideographicSpace);
    
    return resultString;
}