PassRefPtr<Document> XSLTProcessor::createDocumentFromSource(const String& sourceString, const String& sourceEncoding, const String& sourceMIMEType, Node* sourceNode, Frame* frame) { RefPtr<Document> ownerDocument = sourceNode->document(); bool sourceIsDocument = (sourceNode == ownerDocument.get()); String documentSource = sourceString; RefPtr<Document> result; if (sourceMIMEType == "text/plain") { result = Document::create(frame); transformTextStringToXHTMLDocumentString(documentSource); } else result = DOMImplementation::createDocument(sourceMIMEType, frame, false); // Before parsing, we need to save & detach the old document and get the new document // in place. We have to do this only if we're rendering the result document. if (frame) { if (FrameView* view = frame->view()) view->clear(); result->setTransformSourceDocument(frame->document()); frame->setDocument(result); } if (sourceIsDocument) result->setURL(ownerDocument->url()); result->open(); RefPtr<TextResourceDecoder> decoder = TextResourceDecoder::create(sourceMIMEType); decoder->setEncoding(sourceEncoding.isEmpty() ? UTF8Encoding() : TextEncoding(sourceEncoding), TextResourceDecoder::EncodingFromXMLHeader); result->setDecoder(decoder.release()); result->write(documentSource); result->finishParsing(); result->close(); return result.release(); }
TextEncoding TextEncoding::ASCII() { return TextEncoding("ASCII"); }
bool detectTextEncoding(const char* data, size_t len, const char* hintEncodingName, TextEncoding* detectedEncoding) { *detectedEncoding = TextEncoding(); int matchesCount = 0; UErrorCode status = U_ZERO_ERROR; UCharsetDetector* detector = ucsdet_open(&status); if (U_FAILURE(status)) return false; ucsdet_enableInputFilter(detector, true); ucsdet_setText(detector, data, static_cast<int32_t>(len), &status); if (U_FAILURE(status)) return false; // FIXME: A few things we can do other than improving // the ICU detector itself. // 1. Use ucsdet_detectAll and pick the most likely one given // "the context" (parent-encoding, referrer encoding, etc). // 2. 'Emulate' Firefox/IE's non-Universal detectors (e.g. // Chinese, Japanese, Russian, Korean and Hebrew) by picking the // encoding with a highest confidence among the detector-specific // limited set of candidate encodings. // Below is a partial implementation of the first part of what's outlined // above. const UCharsetMatch** matches = ucsdet_detectAll(detector, &matchesCount, &status); if (U_FAILURE(status)) { ucsdet_close(detector); return false; } const char* encoding = 0; if (hintEncodingName) { TextEncoding hintEncoding(hintEncodingName); // 10 is the minimum confidence value consistent with the codepoint // allocation in a given encoding. The size of a chunk passed to // us varies even for the same html file (apparently depending on // the network load). When we're given a rather short chunk, we // don't have a sufficiently reliable signal other than the fact that // the chunk is consistent with a set of encodings. So, instead of // setting an arbitrary threshold, we have to scan all the encodings // consistent with the data. const int32_t kThresold = 10; for (int i = 0; i < matchesCount; ++i) { int32_t confidence = ucsdet_getConfidence(matches[i], &status); if (U_FAILURE(status)) { status = U_ZERO_ERROR; continue; } if (confidence < kThresold) break; const char* matchEncoding = ucsdet_getName(matches[i], &status); if (U_FAILURE(status)) { status = U_ZERO_ERROR; continue; } if (TextEncoding(matchEncoding) == hintEncoding) { encoding = hintEncodingName; break; } } } // If no match is found so far, just pick the top match. // This can happen, say, when a parent frame in EUC-JP refers to // a child frame in Shift_JIS and both frames do NOT specify the encoding // making us resort to auto-detection (when it IS turned on). if (!encoding && matchesCount > 0) encoding = ucsdet_getName(matches[0], &status); if (U_SUCCESS(status)) { *detectedEncoding = TextEncoding(encoding); ucsdet_close(detector); return true; } ucsdet_close(detector); return false; }
TextEncoding TextEncoding::Application() { return TextEncoding(::GetACP()); }
TextEncoding TextEncoding::UTF32LE() { return TextEncoding(12000); }
TextEncoding TextEncoding::UTF16BE() { return TextEncoding(1201); }
TextEncoding TextEncoding::UTF8() { return TextEncoding(65001); }
TextEncoding TextEncoding::ASCII() { return TextEncoding(20127); }
TextEncoding TextEncoding::UTF32LE() { return TextEncoding("UCS-4LE"); }
TextEncoding TextEncoding::UTF32BE() { return TextEncoding("UCS-4BE"); }
TextEncoding TextEncoding::UTF32() { return TextEncoding("UCS-4-INTERNAL"); }
TextEncoding TextEncoding::UTF16LE() { return TextEncoding("UTF-16LE"); }
TextEncoding TextEncoding::UTF16() { return TextEncoding("UTF-16"); }
TextEncoding TextEncoding::UTF8() { return TextEncoding("UTF-8"); }
TextEncoding TextEncoding::UTF7() { return TextEncoding("UTF-7"); }
TextEncoding TextEncoding::WindowsShiftJIS() { return TextEncoding(932); }
TextEncoding TextEncoding::PalmShiftJIS() { return TextEncoding(932); }
TextEncoding TextEncoding::Application() { return TextEncoding(""); }
TextEncoding TextEncoding::UTF7() { return TextEncoding(65000); }
TextEncoding TextEncoding::System() { return TextEncoding(""); }
TextEncoding TextEncoding::UTF16() { return TextEncoding(1200); }
DeprecatedString StreamingTextDecoderICU::toUnicode(const char* chs, int len, bool flush) { ASSERT_ARG(len, len >= 0); if (!chs) return DeprecatedString(); if (len <= 0 && !flush) return ""; // Handle normal case. if (!m_atStart) return convert(chs, len, flush); // Check to see if we found a BOM. int numBufferedBytes = m_numBufferedBytes; int buf1Len = numBufferedBytes; int buf2Len = len; const unsigned char* buf1 = m_bufferedBytes; const unsigned char* buf2 = reinterpret_cast<const unsigned char*>(chs); unsigned char c1 = buf1Len ? (--buf1Len, *buf1++) : buf2Len ? (--buf2Len, *buf2++) : 0; unsigned char c2 = buf1Len ? (--buf1Len, *buf1++) : buf2Len ? (--buf2Len, *buf2++) : 0; unsigned char c3 = buf1Len ? (--buf1Len, *buf1++) : buf2Len ? (--buf2Len, *buf2++) : 0; int BOMLength = 0; if (c1 == 0xFF && c2 == 0xFE) { if (m_encoding != TextEncoding(UTF16Encoding, LittleEndian)) { releaseICUConverter(); m_encoding = TextEncoding(UTF16Encoding, LittleEndian); m_littleEndian = true; } BOMLength = 2; } else if (c1 == 0xFE && c2 == 0xFF) { if (m_encoding != TextEncoding(UTF16Encoding, BigEndian)) { releaseICUConverter(); m_encoding = TextEncoding(UTF16Encoding, BigEndian); m_littleEndian = false; } BOMLength = 2; } else if (c1 == 0xEF && c2 == 0xBB && c3 == 0xBF) { if (m_encoding != TextEncoding(UTF8Encoding)) { releaseICUConverter(); m_encoding = TextEncoding(UTF8Encoding); } BOMLength = 3; } // Handle case where we found a BOM. if (BOMLength != 0) { ASSERT(numBufferedBytes + len >= BOMLength); int skip = BOMLength - numBufferedBytes; m_numBufferedBytes = 0; m_atStart = false; return len == skip ? DeprecatedString("") : convert(chs + skip, len - skip, flush); } // Handle case where we know there is no BOM coming. const int bufferSize = sizeof(m_bufferedBytes); if (numBufferedBytes + len > bufferSize || flush) { m_atStart = false; if (numBufferedBytes == 0) { return convert(chs, len, flush); } unsigned char bufferedBytes[sizeof(m_bufferedBytes)]; memcpy(bufferedBytes, m_bufferedBytes, numBufferedBytes); m_numBufferedBytes = 0; return convert(bufferedBytes, numBufferedBytes, false) + convert(chs, len, flush); } // Continue to look for the BOM. memcpy(&m_bufferedBytes[numBufferedBytes], chs, len); m_numBufferedBytes += len; return ""; }
TextEncoding TextEncoding::UTF32BE() { return TextEncoding(12001); }
// parseDataUrl() is taken from the CURL http backend. static gboolean parseDataUrl(gpointer callback_data) { ResourceHandle* handle = static_cast<ResourceHandle*>(callback_data); ResourceHandleClient* client = handle->client(); ResourceHandleInternal* d = handle->getInternal(); if (d->m_cancelled) return false; d->m_idleHandler = 0; ASSERT(client); if (!client) return false; String url = handle->request().url().string(); ASSERT(url.startsWith("data:", false)); int index = url.find(','); if (index == -1) { client->cannotShowURL(handle); return false; } String mediaType = url.substring(5, index - 5); String data = url.substring(index + 1); bool isBase64 = mediaType.endsWith(";base64", false); if (isBase64) mediaType = mediaType.left(mediaType.length() - 7); if (mediaType.isEmpty()) mediaType = "text/plain;charset=US-ASCII"; String mimeType = extractMIMETypeFromMediaType(mediaType); String charset = extractCharsetFromMediaType(mediaType); ResourceResponse response; response.setMimeType(mimeType); if (isBase64) { data = decodeURLEscapeSequences(data); response.setTextEncodingName(charset); client->didReceiveResponse(handle, response); if (d->m_cancelled) return false; // Use the GLib Base64, since WebCore's decoder isn't // general-purpose and fails on Acid3 test 97 (whitespace). size_t outLength = 0; char* outData = 0; outData = reinterpret_cast<char*>(g_base64_decode(data.utf8().data(), &outLength)); if (outData && outLength > 0) client->didReceiveData(handle, outData, outLength, 0); g_free(outData); } else { // We have to convert to UTF-16 early due to limitations in KURL data = decodeURLEscapeSequences(data, TextEncoding(charset)); response.setTextEncodingName("UTF-16"); client->didReceiveResponse(handle, response); if (d->m_cancelled) return false; if (data.length() > 0) client->didReceiveData(handle, reinterpret_cast<const char*>(data.characters()), data.length() * sizeof(UChar), 0); if (d->m_cancelled) return false; } client->didFinishLoading(handle); return false; }
TextEncoding TextEncoding::Windows(UINT codePage) { return TextEncoding(codePage); }
TextEncoding TextEncoding::WindowsLatin1() { return TextEncoding(1252); }
TextEncoding TextEncoding::System() { return TextEncoding(::GetACP()); }
TextEncoding TextEncoding::PalmLatin1() { return TextEncoding(1252); }
void FileReaderLoader::setEncoding(const String& encoding) { if (!encoding.isEmpty()) m_encoding = TextEncoding(encoding); }
TextEncoding TextEncoding::PalmShiftJIS() { // Palm Shift JIS is based on Windows CP 932, but is not identical. return TextEncoding("CP932"); }