bool MHTMLParser::parseArchiveWithHeader( MIMEHeader* header, HeapVector<Member<ArchiveResource>>& resources) { if (!header) { DVLOG(1) << "Failed to parse MHTML part: no header."; return false; } if (!header->isMultipart()) { // With IE a page with no resource is not multi-part. bool endOfArchiveReached = false; ArchiveResource* resource = parseNextPart(*header, String(), String(), endOfArchiveReached); if (!resource) return false; resources.append(resource); return true; } // Skip the message content (it's a generic browser specific message). skipLinesUntilBoundaryFound(m_lineReader, header->endOfPartBoundary()); bool endOfArchive = false; while (!endOfArchive) { MIMEHeader* resourceHeader = MIMEHeader::parseHeader(&m_lineReader); if (!resourceHeader) { DVLOG(1) << "Failed to parse MHTML, invalid MIME header."; return false; } if (resourceHeader->contentType() == "multipart/alternative") { // Ignore IE nesting which makes little sense (IE seems to nest only some // of the frames). if (!parseArchiveWithHeader(resourceHeader, resources)) { DVLOG(1) << "Failed to parse MHTML subframe."; return false; } skipLinesUntilBoundaryFound(m_lineReader, header->endOfPartBoundary()); continue; } ArchiveResource* resource = parseNextPart(*resourceHeader, header->endOfPartBoundary(), header->endOfDocumentBoundary(), endOfArchive); if (!resource) { DVLOG(1) << "Failed to parse MHTML part."; return false; } resources.append(resource); } return true; }
PassRefPtr<ArchiveResource> MHTMLParser::parseNextPart(const MIMEHeader& mimeHeader, const String& endOfPartBoundary, const String& endOfDocumentBoundary, bool& endOfArchiveReached) { ASSERT(endOfPartBoundary.isEmpty() == endOfDocumentBoundary.isEmpty()); RefPtr<SharedBuffer> content = SharedBuffer::create(); const bool checkBoundary = !endOfPartBoundary.isEmpty(); bool endOfPartReached = false; String line; while (!(line = m_lineReader.nextLine()).isNull()) { if (checkBoundary && (line == endOfPartBoundary || line == endOfDocumentBoundary)) { endOfArchiveReached = (line == endOfDocumentBoundary); endOfPartReached = true; break; } // Note that we use line.utf8() and not line.ascii() as ascii turns special characters (such as tab, line-feed...) into '?'. content->append(line.utf8().data(), line.length()); if (mimeHeader.contentTransferEncoding() == MIMEHeader::QuotedPrintable) { // The line reader removes the \r\n, but we need them for the content in this case as the QuotedPrintable decoder expects CR-LF terminated lines. content->append("\r\n", 2); } } if (!endOfPartReached && checkBoundary) { LOG_ERROR("No bounday found for MHTML part."); return 0; } Vector<char> data; switch (mimeHeader.contentTransferEncoding()) { case MIMEHeader::Base64: if (!base64Decode(content->data(), content->size(), data)) { LOG_ERROR("Invalid base64 content for MHTML part."); return 0; } break; case MIMEHeader::QuotedPrintable: quotedPrintableDecode(content->data(), content->size(), data); break; case MIMEHeader::SevenBit: data.append(content->data(), content->size()); break; default: LOG_ERROR("Invalid encoding for MHTML part."); return 0; } RefPtr<SharedBuffer> contentBuffer = SharedBuffer::adoptVector(data); // FIXME: the URL in the MIME header could be relative, we should resolve it if it is. // The specs mentions 5 ways to resolve a URL: http://tools.ietf.org/html/rfc2557#section-5 // IE and Firefox (UNMht) seem to generate only absolute URLs. KURL location = KURL(KURL(), mimeHeader.contentLocation()); return ArchiveResource::create(contentBuffer, location, mimeHeader.contentType(), mimeHeader.charset(), String()); }
PassRefPtrWillBeRawPtr<ArchiveResource> MHTMLParser::parseNextPart(const MIMEHeader& mimeHeader, const String& endOfPartBoundary, const String& endOfDocumentBoundary, bool& endOfArchiveReached) { ASSERT(endOfPartBoundary.isEmpty() == endOfDocumentBoundary.isEmpty()); // If no content transfer encoding is specified, default to binary encoding. MIMEHeader::Encoding contentTransferEncoding = mimeHeader.contentTransferEncoding(); if (contentTransferEncoding == MIMEHeader::Unknown) contentTransferEncoding = MIMEHeader::Binary; RefPtr<SharedBuffer> content = SharedBuffer::create(); const bool checkBoundary = !endOfPartBoundary.isEmpty(); bool endOfPartReached = false; if (contentTransferEncoding == MIMEHeader::Binary) { if (!checkBoundary) { WTF_LOG_ERROR("Binary contents requires end of part"); return nullptr; } m_lineReader.setSeparator(endOfPartBoundary.utf8().data()); Vector<char> part; if (!m_lineReader.nextChunk(part)) { WTF_LOG_ERROR("Binary contents requires end of part"); return nullptr; } content->append(part); m_lineReader.setSeparator("\r\n"); Vector<char> nextChars; if (m_lineReader.peek(nextChars, 2) != 2) { WTF_LOG_ERROR("Invalid seperator."); return nullptr; } endOfPartReached = true; ASSERT(nextChars.size() == 2); endOfArchiveReached = (nextChars[0] == '-' && nextChars[1] == '-'); if (!endOfArchiveReached) { String line = m_lineReader.nextChunkAsUTF8StringWithLatin1Fallback(); if (!line.isEmpty()) { WTF_LOG_ERROR("No CRLF at end of binary section."); return nullptr; } } } else { String line; while (!(line = m_lineReader.nextChunkAsUTF8StringWithLatin1Fallback()).isNull()) { endOfArchiveReached = (line == endOfDocumentBoundary); if (checkBoundary && (line == endOfPartBoundary || endOfArchiveReached)) { endOfPartReached = true; break; } // Note that we use line.utf8() and not line.ascii() as ascii turns special characters (such as tab, line-feed...) into '?'. content->append(line.utf8().data(), line.length()); if (contentTransferEncoding == MIMEHeader::QuotedPrintable) { // The line reader removes the \r\n, but we need them for the content in this case as the QuotedPrintable decoder expects CR-LF terminated lines. content->append("\r\n", 2u); } } } if (!endOfPartReached && checkBoundary) { WTF_LOG_ERROR("No bounday found for MHTML part."); return nullptr; } Vector<char> data; switch (contentTransferEncoding) { case MIMEHeader::Base64: if (!base64Decode(content->data(), content->size(), data)) { WTF_LOG_ERROR("Invalid base64 content for MHTML part."); return nullptr; } break; case MIMEHeader::QuotedPrintable: quotedPrintableDecode(content->data(), content->size(), data); break; case MIMEHeader::EightBit: case MIMEHeader::SevenBit: case MIMEHeader::Binary: data.append(content->data(), content->size()); break; default: WTF_LOG_ERROR("Invalid encoding for MHTML part."); return nullptr; } RefPtr<SharedBuffer> contentBuffer = SharedBuffer::adoptVector(data); // FIXME: the URL in the MIME header could be relative, we should resolve it if it is. // The specs mentions 5 ways to resolve a URL: http://tools.ietf.org/html/rfc2557#section-5 // IE and Firefox (UNMht) seem to generate only absolute URLs. KURL location = KURL(KURL(), mimeHeader.contentLocation()); return ArchiveResource::create( contentBuffer, location, mimeHeader.contentID(), AtomicString(mimeHeader.contentType()), AtomicString(mimeHeader.charset())); }