CFX_WideString PDF_GetFirstTextLine_Unicode(CPDF_Document* pDoc, CPDF_Dictionary* pPage) { CFX_WideTextBuf buffer; buffer.EstimateSize(0, 1024); CPDF_Page page; page.Load(pDoc, pPage); CPDF_ParseOptions options; options.m_bTextOnly = TRUE; options.m_bSeparateForm = FALSE; page.ParseContent(&options); CPDF_TextStream textstream(buffer, FALSE, NULL); FX_POSITION pos = page.GetFirstObjectPosition(); while (pos) { CPDF_PageObject* pObject = page.GetNextObject(pos); if (pObject->m_Type != PDFPAGE_TEXT) { continue; } if (textstream.ProcessObject((CPDF_TextObject*)pObject, TRUE)) { break; } } return buffer.GetWideString(); }
CXML_Element* CXML_Parser::ParseElement(CXML_Element* pParent, FX_BOOL bStartTag) { m_nOffset = m_nBufferOffset + (FX_FILESIZE)m_dwIndex; if (IsEOF()) { return NULL; } CFX_ByteString tag_name, tag_space; FX_BOOL bEndTag; GetTagName(tag_space, tag_name, bEndTag, bStartTag); if (tag_name.IsEmpty() || bEndTag) { return NULL; } CXML_Element* pElement = new CXML_Element; pElement->m_pParent = pParent; pElement->SetTag(tag_space, tag_name); do { CFX_ByteString attr_space, attr_name; while (m_dwIndex < m_dwBufferSize) { SkipWhiteSpaces(); if (IsEOF()) { break; } if (!g_FXCRT_XML_IsNameIntro(m_pBuffer[m_dwIndex])) { break; } GetName(attr_space, attr_name); SkipWhiteSpaces(); if (IsEOF()) { break; } if (m_pBuffer[m_dwIndex] != '=') { break; } m_dwIndex ++; SkipWhiteSpaces(); if (IsEOF()) { break; } CFX_WideString attr_value; GetAttrValue(attr_value); pElement->m_AttrMap.SetAt(attr_space, attr_name, attr_value); } m_nOffset = m_nBufferOffset + (FX_FILESIZE)m_dwIndex; if (m_dwIndex < m_dwBufferSize || IsEOF()) { break; } } while (ReadNextBlock()); SkipWhiteSpaces(); if (IsEOF()) { return pElement; } uint8_t ch = m_pBuffer[m_dwIndex ++]; if (ch == '/') { m_dwIndex ++; m_nOffset = m_nBufferOffset + (FX_FILESIZE)m_dwIndex; return pElement; } if (ch != '>') { m_nOffset = m_nBufferOffset + (FX_FILESIZE)m_dwIndex; delete pElement; return NULL; } SkipWhiteSpaces(); if (IsEOF()) { return pElement; } CFX_UTF8Decoder decoder; CFX_WideTextBuf content; FX_BOOL bCDATA = FALSE; int32_t iState = 0; do { while (m_dwIndex < m_dwBufferSize) { ch = m_pBuffer[m_dwIndex ++]; switch (iState) { case 0: if (ch == '<') { iState = 1; } else if (ch == '&') { decoder.ClearStatus(); decoder.AppendChar(GetCharRef()); } else { decoder.Input(ch); } break; case 1: if (ch == '!') { iState = 2; } else if (ch == '?') { SkipLiterals(FX_BSTRC("?>")); SkipWhiteSpaces(); iState = 0; } else if (ch == '/') { CFX_ByteString space, name; GetName(space, name); SkipWhiteSpaces(); m_dwIndex ++; iState = 10; } else { content << decoder.GetResult(); CFX_WideString dataStr = content.GetWideString(); if (!bCDATA && !m_bSaveSpaceChars) { dataStr.TrimRight(L" \t\r\n"); } InsertContentSegment(bCDATA, dataStr, pElement); content.Clear(); decoder.Clear(); bCDATA = FALSE; iState = 0; m_dwIndex --; CXML_Element* pSubElement = ParseElement(pElement, TRUE); if (pSubElement == NULL) { break; } pSubElement->m_pParent = pElement; pElement->m_Children.Add((void*)CXML_Element::Element); pElement->m_Children.Add(pSubElement); SkipWhiteSpaces(); } break; case 2: if (ch == '[') { SkipLiterals(FX_BSTRC("]]>")); } else if (ch == '-') { m_dwIndex ++; SkipLiterals(FX_BSTRC("-->")); } else { SkipLiterals(FX_BSTRC(">")); } decoder.Clear(); SkipWhiteSpaces(); iState = 0; break; } if (iState == 10) { break; } } m_nOffset = m_nBufferOffset + (FX_FILESIZE)m_dwIndex; if (iState == 10 || m_dwIndex < m_dwBufferSize || IsEOF()) { break; } } while (ReadNextBlock()); content << decoder.GetResult(); CFX_WideString dataStr = content.GetWideString(); if (!m_bSaveSpaceChars) { dataStr.TrimRight(L" \t\r\n"); } InsertContentSegment(bCDATA, dataStr, pElement); content.Clear(); decoder.Clear(); bCDATA = FALSE; return pElement; }