CFX_WideString PDF_GetFirstTextLine_Unicode(CPDF_Document* pDoc, CPDF_Dictionary* pPage)
{
    CFX_WideTextBuf buffer;
    buffer.EstimateSize(0, 1024);
    CPDF_Page page;
    page.Load(pDoc, pPage);
    CPDF_ParseOptions options;
    options.m_bTextOnly = TRUE;
    options.m_bSeparateForm = FALSE;
    page.ParseContent(&options);
    CPDF_TextStream textstream(buffer, FALSE, NULL);
    FX_POSITION pos = page.GetFirstObjectPosition();
    while (pos) {
        CPDF_PageObject* pObject = page.GetNextObject(pos);
        if (pObject->m_Type != PDFPAGE_TEXT) {
            continue;
        }
        if (textstream.ProcessObject((CPDF_TextObject*)pObject, TRUE)) {
            break;
        }
    }
    return buffer.GetWideString();
}
Exemple #2
0
CXML_Element* CXML_Parser::ParseElement(CXML_Element* pParent, FX_BOOL bStartTag)
{
    m_nOffset = m_nBufferOffset + (FX_FILESIZE)m_dwIndex;
    if (IsEOF()) {
        return NULL;
    }
    CFX_ByteString tag_name, tag_space;
    FX_BOOL bEndTag;
    GetTagName(tag_space, tag_name, bEndTag, bStartTag);
    if (tag_name.IsEmpty() || bEndTag) {
        return NULL;
    }
    CXML_Element* pElement = new CXML_Element;
    pElement->m_pParent = pParent;
    pElement->SetTag(tag_space, tag_name);
    do {
        CFX_ByteString attr_space, attr_name;
        while (m_dwIndex < m_dwBufferSize) {
            SkipWhiteSpaces();
            if (IsEOF()) {
                break;
            }
            if (!g_FXCRT_XML_IsNameIntro(m_pBuffer[m_dwIndex])) {
                break;
            }
            GetName(attr_space, attr_name);
            SkipWhiteSpaces();
            if (IsEOF()) {
                break;
            }
            if (m_pBuffer[m_dwIndex] != '=') {
                break;
            }
            m_dwIndex ++;
            SkipWhiteSpaces();
            if (IsEOF()) {
                break;
            }
            CFX_WideString attr_value;
            GetAttrValue(attr_value);
            pElement->m_AttrMap.SetAt(attr_space, attr_name, attr_value);
        }
        m_nOffset = m_nBufferOffset + (FX_FILESIZE)m_dwIndex;
        if (m_dwIndex < m_dwBufferSize || IsEOF()) {
            break;
        }
    } while (ReadNextBlock());
    SkipWhiteSpaces();
    if (IsEOF()) {
        return pElement;
    }
    uint8_t ch = m_pBuffer[m_dwIndex ++];
    if (ch == '/') {
        m_dwIndex ++;
        m_nOffset = m_nBufferOffset + (FX_FILESIZE)m_dwIndex;
        return pElement;
    }
    if (ch != '>') {
        m_nOffset = m_nBufferOffset + (FX_FILESIZE)m_dwIndex;
        delete pElement;
        return NULL;
    }
    SkipWhiteSpaces();
    if (IsEOF()) {
        return pElement;
    }
    CFX_UTF8Decoder decoder;
    CFX_WideTextBuf content;
    FX_BOOL bCDATA = FALSE;
    int32_t iState = 0;
    do {
        while (m_dwIndex < m_dwBufferSize) {
            ch = m_pBuffer[m_dwIndex ++];
            switch (iState) {
                case 0:
                    if (ch == '<') {
                        iState = 1;
                    } else if (ch == '&') {
                        decoder.ClearStatus();
                        decoder.AppendChar(GetCharRef());
                    } else {
                        decoder.Input(ch);
                    }
                    break;
                case 1:
                    if (ch == '!') {
                        iState = 2;
                    } else if (ch == '?') {
                        SkipLiterals(FX_BSTRC("?>"));
                        SkipWhiteSpaces();
                        iState = 0;
                    } else if (ch == '/') {
                        CFX_ByteString space, name;
                        GetName(space, name);
                        SkipWhiteSpaces();
                        m_dwIndex ++;
                        iState = 10;
                    } else {
                        content << decoder.GetResult();
                        CFX_WideString dataStr = content.GetWideString();
                        if (!bCDATA && !m_bSaveSpaceChars) {
                            dataStr.TrimRight(L" \t\r\n");
                        }
                        InsertContentSegment(bCDATA, dataStr, pElement);
                        content.Clear();
                        decoder.Clear();
                        bCDATA = FALSE;
                        iState = 0;
                        m_dwIndex --;
                        CXML_Element* pSubElement = ParseElement(pElement, TRUE);
                        if (pSubElement == NULL) {
                            break;
                        }
                        pSubElement->m_pParent = pElement;
                        pElement->m_Children.Add((void*)CXML_Element::Element);
                        pElement->m_Children.Add(pSubElement);
                        SkipWhiteSpaces();
                    }
                    break;
                case 2:
                    if (ch == '[') {
                        SkipLiterals(FX_BSTRC("]]>"));
                    } else if (ch == '-') {
                        m_dwIndex ++;
                        SkipLiterals(FX_BSTRC("-->"));
                    } else {
                        SkipLiterals(FX_BSTRC(">"));
                    }
                    decoder.Clear();
                    SkipWhiteSpaces();
                    iState = 0;
                    break;
            }
            if (iState == 10) {
                break;
            }
        }
        m_nOffset = m_nBufferOffset + (FX_FILESIZE)m_dwIndex;
        if (iState == 10 || m_dwIndex < m_dwBufferSize || IsEOF()) {
            break;
        }
    } while (ReadNextBlock());
    content << decoder.GetResult();
    CFX_WideString dataStr = content.GetWideString();
    if (!m_bSaveSpaceChars) {
        dataStr.TrimRight(L" \t\r\n");
    }
    InsertContentSegment(bCDATA, dataStr, pElement);
    content.Clear();
    decoder.Clear();
    bCDATA = FALSE;
    return pElement;
}