Example #1
0
void Highlighter::getBestFragments(string& sMarkedText,
                                   const Analyzer* pAnalyzer,
                                   const char* szText,
                                   size_t nTextLen,
                                   size_t nMaxFrags,
                                   const string& sSeparator)
{
    size_t _nMaxFrags = MAX(1, nMaxFrags); //sanity check

    TextFragmentArray frags;
    getBestTextFragments(pAnalyzer, szText, nTextLen,
                         (int32_t)_nMaxFrags, true, frags);

    TextFragment* pFrag;
    size_t nNumFrags = frags.getNumTextFragment();
    for (size_t i = 0; i < nNumFrags; i++)
    {
        pFrag = frags.getTextFragment(i);
        if ((pFrag != NULL) && (pFrag->getScore() > 0))
        {
            pFrag->markString(sMarkedText, szText, nTextLen,
                    m_pFormatter, m_pEncoder);
        }
        if (nNumFrags > 1 || nTextLen > sMarkedText.size())
        {
            sMarkedText += sSeparator;
        }
    }
}
Example #2
0
void Highlighter::getBestTextFragments(const Analyzer* pAnalyzer,
                                       const char* szText,
                                       size_t nTextLen,
                                       size_t nMaxFrags,
                                       bool bMergeContiguousFragments,
                                       TextFragmentArray& fragArray)
{
    TextFragment* pCurrentFrag = new TextFragment(0,
            (int32_t)fragArray.getNumTextFragment());
    fragArray.add(pCurrentFrag);
    
    m_pFragmentScorer->startFragment(pCurrentFrag);

    try
    {
        m_pTextFragmenter->start(szText, nTextLen);

        TokenGroupPtr pTokenGroup(new TokenGroup);
        TokenSourcePtr tokenSource(new FX_NS(analyzer)::TokenSource);
        TokenViewPtr pOrgTokenView(new TokenView());
        pOrgTokenView->addToken(szText, nTextLen);
        tokenSource->setOriginalView(pOrgTokenView);

        TokenViewPtr pTokenView = pAnalyzer->tokenize(tokenSource);

        int32_t lastEndOffset = 0;
        float lastScore = 0.0f;
        TokenView::Iterator iter = pTokenView->iterator();
        while (iter.hasNext())
        {
            const Token& token = iter.next();
            if (token.getStartOffset() < (int32_t)m_nMaxDocBytesToAnalyze)
            {
                if ((pTokenGroup->getNumTokenView() > 0) 
                    && (pTokenGroup->isDistinct(&token)))
                {
                    if (pTokenGroup->getTotalScore() > 0)
                    {
                        pCurrentFrag->addMatchedToken(pTokenGroup->getMatchedStartOffset(),
                                pTokenGroup->getMatchedEndOffset());
                    }

                    pTokenGroup->clear();

                    //check if current token marks the start of a new fragment
                    if (m_pTextFragmenter->isNewFragment(&token))
                    {
                        pCurrentFrag->setScore(m_pFragmentScorer->getFragmentScore());
                        //record stats for a new fragment
                        pCurrentFrag->setTextEndPos(token.getEndOffset());
                        pCurrentFrag = new TextFragment((int32_t)token.getStartOffset(),
                                (int32_t)fragArray.getNumTextFragment());
                        fragArray.add(pCurrentFrag);
                        m_pFragmentScorer->startFragment(pCurrentFrag);
                    }
                }

                TermPtr pTerm(new Term(m_pFragmentScorer->getFieldName(), token.getTextValue()));
                lastScore = m_pFragmentScorer->getTokenScore(pTerm.get());
                pTokenGroup->addToken(&token, lastScore);
            }
            lastEndOffset = token.getEndOffset();
        }//end while

        if ((lastScore > 0.0f) 
            && (pTokenGroup->getNumTokenView() > 0) 
            && (pTokenGroup->getTotalScore() > 0))
        {
            pCurrentFrag->addMatchedToken(pTokenGroup->getMatchedStartOffset(),
                    pTokenGroup->getMatchedEndOffset());
            pTokenGroup->clear();
        }

        pCurrentFrag->setScore(m_pFragmentScorer->getFragmentScore());
        pCurrentFrag->setTextEndPos(lastEndOffset);

        FragmentQueue fragQueue(nMaxFrags);

        //sort the most relevant sections of the text
        for (size_t i = 0; i < fragArray.getNumTextFragment(); i++)
        {
            TextFragment* pFrag = fragArray.getTextFragment(i);
            if (!fragQueue.insert(pFrag))
            {
                delete pFrag;
            }
        }
        fragArray.clear(false);
		
        //return the most relevant fragments
        while (fragQueue.size() > 0)
        {
            fragArray.add(fragQueue.pop());
        }
		
        //merge any contiguous fragments to improve readability
        if (bMergeContiguousFragments)
        {
            TextFragment* pTextFragment;
            TextFragmentArray tmpFragArray;
            for (size_t i = 0; i < fragArray.getNumTextFragment(); i++)
            {
                pTextFragment = fragArray.getTextFragment(i);
                tmpFragArray.add(pTextFragment);
            }
            fragArray.clear(false);

            mergeContiguousFragments(tmpFragArray);

            for (size_t i = 0; i < tmpFragArray.getNumTextFragment(); i++)
            {
                pTextFragment = tmpFragArray.getTextFragment(i);
                if ( pTextFragment != NULL)
                {
                    if (pTextFragment->getScore() > 0)
                    {
                        fragArray.add(pTextFragment);
                    }
                    else
                    {
                        delete pTextFragment;
                    }
                }
            }
            tmpFragArray.clear(false);
        }
    }
    catch (const FirteXException& e)
    {
        FIRTEX_RETHROW(e);
    }
}