void Highlighter::getBestFragments(string& sMarkedText, const Analyzer* pAnalyzer, const char* szText, size_t nTextLen, size_t nMaxFrags, const string& sSeparator) { size_t _nMaxFrags = MAX(1, nMaxFrags); //sanity check TextFragmentArray frags; getBestTextFragments(pAnalyzer, szText, nTextLen, (int32_t)_nMaxFrags, true, frags); TextFragment* pFrag; size_t nNumFrags = frags.getNumTextFragment(); for (size_t i = 0; i < nNumFrags; i++) { pFrag = frags.getTextFragment(i); if ((pFrag != NULL) && (pFrag->getScore() > 0)) { pFrag->markString(sMarkedText, szText, nTextLen, m_pFormatter, m_pEncoder); } if (nNumFrags > 1 || nTextLen > sMarkedText.size()) { sMarkedText += sSeparator; } } }
void Highlighter::getBestTextFragments(const Analyzer* pAnalyzer, const char* szText, size_t nTextLen, size_t nMaxFrags, bool bMergeContiguousFragments, TextFragmentArray& fragArray) { TextFragment* pCurrentFrag = new TextFragment(0, (int32_t)fragArray.getNumTextFragment()); fragArray.add(pCurrentFrag); m_pFragmentScorer->startFragment(pCurrentFrag); try { m_pTextFragmenter->start(szText, nTextLen); TokenGroupPtr pTokenGroup(new TokenGroup); TokenSourcePtr tokenSource(new FX_NS(analyzer)::TokenSource); TokenViewPtr pOrgTokenView(new TokenView()); pOrgTokenView->addToken(szText, nTextLen); tokenSource->setOriginalView(pOrgTokenView); TokenViewPtr pTokenView = pAnalyzer->tokenize(tokenSource); int32_t lastEndOffset = 0; float lastScore = 0.0f; TokenView::Iterator iter = pTokenView->iterator(); while (iter.hasNext()) { const Token& token = iter.next(); if (token.getStartOffset() < (int32_t)m_nMaxDocBytesToAnalyze) { if ((pTokenGroup->getNumTokenView() > 0) && (pTokenGroup->isDistinct(&token))) { if (pTokenGroup->getTotalScore() > 0) { pCurrentFrag->addMatchedToken(pTokenGroup->getMatchedStartOffset(), pTokenGroup->getMatchedEndOffset()); } pTokenGroup->clear(); //check if current token marks the start of a new fragment if (m_pTextFragmenter->isNewFragment(&token)) { pCurrentFrag->setScore(m_pFragmentScorer->getFragmentScore()); //record stats for a new fragment pCurrentFrag->setTextEndPos(token.getEndOffset()); pCurrentFrag = new TextFragment((int32_t)token.getStartOffset(), (int32_t)fragArray.getNumTextFragment()); fragArray.add(pCurrentFrag); m_pFragmentScorer->startFragment(pCurrentFrag); } } TermPtr pTerm(new Term(m_pFragmentScorer->getFieldName(), token.getTextValue())); lastScore = m_pFragmentScorer->getTokenScore(pTerm.get()); pTokenGroup->addToken(&token, lastScore); } lastEndOffset = token.getEndOffset(); }//end while if ((lastScore > 0.0f) && (pTokenGroup->getNumTokenView() > 0) && (pTokenGroup->getTotalScore() > 0)) { pCurrentFrag->addMatchedToken(pTokenGroup->getMatchedStartOffset(), pTokenGroup->getMatchedEndOffset()); pTokenGroup->clear(); } pCurrentFrag->setScore(m_pFragmentScorer->getFragmentScore()); pCurrentFrag->setTextEndPos(lastEndOffset); FragmentQueue fragQueue(nMaxFrags); //sort the most relevant sections of the text for (size_t i = 0; i < fragArray.getNumTextFragment(); i++) { TextFragment* pFrag = fragArray.getTextFragment(i); if (!fragQueue.insert(pFrag)) { delete pFrag; } } fragArray.clear(false); //return the most relevant fragments while (fragQueue.size() > 0) { fragArray.add(fragQueue.pop()); } //merge any contiguous fragments to improve readability if (bMergeContiguousFragments) { TextFragment* pTextFragment; TextFragmentArray tmpFragArray; for (size_t i = 0; i < fragArray.getNumTextFragment(); i++) { pTextFragment = fragArray.getTextFragment(i); tmpFragArray.add(pTextFragment); } fragArray.clear(false); mergeContiguousFragments(tmpFragArray); for (size_t i = 0; i < tmpFragArray.getNumTextFragment(); i++) { pTextFragment = tmpFragArray.getTextFragment(i); if ( pTextFragment != NULL) { if (pTextFragment->getScore() > 0) { fragArray.add(pTextFragment); } else { delete pTextFragment; } } } tmpFragArray.clear(false); } } catch (const FirteXException& e) { FIRTEX_RETHROW(e); } }