void Highlighter::getBestFragments(string& sMarkedText, const Analyzer* pAnalyzer, const char* szText, size_t nTextLen, size_t nMaxFrags, const string& sSeparator) { size_t _nMaxFrags = MAX(1, nMaxFrags); //sanity check TextFragmentArray frags; getBestTextFragments(pAnalyzer, szText, nTextLen, (int32_t)_nMaxFrags, true, frags); TextFragment* pFrag; size_t nNumFrags = frags.getNumTextFragment(); for (size_t i = 0; i < nNumFrags; i++) { pFrag = frags.getTextFragment(i); if ((pFrag != NULL) && (pFrag->getScore() > 0)) { pFrag->markString(sMarkedText, szText, nTextLen, m_pFormatter, m_pEncoder); } if (nNumFrags > 1 || nTextLen > sMarkedText.size()) { sMarkedText += sSeparator; } } }
TextFragment subText(const TextFragment& frag, int start, int end) { // this impl does an unneccesary copy, to keep TextFragment very simple for now. if(!frag) return TextFragment(); if(start >= end) return TextFragment(); // we won't know the output fragment size in bytes until iterating the code points. int len = frag.lengthInBytes(); SmallStackBuffer<char> temp(len); char* buf = temp.data(); char* pb = buf; auto first = codepoint_iterator<const char*>(frag.getText()); auto it = first; for(int i=0; i<start; ++i) { ++it; } for (int i=0; i<end - start; ++i) { // write the codepoint as UTF-8 to the buffer if(!utf::internal::validate_codepoint(*it)) return TextFragment(); pb = utf::internal::utf_traits<utf::utf8>::encode(*it, pb); ++it; } return TextFragment(buf, pb - buf); }
int SymbolTable::getSymbolID(const HashedCharArray& hsl) { int r = 0; // get the vector of symbol IDs matching this hash. It probably has one entry but may have more. const std::vector<int>& bin = mHashTable[hsl.hash].mIDVector; { bool found = false; std::unique_lock<std::mutex> lock(mHashTable[hsl.hash].mMutex); for(int ID : bin) { // there should be few collisions, so probably the first ID in the hash bin // will be the symbol we are looking for. Unfortunately to test for equality we may have to // compare the entire string. TextFragment* binFragment = &mSymbolTextsByID[ID]; if(compareSizedCharArrays(binFragment->getText(), binFragment->lengthInBytes(), hsl.pChars, hsl.len)) { r = ID; found = true; break; } } if(!found) { mSymbolTextsByID.emplace_back(TextFragment(hsl.pChars, static_cast<int>(hsl.len))); r = mSize++; mHashTable[hsl.hash].mIDVector.emplace_back(r); } } return r; }
void TextFragmentIterator::revertToEndOfFragment(const TextFragment& fragment) { ASSERT(m_position >= fragment.end()); while (m_currentSegment->start > fragment.end()) --m_currentSegment; // TODO: It reverts to the last fragment on the same position, but that's ok for now as we don't need to // differentiate multiple renderers on the same position. m_position = fragment.end(); m_atEndOfSegment = false; }
static int plainTextPlusSymbolsFragmentSize(const TextFragment& f) { int res = 0; if (f.format.type() == CharFormatType::TEXT) res += f.columns(); else { for (const SymId id : f.ids) res += QString("<sym>%1</sym>").arg(Sym::id2name(id)).size(); } return res; }
void TextFragment::moveDataFromOther(TextFragment& b) { mSize = b.mSize; if(mSize >= kShortFragmentSize) { // move the data mpText = b.mpText; } else { // point to local storage and copy data mpText = mLocalText; std::copy(b.mLocalText, b.mLocalText + mSize, mLocalText); nullTerminate(); } // mark b as empty, nothing to dispose b.mpText = b.mLocalText; b.mSize = 0; b.nullTerminate(); }
TextFragment::TextFragment(const TextFragment& a) noexcept { construct(a.getText(), a.lengthInBytes()); }
TextFragment::TextFragment(const TextFragment& t1, const TextFragment& t2, const TextFragment& t3, const TextFragment& t4) noexcept { construct(t1.getText(), t1.lengthInBytes(), t2.getText(), t2.lengthInBytes(), t3.getText(), t3.lengthInBytes(), t4.getText(), t4.lengthInBytes()); }
bool MScoreTextToMXML::split(const QList<TextFragment>& in, const int pos, const int len, QList<TextFragment>& left, QList<TextFragment>& mid, QList<TextFragment>& right) { //qDebug("MScoreTextToMXML::split in size %d pos %d len %d", plainTextPlusSymbolsListSize(in), pos, len); //qDebug("-> in"); //dumpText(in); if (pos < 0 || len < 0) return false; // ensure output is empty at start left.clear(); mid.clear(); right.clear(); // set pos to begin of first fragment int fragmentNr = 0; TextFragment fragment; if (fragmentNr < in.size()) fragment = in.at(fragmentNr); QList<TextFragment>* currentDest = &left; int currentMaxSize = pos; // while text left while (fragmentNr < in.size()) { int destSize = plainTextPlusSymbolsListSize(*currentDest); int fragSize = plainTextPlusSymbolsFragmentSize(fragment); // if no room left in current destination (check applies only to left and mid) if ((currentDest != &right && destSize >= currentMaxSize) || currentDest == &right) { // move to next destination if (currentDest == &left) { currentDest = ∣ currentMaxSize = len; } else if (currentDest == &mid) { currentDest = &right; } } // if current fragment fits in current destination (check applies only to left and mid) if ((currentDest != &right && destSize + fragSize <= currentMaxSize) || currentDest == &right) { // add it currentDest->append(fragment); // move to next fragment fragmentNr++; if (fragmentNr < in.size()) fragment = in.at(fragmentNr); } else { // split current fragment TextFragment rightPart = fragment.split(currentMaxSize - plainTextPlusSymbolsListSize(*currentDest)); // add first part to current destination currentDest->append(fragment); fragment = rightPart; } } /* qDebug("-> left"); dumpText(left); qDebug("-> mid"); dumpText(mid); qDebug("-> right"); dumpText(right); */ return true; }
void Highlighter::mergeContiguousFragments(TextFragmentArray& fragArray) { bool bMergingStillBeingDone; if (fragArray.getNumTextFragment() > 1) { TextFragment* pTextFrag; TextFragment* pTextFrag2; do { bMergingStillBeingDone = false; //initialize loop control flag //for each fragment, scan other frags looking for contiguous blocks for (size_t i = 0; i < fragArray.getNumTextFragment(); i++) { pTextFrag = fragArray.getTextFragment(i); if (pTextFrag == NULL) { continue; } //merge any contiguous blocks for (size_t x = 0; x < fragArray.getNumTextFragment(); x++) { pTextFrag2 = fragArray.getTextFragment(x); if (pTextFrag2 == NULL) { continue; } if (pTextFrag == NULL) { break; } TextFragment* pFrag1 = NULL; TextFragment* pFrag2 = NULL; size_t frag1Num = 0; size_t frag2Num = 0; size_t bestScoringFragNum; size_t worstScoringFragNum; //if blocks are contiguous.... if (pTextFrag->follows(pTextFrag2)) { pFrag1 = pTextFrag2; frag1Num = x; pFrag2 = pTextFrag; frag2Num = i; } else if (pTextFrag2->follows(pTextFrag)) { pFrag1 = pTextFrag; frag1Num = i; pFrag2 = pTextFrag2; frag2Num = x; } //merging required.. if (pFrag1 != NULL) { if (pFrag1->getScore() > pFrag2->getScore()) { bestScoringFragNum = frag1Num; worstScoringFragNum = frag2Num; } else { bestScoringFragNum = frag2Num; worstScoringFragNum = frag1Num; } pFrag1->merge(pFrag2); fragArray.setTextFragment(worstScoringFragNum, NULL, worstScoringFragNum != frag1Num); bMergingStillBeingDone = true; fragArray.setTextFragment(bestScoringFragNum, pFrag1, bestScoringFragNum != frag1Num); } } } }while (bMergingStillBeingDone); } }
void Highlighter::getBestTextFragments(const Analyzer* pAnalyzer, const char* szText, size_t nTextLen, size_t nMaxFrags, bool bMergeContiguousFragments, TextFragmentArray& fragArray) { TextFragment* pCurrentFrag = new TextFragment(0, (int32_t)fragArray.getNumTextFragment()); fragArray.add(pCurrentFrag); m_pFragmentScorer->startFragment(pCurrentFrag); try { m_pTextFragmenter->start(szText, nTextLen); TokenGroupPtr pTokenGroup(new TokenGroup); TokenSourcePtr tokenSource(new FX_NS(analyzer)::TokenSource); TokenViewPtr pOrgTokenView(new TokenView()); pOrgTokenView->addToken(szText, nTextLen); tokenSource->setOriginalView(pOrgTokenView); TokenViewPtr pTokenView = pAnalyzer->tokenize(tokenSource); int32_t lastEndOffset = 0; float lastScore = 0.0f; TokenView::Iterator iter = pTokenView->iterator(); while (iter.hasNext()) { const Token& token = iter.next(); if (token.getStartOffset() < (int32_t)m_nMaxDocBytesToAnalyze) { if ((pTokenGroup->getNumTokenView() > 0) && (pTokenGroup->isDistinct(&token))) { if (pTokenGroup->getTotalScore() > 0) { pCurrentFrag->addMatchedToken(pTokenGroup->getMatchedStartOffset(), pTokenGroup->getMatchedEndOffset()); } pTokenGroup->clear(); //check if current token marks the start of a new fragment if (m_pTextFragmenter->isNewFragment(&token)) { pCurrentFrag->setScore(m_pFragmentScorer->getFragmentScore()); //record stats for a new fragment pCurrentFrag->setTextEndPos(token.getEndOffset()); pCurrentFrag = new TextFragment((int32_t)token.getStartOffset(), (int32_t)fragArray.getNumTextFragment()); fragArray.add(pCurrentFrag); m_pFragmentScorer->startFragment(pCurrentFrag); } } TermPtr pTerm(new Term(m_pFragmentScorer->getFieldName(), token.getTextValue())); lastScore = m_pFragmentScorer->getTokenScore(pTerm.get()); pTokenGroup->addToken(&token, lastScore); } lastEndOffset = token.getEndOffset(); }//end while if ((lastScore > 0.0f) && (pTokenGroup->getNumTokenView() > 0) && (pTokenGroup->getTotalScore() > 0)) { pCurrentFrag->addMatchedToken(pTokenGroup->getMatchedStartOffset(), pTokenGroup->getMatchedEndOffset()); pTokenGroup->clear(); } pCurrentFrag->setScore(m_pFragmentScorer->getFragmentScore()); pCurrentFrag->setTextEndPos(lastEndOffset); FragmentQueue fragQueue(nMaxFrags); //sort the most relevant sections of the text for (size_t i = 0; i < fragArray.getNumTextFragment(); i++) { TextFragment* pFrag = fragArray.getTextFragment(i); if (!fragQueue.insert(pFrag)) { delete pFrag; } } fragArray.clear(false); //return the most relevant fragments while (fragQueue.size() > 0) { fragArray.add(fragQueue.pop()); } //merge any contiguous fragments to improve readability if (bMergeContiguousFragments) { TextFragment* pTextFragment; TextFragmentArray tmpFragArray; for (size_t i = 0; i < fragArray.getNumTextFragment(); i++) { pTextFragment = fragArray.getTextFragment(i); tmpFragArray.add(pTextFragment); } fragArray.clear(false); mergeContiguousFragments(tmpFragArray); for (size_t i = 0; i < tmpFragArray.getNumTextFragment(); i++) { pTextFragment = tmpFragArray.getTextFragment(i); if ( pTextFragment != NULL) { if (pTextFragment->getScore() > 0) { fragArray.add(pTextFragment); } else { delete pTextFragment; } } } tmpFragArray.clear(false); } } catch (const FirteXException& e) { FIRTEX_RETHROW(e); } }