예제 #1
0
void Highlighter::getBestFragments(string& sMarkedText,
                                   const Analyzer* pAnalyzer,
                                   const char* szText,
                                   size_t nTextLen,
                                   size_t nMaxFrags,
                                   const string& sSeparator)
{
    size_t _nMaxFrags = MAX(1, nMaxFrags); //sanity check

    TextFragmentArray frags;
    getBestTextFragments(pAnalyzer, szText, nTextLen,
                         (int32_t)_nMaxFrags, true, frags);

    TextFragment* pFrag;
    size_t nNumFrags = frags.getNumTextFragment();
    for (size_t i = 0; i < nNumFrags; i++)
    {
        pFrag = frags.getTextFragment(i);
        if ((pFrag != NULL) && (pFrag->getScore() > 0))
        {
            pFrag->markString(sMarkedText, szText, nTextLen,
                    m_pFormatter, m_pEncoder);
        }
        if (nNumFrags > 1 || nTextLen > sMarkedText.size())
        {
            sMarkedText += sSeparator;
        }
    }
}
예제 #2
0
	TextFragment subText(const TextFragment& frag, int start, int end)
	{		
		// this impl does an unneccesary copy, to keep TextFragment very simple for now.
		if(!frag) return TextFragment();
		if(start >= end) return TextFragment();
		
		
		// we won't know the output fragment size in bytes until iterating the code points. 
		int len = frag.lengthInBytes();
		SmallStackBuffer<char> temp(len);
		char* buf = temp.data();
		char* pb = buf;
		
		auto first = codepoint_iterator<const char*>(frag.getText());		
		auto it = first;
		for(int i=0; i<start; ++i)
		{
			++it;
		}
		
		for (int i=0; i<end - start; ++i) 
		{
			// write the codepoint as UTF-8 to the buffer
			if(!utf::internal::validate_codepoint(*it)) return TextFragment();
			pb = utf::internal::utf_traits<utf::utf8>::encode(*it, pb);
			++it;
		}	
		
		return TextFragment(buf, pb - buf);
	}
예제 #3
0
	int SymbolTable::getSymbolID(const HashedCharArray& hsl)
	{
		int r = 0;
		
		// get the vector of symbol IDs matching this hash. It probably has one entry but may have more. 
		const std::vector<int>& bin = mHashTable[hsl.hash].mIDVector;
		{
			bool found = false;
			
			std::unique_lock<std::mutex> lock(mHashTable[hsl.hash].mMutex);			

			 for(int ID : bin)
			 {
				// there should be few collisions, so probably the first ID in the hash bin
				// will be the symbol we are looking for. Unfortunately to test for equality we may have to 
				// compare the entire string.	
			 
				TextFragment* binFragment = &mSymbolTextsByID[ID];
				if(compareSizedCharArrays(binFragment->getText(), binFragment->lengthInBytes(), hsl.pChars, hsl.len))
				{
					 r = ID;
					 found = true;
					 break;
				}
			 }
			
			if(!found)
			{	
				mSymbolTextsByID.emplace_back(TextFragment(hsl.pChars, static_cast<int>(hsl.len)));
				r = mSize++;
				mHashTable[hsl.hash].mIDVector.emplace_back(r);		
			}
		}
		return r;
	}
예제 #4
0
void TextFragmentIterator::revertToEndOfFragment(const TextFragment& fragment)
{
    ASSERT(m_position >= fragment.end());
    while (m_currentSegment->start > fragment.end())
        --m_currentSegment;
    // TODO: It reverts to the last fragment on the same position, but that's ok for now as we don't need to
    // differentiate multiple renderers on the same position.
    m_position = fragment.end();
    m_atEndOfSegment = false;
}
예제 #5
0
static int plainTextPlusSymbolsFragmentSize(const TextFragment& f)
      {
      int res = 0;
      if (f.format.type() == CharFormatType::TEXT)
            res += f.columns();
      else {
            for (const SymId id : f.ids)
                  res += QString("<sym>%1</sym>").arg(Sym::id2name(id)).size();
            }
      return res;
      }
예제 #6
0
	void TextFragment::moveDataFromOther(TextFragment& b)
	{
		mSize = b.mSize;
		if(mSize >= kShortFragmentSize)
		{
			// move the data
			mpText = b.mpText; 
		}
		else
		{
			// point to local storage and copy data
			mpText = mLocalText; 
			std::copy(b.mLocalText, b.mLocalText + mSize, mLocalText);
			nullTerminate();
		}
		
		// mark b as empty, nothing to dispose
		b.mpText = b.mLocalText;
		b.mSize = 0;
		b.nullTerminate();
	}
예제 #7
0
	TextFragment::TextFragment(const TextFragment& a) noexcept
	{
		construct(a.getText(), a.lengthInBytes());
	}
예제 #8
0
	TextFragment::TextFragment(const TextFragment& t1, const TextFragment& t2, const TextFragment& t3, const TextFragment& t4) noexcept
	{
		construct(t1.getText(), t1.lengthInBytes(), t2.getText(), t2.lengthInBytes(), 
				  t3.getText(), t3.lengthInBytes(), t4.getText(), t4.lengthInBytes());
	}
예제 #9
0
bool MScoreTextToMXML::split(const QList<TextFragment>& in, const int pos, const int len,
                             QList<TextFragment>& left, QList<TextFragment>& mid, QList<TextFragment>& right)
      {
      //qDebug("MScoreTextToMXML::split in size %d pos %d len %d", plainTextPlusSymbolsListSize(in), pos, len);
      //qDebug("-> in");
      //dumpText(in);

      if (pos < 0 || len < 0)
            return false;

      // ensure output is empty at start
      left.clear();
      mid.clear();
      right.clear();

      // set pos to begin of first fragment
      int fragmentNr = 0;
      TextFragment fragment;
      if (fragmentNr < in.size()) fragment = in.at(fragmentNr);
      QList<TextFragment>* currentDest = &left;
      int currentMaxSize = pos;

      // while text left
      while (fragmentNr < in.size()) {
            int destSize = plainTextPlusSymbolsListSize(*currentDest);
            int fragSize = plainTextPlusSymbolsFragmentSize(fragment);
            // if no room left in current destination (check applies only to left and mid)
            if ((currentDest != &right && destSize >= currentMaxSize)
                || currentDest == &right) {
                  // move to next destination
                  if (currentDest == &left) {
                        currentDest = &mid;
                        currentMaxSize = len;
                        }
                  else if (currentDest == &mid) {
                        currentDest = &right;
                        }
                  }
            // if current fragment fits in current destination (check applies only to left and mid)
            if ((currentDest != &right && destSize + fragSize <= currentMaxSize)
                || currentDest == &right) {
                  // add it
                  currentDest->append(fragment);
                  // move to next fragment
                  fragmentNr++;
                  if (fragmentNr < in.size()) fragment = in.at(fragmentNr);
                  }
            else {
                  // split current fragment
                  TextFragment rightPart = fragment.split(currentMaxSize - plainTextPlusSymbolsListSize(*currentDest));
                  // add first part to current destination
                  currentDest->append(fragment);
                  fragment = rightPart;
                  }
            }

      /*
      qDebug("-> left");
      dumpText(left);
      qDebug("-> mid");
      dumpText(mid);
      qDebug("-> right");
      dumpText(right);
       */

      return true;
      }
예제 #10
0
void Highlighter::mergeContiguousFragments(TextFragmentArray& fragArray)
{
    bool bMergingStillBeingDone;
    if (fragArray.getNumTextFragment() > 1)
    {
        TextFragment* pTextFrag;
        TextFragment* pTextFrag2;
        do
        {
            bMergingStillBeingDone = false; //initialize loop control flag
            //for each fragment, scan other frags looking for contiguous blocks
            for (size_t i = 0; i < fragArray.getNumTextFragment(); i++)
            {
                pTextFrag = fragArray.getTextFragment(i);

                if (pTextFrag == NULL)
                {
                    continue;
                }

                //merge any contiguous blocks 
                for (size_t x = 0; x < fragArray.getNumTextFragment(); x++)
                {
                    pTextFrag2 = fragArray.getTextFragment(x);
                    if (pTextFrag2 == NULL)
                    {
                        continue;
                    }
                    if (pTextFrag == NULL)
                    {
                        break;
                    }
                    TextFragment* pFrag1 = NULL;
                    TextFragment* pFrag2 = NULL;
                    size_t frag1Num = 0;
                    size_t frag2Num = 0;
                    size_t bestScoringFragNum;
                    size_t worstScoringFragNum;
                    //if blocks are contiguous....
                    if (pTextFrag->follows(pTextFrag2))
                    {
                        pFrag1 = pTextFrag2;
                        frag1Num = x;
                        pFrag2 = pTextFrag;
                        frag2Num = i;
                    }
                    else if (pTextFrag2->follows(pTextFrag))
                    {
                        pFrag1 = pTextFrag;
                        frag1Num = i;
                        pFrag2 = pTextFrag2;
                        frag2Num = x;
                    }
					
                    //merging required..
                    if (pFrag1 != NULL)
                    {
                        if (pFrag1->getScore() > pFrag2->getScore())
                        {
                            bestScoringFragNum = frag1Num;
                            worstScoringFragNum = frag2Num;
                        }
                        else
                        {
                            bestScoringFragNum = frag2Num;
                            worstScoringFragNum = frag1Num;
                        }
                        pFrag1->merge(pFrag2);
                        fragArray.setTextFragment(worstScoringFragNum, NULL,
                                worstScoringFragNum != frag1Num);
                        bMergingStillBeingDone = true;
                        fragArray.setTextFragment(bestScoringFragNum, pFrag1,
                                bestScoringFragNum != frag1Num);
                    }
                }
            }
        }while (bMergingStillBeingDone);
    }
}
예제 #11
0
void Highlighter::getBestTextFragments(const Analyzer* pAnalyzer,
                                       const char* szText,
                                       size_t nTextLen,
                                       size_t nMaxFrags,
                                       bool bMergeContiguousFragments,
                                       TextFragmentArray& fragArray)
{
    TextFragment* pCurrentFrag = new TextFragment(0,
            (int32_t)fragArray.getNumTextFragment());
    fragArray.add(pCurrentFrag);
    
    m_pFragmentScorer->startFragment(pCurrentFrag);

    try
    {
        m_pTextFragmenter->start(szText, nTextLen);

        TokenGroupPtr pTokenGroup(new TokenGroup);
        TokenSourcePtr tokenSource(new FX_NS(analyzer)::TokenSource);
        TokenViewPtr pOrgTokenView(new TokenView());
        pOrgTokenView->addToken(szText, nTextLen);
        tokenSource->setOriginalView(pOrgTokenView);

        TokenViewPtr pTokenView = pAnalyzer->tokenize(tokenSource);

        int32_t lastEndOffset = 0;
        float lastScore = 0.0f;
        TokenView::Iterator iter = pTokenView->iterator();
        while (iter.hasNext())
        {
            const Token& token = iter.next();
            if (token.getStartOffset() < (int32_t)m_nMaxDocBytesToAnalyze)
            {
                if ((pTokenGroup->getNumTokenView() > 0) 
                    && (pTokenGroup->isDistinct(&token)))
                {
                    if (pTokenGroup->getTotalScore() > 0)
                    {
                        pCurrentFrag->addMatchedToken(pTokenGroup->getMatchedStartOffset(),
                                pTokenGroup->getMatchedEndOffset());
                    }

                    pTokenGroup->clear();

                    //check if current token marks the start of a new fragment
                    if (m_pTextFragmenter->isNewFragment(&token))
                    {
                        pCurrentFrag->setScore(m_pFragmentScorer->getFragmentScore());
                        //record stats for a new fragment
                        pCurrentFrag->setTextEndPos(token.getEndOffset());
                        pCurrentFrag = new TextFragment((int32_t)token.getStartOffset(),
                                (int32_t)fragArray.getNumTextFragment());
                        fragArray.add(pCurrentFrag);
                        m_pFragmentScorer->startFragment(pCurrentFrag);
                    }
                }

                TermPtr pTerm(new Term(m_pFragmentScorer->getFieldName(), token.getTextValue()));
                lastScore = m_pFragmentScorer->getTokenScore(pTerm.get());
                pTokenGroup->addToken(&token, lastScore);
            }
            lastEndOffset = token.getEndOffset();
        }//end while

        if ((lastScore > 0.0f) 
            && (pTokenGroup->getNumTokenView() > 0) 
            && (pTokenGroup->getTotalScore() > 0))
        {
            pCurrentFrag->addMatchedToken(pTokenGroup->getMatchedStartOffset(),
                    pTokenGroup->getMatchedEndOffset());
            pTokenGroup->clear();
        }

        pCurrentFrag->setScore(m_pFragmentScorer->getFragmentScore());
        pCurrentFrag->setTextEndPos(lastEndOffset);

        FragmentQueue fragQueue(nMaxFrags);

        //sort the most relevant sections of the text
        for (size_t i = 0; i < fragArray.getNumTextFragment(); i++)
        {
            TextFragment* pFrag = fragArray.getTextFragment(i);
            if (!fragQueue.insert(pFrag))
            {
                delete pFrag;
            }
        }
        fragArray.clear(false);
		
        //return the most relevant fragments
        while (fragQueue.size() > 0)
        {
            fragArray.add(fragQueue.pop());
        }
		
        //merge any contiguous fragments to improve readability
        if (bMergeContiguousFragments)
        {
            TextFragment* pTextFragment;
            TextFragmentArray tmpFragArray;
            for (size_t i = 0; i < fragArray.getNumTextFragment(); i++)
            {
                pTextFragment = fragArray.getTextFragment(i);
                tmpFragArray.add(pTextFragment);
            }
            fragArray.clear(false);

            mergeContiguousFragments(tmpFragArray);

            for (size_t i = 0; i < tmpFragArray.getNumTextFragment(); i++)
            {
                pTextFragment = tmpFragArray.getTextFragment(i);
                if ( pTextFragment != NULL)
                {
                    if (pTextFragment->getScore() > 0)
                    {
                        fragArray.add(pTextFragment);
                    }
                    else
                    {
                        delete pTextFragment;
                    }
                }
            }
            tmpFragArray.clear(false);
        }
    }
    catch (const FirteXException& e)
    {
        FIRTEX_RETHROW(e);
    }
}