示例#1
0
void StandardStopFilter::filterInternal(TokenSourcePtr& tokenSource) const
{
    TokenViewPtr pTokenView = tokenSource->getLastView();
    if (!pTokenView.isNull())
    {
        TokenView::Iterator it = pTokenView->iterator();
        while (it.hasNext())
        {
            Token& token = const_cast<Token&>(it.next());
            if (m_stopwords.find((termid_t)token.getHintId()) 
                != m_stopwords.end())
            {
                token.clear();
            }
        }
    }
}
示例#2
0
void StandardAnalyzer::doTokenize(TokenSourcePtr& tokenSource) const
{
    FIRTEX_ASSERT2(m_pSegmenter != NULL);

    TokenViewPtr pInTokenView = tokenSource->getLastView();
    TokenViewPtr pOutTokenView = tokenSource->acquireCustomView(_T("standard_view"));
    pOutTokenView->reserve(getMaxTokensToAnalyze());
    pOutTokenView->setTokenType(Token::TT_CWORD);

    TokenView::Iterator iter = pInTokenView->iterator();
    while(iter.hasNext())
    {
        const Token& token = iter.next();
        m_pSegmenter->segment(pOutTokenView,
                              token.getTextValue(),
                              token.getTextLength());
    }
}
示例#3
0
size_t StandardStopFilter::loadFile(const tstring& filename)
{
    m_stopwords.clear();

    size_t nWords = 0;
    ifstream ifterm(filename.c_str(), ios::in|ios::binary);
    if (!ifterm.is_open()) 
    {
        return 0;
    }

    StandardAnalyzer analyzer;
    char term[128];
    while (!ifterm.eof())
    {
        ifterm.getline(term, 128);
        TokenViewPtr pTokens = analyzer.tokenize(term, strlen(term));
        if (pTokens->getNumTokens() > 1)
        {
            FX_STREAM_LOG(WARN) << "Stop word: [" << term 
                                << "] contains more than two terms" << FIRTEX_ENDL;
        }
        else if(pTokens->getNumTokens() == 1)
        {
            TokenView::Iterator it = pTokens->iterator();
            const Token& token = it.next();
            assert(token.getHintId() != INVALID_HINTID);
            m_stopwords.insert((termid_t)token.getHintId());
            ++nWords;
        }
        else 
        {
            FX_STREAM_LOG(WARN) << "Ignore stop word: [" << term 
                                << "]." << FIRTEX_ENDL;
        }
    }
    ifterm.close();

    return nWords;
}
示例#4
0
void IndexTestCase::checkDocFreq(IndexReaderPtr& pIndexReader,
                                 const tstring& sField, 
                                 const tstring& sTerm,
                                 df_t expDf)
{
    TermReaderPtr pTermReader = pIndexReader->termReader();
    CPPUNIT_ASSERT(pTermReader);    

    StandardAnalyzer sa;
    sa.init();

    TokenViewPtr pTokens = sa.tokenize(sTerm.c_str(), sTerm.length());
    CPPUNIT_ASSERT(pTokens);
    CPPUNIT_ASSERT(pTokens->getNumTokens() == 1);
    TokenView::Iterator it = pTokens->iterator();
    TermPtr pTerm(new Term(sField, it.next().getTextValue()));

    TermPostingIteratorPtr pPost = pTermReader->seek(pTerm.get());
    CPPUNIT_ASSERT(pPost);    
    const TermMeta& termMeta = pPost->getTermMeta();
    CPPUNIT_ASSERT_EQUAL(expDf, termMeta.getDocFreq());
}
示例#5
0
size_t StandardStopFilter::loadWords(const tstring& sWords)
{
    m_stopwords.clear();

    size_t nWords = 0;
    StandardAnalyzer analyzer;
    analyzer.init();
    TokenViewPtr pTokens = analyzer.tokenize(sWords.c_str(), sWords.length());
    if (pTokens.isNotNull())
    {
        TokenView::Iterator it = pTokens->iterator();
        while (it.hasNext())
        {
            const Token& token = it.next();
            assert(token.getHintId() != INVALID_HINTID);
            m_stopwords.insert((termid_t)token.getHintId());

            ++nWords;
        }
    }
    return nWords;
}
示例#6
0
void IndexTestCase::testDocumentDeletion()
{
    DocumentSchema schema;
    schema.addField("URL", "PRIMARY_KEY", true);
    schema.addTextField("BODY");
    schema.addField("MODIFIED", "INT64", true);
    
    stringstream ss1;
    const size_t NUM_DOCS = 1000;
    size_t i = 0;
    for (; i < NUM_DOCS; ++i)
    {
        ss1 << "url" << i << ", body" << i << " hot," 
            << (i * 100) % 1000 << ";";
    }
    buildIndex(schema, ss1.str());

    stringstream ss2;
    for (; i < 2 * NUM_DOCS; ++i)
    {
        ss2 << "url" << i << ", body" << i << " hot," 
            << (i * 100) % 1000 << ";";
    }

    buildIndex(schema, ss2.str(), true);

    StandardAnalyzerPtr sa(new StandardAnalyzer());
    sa->init();

    TokenViewPtr pTokens = sa->tokenize("hot", 3);
    CPPUNIT_ASSERT(pTokens);
    CPPUNIT_ASSERT(pTokens->getNumTokens() == 1);
    TokenView::Iterator it = pTokens->iterator();
    TermPtr pTerm(new Term("BODY", it.next().getTextValue()));
    
    tstring str = getTestPath();
    
    std::set<docid_t> answer;

    {
        Index index;
        index.open(str, Index::RDWR, NULL); 
        IndexWriterPtr pIndexWriter = index.acquireWriter();
        CPPUNIT_ASSERT(pIndexWriter != NULL);

        IndexReaderPtr pIndexReader = index.acquireReader();
        CPPUNIT_ASSERT(pIndexReader != NULL);

        for (size_t i = 0; i < 2 * NUM_DOCS; ++i)
        {
            stringstream ss;
            ss << "url" << i;
            if (i == 1000 || i == 1500 || i == 1505 || i == 1999)
            {
                pIndexWriter->deleteDocument(ss.str());
            }
            else
            {
                TermReaderPtr pTermReader = pIndexReader->termReader();
                TermPtr pTerm(new Term("URL", ss.str()));
                TermPostingIteratorPtr pIt = pTermReader->seek(pTerm.get());
                docid_t docId = pIt->skipTo(0);
                answer.insert(docId);
            }
        }

        TermReaderPtr pTermReader = pIndexReader->termReader();
        TermPostingIteratorPtr pDocFreqs = pTermReader->seek(pTerm.get());
        CPPUNIT_ASSERT(pDocFreqs);

        CPPUNIT_ASSERT_EQUAL((df_t)NUM_DOCS * 2, pDocFreqs->getTermMeta().getDocFreq());

        std::set<docid_t>::const_iterator it = answer.begin();
        for (docid_t i = 0; i < (docid_t)(2 * NUM_DOCS); )
        {        
            docid_t docId = pDocFreqs->skipTo((docid_t)i);
            i = docId + 1;
            if (docId == INVALID_DOCID)
            {
                break;
            }
            CPPUNIT_ASSERT_EQUAL(*it, docId);
            ++it;
        }
        CPPUNIT_ASSERT(it == answer.end());
    }

    {
        Index index;
        index.open(str, Index::READ, NULL); 
        IndexReaderPtr pIndexReader = index.acquireReader();
        CPPUNIT_ASSERT(pIndexReader != NULL);

        TermReaderPtr pTermReader = pIndexReader->termReader();
        TermPostingIteratorPtr pDocFreqs = pTermReader->seek(pTerm.get());
        CPPUNIT_ASSERT(pDocFreqs);

        CPPUNIT_ASSERT_EQUAL((df_t)(2 * NUM_DOCS), pDocFreqs->getTermMeta().getDocFreq());
        std::set<docid_t>::const_iterator it = answer.begin();
        for (docid_t i = 0; i < (docid_t)(2 * NUM_DOCS); )
        {        
            docid_t docId = pDocFreqs->skipTo((docid_t)i);
            i = docId + 1;
            if (docId == INVALID_DOCID)
            {
                break;
            }
            CPPUNIT_ASSERT_EQUAL(*it, docId);
            ++it;
        }
        CPPUNIT_ASSERT(it == answer.end());

        // for (std::set<docid_t>::const_iterator it = answer.begin();
        //      it != answer.end(); ++it)
        // {
        //     docid_t docId = pDocFreqs->skipTo(*it);
        //     CPPUNIT_ASSERT_EQUAL(*it, docId);
        // }

        // docid_t docId = pDocFreqs->skipTo(NUM_DOCS + 0);
        // CPPUNIT_ASSERT_EQUAL((docid_t)NUM_DOCS + 1, docId);
        // docId = pDocFreqs->skipTo(NUM_DOCS + 500);
        // CPPUNIT_ASSERT_EQUAL((docid_t)NUM_DOCS + 501, docId);
        // docId = pDocFreqs->skipTo(NUM_DOCS + 505);
        // CPPUNIT_ASSERT_EQUAL((docid_t)NUM_DOCS + 506, docId);
        // docId = pDocFreqs->skipTo(2 * NUM_DOCS - 1);
        // CPPUNIT_ASSERT_EQUAL((docid_t)INVALID_DOCID, docId);
    }
}
void SynonymFilterTestCase::testFilter()
{
    SynonymFilter sf;
    SynonymMap& sm = sf.getSynonymMap();
    sm.addSynonyms("一群獾", "CETE");
    sm.addSynonyms("一瞥", "CAST,CATCH SIGHT OF,COLOR,COMPANY,DEKKO,"
                   "FLING,FORM,GLANCE,GLAUM,GLIMPSE,KIND,LOOK,MOLD,"
                   "NOTICE,PITCH,SEE,SHADE,SHAPE,SKIM,SORT,THROW,"
                   "TINT,TOSS,TROUPE,TYPE,VARIETY,一看,投掷,演员表,眼光,脱落物,铸件");
    TokenViewPtr pTokens(new TokenView);
    pTokens->addToken(_T("北京"));
    pTokens->addToken(_T("一群獾"));
    pTokens->addToken(_T("一瞥"));
    pTokens->addToken(_T("中国"));
    pTokens->addToken(_T("cete"));
    TokenSourcePtr tokenSource(new FX_NS(analyzer)::TokenSource);
    tokenSource->setOriginalView(pTokens);

    sf.filter(tokenSource);
    TokenViewPtr pSynTokens = tokenSource->getLastView();
    CPPUNIT_ASSERT(pSynTokens.isNull() != true);
    TokenView::Iterator iter = pSynTokens->iterator();	
    CPPUNIT_ASSERT(iter.hasNext());
    Token token = iter.next();
    CPPUNIT_ASSERT_EQUAL(tstring("北京"), tstring(token.getTextValue()));
    CPPUNIT_ASSERT_EQUAL((int32_t)1, token.getPosIncrement());

    CPPUNIT_ASSERT(iter.hasNext());
    token = iter.next();
    CPPUNIT_ASSERT_EQUAL(tstring("一群獾"), tstring(token.getTextValue()));
    CPPUNIT_ASSERT_EQUAL((int32_t)1, token.getPosIncrement());

    CPPUNIT_ASSERT(iter.hasNext());
    token = iter.next();
    CPPUNIT_ASSERT_EQUAL(tstring("cete"), tstring(token.getTextValue()));
    CPPUNIT_ASSERT_EQUAL((int32_t)0, token.getPosIncrement());

    CPPUNIT_ASSERT(iter.hasNext());
    token = iter.next();
    CPPUNIT_ASSERT_EQUAL(tstring("一瞥"), tstring(token.getTextValue()));
    CPPUNIT_ASSERT_EQUAL((int32_t)1, token.getPosIncrement());

    CPPUNIT_ASSERT(iter.hasNext());
    token = iter.next();
    CPPUNIT_ASSERT_EQUAL(tstring("cast"), tstring(token.getTextValue()));
    CPPUNIT_ASSERT_EQUAL((int32_t)0, token.getPosIncrement());

    for(size_t i = 0; i < 30; i++)
    {
        CPPUNIT_ASSERT(iter.hasNext());
        iter.next();
    }

    CPPUNIT_ASSERT(iter.hasNext());
    token = iter.next();
    CPPUNIT_ASSERT_EQUAL(tstring("铸件"), tstring(token.getTextValue()));
    CPPUNIT_ASSERT_EQUAL((int32_t)0, token.getPosIncrement());

    CPPUNIT_ASSERT(iter.hasNext());
    token = iter.next();
    CPPUNIT_ASSERT_EQUAL(tstring("中国"), tstring(token.getTextValue()));
    CPPUNIT_ASSERT_EQUAL((int32_t)1, token.getPosIncrement());

    CPPUNIT_ASSERT(iter.hasNext());
    token = iter.next();
    CPPUNIT_ASSERT_EQUAL(tstring("cete"), tstring(token.getTextValue()));
    CPPUNIT_ASSERT_EQUAL((int32_t)1, token.getPosIncrement());

    CPPUNIT_ASSERT(iter.hasNext());
    token = iter.next();
    CPPUNIT_ASSERT_EQUAL(tstring("一群獾"), tstring(token.getTextValue()));
    CPPUNIT_ASSERT_EQUAL((int32_t)0, token.getPosIncrement());
}
示例#8
0
void Highlighter::getBestTextFragments(const Analyzer* pAnalyzer,
                                       const char* szText,
                                       size_t nTextLen,
                                       size_t nMaxFrags,
                                       bool bMergeContiguousFragments,
                                       TextFragmentArray& fragArray)
{
    TextFragment* pCurrentFrag = new TextFragment(0,
            (int32_t)fragArray.getNumTextFragment());
    fragArray.add(pCurrentFrag);
    
    m_pFragmentScorer->startFragment(pCurrentFrag);

    try
    {
        m_pTextFragmenter->start(szText, nTextLen);

        TokenGroupPtr pTokenGroup(new TokenGroup);
        TokenSourcePtr tokenSource(new FX_NS(analyzer)::TokenSource);
        TokenViewPtr pOrgTokenView(new TokenView());
        pOrgTokenView->addToken(szText, nTextLen);
        tokenSource->setOriginalView(pOrgTokenView);

        TokenViewPtr pTokenView = pAnalyzer->tokenize(tokenSource);

        int32_t lastEndOffset = 0;
        float lastScore = 0.0f;
        TokenView::Iterator iter = pTokenView->iterator();
        while (iter.hasNext())
        {
            const Token& token = iter.next();
            if (token.getStartOffset() < (int32_t)m_nMaxDocBytesToAnalyze)
            {
                if ((pTokenGroup->getNumTokenView() > 0) 
                    && (pTokenGroup->isDistinct(&token)))
                {
                    if (pTokenGroup->getTotalScore() > 0)
                    {
                        pCurrentFrag->addMatchedToken(pTokenGroup->getMatchedStartOffset(),
                                pTokenGroup->getMatchedEndOffset());
                    }

                    pTokenGroup->clear();

                    //check if current token marks the start of a new fragment
                    if (m_pTextFragmenter->isNewFragment(&token))
                    {
                        pCurrentFrag->setScore(m_pFragmentScorer->getFragmentScore());
                        //record stats for a new fragment
                        pCurrentFrag->setTextEndPos(token.getEndOffset());
                        pCurrentFrag = new TextFragment((int32_t)token.getStartOffset(),
                                (int32_t)fragArray.getNumTextFragment());
                        fragArray.add(pCurrentFrag);
                        m_pFragmentScorer->startFragment(pCurrentFrag);
                    }
                }

                TermPtr pTerm(new Term(m_pFragmentScorer->getFieldName(), token.getTextValue()));
                lastScore = m_pFragmentScorer->getTokenScore(pTerm.get());
                pTokenGroup->addToken(&token, lastScore);
            }
            lastEndOffset = token.getEndOffset();
        }//end while

        if ((lastScore > 0.0f) 
            && (pTokenGroup->getNumTokenView() > 0) 
            && (pTokenGroup->getTotalScore() > 0))
        {
            pCurrentFrag->addMatchedToken(pTokenGroup->getMatchedStartOffset(),
                    pTokenGroup->getMatchedEndOffset());
            pTokenGroup->clear();
        }

        pCurrentFrag->setScore(m_pFragmentScorer->getFragmentScore());
        pCurrentFrag->setTextEndPos(lastEndOffset);

        FragmentQueue fragQueue(nMaxFrags);

        //sort the most relevant sections of the text
        for (size_t i = 0; i < fragArray.getNumTextFragment(); i++)
        {
            TextFragment* pFrag = fragArray.getTextFragment(i);
            if (!fragQueue.insert(pFrag))
            {
                delete pFrag;
            }
        }
        fragArray.clear(false);
		
        //return the most relevant fragments
        while (fragQueue.size() > 0)
        {
            fragArray.add(fragQueue.pop());
        }
		
        //merge any contiguous fragments to improve readability
        if (bMergeContiguousFragments)
        {
            TextFragment* pTextFragment;
            TextFragmentArray tmpFragArray;
            for (size_t i = 0; i < fragArray.getNumTextFragment(); i++)
            {
                pTextFragment = fragArray.getTextFragment(i);
                tmpFragArray.add(pTextFragment);
            }
            fragArray.clear(false);

            mergeContiguousFragments(tmpFragArray);

            for (size_t i = 0; i < tmpFragArray.getNumTextFragment(); i++)
            {
                pTextFragment = tmpFragArray.getTextFragment(i);
                if ( pTextFragment != NULL)
                {
                    if (pTextFragment->getScore() > 0)
                    {
                        fragArray.add(pTextFragment);
                    }
                    else
                    {
                        delete pTextFragment;
                    }
                }
            }
            tmpFragArray.clear(false);
        }
    }
    catch (const FirteXException& e)
    {
        FIRTEX_RETHROW(e);
    }
}
void TextIndexMergerTestCase::MergeEntry::makeData()
{
    for (size_t i = 0; i < nDataSize; ++i)
    {
        int32_t nPosInc = 0;
        TokenViewPtr pTokenView = new TokenView(i + 1, Token::TT_CWORD);
        for (size_t j = 0; j < i + 1; ++j)
        {
            stringstream ss;
            ss << "abc" << (i + j) % 100;
            pTokenView->addToken(ss.str().c_str(), nPosInc, 
                    0, 0, Token::TT_CWORD);
            nPosInc = 1;

            uint64_t hash = Hash::hashString64(ss.str().c_str());
            TermMap::iterator it = answer.find(hash);
            if (it != answer.end())
            {
                DocVector& docVect = it->second;
                if (docVect[docVect.size() - 1].first == (docid_t)i)
                {
                    docVect[docVect.size() - 1].second.push_back(j);
                }
                else 
                {
                    std::vector<loc_t> pos;
                    pos.push_back(j);
                    docVect.push_back(make_pair(i, pos));
                }
            }
            else
            {
                DocVector docVect;
                docVect.resize(1);
                docVect[0].first = i;
                docVect[0].second.push_back(j);
                answer.insert(make_pair(hash, docVect));
            }
        }
        //pFieldInfo->getIndexInfo().totalTerms += pTokenView->getNumTokens();
        AnalyzedFieldPtr pField = new AnalyzedField(NULL, pTokenView);
        pIndexer->addField(pField.get());
        pIndexer->commitDocument((docid_t)i);

        if (pDocFilter)
        {
            if (i % 5 == 0 || i == (nDataSize - 1))
            {
                pDocFilter->set(i);
            }
        }
    }
    //pFieldInfo->getIndexInfo().distinctTerms = nDataSize;
    BarrelDirectory::createBarrel(pFileSystem, pBarrelInfo->getSuffix());
    pIndexer->commit(pFileSystem, pBarrelInfo->getSuffix());

    if (pDocFilter)
    {
        pDocIdRemapper = new DocIdRecycling();
        pDocIdRemapper->init(nDataSize, pDocFilter.get());
    }
}