size_t StandardStopFilter::loadFile(const tstring& filename)
{
    m_stopwords.clear();

    size_t nWords = 0;
    ifstream ifterm(filename.c_str(), ios::in|ios::binary);
    if (!ifterm.is_open()) 
    {
        return 0;
    }

    StandardAnalyzer analyzer;
    char term[128];
    while (!ifterm.eof())
    {
        ifterm.getline(term, 128);
        TokenViewPtr pTokens = analyzer.tokenize(term, strlen(term));
        if (pTokens->getNumTokens() > 1)
        {
            FX_STREAM_LOG(WARN) << "Stop word: [" << term 
                                << "] contains more than two terms" << FIRTEX_ENDL;
        }
        else if(pTokens->getNumTokens() == 1)
        {
            TokenView::Iterator it = pTokens->iterator();
            const Token& token = it.next();
            assert(token.getHintId() != INVALID_HINTID);
            m_stopwords.insert((termid_t)token.getHintId());
            ++nWords;
        }
        else 
        {
            FX_STREAM_LOG(WARN) << "Ignore stop word: [" << term 
                                << "]." << FIRTEX_ENDL;
        }
    }
    ifterm.close();

    return nWords;
}
Example #2
0
void IndexTestCase::checkDocFreq(IndexReaderPtr& pIndexReader,
                                 const tstring& sField, 
                                 const tstring& sTerm,
                                 df_t expDf)
{
    TermReaderPtr pTermReader = pIndexReader->termReader();
    CPPUNIT_ASSERT(pTermReader);    

    StandardAnalyzer sa;
    sa.init();

    TokenViewPtr pTokens = sa.tokenize(sTerm.c_str(), sTerm.length());
    CPPUNIT_ASSERT(pTokens);
    CPPUNIT_ASSERT(pTokens->getNumTokens() == 1);
    TokenView::Iterator it = pTokens->iterator();
    TermPtr pTerm(new Term(sField, it.next().getTextValue()));

    TermPostingIteratorPtr pPost = pTermReader->seek(pTerm.get());
    CPPUNIT_ASSERT(pPost);    
    const TermMeta& termMeta = pPost->getTermMeta();
    CPPUNIT_ASSERT_EQUAL(expDf, termMeta.getDocFreq());
}
size_t StandardStopFilter::loadWords(const tstring& sWords)
{
    m_stopwords.clear();

    size_t nWords = 0;
    StandardAnalyzer analyzer;
    analyzer.init();
    TokenViewPtr pTokens = analyzer.tokenize(sWords.c_str(), sWords.length());
    if (pTokens.isNotNull())
    {
        TokenView::Iterator it = pTokens->iterator();
        while (it.hasNext())
        {
            const Token& token = it.next();
            assert(token.getHintId() != INVALID_HINTID);
            m_stopwords.insert((termid_t)token.getHintId());

            ++nWords;
        }
    }
    return nWords;
}