size_t StandardStopFilter::loadFile(const tstring& filename) { m_stopwords.clear(); size_t nWords = 0; ifstream ifterm(filename.c_str(), ios::in|ios::binary); if (!ifterm.is_open()) { return 0; } StandardAnalyzer analyzer; char term[128]; while (!ifterm.eof()) { ifterm.getline(term, 128); TokenViewPtr pTokens = analyzer.tokenize(term, strlen(term)); if (pTokens->getNumTokens() > 1) { FX_STREAM_LOG(WARN) << "Stop word: [" << term << "] contains more than two terms" << FIRTEX_ENDL; } else if(pTokens->getNumTokens() == 1) { TokenView::Iterator it = pTokens->iterator(); const Token& token = it.next(); assert(token.getHintId() != INVALID_HINTID); m_stopwords.insert((termid_t)token.getHintId()); ++nWords; } else { FX_STREAM_LOG(WARN) << "Ignore stop word: [" << term << "]." << FIRTEX_ENDL; } } ifterm.close(); return nWords; }
void IndexTestCase::checkDocFreq(IndexReaderPtr& pIndexReader, const tstring& sField, const tstring& sTerm, df_t expDf) { TermReaderPtr pTermReader = pIndexReader->termReader(); CPPUNIT_ASSERT(pTermReader); StandardAnalyzer sa; sa.init(); TokenViewPtr pTokens = sa.tokenize(sTerm.c_str(), sTerm.length()); CPPUNIT_ASSERT(pTokens); CPPUNIT_ASSERT(pTokens->getNumTokens() == 1); TokenView::Iterator it = pTokens->iterator(); TermPtr pTerm(new Term(sField, it.next().getTextValue())); TermPostingIteratorPtr pPost = pTermReader->seek(pTerm.get()); CPPUNIT_ASSERT(pPost); const TermMeta& termMeta = pPost->getTermMeta(); CPPUNIT_ASSERT_EQUAL(expDf, termMeta.getDocFreq()); }
size_t StandardStopFilter::loadWords(const tstring& sWords) { m_stopwords.clear(); size_t nWords = 0; StandardAnalyzer analyzer; analyzer.init(); TokenViewPtr pTokens = analyzer.tokenize(sWords.c_str(), sWords.length()); if (pTokens.isNotNull()) { TokenView::Iterator it = pTokens->iterator(); while (it.hasNext()) { const Token& token = it.next(); assert(token.getHintId() != INVALID_HINTID); m_stopwords.insert((termid_t)token.getHintId()); ++nWords; } } return nWords; }