void StandardStopFilter::filterInternal(TokenSourcePtr& tokenSource) const { TokenViewPtr pTokenView = tokenSource->getLastView(); if (!pTokenView.isNull()) { TokenView::Iterator it = pTokenView->iterator(); while (it.hasNext()) { Token& token = const_cast<Token&>(it.next()); if (m_stopwords.find((termid_t)token.getHintId()) != m_stopwords.end()) { token.clear(); } } } }
void StandardAnalyzer::doTokenize(TokenSourcePtr& tokenSource) const { FIRTEX_ASSERT2(m_pSegmenter != NULL); TokenViewPtr pInTokenView = tokenSource->getLastView(); TokenViewPtr pOutTokenView = tokenSource->acquireCustomView(_T("standard_view")); pOutTokenView->reserve(getMaxTokensToAnalyze()); pOutTokenView->setTokenType(Token::TT_CWORD); TokenView::Iterator iter = pInTokenView->iterator(); while(iter.hasNext()) { const Token& token = iter.next(); m_pSegmenter->segment(pOutTokenView, token.getTextValue(), token.getTextLength()); } }
size_t StandardStopFilter::loadFile(const tstring& filename) { m_stopwords.clear(); size_t nWords = 0; ifstream ifterm(filename.c_str(), ios::in|ios::binary); if (!ifterm.is_open()) { return 0; } StandardAnalyzer analyzer; char term[128]; while (!ifterm.eof()) { ifterm.getline(term, 128); TokenViewPtr pTokens = analyzer.tokenize(term, strlen(term)); if (pTokens->getNumTokens() > 1) { FX_STREAM_LOG(WARN) << "Stop word: [" << term << "] contains more than two terms" << FIRTEX_ENDL; } else if(pTokens->getNumTokens() == 1) { TokenView::Iterator it = pTokens->iterator(); const Token& token = it.next(); assert(token.getHintId() != INVALID_HINTID); m_stopwords.insert((termid_t)token.getHintId()); ++nWords; } else { FX_STREAM_LOG(WARN) << "Ignore stop word: [" << term << "]." << FIRTEX_ENDL; } } ifterm.close(); return nWords; }
void IndexTestCase::checkDocFreq(IndexReaderPtr& pIndexReader, const tstring& sField, const tstring& sTerm, df_t expDf) { TermReaderPtr pTermReader = pIndexReader->termReader(); CPPUNIT_ASSERT(pTermReader); StandardAnalyzer sa; sa.init(); TokenViewPtr pTokens = sa.tokenize(sTerm.c_str(), sTerm.length()); CPPUNIT_ASSERT(pTokens); CPPUNIT_ASSERT(pTokens->getNumTokens() == 1); TokenView::Iterator it = pTokens->iterator(); TermPtr pTerm(new Term(sField, it.next().getTextValue())); TermPostingIteratorPtr pPost = pTermReader->seek(pTerm.get()); CPPUNIT_ASSERT(pPost); const TermMeta& termMeta = pPost->getTermMeta(); CPPUNIT_ASSERT_EQUAL(expDf, termMeta.getDocFreq()); }
size_t StandardStopFilter::loadWords(const tstring& sWords) { m_stopwords.clear(); size_t nWords = 0; StandardAnalyzer analyzer; analyzer.init(); TokenViewPtr pTokens = analyzer.tokenize(sWords.c_str(), sWords.length()); if (pTokens.isNotNull()) { TokenView::Iterator it = pTokens->iterator(); while (it.hasNext()) { const Token& token = it.next(); assert(token.getHintId() != INVALID_HINTID); m_stopwords.insert((termid_t)token.getHintId()); ++nWords; } } return nWords; }
void IndexTestCase::testDocumentDeletion() { DocumentSchema schema; schema.addField("URL", "PRIMARY_KEY", true); schema.addTextField("BODY"); schema.addField("MODIFIED", "INT64", true); stringstream ss1; const size_t NUM_DOCS = 1000; size_t i = 0; for (; i < NUM_DOCS; ++i) { ss1 << "url" << i << ", body" << i << " hot," << (i * 100) % 1000 << ";"; } buildIndex(schema, ss1.str()); stringstream ss2; for (; i < 2 * NUM_DOCS; ++i) { ss2 << "url" << i << ", body" << i << " hot," << (i * 100) % 1000 << ";"; } buildIndex(schema, ss2.str(), true); StandardAnalyzerPtr sa(new StandardAnalyzer()); sa->init(); TokenViewPtr pTokens = sa->tokenize("hot", 3); CPPUNIT_ASSERT(pTokens); CPPUNIT_ASSERT(pTokens->getNumTokens() == 1); TokenView::Iterator it = pTokens->iterator(); TermPtr pTerm(new Term("BODY", it.next().getTextValue())); tstring str = getTestPath(); std::set<docid_t> answer; { Index index; index.open(str, Index::RDWR, NULL); IndexWriterPtr pIndexWriter = index.acquireWriter(); CPPUNIT_ASSERT(pIndexWriter != NULL); IndexReaderPtr pIndexReader = index.acquireReader(); CPPUNIT_ASSERT(pIndexReader != NULL); for (size_t i = 0; i < 2 * NUM_DOCS; ++i) { stringstream ss; ss << "url" << i; if (i == 1000 || i == 1500 || i == 1505 || i == 1999) { pIndexWriter->deleteDocument(ss.str()); } else { TermReaderPtr pTermReader = pIndexReader->termReader(); TermPtr pTerm(new Term("URL", ss.str())); TermPostingIteratorPtr pIt = pTermReader->seek(pTerm.get()); docid_t docId = pIt->skipTo(0); answer.insert(docId); } } TermReaderPtr pTermReader = pIndexReader->termReader(); TermPostingIteratorPtr pDocFreqs = pTermReader->seek(pTerm.get()); CPPUNIT_ASSERT(pDocFreqs); CPPUNIT_ASSERT_EQUAL((df_t)NUM_DOCS * 2, pDocFreqs->getTermMeta().getDocFreq()); std::set<docid_t>::const_iterator it = answer.begin(); for (docid_t i = 0; i < (docid_t)(2 * NUM_DOCS); ) { docid_t docId = pDocFreqs->skipTo((docid_t)i); i = docId + 1; if (docId == INVALID_DOCID) { break; } CPPUNIT_ASSERT_EQUAL(*it, docId); ++it; } CPPUNIT_ASSERT(it == answer.end()); } { Index index; index.open(str, Index::READ, NULL); IndexReaderPtr pIndexReader = index.acquireReader(); CPPUNIT_ASSERT(pIndexReader != NULL); TermReaderPtr pTermReader = pIndexReader->termReader(); TermPostingIteratorPtr pDocFreqs = pTermReader->seek(pTerm.get()); CPPUNIT_ASSERT(pDocFreqs); CPPUNIT_ASSERT_EQUAL((df_t)(2 * NUM_DOCS), pDocFreqs->getTermMeta().getDocFreq()); std::set<docid_t>::const_iterator it = answer.begin(); for (docid_t i = 0; i < (docid_t)(2 * NUM_DOCS); ) { docid_t docId = pDocFreqs->skipTo((docid_t)i); i = docId + 1; if (docId == INVALID_DOCID) { break; } CPPUNIT_ASSERT_EQUAL(*it, docId); ++it; } CPPUNIT_ASSERT(it == answer.end()); // for (std::set<docid_t>::const_iterator it = answer.begin(); // it != answer.end(); ++it) // { // docid_t docId = pDocFreqs->skipTo(*it); // CPPUNIT_ASSERT_EQUAL(*it, docId); // } // docid_t docId = pDocFreqs->skipTo(NUM_DOCS + 0); // CPPUNIT_ASSERT_EQUAL((docid_t)NUM_DOCS + 1, docId); // docId = pDocFreqs->skipTo(NUM_DOCS + 500); // CPPUNIT_ASSERT_EQUAL((docid_t)NUM_DOCS + 501, docId); // docId = pDocFreqs->skipTo(NUM_DOCS + 505); // CPPUNIT_ASSERT_EQUAL((docid_t)NUM_DOCS + 506, docId); // docId = pDocFreqs->skipTo(2 * NUM_DOCS - 1); // CPPUNIT_ASSERT_EQUAL((docid_t)INVALID_DOCID, docId); } }
void SynonymFilterTestCase::testFilter() { SynonymFilter sf; SynonymMap& sm = sf.getSynonymMap(); sm.addSynonyms("一群獾", "CETE"); sm.addSynonyms("一瞥", "CAST,CATCH SIGHT OF,COLOR,COMPANY,DEKKO," "FLING,FORM,GLANCE,GLAUM,GLIMPSE,KIND,LOOK,MOLD," "NOTICE,PITCH,SEE,SHADE,SHAPE,SKIM,SORT,THROW," "TINT,TOSS,TROUPE,TYPE,VARIETY,一看,投掷,演员表,眼光,脱落物,铸件"); TokenViewPtr pTokens(new TokenView); pTokens->addToken(_T("北京")); pTokens->addToken(_T("一群獾")); pTokens->addToken(_T("一瞥")); pTokens->addToken(_T("中国")); pTokens->addToken(_T("cete")); TokenSourcePtr tokenSource(new FX_NS(analyzer)::TokenSource); tokenSource->setOriginalView(pTokens); sf.filter(tokenSource); TokenViewPtr pSynTokens = tokenSource->getLastView(); CPPUNIT_ASSERT(pSynTokens.isNull() != true); TokenView::Iterator iter = pSynTokens->iterator(); CPPUNIT_ASSERT(iter.hasNext()); Token token = iter.next(); CPPUNIT_ASSERT_EQUAL(tstring("北京"), tstring(token.getTextValue())); CPPUNIT_ASSERT_EQUAL((int32_t)1, token.getPosIncrement()); CPPUNIT_ASSERT(iter.hasNext()); token = iter.next(); CPPUNIT_ASSERT_EQUAL(tstring("一群獾"), tstring(token.getTextValue())); CPPUNIT_ASSERT_EQUAL((int32_t)1, token.getPosIncrement()); CPPUNIT_ASSERT(iter.hasNext()); token = iter.next(); CPPUNIT_ASSERT_EQUAL(tstring("cete"), tstring(token.getTextValue())); CPPUNIT_ASSERT_EQUAL((int32_t)0, token.getPosIncrement()); CPPUNIT_ASSERT(iter.hasNext()); token = iter.next(); CPPUNIT_ASSERT_EQUAL(tstring("一瞥"), tstring(token.getTextValue())); CPPUNIT_ASSERT_EQUAL((int32_t)1, token.getPosIncrement()); CPPUNIT_ASSERT(iter.hasNext()); token = iter.next(); CPPUNIT_ASSERT_EQUAL(tstring("cast"), tstring(token.getTextValue())); CPPUNIT_ASSERT_EQUAL((int32_t)0, token.getPosIncrement()); for(size_t i = 0; i < 30; i++) { CPPUNIT_ASSERT(iter.hasNext()); iter.next(); } CPPUNIT_ASSERT(iter.hasNext()); token = iter.next(); CPPUNIT_ASSERT_EQUAL(tstring("铸件"), tstring(token.getTextValue())); CPPUNIT_ASSERT_EQUAL((int32_t)0, token.getPosIncrement()); CPPUNIT_ASSERT(iter.hasNext()); token = iter.next(); CPPUNIT_ASSERT_EQUAL(tstring("中国"), tstring(token.getTextValue())); CPPUNIT_ASSERT_EQUAL((int32_t)1, token.getPosIncrement()); CPPUNIT_ASSERT(iter.hasNext()); token = iter.next(); CPPUNIT_ASSERT_EQUAL(tstring("cete"), tstring(token.getTextValue())); CPPUNIT_ASSERT_EQUAL((int32_t)1, token.getPosIncrement()); CPPUNIT_ASSERT(iter.hasNext()); token = iter.next(); CPPUNIT_ASSERT_EQUAL(tstring("一群獾"), tstring(token.getTextValue())); CPPUNIT_ASSERT_EQUAL((int32_t)0, token.getPosIncrement()); }
void Highlighter::getBestTextFragments(const Analyzer* pAnalyzer, const char* szText, size_t nTextLen, size_t nMaxFrags, bool bMergeContiguousFragments, TextFragmentArray& fragArray) { TextFragment* pCurrentFrag = new TextFragment(0, (int32_t)fragArray.getNumTextFragment()); fragArray.add(pCurrentFrag); m_pFragmentScorer->startFragment(pCurrentFrag); try { m_pTextFragmenter->start(szText, nTextLen); TokenGroupPtr pTokenGroup(new TokenGroup); TokenSourcePtr tokenSource(new FX_NS(analyzer)::TokenSource); TokenViewPtr pOrgTokenView(new TokenView()); pOrgTokenView->addToken(szText, nTextLen); tokenSource->setOriginalView(pOrgTokenView); TokenViewPtr pTokenView = pAnalyzer->tokenize(tokenSource); int32_t lastEndOffset = 0; float lastScore = 0.0f; TokenView::Iterator iter = pTokenView->iterator(); while (iter.hasNext()) { const Token& token = iter.next(); if (token.getStartOffset() < (int32_t)m_nMaxDocBytesToAnalyze) { if ((pTokenGroup->getNumTokenView() > 0) && (pTokenGroup->isDistinct(&token))) { if (pTokenGroup->getTotalScore() > 0) { pCurrentFrag->addMatchedToken(pTokenGroup->getMatchedStartOffset(), pTokenGroup->getMatchedEndOffset()); } pTokenGroup->clear(); //check if current token marks the start of a new fragment if (m_pTextFragmenter->isNewFragment(&token)) { pCurrentFrag->setScore(m_pFragmentScorer->getFragmentScore()); //record stats for a new fragment pCurrentFrag->setTextEndPos(token.getEndOffset()); pCurrentFrag = new TextFragment((int32_t)token.getStartOffset(), (int32_t)fragArray.getNumTextFragment()); fragArray.add(pCurrentFrag); m_pFragmentScorer->startFragment(pCurrentFrag); } } TermPtr pTerm(new Term(m_pFragmentScorer->getFieldName(), token.getTextValue())); lastScore = m_pFragmentScorer->getTokenScore(pTerm.get()); pTokenGroup->addToken(&token, lastScore); } lastEndOffset = token.getEndOffset(); }//end while if ((lastScore > 0.0f) && (pTokenGroup->getNumTokenView() > 0) && (pTokenGroup->getTotalScore() > 0)) { pCurrentFrag->addMatchedToken(pTokenGroup->getMatchedStartOffset(), pTokenGroup->getMatchedEndOffset()); pTokenGroup->clear(); } pCurrentFrag->setScore(m_pFragmentScorer->getFragmentScore()); pCurrentFrag->setTextEndPos(lastEndOffset); FragmentQueue fragQueue(nMaxFrags); //sort the most relevant sections of the text for (size_t i = 0; i < fragArray.getNumTextFragment(); i++) { TextFragment* pFrag = fragArray.getTextFragment(i); if (!fragQueue.insert(pFrag)) { delete pFrag; } } fragArray.clear(false); //return the most relevant fragments while (fragQueue.size() > 0) { fragArray.add(fragQueue.pop()); } //merge any contiguous fragments to improve readability if (bMergeContiguousFragments) { TextFragment* pTextFragment; TextFragmentArray tmpFragArray; for (size_t i = 0; i < fragArray.getNumTextFragment(); i++) { pTextFragment = fragArray.getTextFragment(i); tmpFragArray.add(pTextFragment); } fragArray.clear(false); mergeContiguousFragments(tmpFragArray); for (size_t i = 0; i < tmpFragArray.getNumTextFragment(); i++) { pTextFragment = tmpFragArray.getTextFragment(i); if ( pTextFragment != NULL) { if (pTextFragment->getScore() > 0) { fragArray.add(pTextFragment); } else { delete pTextFragment; } } } tmpFragArray.clear(false); } } catch (const FirteXException& e) { FIRTEX_RETHROW(e); } }
void TextIndexMergerTestCase::MergeEntry::makeData() { for (size_t i = 0; i < nDataSize; ++i) { int32_t nPosInc = 0; TokenViewPtr pTokenView = new TokenView(i + 1, Token::TT_CWORD); for (size_t j = 0; j < i + 1; ++j) { stringstream ss; ss << "abc" << (i + j) % 100; pTokenView->addToken(ss.str().c_str(), nPosInc, 0, 0, Token::TT_CWORD); nPosInc = 1; uint64_t hash = Hash::hashString64(ss.str().c_str()); TermMap::iterator it = answer.find(hash); if (it != answer.end()) { DocVector& docVect = it->second; if (docVect[docVect.size() - 1].first == (docid_t)i) { docVect[docVect.size() - 1].second.push_back(j); } else { std::vector<loc_t> pos; pos.push_back(j); docVect.push_back(make_pair(i, pos)); } } else { DocVector docVect; docVect.resize(1); docVect[0].first = i; docVect[0].second.push_back(j); answer.insert(make_pair(hash, docVect)); } } //pFieldInfo->getIndexInfo().totalTerms += pTokenView->getNumTokens(); AnalyzedFieldPtr pField = new AnalyzedField(NULL, pTokenView); pIndexer->addField(pField.get()); pIndexer->commitDocument((docid_t)i); if (pDocFilter) { if (i % 5 == 0 || i == (nDataSize - 1)) { pDocFilter->set(i); } } } //pFieldInfo->getIndexInfo().distinctTerms = nDataSize; BarrelDirectory::createBarrel(pFileSystem, pBarrelInfo->getSuffix()); pIndexer->commit(pFileSystem, pBarrelInfo->getSuffix()); if (pDocFilter) { pDocIdRemapper = new DocIdRecycling(); pDocIdRemapper->init(nDataSize, pDocFilter.get()); } }