void clearTermSet( TermSet& termSet ) { for( TermSet::iterator itTerms = termSet.begin(); itTerms != termSet.end(); itTerms++ ) { Term * pTerm = *itTerms; _CLLDECDELETE( pTerm ); } termSet.clear(); }
bool TermSet::is_subset(const TermSet& y) const { TermSet::iterator i, e, j; for(i=y.begin(), e=y.end(); i!=e; i++) { if ( (j=find(*i))==end() ) return false; // term in y isn't here if ( !(*j).is_subset(*i) ) return false; // y term isn't a subset of this term } return true; }
bool Grammar::calculateFirstSet(NonTerminal nt, SententialForm * rule) { SententialForm::Node * node = rule->begin(); bool changed = false; while(node) { VocabSymbol * sym = node->symbol; // Test to see what terminal it is. Terminal * term = dynamic_cast<Terminal *>(sym); NonTerminal * nonterm = dynamic_cast<NonTerminal *>(sym); if (term) { if (first[nt].find(*term) == first[nt].end()) { first[nt].insert(*term); return true; } else { return false; } } else if (nonterm) { // FIXME: There are some really ineffecient operations here. // Since this it a one time thing, I'll leave it for now... -Alan unsigned int oldSize = first[nt].size(); TermSet newSet; insert_iterator<TermSet> newSetIterator(newSet, newSet.begin()); set_union(first[nt].begin(), first[nt].end(), first[*nonterm].begin(), first[*nonterm].end(), newSetIterator); if (oldSize != newSet.size()) { changed = true; first[nt] = newSet; } if (!nullable[*nonterm]) { return changed; } } else { cout << "Found a symbol that is neither terminal nor non-terminal!" << endl; } // Check the next one. node = node->next; } return changed; }
void testExtractFromWildcardQuery( CuTest * tc ) { Directory * pIndex = setUpIndex(); IndexReader * pReader = IndexReader::open( pIndex ); TermSet termSet; WildcardQuery * wildcard; Term * t1; Query * rewrite; t1 = _CLNEW Term( _T("data"), _T("aaaa?") ); wildcard = _CLNEW WildcardQuery( t1 ); rewrite = wildcard->rewrite( pReader ); rewrite->extractTerms( &termSet ); _CLLDECDELETE( t1 ); assertEqualsMsg( _T( "wrong number of terms" ), 3, termSet.size() ); for( TermSet::iterator itTerms = termSet.begin(); itTerms != termSet.end(); itTerms++ ) { Term * pTerm = *itTerms; if( 0 != _tcscmp( _T( "aaaaa" ), pTerm->text()) && 0 != _tcscmp( _T( "aaaab" ), pTerm->text()) && 0 != _tcscmp( _T( "aaaac" ), pTerm->text())) { assertTrueMsg( _T( "wrong term" ), false ); } } clearTermSet( termSet ); if( rewrite != wildcard ) _CLDELETE( rewrite ); _CLDELETE( wildcard ); t1 = _CLNEW Term( _T("data"), _T("aaa*") ); wildcard = _CLNEW WildcardQuery( t1 ); rewrite = wildcard->rewrite( pReader ); rewrite->extractTerms( &termSet ); _CLLDECDELETE( t1 ); assertEqualsMsg( _T( "wrong number of terms" ), 5, termSet.size() ); for( TermSet::iterator itTerms = termSet.begin(); itTerms != termSet.end(); itTerms++ ) { Term * pTerm = *itTerms; assertTrueMsg( _T( "wrong term" ), ( 0 == _tcsncmp( _T( "aaa" ), pTerm->text(), 3 ))); } clearTermSet( termSet ); if( rewrite != wildcard ) _CLDELETE( rewrite ); _CLDELETE( wildcard ); pReader->close(); _CLDELETE( pReader ); closeIndex( pIndex ); pIndex = NULL; }
void testExtractFromTermQuery( CuTest * tc ) { Directory * pIndex = setUpIndex(); IndexReader * pReader = IndexReader::open( pIndex ); TermSet termSet; Term * t1 = _CLNEW Term( _T("data"), _T("aaaaa") ); Term * t2 = _CLNEW Term( _T("data"), _T("bbbbb") ); Query * q1 = _CLNEW TermQuery( t1 ); Query * q2 = _CLNEW TermQuery( t2 ); Query * rewrite1 = q1->rewrite( pReader ); Query * rewrite2 = q2->rewrite( pReader ); rewrite1->extractTerms( &termSet ); assertEqualsMsg( _T( "wrong number of terms" ), 1, termSet.size() ); assertEqualsMsg( _T( "wrong term" ), 0, t1->compareTo( *(termSet.begin())) ); clearTermSet( termSet ); rewrite2->extractTerms( &termSet ); assertEqualsMsg( _T( "wrong number of terms" ), 1, termSet.size() ); assertEqualsMsg( _T( "wrong term" ), 0, t2->compareTo( *(termSet.begin())) ); clearTermSet( termSet ); _CLLDECDELETE( t1 ); _CLLDECDELETE( t2 ); if( q1 != rewrite1 ) _CLDELETE( rewrite1 ); _CLDELETE( q1 ); if( q2 != rewrite2 ) _CLDELETE( rewrite2 ); _CLDELETE( q2 ); pReader->close(); _CLDELETE( pReader ); closeIndex( pIndex ); pIndex = NULL; }
void testExtractFromFuzzyQuery( CuTest * tc ) { Directory * pIndex = setUpIndex(); IndexReader * pReader = IndexReader::open( pIndex ); TermSet termSet; FuzzyQuery * fuzzy; Term * t1; Query * rewrite; t1 = _CLNEW Term( _T("data"), _T("aaaab") ); fuzzy = _CLNEW FuzzyQuery( t1, 0.7f ); rewrite = fuzzy->rewrite( pReader ); rewrite->extractTerms( &termSet ); _CLLDECDELETE( t1 ); assertEqualsMsg( _T( "wrong number of terms" ), 4, termSet.size() ); for( TermSet::iterator itTerms = termSet.begin(); itTerms != termSet.end(); itTerms++ ) { Term * pTerm = *itTerms; if( 0 != _tcscmp( _T( "aaaaa" ), pTerm->text()) && 0 != _tcscmp( _T( "aaaab" ), pTerm->text()) && 0 != _tcscmp( _T( "aaabb" ), pTerm->text()) && 0 != _tcscmp( _T( "aaaac" ), pTerm->text())) { assertTrueMsg( _T( "wrong term" ), false ); } } clearTermSet( termSet ); if( rewrite != fuzzy ) _CLDELETE( rewrite ); _CLDELETE( fuzzy ); pReader->close(); _CLDELETE( pReader ); closeIndex( pIndex ); pIndex = NULL; }
void QueryTermExtractor::getTerms(const Query * query, WeightedTermList * terms, bool prohibited, const TCHAR* fieldName) { if (query->instanceOf( BooleanQuery::getClassName() )) { getTermsFromBooleanQuery((BooleanQuery *) query, terms, prohibited, fieldName); } // FilteredQuery not implemented yet // else if (query->instanceOf( FilteredQuery::getClassName() )) // getTermsFromFilteredQuery((FilteredQuery *) query, terms); else { TermSet nonWeightedTerms; query->extractTerms(&nonWeightedTerms); for (TermSet::iterator iter = nonWeightedTerms.begin(); iter != nonWeightedTerms.end(); iter++) { Term * term = (Term *)(*iter); if ( fieldName == NULL || term->field() == fieldName ) terms->insert(_CLNEW WeightedTerm(query->getBoost(), term->text())); _CLLDECDELETE( term ); } } }
void testExtractFromBooleanQuery( CuTest * tc ) { Directory * pIndex = setUpIndex(); IndexReader * pReader = IndexReader::open( pIndex ); TermSet termSet; Term * t1 = _CLNEW Term( _T("data"), _T("aaaab") ); Term * t2 = _CLNEW Term( _T("data"), _T("aaabb") ); Term * t3 = _CLNEW Term( _T("data"), _T("aaabb") ); BooleanQuery * bq = _CLNEW BooleanQuery(); bq->add( _CLNEW TermQuery( t1 ), true, BooleanClause::SHOULD ); bq->add( _CLNEW TermQuery( t2 ), true, BooleanClause::SHOULD ); bq->add( _CLNEW TermQuery( t3 ), true, BooleanClause::SHOULD ); Query * rewrite = bq->rewrite( pReader ); rewrite->extractTerms( &termSet ); assertEqualsMsg( _T( "wrong number of terms" ), 2, termSet.size() ); for( TermSet::iterator itTerms = termSet.begin(); itTerms != termSet.end(); itTerms++ ) { Term * pTerm = *itTerms; assertTrueMsg( _T( "wrong term" ), ( 0 == t1->compareTo( pTerm ) || 0 == t2->compareTo( pTerm ))); } clearTermSet( termSet ); _CLLDECDELETE( t1 ); _CLLDECDELETE( t2 ); _CLLDECDELETE( t3 ); if( rewrite != bq ) _CLDELETE( rewrite ); _CLDELETE( bq ); pReader->close(); _CLDELETE( pReader ); closeIndex( pIndex ); pIndex = NULL; }
bool TermSet::operator<(const TermSet& y) const { return std::lexicographical_compare(begin(), end(), y.begin(), y.end()); }
bool TermSet::operator==(const TermSet& y) const { if (size()!=y.size()) return false; return std::equal(begin(), end(), y.begin()); }
bool Grammar::calculateFollowSet(NonTerminal nt, SententialForm * rule) { SententialForm::Node * node = rule->begin(); bool changed = false; while(node) { VocabSymbol * sym = node->symbol; // Test to see what terminal it is. NonTerminal * nonterm = dynamic_cast<NonTerminal *>(sym); if (nonterm) { // FIXME: There are some really ineffecient operations here. // Since this it a one time thing, I'll leave it for now... -Alan SententialForm::Node * canFollow = node->next; while (canFollow) { Terminal * fTerm = dynamic_cast<Terminal *>(canFollow->symbol); NonTerminal * fNonterm = dynamic_cast<NonTerminal *>(canFollow->symbol); // If it is a terminal. That is in the suffix before a non-terminal. if (fTerm) { // All other suffix vocab will not be in the follow. // (At least not from this rule. if (follow[*nonterm].find(*fTerm) == follow[*nonterm].end()) { follow[*nonterm].insert(*fTerm); changed = true; } else { changed = false; } break; } else if (fNonterm) { unsigned int oldSize = follow[*nonterm].size(); // New follow set for comparison. TermSet newSet; insert_iterator<TermSet> newSetIterator(newSet, newSet.begin()); set_union(follow[*nonterm].begin(), follow[*nonterm].end(), first[*fNonterm].begin(), first[*fNonterm].end(), newSetIterator); if (oldSize != newSet.size()) { changed = true; follow[*nonterm] = newSet; } if (nullable[*fNonterm]) { canFollow = canFollow->next; } else { break; } } } // If we ever reach the very end of the production, // We need to add the follow-set of the parent to the follow-set // of the current non-terminal. if (canFollow == 0) { // Add the parent set here. unsigned int oldSize = follow[*nonterm].size(); // New follow set for comparison. TermSet newSet; insert_iterator<TermSet> newSetIterator(newSet, newSet.begin()); set_union(follow[*nonterm].begin(), follow[*nonterm].end(), follow[nt].begin(), follow[nt].end(), newSetIterator); if (oldSize != newSet.size()) { changed = true; follow[*nonterm] = newSet; } } } // Check the next one. node = node->next; } return changed; }