bool LexicsCutter::CheckLexics(std::string& Phrase) { std::string lchar; LC_WordMap::iterator i; std::pair<LC_WordMap::iterator, LC_WordMap::iterator> ii; if (Phrase.size() == 0) return false; // first, convert the string, adding spaces and removing invalid characters // also create fast position vector for the new positions std::string str = " "; unsigned int pos = 0; while (ReadUTF8(Phrase, lchar, pos)) if (m_sInvalidChars.find(lchar) == std::string::npos) str.append(lchar); // string prepared, now parse it and scan for all the words unsigned int pos_prev = 0; pos = 0; while (ReadUTF8(str, lchar, pos)) { // got character, now try to find wordmap for it ii = m_WordMap.equal_range(lchar); // iterate over all found words for (i = ii.first; i != ii.second; i++) // compare word at initial position if (CompareWord(str, pos_prev, m_WordList[i->second])) return true; // set initial position to the current position pos_prev = pos; } return false; }
int SuffixArray::Match( const vector< WORD > &phrase, INDEX index ) { INDEX pos = m_index[ index ]; for(INDEX i=0; i<phrase.size() && i+pos<m_size; i++) { int match = CompareWord( m_vcb.GetWordID( phrase[i] ), m_array[ pos+i ] ); // cerr << "{" << index << "+" << i << "," << pos+i << ":" << match << "}" << endl; if (match != 0) return match; } return 0; }
int SuffixArray::CompareIndex( INDEX a, INDEX b ) const { // skip over identical words INDEX offset = 0; while( a+offset < m_size && b+offset < m_size && m_array[ a+offset ] == m_array[ b+offset ] ) { offset++; } if( a+offset == m_size ) return -1; if( b+offset == m_size ) return 1; return CompareWord( m_array[ a+offset ], m_array[ b+offset ] ); }