lucene::analysis::Token* next(lucene::analysis::Token* token)
        {
                TCHAR ch = 0;

                while (true)
                {
                        offset++;
                        //
                        if (bufferIndex >= dataLen)
                        {
                                dataLen = input->read(ioBuffer, 1, LUCENE_IO_BUFFER_SIZE);
                                bufferIndex = 0;
                        }
                        //
                        if (dataLen == -1)
                        {
                                return NULL;
                        }
                        else
                        {
                                ch = ioBuffer[bufferIndex++];
                        }

                        if (_CJK)
                        {
                                break;
                        }

                }
                TCHAR currentWord[2] = {ch, 0};
                token->set(currentWord, offset - 1, offset);
                return token;
        }
Beispiel #2
0
	//这里是lucene分词器实现的最关键的地方
	lucene::analysis::Token* next(lucene::analysis::Token* token)
	{
        TCHAR ch = 0;

		while (true)
		{
			offset++;
			//
			if (bufferIndex >= dataLen)
			{
				dataLen = input->read(ioBuffer, 1, LUCENE_IO_BUFFER_SIZE);
				bufferIndex = 0;
			}
			//
			if (dataLen == -1)
			{
				return NULL;
			}
			else 
			{
				ch = ioBuffer[bufferIndex++];                
			}
			//通过这个条件不难看出这里只处理了CJK_UNIFIED_IDEOGRAPHS,
			//因此会丢掉其它的字符,如它会丢掉LATIN字符和数字
			if (_CJK)
			{
				break;
			}
			
		}
		//
        TCHAR currentWord[2] = {ch, 0};
		token->set(currentWord, offset - 1, offset);
		return token;
	}
Beispiel #3
0
	//这里是lucene分词器实现的最关键的地方
	lucene::analysis::Token* next(lucene::analysis::Token* token)
	{
		loadWords();

		std::wstring currentWord;

		while (true) {
			WCHAR ch;
			//
			if (bufferIndex >= dataLen) {
				dataLen = input->read(ioBuffer, 1, LUCENE_IO_BUFFER_SIZE);
				bufferIndex = 0;
			}

			if (dataLen == -1) {
				if (currentWord.length() == 0) {
					return NULL;
				} else {
					break;
				}
			} 
			else 
			{
				ch = ioBuffer[bufferIndex++];                
			}
			//通过这个条件不难看出这里只处理了CJK_UNIFIED_IDEOGRAPHS,
			//因此会丢掉其它的字符,如它会丢掉LATIN字符和数字
			//这也是该lucene分词器的一个限制,您可以在此基础之上完善它,
			//也很欢迎把您完善的结果反馈给我
			if (_CJK)
			{
				if (currentWord.length() == 0) {
					currentWord += ch;                    
				}
				else 
				{
					//这里实现了正向最大匹配法
					std::wstring temp = currentWord;
					temp += ch;
					//
					if (simWords->find(temp) != simWords->end())
					{
						currentWord = temp;
					}
					else 
					{
						bufferIndex--;
						break;
					}
				}
			}
		}
		//
		token->set(currentWord.c_str(), 0, currentWord.length());
		return token;
	}