lucene::analysis::Token* next(lucene::analysis::Token* token) { TCHAR ch = 0; while (true) { offset++; // if (bufferIndex >= dataLen) { dataLen = input->read(ioBuffer, 1, LUCENE_IO_BUFFER_SIZE); bufferIndex = 0; } // if (dataLen == -1) { return NULL; } else { ch = ioBuffer[bufferIndex++]; } if (_CJK) { break; } } TCHAR currentWord[2] = {ch, 0}; token->set(currentWord, offset - 1, offset); return token; }
//这里是lucene分词器实现的最关键的地方 lucene::analysis::Token* next(lucene::analysis::Token* token) { TCHAR ch = 0; while (true) { offset++; // if (bufferIndex >= dataLen) { dataLen = input->read(ioBuffer, 1, LUCENE_IO_BUFFER_SIZE); bufferIndex = 0; } // if (dataLen == -1) { return NULL; } else { ch = ioBuffer[bufferIndex++]; } //通过这个条件不难看出这里只处理了CJK_UNIFIED_IDEOGRAPHS, //因此会丢掉其它的字符,如它会丢掉LATIN字符和数字 if (_CJK) { break; } } // TCHAR currentWord[2] = {ch, 0}; token->set(currentWord, offset - 1, offset); return token; }
//这里是lucene分词器实现的最关键的地方 lucene::analysis::Token* next(lucene::analysis::Token* token) { loadWords(); std::wstring currentWord; while (true) { WCHAR ch; // if (bufferIndex >= dataLen) { dataLen = input->read(ioBuffer, 1, LUCENE_IO_BUFFER_SIZE); bufferIndex = 0; } if (dataLen == -1) { if (currentWord.length() == 0) { return NULL; } else { break; } } else { ch = ioBuffer[bufferIndex++]; } //通过这个条件不难看出这里只处理了CJK_UNIFIED_IDEOGRAPHS, //因此会丢掉其它的字符,如它会丢掉LATIN字符和数字 //这也是该lucene分词器的一个限制,您可以在此基础之上完善它, //也很欢迎把您完善的结果反馈给我 if (_CJK) { if (currentWord.length() == 0) { currentWord += ch; } else { //这里实现了正向最大匹配法 std::wstring temp = currentWord; temp += ch; // if (simWords->find(temp) != simWords->end()) { currentWord = temp; } else { bufferIndex--; break; } } } } // token->set(currentWord.c_str(), 0, currentWord.length()); return token; }