void CSearchManager::GetWords(LPCTSTR sz, WordList *plistWords) { LPCTSTR szS = sz; size_t uChars = 0; size_t uBytes = 0; CStringW sWord; while (_tcslen(szS) > 0) { uChars = _tcscspn(szS, _aszInvKadKeywordChars); sWord = szS; sWord.Truncate(uChars); // TODO: We'd need a safe way to determine if a sequence which contains only 3 chars is a real word. // Currently we do this by evaluating the UTF-8 byte count. This will work well for Western locales, // AS LONG AS the min. byte count is 3(!). If the byte count is once changed to 2, this will not // work properly any longer because there are a lot of Western characters which need 2 bytes in UTF-8. // Maybe we need to evaluate the Unicode character values itself whether the characters are located // in code ranges where single characters are known to represent words. uBytes = KadGetKeywordBytes(sWord).GetLength(); if (uBytes >= 3) { KadTagStrMakeLower(sWord); plistWords->remove (sWord); plistWords->push_back(sWord); } szS += uChars; if (uChars < _tcslen(szS)) szS++; } // if the last word we have added, contains 3 chars (and 3 bytes), it's in almost all cases a file's extension. if (plistWords->size() > 1 && (uChars == 3 && uBytes == 3)) plistWords->pop_back(); }