Example #1
0
void CSearchManager::GetWords(LPCTSTR sz, WordList *plistWords)
{
	LPCTSTR szS = sz;
	size_t uChars = 0;
	size_t uBytes = 0;
	CStringW sWord;
	while (_tcslen(szS) > 0)
	{
		uChars = _tcscspn(szS, _aszInvKadKeywordChars);
		sWord = szS;
		sWord.Truncate(uChars);
		// TODO: We'd need a safe way to determine if a sequence which contains only 3 chars is a real word.
		// Currently we do this by evaluating the UTF-8 byte count. This will work well for Western locales,
		// AS LONG AS the min. byte count is 3(!). If the byte count is once changed to 2, this will not
		// work properly any longer because there are a lot of Western characters which need 2 bytes in UTF-8.
		// Maybe we need to evaluate the Unicode character values itself whether the characters are located
		// in code ranges where single characters are known to represent words.
		uBytes = KadGetKeywordBytes(sWord).GetLength();
		if (uBytes >= 3)
		{
			KadTagStrMakeLower(sWord);
			plistWords->remove
			(sWord);
			plistWords->push_back(sWord);
		}
		szS += uChars;
		if (uChars < _tcslen(szS))
			szS++;
	}

	// if the last word we have added, contains 3 chars (and 3 bytes), it's in almost all cases a file's extension.
	if (plistWords->size() > 1 && (uChars == 3 && uBytes == 3))
		plistWords->pop_back();
}