///////////////////////////////
// Tokenizer-Html-Unicode
void TTokenizerHtmlUnicode::GetTokens(const PSIn& SIn, TStrV& TokenV) const {
	TStr LineStr; TStrV WordStrV;
	while (SIn->GetNextLn(LineStr)) {
		TStr SimpleText = TUStr(LineStr).GetStarterLowerCaseStr();
		TTokenizerHtml::GetTokens(TStrIn::New(SimpleText), TokenV);
	}
}
///////////////////////////////
// Tokenizer-Simple
void TTokenizerSimple::GetTokens(const PSIn& SIn, TStrV& TokenV) const {
	TStr LineStr; TStrV WordStrV;
	while (SIn->GetNextLn(LineStr)) {
		WordStrV.Clr(false);
		LineStr.SplitOnAllAnyCh(" .,!?\n\r()+=-{}[]%$#@\\/", WordStrV, true);
		for (int WordStrN = 0; WordStrN < WordStrV.Len(); WordStrN++) {
			const TStr& WordStr = WordStrV[WordStrN];
			const TStr UcStr = WordStr.GetUc();
			if (SwSet.Empty() || (!SwSet->IsIn(UcStr))) {
				TStr TokenStr = ToUcP ? UcStr : WordStr;
				if (!Stemmer.Empty()) {	
					TokenStr = Stemmer->GetStem(TokenStr); }
				TokenV.Add(TokenStr);
			}
		}
	}
}