///////////////////////////////
// Tokenizer-Simple
void TTokenizerSimple::GetTokens(const PSIn& SIn, TStrV& TokenV) const {
	TStr LineStr; TStrV WordStrV;
	while (SIn->GetNextLn(LineStr)) {
		WordStrV.Clr(false);
		LineStr.SplitOnAllAnyCh(" .,!?\n\r()+=-{}[]%$#@\\/", WordStrV, true);
		for (int WordStrN = 0; WordStrN < WordStrV.Len(); WordStrN++) {
			const TStr& WordStr = WordStrV[WordStrN];
			const TStr UcStr = WordStr.GetUc();
			if (SwSet.Empty() || (!SwSet->IsIn(UcStr))) {
				TStr TokenStr = ToUcP ? UcStr : WordStr;
				if (!Stemmer.Empty()) {	
					TokenStr = Stemmer->GetStem(TokenStr); }
				TokenV.Add(TokenStr);
			}
		}
	}
}
예제 #2
0
파일: InOut.cpp 프로젝트: KronGen/KronTest
void GetMtxFromSepLine(const TStr& line, const TStr& separator, TFltV& matrix){
    TStrV strVals;
    line.SplitOnAllAnyCh(separator, strVals);
    for (int i = 0; i < strVals.Len(); i++) 
        matrix.Add(strVals[i].GetFlt());
}