/////////////////////////////// // Tokenizer-Simple void TTokenizerSimple::GetTokens(const PSIn& SIn, TStrV& TokenV) const { TStr LineStr; TStrV WordStrV; while (SIn->GetNextLn(LineStr)) { WordStrV.Clr(false); LineStr.SplitOnAllAnyCh(" .,!?\n\r()+=-{}[]%$#@\\/", WordStrV, true); for (int WordStrN = 0; WordStrN < WordStrV.Len(); WordStrN++) { const TStr& WordStr = WordStrV[WordStrN]; const TStr UcStr = WordStr.GetUc(); if (SwSet.Empty() || (!SwSet->IsIn(UcStr))) { TStr TokenStr = ToUcP ? UcStr : WordStr; if (!Stemmer.Empty()) { TokenStr = Stemmer->GetStem(TokenStr); } TokenV.Add(TokenStr); } } } }
void GetMtxFromSepLine(const TStr& line, const TStr& separator, TFltV& matrix){ TStrV strVals; line.SplitOnAllAnyCh(separator, strVals); for (int i = 0; i < strVals.Len(); i++) matrix.Add(strVals[i].GetFlt()); }