////////////////////////////////////////////////////////////////////////// // String-To-Words void TStrParser::DocStrToWIdV(const TStr& _DocStr, TIntV& WordIdV, const bool& Stemm) { TStr DocStr = _DocStr.GetUc(); // to upper case TStrV WordV; DocStr.SplitOnWs(WordV); int WordN = WordV.Len(); WordIdV.Reserve(WordN, 0); PStemmer Stemmer = TStemmer::New(stmtPorter); TIntH WordsInDoc; for (int WordC = 0; WordC < WordN; WordC++) { TStr WordStr; if (Stemm) { WordStr = Stemmer->GetStem(WordV[WordC]); } else { WordStr = WordV[WordC]; } int WId = GetWId(WordStr); if (WId == -1) { WId = WordToIdH.AddKey(WordStr); WordToIdH[WId] = 0; } WordIdV.Add(WId); // is it first time we see this word in this doc? if (!WordsInDoc.IsKey(WId)) WordsInDoc.AddKey(WId); } //do some statistics for DF DocsParsed++; for (int i = 0, l = WordsInDoc.Len(); i < l; i++) WordToIdH[WordsInDoc.GetKey(i)]++; Assert(WordV.Len() == WordIdV.Len()); }
TUStr TUStr::GetSubValV(const int& BChN, const int& EChN){ //size_t Len = EChN - BChN + 1; TIntV UniChV; int capacity = BChN - EChN + 1; if(capacity < this->UniChV.Reserved()){ UniChV.Reserve(BChN - EChN + 1, 0); } this->UniChV.GetSubValV(BChN, EChN, UniChV); return TUStr(UniChV); }
void TUStr::GetWordBoundPV(TIntV& WordBoundPosV){ TBoolV WordBoundPV; TUnicodeDef::GetDef()->FindWordBoundaries(UniChV, WordBoundPV); int n = WordBoundPV.Len(); WordBoundPosV.Reserve(n); for(int i = 0; i <= n; i++){ if(WordBoundPV[i]){ WordBoundPosV.Add(i); } } }
void TStrParser::DocStrToChIdV(const TStr& _DocStr, TIntV& ChIdV) { TStr DocStr = _DocStr.GetUc(); // to upper case int ChN = DocStr.Len(); ChIdV.Reserve(ChN, 0); for (int ChC = 0; ChC < ChN; ChC++) { TStr ChStr = DocStr.GetSubStr(ChC,ChC); int ChId = GetWId(ChStr); if (ChId != -1) { WordToIdH[ChId]++; } else { ChId = WordToIdH.AddKey(ChStr); WordToIdH[ChId] = 1; } ChIdV.Add(ChId); } }