bool TBagOfWords::Update(const TStrV& TokenStrV) { // Generate Ngrams if necessary TStrV NgramStrV; GenerateNgrams(TokenStrV, NgramStrV); // process tokens to update DF counts bool UpdateP = false; if (IsHashing()) { // consolidate tokens and get their hashed IDs TIntSet TokenIdH; for (int TokenStrN = 0; TokenStrN < NgramStrV.Len(); TokenStrN++) { const TStr& TokenStr = NgramStrV[TokenStrN]; TInt TokenId = TokenStr.GetHashTrick() % HashDim; TokenIdH.AddKey(TokenId); if (IsStoreHashWords()) { HashWordV[TokenId].AddKey(TokenStr); } } // update document counts int KeyId = TokenIdH.FFirstKeyId(); while (TokenIdH.FNextKeyId(KeyId)) { const int TokenId = TokenIdH.GetKey(KeyId); // update DF DocFqV[TokenId]++; } } else { // consolidate tokens TStrH TokenStrH; for (int TokenStrN = 0; TokenStrN < NgramStrV.Len(); TokenStrN++) { const TStr& TokenStr = NgramStrV[TokenStrN]; TokenStrH.AddKey(TokenStr); } // update document counts and update vocabulary with new tokens int KeyId = TokenStrH.FFirstKeyId(); while (TokenStrH.FNextKeyId(KeyId)) { // get token const TStr& TokenStr = TokenStrH.GetKey(KeyId); // different processing for hashing int TokenId = TokenSet.GetKeyId(TokenStr); if (TokenId == -1) { // new token, remember the dimensionality change UpdateP = true; // remember the new token TokenId = TokenSet.AddKey(TokenStr); // increase document count table const int TokenDfId = DocFqV.Add(0); // increase also the old count table OldDocFqV.Add(0.0); // make sure we DF vector and TokenSet still in sync IAssert(TokenId == TokenDfId); IAssert(DocFqV.Len() == OldDocFqV.Len()); } // document count update DocFqV[TokenId]++; } } // update document count Docs++; // tell if dimension changed return UpdateP; }
TStr TStrUtil::GetStr(const TStrH& StrH, const TStr& FieldDelimiterStr, const TStr& DelimiterStr) { if (StrH.Empty()) {return TStr();} TChA ResChA; int KeyId = StrH.FFirstKeyId(); while (StrH.FNextKeyId(KeyId)) { if (!ResChA.Empty()) { ResChA+=DelimiterStr; } ResChA+=StrH.GetKey(KeyId); ResChA+=FieldDelimiterStr; ResChA+=StrH[KeyId].GetStr(); } return ResChA; }
void TFtrGenToken::Update(const TStr& Val) { TStrV TokenStrV; GetTokenV(Val, TokenStrV); TStrH TokenStrH; for (int TokenStrN = 0; TokenStrN < TokenStrV.Len(); TokenStrN++) { const TStr& TokenStr = TokenStrV[TokenStrN]; TokenStrH.AddKey(TokenStr); } int KeyId = TokenStrH.FFirstKeyId(); while (TokenStrH.FNextKeyId(KeyId)) { const TStr& TokenStr = TokenStrH.GetKey(KeyId); TokenH.AddDat(TokenStr)++; } Docs++; }