示例#1
0
bool TBagOfWords::Update(const TStrV& TokenStrV) {    
    // Generate Ngrams if necessary
	TStrV NgramStrV;
    GenerateNgrams(TokenStrV, NgramStrV);

    // process tokens to update DF counts
    bool UpdateP = false;
    if (IsHashing()) {  
        // consolidate tokens and get their hashed IDs
        TIntSet TokenIdH;
        for (int TokenStrN = 0; TokenStrN < NgramStrV.Len(); TokenStrN++) {
            const TStr& TokenStr = NgramStrV[TokenStrN];
            TInt TokenId = TokenStr.GetHashTrick() % HashDim;
            TokenIdH.AddKey(TokenId);
            if (IsStoreHashWords()) { HashWordV[TokenId].AddKey(TokenStr); }
        }
        // update document counts
        int KeyId = TokenIdH.FFirstKeyId();
        while (TokenIdH.FNextKeyId(KeyId)) {
            const int TokenId = TokenIdH.GetKey(KeyId);
            // update DF
            DocFqV[TokenId]++;
        }
    } else {
        // consolidate tokens
        TStrH TokenStrH;
        for (int TokenStrN = 0; TokenStrN < NgramStrV.Len(); TokenStrN++) {
            const TStr& TokenStr = NgramStrV[TokenStrN];
            TokenStrH.AddKey(TokenStr);
        }
        // update document counts and update vocabulary with new tokens
        int KeyId = TokenStrH.FFirstKeyId();
        while (TokenStrH.FNextKeyId(KeyId)) {
            // get token
            const TStr& TokenStr = TokenStrH.GetKey(KeyId);
            // different processing for hashing
            int TokenId = TokenSet.GetKeyId(TokenStr);
            if (TokenId == -1) {
                // new token, remember the dimensionality change
                UpdateP = true;
                // remember the new token
                TokenId = TokenSet.AddKey(TokenStr);
                // increase document count table
                const int TokenDfId = DocFqV.Add(0);
                // increase also the old count table
                OldDocFqV.Add(0.0);
                // make sure we DF vector and TokenSet still in sync
                IAssert(TokenId == TokenDfId);
                IAssert(DocFqV.Len() == OldDocFqV.Len());
            }
            // document count update
            DocFqV[TokenId]++;
        }
    }
    // update document count
    Docs++;
    // tell if dimension changed
    return UpdateP;
}
示例#2
0
文件: strut.cpp 项目: amrsobhy/qminer
TStr TStrUtil::GetStr(const TStrH& StrH, const TStr& FieldDelimiterStr, const TStr& DelimiterStr) {
  if (StrH.Empty()) {return TStr();}
  TChA ResChA;
  int KeyId = StrH.FFirstKeyId();
  while (StrH.FNextKeyId(KeyId)) {
	if (!ResChA.Empty()) { ResChA+=DelimiterStr; }
    ResChA+=StrH.GetKey(KeyId);
	ResChA+=FieldDelimiterStr;
	ResChA+=StrH[KeyId].GetStr();
  }
  return ResChA;
}
示例#3
0
void TFtrGenToken::Update(const TStr& Val) {
    TStrV TokenStrV; GetTokenV(Val, TokenStrV); TStrH TokenStrH;
    for (int TokenStrN = 0; TokenStrN < TokenStrV.Len(); TokenStrN++) {
        const TStr& TokenStr = TokenStrV[TokenStrN];
        TokenStrH.AddKey(TokenStr);
    }
    int KeyId = TokenStrH.FFirstKeyId();
    while (TokenStrH.FNextKeyId(KeyId)) {
        const TStr& TokenStr = TokenStrH.GetKey(KeyId);
        TokenH.AddDat(TokenStr)++;
    }
    Docs++;
}