bool TBagOfWords::Update(const TStrV& TokenStrV) { // Generate Ngrams if necessary TStrV NgramStrV; GenerateNgrams(TokenStrV, NgramStrV); // process tokens to update DF counts bool UpdateP = false; if (IsHashing()) { // consolidate tokens and get their hashed IDs TIntSet TokenIdH; for (int TokenStrN = 0; TokenStrN < NgramStrV.Len(); TokenStrN++) { const TStr& TokenStr = NgramStrV[TokenStrN]; TInt TokenId = TokenStr.GetHashTrick() % HashDim; TokenIdH.AddKey(TokenId); if (IsStoreHashWords()) { HashWordV[TokenId].AddKey(TokenStr); } } // update document counts int KeyId = TokenIdH.FFirstKeyId(); while (TokenIdH.FNextKeyId(KeyId)) { const int TokenId = TokenIdH.GetKey(KeyId); // update DF DocFqV[TokenId]++; } } else { // consolidate tokens TStrH TokenStrH; for (int TokenStrN = 0; TokenStrN < NgramStrV.Len(); TokenStrN++) { const TStr& TokenStr = NgramStrV[TokenStrN]; TokenStrH.AddKey(TokenStr); } // update document counts and update vocabulary with new tokens int KeyId = TokenStrH.FFirstKeyId(); while (TokenStrH.FNextKeyId(KeyId)) { // get token const TStr& TokenStr = TokenStrH.GetKey(KeyId); // different processing for hashing int TokenId = TokenSet.GetKeyId(TokenStr); if (TokenId == -1) { // new token, remember the dimensionality change UpdateP = true; // remember the new token TokenId = TokenSet.AddKey(TokenStr); // increase document count table const int TokenDfId = DocFqV.Add(0); // increase also the old count table OldDocFqV.Add(0.0); // make sure we DF vector and TokenSet still in sync IAssert(TokenId == TokenDfId); IAssert(DocFqV.Len() == OldDocFqV.Len()); } // document count update DocFqV[TokenId]++; } } // update document count Docs++; // tell if dimension changed return UpdateP; }
void TAGMFast::GradientForRow(const int UID, TIntFltH& GradU, const TIntSet& CIDSet) { GradU.Gen(CIDSet.Len()); TFltV HOSumFV; //adjust for Fv of v hold out if (HOVIDSV[UID].Len() > 0) { HOSumFV.Gen(SumFV.Len()); for (int e = 0; e < HOVIDSV[UID].Len(); e++) { for (int c = 0; c < SumFV.Len(); c++) { HOSumFV[c] += GetCom(HOVIDSV[UID][e], c); } } } TUNGraph::TNodeI NI = G->GetNI(UID); int Deg = NI.GetDeg(); TFltV PredV(Deg), GradV(CIDSet.Len()); TIntV CIDV(CIDSet.Len()); if (DoParallel && Deg + CIDSet.Len() > 10) { #pragma omp parallel for schedule(static, 1) for (int e = 0; e < Deg; e++) { if (NI.GetNbrNId(e) == UID) { continue; } if (HOVIDSV[UID].IsKey(NI.GetNbrNId(e))) { continue; } PredV[e] = Prediction(UID, NI.GetNbrNId(e)); } #pragma omp parallel for schedule(static, 1) for (int c = 0; c < CIDSet.Len(); c++) { int CID = CIDSet.GetKey(c); double Val = 0.0; for (int e = 0; e < Deg; e++) { int VID = NI.GetNbrNId(e); if (VID == UID) { continue; } if (HOVIDSV[UID].IsKey(VID)) { continue; } Val += PredV[e] * GetCom(VID, CID) / (1.0 - PredV[e]) + NegWgt * GetCom(VID, CID); } double HOSum = HOVIDSV[UID].Len() > 0? HOSumFV[CID].Val: 0.0;//subtract Hold out pairs only if hold out pairs exist Val -= NegWgt * (SumFV[CID] - HOSum - GetCom(UID, CID)); CIDV[c] = CID; GradV[c] = Val; } } else { for (int e = 0; e < Deg; e++) { if (NI.GetNbrNId(e) == UID) { continue; } if (HOVIDSV[UID].IsKey(NI.GetNbrNId(e))) { continue; } PredV[e] = Prediction(UID, NI.GetNbrNId(e)); } for (int c = 0; c < CIDSet.Len(); c++) { int CID = CIDSet.GetKey(c); double Val = 0.0; for (int e = 0; e < Deg; e++) { int VID = NI.GetNbrNId(e); if (VID == UID) { continue; } if (HOVIDSV[UID].IsKey(VID)) { continue; } Val += PredV[e] * GetCom(VID, CID) / (1.0 - PredV[e]) + NegWgt * GetCom(VID, CID); } double HOSum = HOVIDSV[UID].Len() > 0? HOSumFV[CID].Val: 0.0;//subtract Hold out pairs only if hold out pairs exist Val -= NegWgt * (SumFV[CID] - HOSum - GetCom(UID, CID)); CIDV[c] = CID; GradV[c] = Val; } } //add regularization if (RegCoef > 0.0) { //L1 for (int c = 0; c < GradV.Len(); c++) { GradV[c] -= RegCoef; } } if (RegCoef < 0.0) { //L2 for (int c = 0; c < GradV.Len(); c++) { GradV[c] += 2 * RegCoef * GetCom(UID, CIDV[c]); } } for (int c = 0; c < GradV.Len(); c++) { if (GetCom(UID, CIDV[c]) == 0.0 && GradV[c] < 0.0) { continue; } if (fabs(GradV[c]) < 0.0001) { continue; } GradU.AddDat(CIDV[c], GradV[c]); } for (int c = 0; c < GradU.Len(); c++) { if (GradU[c] >= 10) { GradU[c] = 10; } if (GradU[c] <= -10) { GradU[c] = -10; } IAssert(GradU[c] >= -10); } }