Ejemplo n.º 1
0
bool TBagOfWords::Update(const TStrV& TokenStrV) {    
    // Generate Ngrams if necessary
	TStrV NgramStrV;
    GenerateNgrams(TokenStrV, NgramStrV);

    // process tokens to update DF counts
    bool UpdateP = false;
    if (IsHashing()) {  
        // consolidate tokens and get their hashed IDs
        TIntSet TokenIdH;
        for (int TokenStrN = 0; TokenStrN < NgramStrV.Len(); TokenStrN++) {
            const TStr& TokenStr = NgramStrV[TokenStrN];
            TInt TokenId = TokenStr.GetHashTrick() % HashDim;
            TokenIdH.AddKey(TokenId);
            if (IsStoreHashWords()) { HashWordV[TokenId].AddKey(TokenStr); }
        }
        // update document counts
        int KeyId = TokenIdH.FFirstKeyId();
        while (TokenIdH.FNextKeyId(KeyId)) {
            const int TokenId = TokenIdH.GetKey(KeyId);
            // update DF
            DocFqV[TokenId]++;
        }
    } else {
        // consolidate tokens
        TStrH TokenStrH;
        for (int TokenStrN = 0; TokenStrN < NgramStrV.Len(); TokenStrN++) {
            const TStr& TokenStr = NgramStrV[TokenStrN];
            TokenStrH.AddKey(TokenStr);
        }
        // update document counts and update vocabulary with new tokens
        int KeyId = TokenStrH.FFirstKeyId();
        while (TokenStrH.FNextKeyId(KeyId)) {
            // get token
            const TStr& TokenStr = TokenStrH.GetKey(KeyId);
            // different processing for hashing
            int TokenId = TokenSet.GetKeyId(TokenStr);
            if (TokenId == -1) {
                // new token, remember the dimensionality change
                UpdateP = true;
                // remember the new token
                TokenId = TokenSet.AddKey(TokenStr);
                // increase document count table
                const int TokenDfId = DocFqV.Add(0);
                // increase also the old count table
                OldDocFqV.Add(0.0);
                // make sure we DF vector and TokenSet still in sync
                IAssert(TokenId == TokenDfId);
                IAssert(DocFqV.Len() == OldDocFqV.Len());
            }
            // document count update
            DocFqV[TokenId]++;
        }
    }
    // update document count
    Docs++;
    // tell if dimension changed
    return UpdateP;
}
Ejemplo n.º 2
0
void TAGMFast::GradientForRow(const int UID, TIntFltH& GradU, const TIntSet& CIDSet) {
  GradU.Gen(CIDSet.Len());

  TFltV HOSumFV; //adjust for Fv of v hold out
  if (HOVIDSV[UID].Len() > 0) {
    HOSumFV.Gen(SumFV.Len());
    
    for (int e = 0; e < HOVIDSV[UID].Len(); e++) {
      for (int c = 0; c < SumFV.Len(); c++) {
        HOSumFV[c] += GetCom(HOVIDSV[UID][e], c);
      }
    }
  }
    
  TUNGraph::TNodeI NI = G->GetNI(UID);
  int Deg = NI.GetDeg();
  TFltV PredV(Deg), GradV(CIDSet.Len());
  TIntV CIDV(CIDSet.Len());
  if (DoParallel && Deg + CIDSet.Len() > 10) {
#pragma omp parallel for schedule(static, 1)
    for (int e = 0; e < Deg; e++) {
      if (NI.GetNbrNId(e) == UID) { continue; }
      if (HOVIDSV[UID].IsKey(NI.GetNbrNId(e))) { continue; }
      PredV[e] = Prediction(UID, NI.GetNbrNId(e));
    }
  
#pragma omp parallel for schedule(static, 1)
    for (int c = 0; c < CIDSet.Len(); c++) {
      int CID = CIDSet.GetKey(c);
      double Val = 0.0;
      for (int e = 0; e < Deg; e++) {
        int VID = NI.GetNbrNId(e);
        if (VID == UID) { continue; }
        if (HOVIDSV[UID].IsKey(VID)) { continue; }
        Val += PredV[e] * GetCom(VID, CID) / (1.0 - PredV[e]) + NegWgt * GetCom(VID, CID);
      }
      double HOSum = HOVIDSV[UID].Len() > 0?  HOSumFV[CID].Val: 0.0;//subtract Hold out pairs only if hold out pairs exist
      Val -= NegWgt * (SumFV[CID] - HOSum - GetCom(UID, CID));
      CIDV[c] = CID;
      GradV[c] = Val;
    }
  } 
  else {
    for (int e = 0; e < Deg; e++) {
      if (NI.GetNbrNId(e) == UID) { continue; }
      if (HOVIDSV[UID].IsKey(NI.GetNbrNId(e))) { continue; }
      PredV[e] = Prediction(UID, NI.GetNbrNId(e));
    }
  
    for (int c = 0; c < CIDSet.Len(); c++) {
      int CID = CIDSet.GetKey(c);
      double Val = 0.0;
      for (int e = 0; e < Deg; e++) {
        int VID = NI.GetNbrNId(e);
        if (VID == UID) { continue; }
        if (HOVIDSV[UID].IsKey(VID)) { continue; }
        Val += PredV[e] * GetCom(VID, CID) / (1.0 - PredV[e]) + NegWgt * GetCom(VID, CID);
      }
      double HOSum = HOVIDSV[UID].Len() > 0?  HOSumFV[CID].Val: 0.0;//subtract Hold out pairs only if hold out pairs exist
      Val -= NegWgt * (SumFV[CID] - HOSum - GetCom(UID, CID));
      CIDV[c] = CID;
      GradV[c] = Val;
    }
  }
  //add regularization
  if (RegCoef > 0.0) { //L1
    for (int c = 0; c < GradV.Len(); c++) {
      GradV[c] -= RegCoef; 
    }
  }
  if (RegCoef < 0.0) { //L2
    for (int c = 0; c < GradV.Len(); c++) {
      GradV[c] += 2 * RegCoef * GetCom(UID, CIDV[c]); 
    }
  }


  for (int c = 0; c < GradV.Len(); c++) {
    if (GetCom(UID, CIDV[c]) == 0.0 && GradV[c] < 0.0) { continue; }
    if (fabs(GradV[c]) < 0.0001) { continue; }
    GradU.AddDat(CIDV[c], GradV[c]);
  }
  for (int c = 0; c < GradU.Len(); c++) {
    if (GradU[c] >= 10) { GradU[c] = 10; }
    if (GradU[c] <= -10) { GradU[c] = -10; }
    IAssert(GradU[c] >= -10);
  }
}