示例#1
0
// Simple test for Defrag()
TEST(THashSet, Defrag) {
  TIntSet* TestSet = new TIntSet();

  // fragment the set (IsKeyIdEqKeyN() will be false)
  TestSet->AddKey(6);
  TestSet->AddKey(4);
  EXPECT_EQ(1,TestSet->IsKeyIdEqKeyN());

  TestSet->AddKey(2);
  EXPECT_EQ(1,TestSet->IsKeyIdEqKeyN());

  TestSet->DelKey(2);
  EXPECT_EQ(0,TestSet->IsKeyIdEqKeyN());

  TestSet->Defrag();
  EXPECT_EQ(1,TestSet->IsKeyIdEqKeyN());

  TestSet->DelKey(4);
  EXPECT_EQ(0,TestSet->IsKeyIdEqKeyN());

  TestSet->Defrag();
  EXPECT_EQ(1,TestSet->IsKeyIdEqKeyN());

  // this does not work with a fragmented set
  TestSet->DelKeyId(TestSet->GetRndKeyId(TInt::Rnd));
  EXPECT_EQ(0,TestSet->IsKeyIdEqKeyN());

  delete TestSet;
}
示例#2
0
void TAGMUtil::GetNbhCom(const PUNGraph& Graph, const int NID, TIntSet& NBCmtyS) {
    TUNGraph::TNodeI NI = Graph->GetNI(NID);
    NBCmtyS.Gen(NI.GetDeg());
    NBCmtyS.AddKey(NID);
    for (int e = 0; e < NI.GetDeg(); e++) {
        NBCmtyS.AddKey(NI.GetNbrNId(e));
    }
}
示例#3
0
void LSH::ElCheapoHashing(TQuoteBase *QuoteBase, TInt ShingleLen,
    THash<TMd5Sig, TIntSet>& ShingleToQuoteIds) {
  fprintf(stderr, "Hashing shingles the el cheapo way...\n");
  TIntV QuoteIds;
  QuoteBase->GetAllQuoteIds(QuoteIds);
  for (int qt = 0; qt < QuoteIds.Len(); qt++) {
    if (qt % 1000 == 0) {
      fprintf(stderr, "%d out of %d completed\n", qt, QuoteIds.Len());
    }
    TQuote Q;
    QuoteBase->GetQuote(QuoteIds[qt], Q);

    // Put x-character (or x-word) shingles into hash table; x is specified by ShingleLen parameter
    TStr QContentStr;
    Q.GetParsedContentString(QContentStr);
    TChA QContentChA = TChA(QContentStr);

    for (int i = 0; i < QContentChA.Len() - ShingleLen + 1; i++) {
      TChA ShingleChA = TChA();
      for (int j = 0; j < ShingleLen; j++) {
        ShingleChA.AddCh(QContentChA.GetCh(i + j));
      }
      TStr Shingle = TStr(ShingleChA);
      const TMd5Sig ShingleMd5(Shingle);
      TIntSet ShingleQuoteIds;
      if (ShingleToQuoteIds.IsKey(ShingleMd5)) {
        ShingleQuoteIds = ShingleToQuoteIds.GetDat(ShingleMd5);
      }

      ShingleQuoteIds.AddKey(QuoteIds[qt]);
      ShingleToQuoteIds.AddDat(ShingleMd5, ShingleQuoteIds);
    }
  }
  Err("Done with el cheapo hashing!\n");
}
示例#4
0
///Generate graph using the AGM model. CProbV = vector of Pc
PUNGraph TAGM::GenAGM(TVec<TIntV>& CmtyVV, const TFltV& CProbV, TRnd& Rnd, const double PNoCom) {
    PUNGraph G = TUNGraph::New(100 * CmtyVV.Len(), -1);
    printf("AGM begins\n");
    for (int i = 0; i < CmtyVV.Len(); i++) {
        TIntV& CmtyV = CmtyVV[i];
        for (int u = 0; u < CmtyV.Len(); u++) {
            if ( G->IsNode(CmtyV[u])) {
                continue;
            }
            G->AddNode(CmtyV[u]);
        }
        double Prob = CProbV[i];
        RndConnectInsideCommunity(G, CmtyV, Prob, Rnd);
    }
    if (PNoCom > 0.0) { //if we want to connect nodes that do not share any community
        TIntSet NIDS;
        for (int c = 0; c < CmtyVV.Len(); c++) {
            for (int u = 0; u < CmtyVV[c].Len(); u++) {
                NIDS.AddKey(CmtyVV[c][u]);
            }
        }
        TIntV NIDV;
        NIDS.GetKeyV(NIDV);
        RndConnectInsideCommunity(G,NIDV,PNoCom,Rnd);
    }
    printf("AGM completed (%d nodes %d edges)\n",G->GetNodes(),G->GetEdges());
    G->Defrag();
    return G;
}
示例#5
0
bool TBagOfWords::Update(const TStrV& TokenStrV) {    
    // Generate Ngrams if necessary
	TStrV NgramStrV;
    GenerateNgrams(TokenStrV, NgramStrV);

    // process tokens to update DF counts
    bool UpdateP = false;
    if (IsHashing()) {  
        // consolidate tokens and get their hashed IDs
        TIntSet TokenIdH;
        for (int TokenStrN = 0; TokenStrN < NgramStrV.Len(); TokenStrN++) {
            const TStr& TokenStr = NgramStrV[TokenStrN];
            TInt TokenId = TokenStr.GetHashTrick() % HashDim;
            TokenIdH.AddKey(TokenId);
            if (IsStoreHashWords()) { HashWordV[TokenId].AddKey(TokenStr); }
        }
        // update document counts
        int KeyId = TokenIdH.FFirstKeyId();
        while (TokenIdH.FNextKeyId(KeyId)) {
            const int TokenId = TokenIdH.GetKey(KeyId);
            // update DF
            DocFqV[TokenId]++;
        }
    } else {
        // consolidate tokens
        TStrH TokenStrH;
        for (int TokenStrN = 0; TokenStrN < NgramStrV.Len(); TokenStrN++) {
            const TStr& TokenStr = NgramStrV[TokenStrN];
            TokenStrH.AddKey(TokenStr);
        }
        // update document counts and update vocabulary with new tokens
        int KeyId = TokenStrH.FFirstKeyId();
        while (TokenStrH.FNextKeyId(KeyId)) {
            // get token
            const TStr& TokenStr = TokenStrH.GetKey(KeyId);
            // different processing for hashing
            int TokenId = TokenSet.GetKeyId(TokenStr);
            if (TokenId == -1) {
                // new token, remember the dimensionality change
                UpdateP = true;
                // remember the new token
                TokenId = TokenSet.AddKey(TokenStr);
                // increase document count table
                const int TokenDfId = DocFqV.Add(0);
                // increase also the old count table
                OldDocFqV.Add(0.0);
                // make sure we DF vector and TokenSet still in sync
                IAssert(TokenId == TokenDfId);
                IAssert(DocFqV.Len() == OldDocFqV.Len());
            }
            // document count update
            DocFqV[TokenId]++;
        }
    }
    // update document count
    Docs++;
    // tell if dimension changed
    return UpdateP;
}
示例#6
0
// Helper: Return GroupSet based on NodeID
void GetGroupSet(int NId, TIntSet& GroupSet) {
  GroupSet.Clr();
  switch (NId) {
    case 0:
      GroupSet.AddKey(2);
      GroupSet.AddKey(4);
      GroupSet.AddKey(5);
      break;
    case 1:
      // Empty Set
      break;
    case 2:
      GroupSet.AddKey(0);
      GroupSet.AddKey(3);
      GroupSet.AddKey(5);
      break;
    case 3:
      GroupSet.AddKey(0);
      break;
    case 4:
      GroupSet.AddKey(0);
      break;
    case 5:
      GroupSet.AddKey(0);
      GroupSet.AddKey(1);
      break;
    default:
      ASSERT_FALSE(true); // NId Outside Graph Construction FAIL
      break;
  }
}
示例#7
0
void LSH::WordHashing(TQuoteBase *QuoteBase,
    THash<TMd5Sig, TIntSet>& ShingleToQuoteIds) {
  fprintf(stderr, "Hashing shingles using words...\n");
  TIntV QuoteIds;
  QuoteBase->GetAllQuoteIds(QuoteIds);

  THash<TStr, TIntSet> Temp;

  for (int qt = 0; qt < QuoteIds.Len(); qt++) {
    if (qt % 1000 == 0) {
      fprintf(stderr, "%d out of %d completed\n", qt, QuoteIds.Len());
    }
    TQuote Q;
    QuoteBase->GetQuote(QuoteIds[qt], Q);

    TStrV Content;
    Q.GetParsedContent(Content);

    int ContentLen = Content.Len();
    for (int i = 0; i < ContentLen; i++) {
      const TMd5Sig ShingleMd5(Content[i]);
      TIntSet ShingleQuoteIds;
      ShingleToQuoteIds.IsKeyGetDat(ShingleMd5, ShingleQuoteIds);
      ShingleQuoteIds.AddKey(QuoteIds[qt]);
      ShingleToQuoteIds.AddDat(ShingleMd5, ShingleQuoteIds);

      ///// COMMENT OUT LATER
      TIntSet TempSet;
      Temp.IsKeyGetDat(Content[i], TempSet);
      TempSet.AddKey(QuoteIds[qt]);
      Temp.AddDat(Content[i], TempSet);
    }
  }

  TVec<TStr> ShingleKeys;
  Temp.GetKeyV(ShingleKeys);
  ShingleKeys.SortCmp(TCmpSetByLen(false, &Temp));
  for (int i = 0; i < 100; i++) {
    TIntSet TempSet = Temp.GetDat(ShingleKeys[i]);
    Err("%d: %s - %d \n", i, ShingleKeys[i].CStr(), TempSet.Len());
  }

  Err("Done with word hashing!\n");
}
示例#8
0
// YES I COPIED AND PASTED CODE my section leader would be so ashamed :D
void LSH::MinHash(THash<TMd5Sig, TIntSet>& ShingleToQuoteIds,
    TVec<THash<TIntV, TIntSet> >& SignatureBandBuckets) {
  TRnd RandomGenerator; // TODO: make this "more random" by incorporating time
  for (int i = 0; i < NumBands; ++i) {
    THash < TInt, TIntV > Inverted; // (QuoteID, QuoteSignatureForBand)
    THash < TIntV, TIntSet > BandBuckets; // (BandSignature, QuoteIDs)
    for (int j = 0; j < BandSize; ++j) {
      // Create new signature
      TVec < TMd5Sig > Signature;
      ShingleToQuoteIds.GetKeyV(Signature);
      Signature.Shuffle(RandomGenerator);

      // Place in bucket - not very efficient
      int SigLen = Signature.Len();
      for (int k = 0; k < SigLen; ++k) {
        TIntSet CurSet = ShingleToQuoteIds.GetDat(Signature[k]);
        for (TIntSet::TIter l = CurSet.BegI(); l < CurSet.EndI(); l++) {
          TInt Key = l.GetKey();
          if (Inverted.IsKey(Key)) {
            TIntV CurSignature = Inverted.GetDat(Key);
            if (CurSignature.Len() <= j) {
              CurSignature.Add(k);
              Inverted.AddDat(Key, CurSignature);
            }
          } else {
            TIntV NewSignature;
            NewSignature.Add(k);
            Inverted.AddDat(Key, NewSignature);
          }
        }
      }
    }

    TIntV InvertedKeys;
    Inverted.GetKeyV(InvertedKeys);
    TInt InvertedLen = InvertedKeys.Len();
    for (int k = 0; k < InvertedLen; ++k) {
      TIntSet Bucket;
      TIntV Signature = Inverted.GetDat(InvertedKeys[k]);
      if (BandBuckets.IsKey(Signature)) {
        Bucket = BandBuckets.GetDat(Signature);
      }
      Bucket.AddKey(InvertedKeys[k]);
      BandBuckets.AddDat(Signature, Bucket);
    }

    SignatureBandBuckets.Add(BandBuckets);
    Err("%d out of %d band signatures computed\n", i + 1, NumBands);
  }
  Err("Minhash step complete!\n");
}
示例#9
0
void TGraphKey::TakeGraph(const PNGraph& Graph, TIntPrV& NodeMap) {
  TIntSet NodeIdH;
  int n = 0;
  NodeMap.Gen(Graph->GetNodes(), 0);
  for (TNGraph::TNodeI NI = Graph->BegNI(); NI < Graph->EndNI(); NI++, n++) {
    NodeIdH.AddKey(NI.GetId());
    NodeMap.Add(TIntPr(NI.GetId(), n));
  }
  Nodes = Graph->GetNodes();
  EdgeV.Gen(Nodes, 0);
  for (TNGraph::TNodeI NI = Graph->BegNI(); NI < Graph->EndNI(); NI++) {
    const int NewNId = NodeIdH.GetKeyId(NI.GetId());
    for (int i = 0; i < NI.GetOutDeg(); i++) {
      EdgeV.Add(TIntPr(NewNId, NodeIdH.GetKeyId(NI.GetOutNId(i))));
    }
  }
  EdgeV.Sort(true);
  EdgeV.Pack();
}
示例#10
0
文件: gstat.cpp 项目: Accio/snap
void TGStatVec::SaveTxt(const TStr& FNmPref, const TStr& Desc) const {
  FILE *F = fopen(TStr::Fmt("growth.%s.tab", FNmPref.CStr()).CStr(), "wt");
  fprintf(F, "# %s\n", Desc.CStr());
  fprintf(F, "# %s", TTmInfo::GetTmUnitStr(TmUnit).CStr());
  TIntSet StatValSet;
  for (int i = 0; i < Len(); i++) {
    for (int v = gsvNone; v < gsvMx; v++) {
      if (At(i)->HasVal(TGStatVal(v))) { StatValSet.AddKey(v); }
    }
  }
  TIntV StatValV;  StatValSet.GetKeyV(StatValV);  StatValV.Sort();
  for (int sv = 0; sv < StatValV.Len(); sv++) {
    fprintf(F, "\t%s", TGStat::GetValStr(TGStatVal(StatValV[sv].Val)).CStr()); }
  fprintf(F, "Time\n");
  for (int i = 0; i < Len(); i++) {
    const TGStat& G = *At(i);
    for (int sv = 0; sv < StatValV.Len(); sv++) {
      fprintf(F, "%g\t", G.GetVal(TGStatVal(StatValV[sv].Val))); }
    fprintf(F, "%s\n", G.GetTmStr().CStr());
  }
  fclose(F);
}
示例#11
0
/// Barabasi-Albert model of scale-free graphs.
/// The graph has power-law degree distribution.
/// See: Emergence of scaling in random networks by Barabasi and Albert.
/// URL: http://arxiv.org/abs/cond-mat/9910332
PUNGraph GenPrefAttach(const int& Nodes, const int& NodeOutDeg, TRnd& Rnd) {
  PUNGraph GraphPt = PUNGraph::New();
  TUNGraph& Graph = *GraphPt;
  Graph.Reserve(Nodes, NodeOutDeg*Nodes);
  TIntV NIdV(NodeOutDeg*Nodes, 0);
  // first edge
  Graph.AddNode(0);  Graph.AddNode(1);
  NIdV.Add(0);  NIdV.Add(1);
  Graph.AddEdge(0, 1);
  TIntSet NodeSet;
  for (int node = 2; node < Nodes; node++) {
    NodeSet.Clr(false);
    while (NodeSet.Len() < NodeOutDeg && NodeSet.Len() < node) {
      NodeSet.AddKey(NIdV[TInt::Rnd.GetUniDevInt(NIdV.Len())]);
    }
    const int N = Graph.AddNode();
    for (int i = 0; i < NodeSet.Len(); i++) {
      Graph.AddEdge(N, NodeSet[i]);
      NIdV.Add(N);
      NIdV.Add(NodeSet[i]);
    }
  }
  return GraphPt;
}
示例#12
0
/// Newton method: DEPRECATED
int TAGMFast::MLENewton(const double& Thres, const int& MaxIter, const TStr PlotNm) {
  TExeTm ExeTm;
  int iter = 0, PrevIter = 0;
  TIntFltPrV IterLV;
  double PrevL = TFlt::Mn, CurL;
  TUNGraph::TNodeI UI;
  TIntV NIdxV;
  G->GetNIdV(NIdxV);
  int CID, UID, NewtonIter;
  double Fuc, PrevFuc, Grad, H;
  while(iter < MaxIter) {
    NIdxV.Shuffle(Rnd);
    for (int ui = 0; ui < F.Len(); ui++, iter++) {
      if (! PlotNm.Empty() && iter % G->GetNodes() == 0) {
        IterLV.Add(TIntFltPr(iter, Likelihood(false)));
      }
      UID = NIdxV[ui];
      //find set of candidate c (we only need to consider c to which a neighbor of u belongs to)
      TIntSet CIDSet;
      UI = G->GetNI(UID);
      if (UI.GetDeg() == 0) { //if the node is isolated, clear its membership and skip
        if (! F[UID].Empty()) { F[UID].Clr(); }
        continue;
      }
      for (int e = 0; e < UI.GetDeg(); e++) {
        if (HOVIDSV[UID].IsKey(UI.GetNbrNId(e))) { continue; }
        TIntFltH& NbhCIDH = F[UI.GetNbrNId(e)];
        for (TIntFltH::TIter CI = NbhCIDH.BegI(); CI < NbhCIDH.EndI(); CI++) {
          CIDSet.AddKey(CI.GetKey());
        }
      }
      for (TIntFltH::TIter CI = F[UID].BegI(); CI < F[UID].EndI(); CI++) { //remove the community membership which U does not share with its neighbors
        if (! CIDSet.IsKey(CI.GetKey())) {
          DelCom(UID, CI.GetKey());
        }
      }
      if (CIDSet.Empty()) { continue; }
      for (TIntSet::TIter CI = CIDSet.BegI(); CI < CIDSet.EndI(); CI++) {
        CID = CI.GetKey();
        //optimize for UID, CID
        //compute constants
        TFltV AlphaKV(UI.GetDeg());
        for (int e = 0; e < UI.GetDeg(); e++) {
          if (HOVIDSV[UID].IsKey(UI.GetNbrNId(e))) { continue; }
          AlphaKV[e] = (1 - PNoCom) * exp(- DotProduct(UID, UI.GetNbrNId(e)) + GetCom(UI.GetNbrNId(e), CID) * GetCom(UID, CID));
          IAssertR(AlphaKV[e] <= 1.0, TStr::Fmt("AlphaKV=%f, %f, %f", AlphaKV[e].Val, PNoCom.Val, GetCom(UI.GetNbrNId(e), CID)));
        }
        Fuc = GetCom(UID, CID);
        PrevFuc = Fuc;
        Grad = GradientForOneVar(AlphaKV, UID, CID, Fuc), H = 0.0;
        if (Grad <= 1e-3 && Grad >= -0.1) { continue; }
        NewtonIter = 0;
        while (NewtonIter++ < 10) {
          Grad = GradientForOneVar(AlphaKV, UID, CID, Fuc), H = 0.0;
          H = HessianForOneVar(AlphaKV, UID, CID, Fuc);
          if (Fuc == 0.0 && Grad <= 0.0) { Grad = 0.0; }
          if (fabs(Grad) < 1e-3) { break; }
          if (H == 0.0) { Fuc = 0.0; break; }
          double NewtonStep = - Grad / H;
          if (NewtonStep < -0.5) { NewtonStep = - 0.5; }
          Fuc += NewtonStep;
          if (Fuc < 0.0) { Fuc = 0.0; }
        }
        if (Fuc == 0.0) {
          DelCom(UID, CID);
        }
        else {
          AddCom(UID, CID, Fuc);
        }
      }
    }
    if (iter - PrevIter >= 2 * G->GetNodes() && iter > 10000) {
      PrevIter = iter;
      CurL = Likelihood();
      if (PrevL > TFlt::Mn && ! PlotNm.Empty()) {
        printf("\r%d iterations, Likelihood: %f, Diff: %f", iter, CurL,  CurL - PrevL);
      }
      fflush(stdout);
      if (CurL - PrevL <= Thres * fabs(PrevL)) { break; }
      else { PrevL = CurL; }
    }
    
  }
  if (! PlotNm.Empty()) {
    printf("\nMLE for Lambda completed with %d iterations(%s)\n", iter, ExeTm.GetTmStr());
    TGnuPlot::PlotValV(IterLV, PlotNm + ".likelihood_Q");
  }
  return iter;
}
示例#13
0
void LSH::MinHash(TQuoteBase *QB, THashSet<TMd5Sig>& Shingles,
    TVec<THash<TMd5Sig, TIntSet> >& SignatureBandBuckets) {
  Err("Creating buckets...\n");
  THash < TMd5Sig, TIntV > Signatures;
  ComputeSignatures(Shingles, Signatures, NumBands * BandSize);

  // bucket creation
  for (int i = 0; i < NumBands; ++i) {
    SignatureBandBuckets.Add(THash<TMd5Sig, TIntSet>());
  }


  // bucket filling
  int NumShingles = Shingles.Len();
  THash<TInt, TQuote> Quotes;
  QB->GetIdToTQuotes(Quotes);

  THash<TInt, TQuote>::TIter CurI = Quotes.BegI();
  THash<TInt, TQuote>::TIter EndI = Quotes.EndI();
  TQuote Q; // SKYFALL

  for (; CurI < EndI; CurI++) {
    Q = CurI.GetDat();

    TStrV Content;
    Q.GetParsedContent(Content);
    TInt Id = Q.GetId();

    // signature for quote
    int ContentLen = Content.Len();
    TVec < TIntV > Signature;
    for (int i = 0; i < ContentLen; i++) {
      const TMd5Sig ShingleMd5(Content[i]);
      Signature.Add(Signatures.GetDat(ShingleMd5));
    }

    // place in bucket
    if (ContentLen < WordWindow) {
      for (int i = 0; i < NumBands; ++i) {
        TStr Sig;
        for (int j = 0; j < BandSize; ++j) {
          int CurSig = i * BandSize + j;

          TInt min = NumShingles;
          for (int k = 0; k < ContentLen; k++) {
            if (Signature[k][CurSig] < min) {
              min = Signature[k][CurSig];
            }
          }
          Sig += min.GetStr() + "-";
        }
        //Err(Sig.CStr());

        const TMd5Sig SigMd5(Sig);
        TIntSet Bucket;
        SignatureBandBuckets[i].IsKeyGetDat(SigMd5, Bucket);
        Bucket.AddKey(Id);
        SignatureBandBuckets[i].AddDat(SigMd5, Bucket);
      }
    } else {

    }

  }
  Err("Minhash step complete!\n");
}