// Simple test for Defrag() TEST(THashSet, Defrag) { TIntSet* TestSet = new TIntSet(); // fragment the set (IsKeyIdEqKeyN() will be false) TestSet->AddKey(6); TestSet->AddKey(4); EXPECT_EQ(1,TestSet->IsKeyIdEqKeyN()); TestSet->AddKey(2); EXPECT_EQ(1,TestSet->IsKeyIdEqKeyN()); TestSet->DelKey(2); EXPECT_EQ(0,TestSet->IsKeyIdEqKeyN()); TestSet->Defrag(); EXPECT_EQ(1,TestSet->IsKeyIdEqKeyN()); TestSet->DelKey(4); EXPECT_EQ(0,TestSet->IsKeyIdEqKeyN()); TestSet->Defrag(); EXPECT_EQ(1,TestSet->IsKeyIdEqKeyN()); // this does not work with a fragmented set TestSet->DelKeyId(TestSet->GetRndKeyId(TInt::Rnd)); EXPECT_EQ(0,TestSet->IsKeyIdEqKeyN()); delete TestSet; }
void TAGMUtil::GetNbhCom(const PUNGraph& Graph, const int NID, TIntSet& NBCmtyS) { TUNGraph::TNodeI NI = Graph->GetNI(NID); NBCmtyS.Gen(NI.GetDeg()); NBCmtyS.AddKey(NID); for (int e = 0; e < NI.GetDeg(); e++) { NBCmtyS.AddKey(NI.GetNbrNId(e)); } }
void LSH::ElCheapoHashing(TQuoteBase *QuoteBase, TInt ShingleLen, THash<TMd5Sig, TIntSet>& ShingleToQuoteIds) { fprintf(stderr, "Hashing shingles the el cheapo way...\n"); TIntV QuoteIds; QuoteBase->GetAllQuoteIds(QuoteIds); for (int qt = 0; qt < QuoteIds.Len(); qt++) { if (qt % 1000 == 0) { fprintf(stderr, "%d out of %d completed\n", qt, QuoteIds.Len()); } TQuote Q; QuoteBase->GetQuote(QuoteIds[qt], Q); // Put x-character (or x-word) shingles into hash table; x is specified by ShingleLen parameter TStr QContentStr; Q.GetParsedContentString(QContentStr); TChA QContentChA = TChA(QContentStr); for (int i = 0; i < QContentChA.Len() - ShingleLen + 1; i++) { TChA ShingleChA = TChA(); for (int j = 0; j < ShingleLen; j++) { ShingleChA.AddCh(QContentChA.GetCh(i + j)); } TStr Shingle = TStr(ShingleChA); const TMd5Sig ShingleMd5(Shingle); TIntSet ShingleQuoteIds; if (ShingleToQuoteIds.IsKey(ShingleMd5)) { ShingleQuoteIds = ShingleToQuoteIds.GetDat(ShingleMd5); } ShingleQuoteIds.AddKey(QuoteIds[qt]); ShingleToQuoteIds.AddDat(ShingleMd5, ShingleQuoteIds); } } Err("Done with el cheapo hashing!\n"); }
///Generate graph using the AGM model. CProbV = vector of Pc PUNGraph TAGM::GenAGM(TVec<TIntV>& CmtyVV, const TFltV& CProbV, TRnd& Rnd, const double PNoCom) { PUNGraph G = TUNGraph::New(100 * CmtyVV.Len(), -1); printf("AGM begins\n"); for (int i = 0; i < CmtyVV.Len(); i++) { TIntV& CmtyV = CmtyVV[i]; for (int u = 0; u < CmtyV.Len(); u++) { if ( G->IsNode(CmtyV[u])) { continue; } G->AddNode(CmtyV[u]); } double Prob = CProbV[i]; RndConnectInsideCommunity(G, CmtyV, Prob, Rnd); } if (PNoCom > 0.0) { //if we want to connect nodes that do not share any community TIntSet NIDS; for (int c = 0; c < CmtyVV.Len(); c++) { for (int u = 0; u < CmtyVV[c].Len(); u++) { NIDS.AddKey(CmtyVV[c][u]); } } TIntV NIDV; NIDS.GetKeyV(NIDV); RndConnectInsideCommunity(G,NIDV,PNoCom,Rnd); } printf("AGM completed (%d nodes %d edges)\n",G->GetNodes(),G->GetEdges()); G->Defrag(); return G; }
bool TBagOfWords::Update(const TStrV& TokenStrV) { // Generate Ngrams if necessary TStrV NgramStrV; GenerateNgrams(TokenStrV, NgramStrV); // process tokens to update DF counts bool UpdateP = false; if (IsHashing()) { // consolidate tokens and get their hashed IDs TIntSet TokenIdH; for (int TokenStrN = 0; TokenStrN < NgramStrV.Len(); TokenStrN++) { const TStr& TokenStr = NgramStrV[TokenStrN]; TInt TokenId = TokenStr.GetHashTrick() % HashDim; TokenIdH.AddKey(TokenId); if (IsStoreHashWords()) { HashWordV[TokenId].AddKey(TokenStr); } } // update document counts int KeyId = TokenIdH.FFirstKeyId(); while (TokenIdH.FNextKeyId(KeyId)) { const int TokenId = TokenIdH.GetKey(KeyId); // update DF DocFqV[TokenId]++; } } else { // consolidate tokens TStrH TokenStrH; for (int TokenStrN = 0; TokenStrN < NgramStrV.Len(); TokenStrN++) { const TStr& TokenStr = NgramStrV[TokenStrN]; TokenStrH.AddKey(TokenStr); } // update document counts and update vocabulary with new tokens int KeyId = TokenStrH.FFirstKeyId(); while (TokenStrH.FNextKeyId(KeyId)) { // get token const TStr& TokenStr = TokenStrH.GetKey(KeyId); // different processing for hashing int TokenId = TokenSet.GetKeyId(TokenStr); if (TokenId == -1) { // new token, remember the dimensionality change UpdateP = true; // remember the new token TokenId = TokenSet.AddKey(TokenStr); // increase document count table const int TokenDfId = DocFqV.Add(0); // increase also the old count table OldDocFqV.Add(0.0); // make sure we DF vector and TokenSet still in sync IAssert(TokenId == TokenDfId); IAssert(DocFqV.Len() == OldDocFqV.Len()); } // document count update DocFqV[TokenId]++; } } // update document count Docs++; // tell if dimension changed return UpdateP; }
// Helper: Return GroupSet based on NodeID void GetGroupSet(int NId, TIntSet& GroupSet) { GroupSet.Clr(); switch (NId) { case 0: GroupSet.AddKey(2); GroupSet.AddKey(4); GroupSet.AddKey(5); break; case 1: // Empty Set break; case 2: GroupSet.AddKey(0); GroupSet.AddKey(3); GroupSet.AddKey(5); break; case 3: GroupSet.AddKey(0); break; case 4: GroupSet.AddKey(0); break; case 5: GroupSet.AddKey(0); GroupSet.AddKey(1); break; default: ASSERT_FALSE(true); // NId Outside Graph Construction FAIL break; } }
void LSH::WordHashing(TQuoteBase *QuoteBase, THash<TMd5Sig, TIntSet>& ShingleToQuoteIds) { fprintf(stderr, "Hashing shingles using words...\n"); TIntV QuoteIds; QuoteBase->GetAllQuoteIds(QuoteIds); THash<TStr, TIntSet> Temp; for (int qt = 0; qt < QuoteIds.Len(); qt++) { if (qt % 1000 == 0) { fprintf(stderr, "%d out of %d completed\n", qt, QuoteIds.Len()); } TQuote Q; QuoteBase->GetQuote(QuoteIds[qt], Q); TStrV Content; Q.GetParsedContent(Content); int ContentLen = Content.Len(); for (int i = 0; i < ContentLen; i++) { const TMd5Sig ShingleMd5(Content[i]); TIntSet ShingleQuoteIds; ShingleToQuoteIds.IsKeyGetDat(ShingleMd5, ShingleQuoteIds); ShingleQuoteIds.AddKey(QuoteIds[qt]); ShingleToQuoteIds.AddDat(ShingleMd5, ShingleQuoteIds); ///// COMMENT OUT LATER TIntSet TempSet; Temp.IsKeyGetDat(Content[i], TempSet); TempSet.AddKey(QuoteIds[qt]); Temp.AddDat(Content[i], TempSet); } } TVec<TStr> ShingleKeys; Temp.GetKeyV(ShingleKeys); ShingleKeys.SortCmp(TCmpSetByLen(false, &Temp)); for (int i = 0; i < 100; i++) { TIntSet TempSet = Temp.GetDat(ShingleKeys[i]); Err("%d: %s - %d \n", i, ShingleKeys[i].CStr(), TempSet.Len()); } Err("Done with word hashing!\n"); }
// YES I COPIED AND PASTED CODE my section leader would be so ashamed :D void LSH::MinHash(THash<TMd5Sig, TIntSet>& ShingleToQuoteIds, TVec<THash<TIntV, TIntSet> >& SignatureBandBuckets) { TRnd RandomGenerator; // TODO: make this "more random" by incorporating time for (int i = 0; i < NumBands; ++i) { THash < TInt, TIntV > Inverted; // (QuoteID, QuoteSignatureForBand) THash < TIntV, TIntSet > BandBuckets; // (BandSignature, QuoteIDs) for (int j = 0; j < BandSize; ++j) { // Create new signature TVec < TMd5Sig > Signature; ShingleToQuoteIds.GetKeyV(Signature); Signature.Shuffle(RandomGenerator); // Place in bucket - not very efficient int SigLen = Signature.Len(); for (int k = 0; k < SigLen; ++k) { TIntSet CurSet = ShingleToQuoteIds.GetDat(Signature[k]); for (TIntSet::TIter l = CurSet.BegI(); l < CurSet.EndI(); l++) { TInt Key = l.GetKey(); if (Inverted.IsKey(Key)) { TIntV CurSignature = Inverted.GetDat(Key); if (CurSignature.Len() <= j) { CurSignature.Add(k); Inverted.AddDat(Key, CurSignature); } } else { TIntV NewSignature; NewSignature.Add(k); Inverted.AddDat(Key, NewSignature); } } } } TIntV InvertedKeys; Inverted.GetKeyV(InvertedKeys); TInt InvertedLen = InvertedKeys.Len(); for (int k = 0; k < InvertedLen; ++k) { TIntSet Bucket; TIntV Signature = Inverted.GetDat(InvertedKeys[k]); if (BandBuckets.IsKey(Signature)) { Bucket = BandBuckets.GetDat(Signature); } Bucket.AddKey(InvertedKeys[k]); BandBuckets.AddDat(Signature, Bucket); } SignatureBandBuckets.Add(BandBuckets); Err("%d out of %d band signatures computed\n", i + 1, NumBands); } Err("Minhash step complete!\n"); }
void TGraphKey::TakeGraph(const PNGraph& Graph, TIntPrV& NodeMap) { TIntSet NodeIdH; int n = 0; NodeMap.Gen(Graph->GetNodes(), 0); for (TNGraph::TNodeI NI = Graph->BegNI(); NI < Graph->EndNI(); NI++, n++) { NodeIdH.AddKey(NI.GetId()); NodeMap.Add(TIntPr(NI.GetId(), n)); } Nodes = Graph->GetNodes(); EdgeV.Gen(Nodes, 0); for (TNGraph::TNodeI NI = Graph->BegNI(); NI < Graph->EndNI(); NI++) { const int NewNId = NodeIdH.GetKeyId(NI.GetId()); for (int i = 0; i < NI.GetOutDeg(); i++) { EdgeV.Add(TIntPr(NewNId, NodeIdH.GetKeyId(NI.GetOutNId(i)))); } } EdgeV.Sort(true); EdgeV.Pack(); }
void TGStatVec::SaveTxt(const TStr& FNmPref, const TStr& Desc) const { FILE *F = fopen(TStr::Fmt("growth.%s.tab", FNmPref.CStr()).CStr(), "wt"); fprintf(F, "# %s\n", Desc.CStr()); fprintf(F, "# %s", TTmInfo::GetTmUnitStr(TmUnit).CStr()); TIntSet StatValSet; for (int i = 0; i < Len(); i++) { for (int v = gsvNone; v < gsvMx; v++) { if (At(i)->HasVal(TGStatVal(v))) { StatValSet.AddKey(v); } } } TIntV StatValV; StatValSet.GetKeyV(StatValV); StatValV.Sort(); for (int sv = 0; sv < StatValV.Len(); sv++) { fprintf(F, "\t%s", TGStat::GetValStr(TGStatVal(StatValV[sv].Val)).CStr()); } fprintf(F, "Time\n"); for (int i = 0; i < Len(); i++) { const TGStat& G = *At(i); for (int sv = 0; sv < StatValV.Len(); sv++) { fprintf(F, "%g\t", G.GetVal(TGStatVal(StatValV[sv].Val))); } fprintf(F, "%s\n", G.GetTmStr().CStr()); } fclose(F); }
/// Barabasi-Albert model of scale-free graphs. /// The graph has power-law degree distribution. /// See: Emergence of scaling in random networks by Barabasi and Albert. /// URL: http://arxiv.org/abs/cond-mat/9910332 PUNGraph GenPrefAttach(const int& Nodes, const int& NodeOutDeg, TRnd& Rnd) { PUNGraph GraphPt = PUNGraph::New(); TUNGraph& Graph = *GraphPt; Graph.Reserve(Nodes, NodeOutDeg*Nodes); TIntV NIdV(NodeOutDeg*Nodes, 0); // first edge Graph.AddNode(0); Graph.AddNode(1); NIdV.Add(0); NIdV.Add(1); Graph.AddEdge(0, 1); TIntSet NodeSet; for (int node = 2; node < Nodes; node++) { NodeSet.Clr(false); while (NodeSet.Len() < NodeOutDeg && NodeSet.Len() < node) { NodeSet.AddKey(NIdV[TInt::Rnd.GetUniDevInt(NIdV.Len())]); } const int N = Graph.AddNode(); for (int i = 0; i < NodeSet.Len(); i++) { Graph.AddEdge(N, NodeSet[i]); NIdV.Add(N); NIdV.Add(NodeSet[i]); } } return GraphPt; }
/// Newton method: DEPRECATED int TAGMFast::MLENewton(const double& Thres, const int& MaxIter, const TStr PlotNm) { TExeTm ExeTm; int iter = 0, PrevIter = 0; TIntFltPrV IterLV; double PrevL = TFlt::Mn, CurL; TUNGraph::TNodeI UI; TIntV NIdxV; G->GetNIdV(NIdxV); int CID, UID, NewtonIter; double Fuc, PrevFuc, Grad, H; while(iter < MaxIter) { NIdxV.Shuffle(Rnd); for (int ui = 0; ui < F.Len(); ui++, iter++) { if (! PlotNm.Empty() && iter % G->GetNodes() == 0) { IterLV.Add(TIntFltPr(iter, Likelihood(false))); } UID = NIdxV[ui]; //find set of candidate c (we only need to consider c to which a neighbor of u belongs to) TIntSet CIDSet; UI = G->GetNI(UID); if (UI.GetDeg() == 0) { //if the node is isolated, clear its membership and skip if (! F[UID].Empty()) { F[UID].Clr(); } continue; } for (int e = 0; e < UI.GetDeg(); e++) { if (HOVIDSV[UID].IsKey(UI.GetNbrNId(e))) { continue; } TIntFltH& NbhCIDH = F[UI.GetNbrNId(e)]; for (TIntFltH::TIter CI = NbhCIDH.BegI(); CI < NbhCIDH.EndI(); CI++) { CIDSet.AddKey(CI.GetKey()); } } for (TIntFltH::TIter CI = F[UID].BegI(); CI < F[UID].EndI(); CI++) { //remove the community membership which U does not share with its neighbors if (! CIDSet.IsKey(CI.GetKey())) { DelCom(UID, CI.GetKey()); } } if (CIDSet.Empty()) { continue; } for (TIntSet::TIter CI = CIDSet.BegI(); CI < CIDSet.EndI(); CI++) { CID = CI.GetKey(); //optimize for UID, CID //compute constants TFltV AlphaKV(UI.GetDeg()); for (int e = 0; e < UI.GetDeg(); e++) { if (HOVIDSV[UID].IsKey(UI.GetNbrNId(e))) { continue; } AlphaKV[e] = (1 - PNoCom) * exp(- DotProduct(UID, UI.GetNbrNId(e)) + GetCom(UI.GetNbrNId(e), CID) * GetCom(UID, CID)); IAssertR(AlphaKV[e] <= 1.0, TStr::Fmt("AlphaKV=%f, %f, %f", AlphaKV[e].Val, PNoCom.Val, GetCom(UI.GetNbrNId(e), CID))); } Fuc = GetCom(UID, CID); PrevFuc = Fuc; Grad = GradientForOneVar(AlphaKV, UID, CID, Fuc), H = 0.0; if (Grad <= 1e-3 && Grad >= -0.1) { continue; } NewtonIter = 0; while (NewtonIter++ < 10) { Grad = GradientForOneVar(AlphaKV, UID, CID, Fuc), H = 0.0; H = HessianForOneVar(AlphaKV, UID, CID, Fuc); if (Fuc == 0.0 && Grad <= 0.0) { Grad = 0.0; } if (fabs(Grad) < 1e-3) { break; } if (H == 0.0) { Fuc = 0.0; break; } double NewtonStep = - Grad / H; if (NewtonStep < -0.5) { NewtonStep = - 0.5; } Fuc += NewtonStep; if (Fuc < 0.0) { Fuc = 0.0; } } if (Fuc == 0.0) { DelCom(UID, CID); } else { AddCom(UID, CID, Fuc); } } } if (iter - PrevIter >= 2 * G->GetNodes() && iter > 10000) { PrevIter = iter; CurL = Likelihood(); if (PrevL > TFlt::Mn && ! PlotNm.Empty()) { printf("\r%d iterations, Likelihood: %f, Diff: %f", iter, CurL, CurL - PrevL); } fflush(stdout); if (CurL - PrevL <= Thres * fabs(PrevL)) { break; } else { PrevL = CurL; } } } if (! PlotNm.Empty()) { printf("\nMLE for Lambda completed with %d iterations(%s)\n", iter, ExeTm.GetTmStr()); TGnuPlot::PlotValV(IterLV, PlotNm + ".likelihood_Q"); } return iter; }
void LSH::MinHash(TQuoteBase *QB, THashSet<TMd5Sig>& Shingles, TVec<THash<TMd5Sig, TIntSet> >& SignatureBandBuckets) { Err("Creating buckets...\n"); THash < TMd5Sig, TIntV > Signatures; ComputeSignatures(Shingles, Signatures, NumBands * BandSize); // bucket creation for (int i = 0; i < NumBands; ++i) { SignatureBandBuckets.Add(THash<TMd5Sig, TIntSet>()); } // bucket filling int NumShingles = Shingles.Len(); THash<TInt, TQuote> Quotes; QB->GetIdToTQuotes(Quotes); THash<TInt, TQuote>::TIter CurI = Quotes.BegI(); THash<TInt, TQuote>::TIter EndI = Quotes.EndI(); TQuote Q; // SKYFALL for (; CurI < EndI; CurI++) { Q = CurI.GetDat(); TStrV Content; Q.GetParsedContent(Content); TInt Id = Q.GetId(); // signature for quote int ContentLen = Content.Len(); TVec < TIntV > Signature; for (int i = 0; i < ContentLen; i++) { const TMd5Sig ShingleMd5(Content[i]); Signature.Add(Signatures.GetDat(ShingleMd5)); } // place in bucket if (ContentLen < WordWindow) { for (int i = 0; i < NumBands; ++i) { TStr Sig; for (int j = 0; j < BandSize; ++j) { int CurSig = i * BandSize + j; TInt min = NumShingles; for (int k = 0; k < ContentLen; k++) { if (Signature[k][CurSig] < min) { min = Signature[k][CurSig]; } } Sig += min.GetStr() + "-"; } //Err(Sig.CStr()); const TMd5Sig SigMd5(Sig); TIntSet Bucket; SignatureBandBuckets[i].IsKeyGetDat(SigMd5, Bucket); Bucket.AddKey(Id); SignatureBandBuckets[i].AddDat(SigMd5, Bucket); } } else { } } Err("Minhash step complete!\n"); }