// I embarassingly don't know how templating works. void QuoteGraph::CompareUsingMinHash(TVec<THash<TMd5Sig, TIntSet> >& BucketsVector) { THashSet<TIntPr> EdgeCache; int Count = 0; int RealCount = 0; Err("Beginning edge creation step...\n"); for (int i = 0; i < BucketsVector.Len(); i++) { Err("Processing band signature %d of %d - %d signatures\n", i+1, BucketsVector.Len(), BucketsVector[i].Len()); TVec<TMd5Sig> Buckets; BucketsVector[i].GetKeyV(Buckets); TVec<TMd5Sig>::TIter BucketEnd = Buckets.EndI(); for (TVec<TMd5Sig>::TIter BucketSig = Buckets.BegI(); BucketSig < BucketEnd; BucketSig++) { TIntSet Bucket = BucketsVector[i].GetDat(*BucketSig); Count += Bucket.Len() * (Bucket.Len() - 1) / 2; for (TIntSet::TIter Quote1 = Bucket.BegI(); Quote1 < Bucket.EndI(); Quote1++) { TIntSet::TIter Quote1Copy = Quote1; Quote1Copy++; for (TIntSet::TIter Quote2 = Quote1Copy; Quote2 < Bucket.EndI(); Quote2++) { if (!EdgeCache.IsKey(TIntPr(Quote1.GetKey(), Quote2.GetKey())) && !EdgeCache.IsKey(TIntPr(Quote2.GetKey(), Quote1.GetKey()))) { EdgeCache.AddKey(TIntPr(Quote1.GetKey(), Quote2.GetKey())); EdgeCache.AddKey(TIntPr(Quote2.GetKey(), Quote1.GetKey())); RealCount++; AddEdgeIfSimilar(Quote1.GetKey(), Quote2.GetKey()); } } } } } fprintf(stderr, "NUMBER OF COMPARES: %d\n", Count); fprintf(stderr, "NUMBER OF REAL COMPARES: %d\n", RealCount); }
void QuoteGraph::CompareUsingShingles(THash<TMd5Sig, TIntSet>& Shingles) { int Count = 0; int RealCount = 0; TVec<TMd5Sig> ShingleKeys; Shingles.GetKeyV(ShingleKeys); THashSet<TIntPr> EdgeCache; for (int i = 0; i < ShingleKeys.Len(); i++) { if (i % 100 == 0) { Err("Processed %d out of %d shingles, count = %d\n", i, ShingleKeys.Len(), Count); } TIntSet Bucket; Shingles.IsKeyGetDat(ShingleKeys[i], Bucket); for (TIntSet::TIter Quote1 = Bucket.BegI(); Quote1 < Bucket.EndI(); Quote1++) { TIntSet::TIter Quote1Copy = Quote1; Quote1Copy++; for (TIntSet::TIter Quote2 = Quote1Copy; Quote2 < Bucket.EndI(); Quote2++) { if (!EdgeCache.IsKey(TIntPr(Quote1.GetKey(), Quote2.GetKey())) && !EdgeCache.IsKey(TIntPr(Quote2.GetKey(), Quote1.GetKey()))) { EdgeCache.AddKey(TIntPr(Quote1.GetKey(), Quote2.GetKey())); EdgeCache.AddKey(TIntPr(Quote2.GetKey(), Quote1.GetKey())); RealCount++; AddEdgeIfSimilar(Quote1.GetKey(), Quote2.GetKey()); } } } int Len = Bucket.Len() * (Bucket.Len() - 1) / 2; Count += Len; } fprintf(stderr, "NUMBER OF COMPARES: %d\n", Count); fprintf(stderr, "NUMBER OF REAL COMPARES: %d\n", RealCount); }
double TAGMFit::SelectLambdaSum(const TFltV& NewLambdaV, const TIntSet& ComK) { double Result = 0.0; for (TIntSet::TIter SI = ComK.BegI(); SI < ComK.EndI(); SI++) { IAssert(NewLambdaV[SI.GetKey()] >= 0); Result += NewLambdaV[SI.GetKey()]; } return Result; }
// YES I COPIED AND PASTED CODE my section leader would be so ashamed :D void LSH::MinHash(THash<TMd5Sig, TIntSet>& ShingleToQuoteIds, TVec<THash<TIntV, TIntSet> >& SignatureBandBuckets) { TRnd RandomGenerator; // TODO: make this "more random" by incorporating time for (int i = 0; i < NumBands; ++i) { THash < TInt, TIntV > Inverted; // (QuoteID, QuoteSignatureForBand) THash < TIntV, TIntSet > BandBuckets; // (BandSignature, QuoteIDs) for (int j = 0; j < BandSize; ++j) { // Create new signature TVec < TMd5Sig > Signature; ShingleToQuoteIds.GetKeyV(Signature); Signature.Shuffle(RandomGenerator); // Place in bucket - not very efficient int SigLen = Signature.Len(); for (int k = 0; k < SigLen; ++k) { TIntSet CurSet = ShingleToQuoteIds.GetDat(Signature[k]); for (TIntSet::TIter l = CurSet.BegI(); l < CurSet.EndI(); l++) { TInt Key = l.GetKey(); if (Inverted.IsKey(Key)) { TIntV CurSignature = Inverted.GetDat(Key); if (CurSignature.Len() <= j) { CurSignature.Add(k); Inverted.AddDat(Key, CurSignature); } } else { TIntV NewSignature; NewSignature.Add(k); Inverted.AddDat(Key, NewSignature); } } } } TIntV InvertedKeys; Inverted.GetKeyV(InvertedKeys); TInt InvertedLen = InvertedKeys.Len(); for (int k = 0; k < InvertedLen; ++k) { TIntSet Bucket; TIntV Signature = Inverted.GetDat(InvertedKeys[k]); if (BandBuckets.IsKey(Signature)) { Bucket = BandBuckets.GetDat(Signature); } Bucket.AddKey(InvertedKeys[k]); BandBuckets.AddDat(Signature, Bucket); } SignatureBandBuckets.Add(BandBuckets); Err("%d out of %d band signatures computed\n", i + 1, NumBands); } Err("Minhash step complete!\n"); }
/// Newton method: DEPRECATED int TAGMFast::MLENewton(const double& Thres, const int& MaxIter, const TStr PlotNm) { TExeTm ExeTm; int iter = 0, PrevIter = 0; TIntFltPrV IterLV; double PrevL = TFlt::Mn, CurL; TUNGraph::TNodeI UI; TIntV NIdxV; G->GetNIdV(NIdxV); int CID, UID, NewtonIter; double Fuc, PrevFuc, Grad, H; while(iter < MaxIter) { NIdxV.Shuffle(Rnd); for (int ui = 0; ui < F.Len(); ui++, iter++) { if (! PlotNm.Empty() && iter % G->GetNodes() == 0) { IterLV.Add(TIntFltPr(iter, Likelihood(false))); } UID = NIdxV[ui]; //find set of candidate c (we only need to consider c to which a neighbor of u belongs to) TIntSet CIDSet; UI = G->GetNI(UID); if (UI.GetDeg() == 0) { //if the node is isolated, clear its membership and skip if (! F[UID].Empty()) { F[UID].Clr(); } continue; } for (int e = 0; e < UI.GetDeg(); e++) { if (HOVIDSV[UID].IsKey(UI.GetNbrNId(e))) { continue; } TIntFltH& NbhCIDH = F[UI.GetNbrNId(e)]; for (TIntFltH::TIter CI = NbhCIDH.BegI(); CI < NbhCIDH.EndI(); CI++) { CIDSet.AddKey(CI.GetKey()); } } for (TIntFltH::TIter CI = F[UID].BegI(); CI < F[UID].EndI(); CI++) { //remove the community membership which U does not share with its neighbors if (! CIDSet.IsKey(CI.GetKey())) { DelCom(UID, CI.GetKey()); } } if (CIDSet.Empty()) { continue; } for (TIntSet::TIter CI = CIDSet.BegI(); CI < CIDSet.EndI(); CI++) { CID = CI.GetKey(); //optimize for UID, CID //compute constants TFltV AlphaKV(UI.GetDeg()); for (int e = 0; e < UI.GetDeg(); e++) { if (HOVIDSV[UID].IsKey(UI.GetNbrNId(e))) { continue; } AlphaKV[e] = (1 - PNoCom) * exp(- DotProduct(UID, UI.GetNbrNId(e)) + GetCom(UI.GetNbrNId(e), CID) * GetCom(UID, CID)); IAssertR(AlphaKV[e] <= 1.0, TStr::Fmt("AlphaKV=%f, %f, %f", AlphaKV[e].Val, PNoCom.Val, GetCom(UI.GetNbrNId(e), CID))); } Fuc = GetCom(UID, CID); PrevFuc = Fuc; Grad = GradientForOneVar(AlphaKV, UID, CID, Fuc), H = 0.0; if (Grad <= 1e-3 && Grad >= -0.1) { continue; } NewtonIter = 0; while (NewtonIter++ < 10) { Grad = GradientForOneVar(AlphaKV, UID, CID, Fuc), H = 0.0; H = HessianForOneVar(AlphaKV, UID, CID, Fuc); if (Fuc == 0.0 && Grad <= 0.0) { Grad = 0.0; } if (fabs(Grad) < 1e-3) { break; } if (H == 0.0) { Fuc = 0.0; break; } double NewtonStep = - Grad / H; if (NewtonStep < -0.5) { NewtonStep = - 0.5; } Fuc += NewtonStep; if (Fuc < 0.0) { Fuc = 0.0; } } if (Fuc == 0.0) { DelCom(UID, CID); } else { AddCom(UID, CID, Fuc); } } } if (iter - PrevIter >= 2 * G->GetNodes() && iter > 10000) { PrevIter = iter; CurL = Likelihood(); if (PrevL > TFlt::Mn && ! PlotNm.Empty()) { printf("\r%d iterations, Likelihood: %f, Diff: %f", iter, CurL, CurL - PrevL); } fflush(stdout); if (CurL - PrevL <= Thres * fabs(PrevL)) { break; } else { PrevL = CurL; } } } if (! PlotNm.Empty()) { printf("\nMLE for Lambda completed with %d iterations(%s)\n", iter, ExeTm.GetTmStr()); TGnuPlot::PlotValV(IterLV, PlotNm + ".likelihood_Q"); } return iter; }