void IncrementalQuoteGraph::AddNodes() { TIntSet::TIter QuoteIdsEnd = NewQuotes.EndI(); for (TIntSet::TIter QuoteId = NewQuotes.BegI(); QuoteId < QuoteIdsEnd; QuoteId++) { if (!QGraph->IsNode(QuoteId.GetKey())) QGraph->AddNode(QuoteId.GetKey()); } }
void TAGMFit::InitNodeData() { TSnap::DelSelfEdges(G); NIDComVH.Gen(G->GetNodes()); for (TUNGraph::TNodeI NI = G->BegNI(); NI < G->EndNI(); NI++) { NIDComVH.AddDat(NI.GetId()); } TAGMUtil::GetNodeMembership(NIDComVH, CIDNSetV); GetEdgeJointCom(); LambdaV.Gen(CIDNSetV.Len()); for (int c = 0; c < CIDNSetV.Len(); c++) { int MaxE = (CIDNSetV[c].Len()) * (CIDNSetV[c].Len() - 1) / 2; if (MaxE < 2) { LambdaV[c] = MaxLambda; } else{ LambdaV[c] = -log((double) (MaxE - ComEdgesV[c]) / MaxE); } if (LambdaV[c] > MaxLambda) { LambdaV[c] = MaxLambda; } if (LambdaV[c] < MinLambda) { LambdaV[c] = MinLambda; } } NIDCIDPrS.Gen(G->GetNodes() * 10); for (int c = 0; c < CIDNSetV.Len(); c++) { for (TIntSet::TIter SI = CIDNSetV[c].BegI(); SI < CIDNSetV[c].EndI(); SI++) { NIDCIDPrS.AddKey(TIntPr(SI.GetKey(), c)); } } }
double TAGMFit::SelectLambdaSum(const TFltV& NewLambdaV, const TIntSet& ComK) { double Result = 0.0; for (TIntSet::TIter SI = ComK.BegI(); SI < ComK.EndI(); SI++) { IAssert(NewLambdaV[SI.GetKey()] >= 0); Result += NewLambdaV[SI.GetKey()]; } return Result; }
void TAGMUtil::GetNodeMembership(THash<TInt,TIntSet >& NIDComVH, const TVec<TIntSet>& CmtyVV) { for (int i = 0; i < CmtyVV.Len(); i++) { int CID = i; for (TIntSet::TIter SI = CmtyVV[i].BegI(); SI < CmtyVV[i].EndI(); SI++) { int NID = SI.GetKey(); NIDComVH.AddDat(NID).AddKey(CID); } } }
// YES I COPIED AND PASTED CODE my section leader would be so ashamed :D void LSH::MinHash(THash<TMd5Sig, TIntSet>& ShingleToQuoteIds, TVec<THash<TIntV, TIntSet> >& SignatureBandBuckets) { TRnd RandomGenerator; // TODO: make this "more random" by incorporating time for (int i = 0; i < NumBands; ++i) { THash < TInt, TIntV > Inverted; // (QuoteID, QuoteSignatureForBand) THash < TIntV, TIntSet > BandBuckets; // (BandSignature, QuoteIDs) for (int j = 0; j < BandSize; ++j) { // Create new signature TVec < TMd5Sig > Signature; ShingleToQuoteIds.GetKeyV(Signature); Signature.Shuffle(RandomGenerator); // Place in bucket - not very efficient int SigLen = Signature.Len(); for (int k = 0; k < SigLen; ++k) { TIntSet CurSet = ShingleToQuoteIds.GetDat(Signature[k]); for (TIntSet::TIter l = CurSet.BegI(); l < CurSet.EndI(); l++) { TInt Key = l.GetKey(); if (Inverted.IsKey(Key)) { TIntV CurSignature = Inverted.GetDat(Key); if (CurSignature.Len() <= j) { CurSignature.Add(k); Inverted.AddDat(Key, CurSignature); } } else { TIntV NewSignature; NewSignature.Add(k); Inverted.AddDat(Key, NewSignature); } } } } TIntV InvertedKeys; Inverted.GetKeyV(InvertedKeys); TInt InvertedLen = InvertedKeys.Len(); for (int k = 0; k < InvertedLen; ++k) { TIntSet Bucket; TIntV Signature = Inverted.GetDat(InvertedKeys[k]); if (BandBuckets.IsKey(Signature)) { Bucket = BandBuckets.GetDat(Signature); } Bucket.AddKey(InvertedKeys[k]); BandBuckets.AddDat(Signature, Bucket); } SignatureBandBuckets.Add(BandBuckets); Err("%d out of %d band signatures computed\n", i + 1, NumBands); } Err("Minhash step complete!\n"); }
void TIncrementalClustering::KeepAtMostOneChildPerNode(PNGraph& G, TQuoteBase *QB, TDocBase *DB) { TIntSet::TIter EndNode = AffectedNodes.EndI(); for (TIntSet::TIter NodeId = AffectedNodes.BegI(); NodeId < EndNode; NodeId++) { TNGraph::TNodeI Node = G->GetNI(NodeId.GetKey()); TQuote SourceQuote; if (QB->GetQuote(Node.GetId(), SourceQuote)) { TInt NodeDegree = Node.GetOutDeg(); if (NodeDegree > 1) { TFlt MaxScore = 0; TInt MaxNodeId = 0; TIntV NodeV; // first pass: check to see if we are pointing to any old nodes - if so, they get higher // priority over the new ones for edge selection. bool ContainsOldNode = false; for (int i = 0; i < NodeDegree; ++i) { if (!NewQuotes.IsKey(Node.GetOutNId(i))) { ContainsOldNode = true; } } // modified edge selection: filter out new nodes if old ones exist. for (int i = 0; i < NodeDegree; ++i) { TInt CurNode = Node.GetOutNId(i); NodeV.Add(CurNode); TQuote DestQuote; if (QB->GetQuote(CurNode, DestQuote)) { TFlt EdgeScore = 0; if (!ContainsOldNode || !NewQuotes.IsKey(Node.GetOutNId(i))) { EdgeScore = ComputeEdgeScore(SourceQuote, DestQuote, DB); } if (EdgeScore > MaxScore) { MaxScore = EdgeScore; MaxNodeId = CurNode; } } } // remove all other edges, backwards to prevent indexing fail for (int i = 0; i < NodeV.Len(); i++) { if (NodeV[i] != MaxNodeId) { G->DelEdge(Node.GetId(), NodeV[i]); } } //printf("Out degree: %d out of %d\n", Node.GetOutDeg(), NodeDegree.Val); } } } fprintf(stderr, "finished deleting edges\n"); }
// Gradient of likelihood for P_c. void TAGMFit::GradLogLForLambda(TFltV& GradV) { GradV.Gen(LambdaV.Len()); TFltV SumEdgeProbsV(LambdaV.Len()); for (int e = 0; e < EdgeComVH.Len(); e++) { TIntSet& JointCom = EdgeComVH[e]; double LambdaSum = SelectLambdaSum(JointCom); double Puv = 1 - exp(- LambdaSum); if (JointCom.Len() == 0) { Puv = PNoCom; } for (TIntSet::TIter SI = JointCom.BegI(); SI < JointCom.EndI(); SI++) { SumEdgeProbsV[SI.GetKey()] += (1 - Puv) / Puv; } } for (int k = 0; k < LambdaV.Len(); k++) { int MaxEk = CIDNSetV[k].Len() * (CIDNSetV[k].Len() - 1) / 2; int NotEdgesInCom = MaxEk - ComEdgesV[k]; GradV[k] = SumEdgeProbsV[k] - (double) NotEdgesInCom; if (LambdaV[k] > 0.0 && RegCoef > 0.0) { //if regularization exists GradV[k] -= RegCoef; } } }
/// Newton method: DEPRECATED int TAGMFast::MLENewton(const double& Thres, const int& MaxIter, const TStr PlotNm) { TExeTm ExeTm; int iter = 0, PrevIter = 0; TIntFltPrV IterLV; double PrevL = TFlt::Mn, CurL; TUNGraph::TNodeI UI; TIntV NIdxV; G->GetNIdV(NIdxV); int CID, UID, NewtonIter; double Fuc, PrevFuc, Grad, H; while(iter < MaxIter) { NIdxV.Shuffle(Rnd); for (int ui = 0; ui < F.Len(); ui++, iter++) { if (! PlotNm.Empty() && iter % G->GetNodes() == 0) { IterLV.Add(TIntFltPr(iter, Likelihood(false))); } UID = NIdxV[ui]; //find set of candidate c (we only need to consider c to which a neighbor of u belongs to) TIntSet CIDSet; UI = G->GetNI(UID); if (UI.GetDeg() == 0) { //if the node is isolated, clear its membership and skip if (! F[UID].Empty()) { F[UID].Clr(); } continue; } for (int e = 0; e < UI.GetDeg(); e++) { if (HOVIDSV[UID].IsKey(UI.GetNbrNId(e))) { continue; } TIntFltH& NbhCIDH = F[UI.GetNbrNId(e)]; for (TIntFltH::TIter CI = NbhCIDH.BegI(); CI < NbhCIDH.EndI(); CI++) { CIDSet.AddKey(CI.GetKey()); } } for (TIntFltH::TIter CI = F[UID].BegI(); CI < F[UID].EndI(); CI++) { //remove the community membership which U does not share with its neighbors if (! CIDSet.IsKey(CI.GetKey())) { DelCom(UID, CI.GetKey()); } } if (CIDSet.Empty()) { continue; } for (TIntSet::TIter CI = CIDSet.BegI(); CI < CIDSet.EndI(); CI++) { CID = CI.GetKey(); //optimize for UID, CID //compute constants TFltV AlphaKV(UI.GetDeg()); for (int e = 0; e < UI.GetDeg(); e++) { if (HOVIDSV[UID].IsKey(UI.GetNbrNId(e))) { continue; } AlphaKV[e] = (1 - PNoCom) * exp(- DotProduct(UID, UI.GetNbrNId(e)) + GetCom(UI.GetNbrNId(e), CID) * GetCom(UID, CID)); IAssertR(AlphaKV[e] <= 1.0, TStr::Fmt("AlphaKV=%f, %f, %f", AlphaKV[e].Val, PNoCom.Val, GetCom(UI.GetNbrNId(e), CID))); } Fuc = GetCom(UID, CID); PrevFuc = Fuc; Grad = GradientForOneVar(AlphaKV, UID, CID, Fuc), H = 0.0; if (Grad <= 1e-3 && Grad >= -0.1) { continue; } NewtonIter = 0; while (NewtonIter++ < 10) { Grad = GradientForOneVar(AlphaKV, UID, CID, Fuc), H = 0.0; H = HessianForOneVar(AlphaKV, UID, CID, Fuc); if (Fuc == 0.0 && Grad <= 0.0) { Grad = 0.0; } if (fabs(Grad) < 1e-3) { break; } if (H == 0.0) { Fuc = 0.0; break; } double NewtonStep = - Grad / H; if (NewtonStep < -0.5) { NewtonStep = - 0.5; } Fuc += NewtonStep; if (Fuc < 0.0) { Fuc = 0.0; } } if (Fuc == 0.0) { DelCom(UID, CID); } else { AddCom(UID, CID, Fuc); } } } if (iter - PrevIter >= 2 * G->GetNodes() && iter > 10000) { PrevIter = iter; CurL = Likelihood(); if (PrevL > TFlt::Mn && ! PlotNm.Empty()) { printf("\r%d iterations, Likelihood: %f, Diff: %f", iter, CurL, CurL - PrevL); } fflush(stdout); if (CurL - PrevL <= Thres * fabs(PrevL)) { break; } else { PrevL = CurL; } } } if (! PlotNm.Empty()) { printf("\nMLE for Lambda completed with %d iterations(%s)\n", iter, ExeTm.GetTmStr()); TGnuPlot::PlotValV(IterLV, PlotNm + ".likelihood_Q"); } return iter; }