/// estimate number of communities using cross validation int TAGMFast::FindComsByCV(const int NumThreads, const int MaxComs, const int MinComs, const int DivComs, const TStr OutFNm, const double StepAlpha, const double StepBeta) { double ComsGap = exp(TMath::Log((double) MaxComs / (double) MinComs) / (double) DivComs); TIntV ComsV; ComsV.Add(MinComs); while (ComsV.Len() < DivComs) { int NewComs = int(ComsV.Last() * ComsGap); if (NewComs == ComsV.Last().Val) { NewComs++; } ComsV.Add(NewComs); } if (ComsV.Last() < MaxComs) { ComsV.Add(MaxComs); } return FindComsByCV(ComsV, 0.1, NumThreads, OutFNm + ".CV.likelihood", StepAlpha, StepBeta); }
/// rewire bipartite community affiliation graphs void TAGMUtil::RewireCmtyNID(THash<TInt,TIntV >& CmtyVH, TRnd& Rnd) { THash<TInt,TIntV > NewCmtyVH(CmtyVH.Len()); TIntV NDegV; TIntV CDegV; for (int i = 0; i < CmtyVH.Len(); i++) { int CID = CmtyVH.GetKey(i); for (int j = 0; j < CmtyVH[i].Len(); j++) { int NID = CmtyVH[i][j]; NDegV.Add(NID); CDegV.Add(CID); } } TIntPrSet CNIDSet(CDegV.Len()); int c=0; while (c++ < 15 && CDegV.Len() > 1) { for (int i = 0; i < CDegV.Len(); i++) { int u = Rnd.GetUniDevInt(CDegV.Len()); int v = Rnd.GetUniDevInt(NDegV.Len()); if (CNIDSet.IsKey(TIntPr(CDegV[u], NDegV[v]))) { continue; } CNIDSet.AddKey(TIntPr(CDegV[u], NDegV[v])); if (u == CDegV.Len() - 1) { CDegV.DelLast(); } else { CDegV[u] = CDegV.Last(); CDegV.DelLast(); } if ( v == NDegV.Len() - 1) { NDegV.DelLast(); } else { NDegV[v] = NDegV.Last(); NDegV.DelLast(); } } } for (int i = 0; i < CNIDSet.Len(); i++) { TIntPr CNIDPr = CNIDSet[i]; IAssert(CmtyVH.IsKey(CNIDPr.Val1)); NewCmtyVH.AddDat(CNIDPr.Val1); NewCmtyVH.GetDat(CNIDPr.Val1).Add(CNIDPr.Val2); } CmtyVH = NewCmtyVH; }
//Precompute unigram table using alias sampling method void InitUnigramTable(TIntV& Vocab, TIntV& KTable, TFltV& UTable) { double TrainWordsPow = 0; double Pwr = 0.75; TFltV ProbV(Vocab.Len()); for (int64 i = 0; i < Vocab.Len(); i++) { ProbV[i]=TMath::Power(Vocab[i],Pwr); TrainWordsPow += ProbV[i]; KTable[i]=0; UTable[i]=0; } for (int64 i = 0; i < ProbV.Len(); i++) { ProbV[i] /= TrainWordsPow; } TIntV UnderV; TIntV OverV; for (int64 i = 0; i < ProbV.Len(); i++) { UTable[i] = ProbV[i] * ProbV.Len(); if ( UTable[i] < 1 ) { UnderV.Add(i); } else { OverV.Add(i); } } while(UnderV.Len() > 0 && OverV.Len() > 0) { int64 Small = UnderV.Last(); int64 Large = OverV.Last(); UnderV.DelLast(); OverV.DelLast(); KTable[Small] = Large; UTable[Large] = UTable[Large] + UTable[Small] - 1; if (UTable[Large] < 1) { UnderV.Add(Large); } else { OverV.Add(Large); } } }
void TBlobBs::GenBlockLenV(TIntV& BlockLenV){ BlockLenV.Clr(); for (int P2Exp=0; P2Exp<TB4Def::MxP2Exp; P2Exp++){ BlockLenV.Add(TInt(TB4Def::GetP2(P2Exp)));} EAssert(int(BlockLenV.Last())<2000000000); {for (int Len=10; Len<100; Len+=10){BlockLenV.Add(Len);}} {for (int Len=100; Len<10000; Len+=100){BlockLenV.Add(Len);}} {for (int Len=10000; Len<100000; Len+=1000){BlockLenV.Add(Len);}} {for (int Len=100000; Len<1000000; Len+=25000){BlockLenV.Add(Len);}} {for (int Len=1000000; Len<10000000; Len+=1000000){BlockLenV.Add(Len);}} {for (int Len=10000000; Len<100000000; Len+=10000000){BlockLenV.Add(Len);}} BlockLenV.Sort(); }
int TAGMFast::FindComsByCV(TIntV& ComsV, const double HOFrac, const int NumThreads, const TStr PlotLFNm, const double StepAlpha, const double StepBeta) { if (ComsV.Len() == 0) { int MaxComs = G->GetNodes() / 5; ComsV.Add(2); while(ComsV.Last() < MaxComs) { ComsV.Add(ComsV.Last() * 2); } } TIntPrV EdgeV(G->GetEdges(), 0); for (TUNGraph::TEdgeI EI = G->BegEI(); EI < G->EndEI(); EI++) { EdgeV.Add(TIntPr(EI.GetSrcNId(), EI.GetDstNId())); } EdgeV.Shuffle(Rnd); int MaxIterCV = 3; TVec<TVec<TIntSet> > HoldOutSets(MaxIterCV); if (EdgeV.Len() > 50) { //if edges are many enough, use CV printf("generating hold out set\n"); TIntV NIdV1, NIdV2; G->GetNIdV(NIdV1); G->GetNIdV(NIdV2); for (int IterCV = 0; IterCV < MaxIterCV; IterCV++) { // generate holdout sets HoldOutSets[IterCV].Gen(G->GetNodes()); const int HOTotal = int(HOFrac * G->GetNodes() * (G->GetNodes() - 1) / 2.0); int HOCnt = 0; int HOEdges = (int) TMath::Round(HOFrac * G->GetEdges()); printf("holding out %d edges...\n", HOEdges); for (int he = 0; he < (int) HOEdges; he++) { HoldOutSets[IterCV][EdgeV[he].Val1].AddKey(EdgeV[he].Val2); HoldOutSets[IterCV][EdgeV[he].Val2].AddKey(EdgeV[he].Val1); HOCnt++; } printf("%d Edges hold out\n", HOCnt); while(HOCnt++ < HOTotal) { int SrcNID = Rnd.GetUniDevInt(G->GetNodes()); int DstNID = Rnd.GetUniDevInt(G->GetNodes()); HoldOutSets[IterCV][SrcNID].AddKey(DstNID); HoldOutSets[IterCV][DstNID].AddKey(SrcNID); } } printf("hold out set generated\n"); } TFltV HOLV(ComsV.Len()); TIntFltPrV ComsLV; for (int c = 0; c < ComsV.Len(); c++) { const int Coms = ComsV[c]; printf("Try number of Coms:%d\n", Coms); NeighborComInit(Coms); printf("Initialized\n"); if (EdgeV.Len() > 50) { //if edges are many enough, use CV for (int IterCV = 0; IterCV < MaxIterCV; IterCV++) { HOVIDSV = HoldOutSets[IterCV]; if (NumThreads == 1) { printf("MLE without parallelization begins\n"); MLEGradAscent(0.05, 10 * G->GetNodes(), "", StepAlpha, StepBeta); } else { printf("MLE with parallelization begins\n"); MLEGradAscentParallel(0.05, 100, NumThreads, "", StepAlpha, StepBeta); } double HOL = LikelihoodHoldOut(); HOL = HOL < 0? HOL: TFlt::Mn; HOLV[c] += HOL; } } else { HOVIDSV.Gen(G->GetNodes()); MLEGradAscent(0.0001, 100 * G->GetNodes(), ""); double BIC = 2 * Likelihood() - (double) G->GetNodes() * Coms * 2.0 * log ( (double) G->GetNodes()); HOLV[c] = BIC; } } int EstComs = 2; double MaxL = TFlt::Mn; printf("\n"); for (int c = 0; c < ComsV.Len(); c++) { ComsLV.Add(TIntFltPr(ComsV[c].Val, HOLV[c].Val)); printf("%d(%f)\t", ComsV[c].Val, HOLV[c].Val); if (MaxL < HOLV[c]) { MaxL = HOLV[c]; EstComs = ComsV[c]; } } printf("\n"); RandomInit(EstComs); HOVIDSV.Gen(G->GetNodes()); if (! PlotLFNm.Empty()) { TGnuPlot::PlotValV(ComsLV, PlotLFNm, "hold-out likelihood", "communities", "likelihood"); } return EstComs; }