void PlotSngValDistr(const PNGraph& Graph, const int& SngVals, const TStr& FNmPref, TStr DescStr) { const int NBuckets = 50; TFltV SngValV; for (int f = 1; SngValV.Empty() && f < 4; f++) { TSnap::GetSngVals(Graph, f*SngVals, SngValV); } SngValV.Sort(true); THash<TFlt, TFlt> BucketCntH; double Step = (SngValV.Last()-SngValV[0]) / double(NBuckets-1); for (int i = 0; i < NBuckets; i++) { BucketCntH.AddDat(SngValV[0]+Step*(i+0.5), 0); } for (int i = 0; i < SngValV.Len(); i++) { const int Bucket = (int) floor((SngValV[i]-SngValV[0]) / Step); BucketCntH[Bucket] += 1; } TFltPrV EigCntV; BucketCntH.GetKeyDatPrV(EigCntV); if (DescStr.Empty()) { DescStr = FNmPref; } TGnuPlot::PlotValV(EigCntV, "sngDistr."+FNmPref, TStr::Fmt("%s. G(%d, %d). Largest eig val = %f", DescStr.CStr(), Graph->GetNodes(), Graph->GetEdges(), SngValV.Last().Val), "Singular value", "Count", gpsAuto, false, gpwLinesPoints); }
double TLogRegPredict::GetCfy(const TFltV& AttrV, const TFltV& NewTheta) { int len = AttrV.Len(); double res = 0; if (len < NewTheta.Len()) { res = NewTheta.Last(); //if feature vector is shorter, add an intercept } for (int i = 0; i < len; i++) { if (i < NewTheta.Len()) { res += AttrV[i] * NewTheta[i]; } } double mu = 1 / (1 + exp(-res)); return mu; }
/// estimate number of communities using AGM int TAGMUtil::FindComsByAGM(const PUNGraph& Graph, const int InitComs, const int MaxIter, const int RndSeed, const double RegGap, const double PNoCom, const TStr PltFPrx) { TRnd Rnd(RndSeed); int LambdaIter = 100; if (Graph->GetNodes() < 200) { LambdaIter = 1; } if (Graph->GetNodes() < 200 && Graph->GetEdges() > 2000) { LambdaIter = 100; } //Find coms with large C TAGMFit AGMFitM(Graph, InitComs, RndSeed); if (PNoCom > 0.0) { AGMFitM.SetPNoCom(PNoCom); } AGMFitM.RunMCMC(MaxIter, LambdaIter, ""); int TE = Graph->GetEdges(); TFltV RegV; RegV.Add(0.3 * TE); for (int r = 0; r < 25; r++) { RegV.Add(RegV.Last() * RegGap); } TFltPrV RegComsV, RegLV, RegBICV; TFltV LV, BICV; //record likelihood and number of communities with nonzero P_c for (int r = 0; r < RegV.Len(); r++) { double RegCoef = RegV[r]; AGMFitM.SetRegCoef(RegCoef); AGMFitM.MLEGradAscentGivenCAG(0.01, 1000); AGMFitM.SetRegCoef(0.0); TVec<TIntV> EstCmtyVV; AGMFitM.GetCmtyVV(EstCmtyVV, 0.99); int NumLowQ = EstCmtyVV.Len(); RegComsV.Add(TFltPr(RegCoef, (double) NumLowQ)); if (EstCmtyVV.Len() > 0) { TAGMFit AFTemp(Graph, EstCmtyVV, Rnd); AFTemp.MLEGradAscentGivenCAG(0.001, 1000); double CurL = AFTemp.Likelihood(); LV.Add(CurL); BICV.Add(-2.0 * CurL + (double) EstCmtyVV.Len() * log((double) Graph->GetNodes() * (Graph->GetNodes() - 1) / 2.0)); } else { break; } } // if likelihood does not exist or does not change at all, report the smallest number of communities or 2 if (LV.Len() == 0) { return 2; } else if (LV[0] == LV.Last()) { return (int) TMath::Mx<TFlt>(2.0, RegComsV[LV.Len() - 1].Val2); } //normalize likelihood and BIC to 0~100 int MaxL = 100; { TFltV& ValueV = LV; TFltPrV& RegValueV = RegLV; double MinValue = TFlt::Mx, MaxValue = TFlt::Mn; for (int l = 0; l < ValueV.Len(); l++) { if (ValueV[l] < MinValue) { MinValue = ValueV[l]; } if (ValueV[l] > MaxValue) { MaxValue = ValueV[l]; } } while (ValueV.Len() < RegV.Len()) { ValueV.Add(MinValue); } double RangeVal = MaxValue - MinValue; for (int l = 0; l < ValueV.Len(); l++) { RegValueV.Add(TFltPr(RegV[l], double(MaxL) * (ValueV[l] - MinValue) / RangeVal)); } } { TFltV& ValueV = BICV; TFltPrV& RegValueV = RegBICV; double MinValue = TFlt::Mx, MaxValue = TFlt::Mn; for (int l = 0; l < ValueV.Len(); l++) { if (ValueV[l] < MinValue) { MinValue = ValueV[l]; } if (ValueV[l] > MaxValue) { MaxValue = ValueV[l]; } } while (ValueV.Len() < RegV.Len()) { ValueV.Add(MaxValue); } double RangeVal = MaxValue - MinValue; for (int l = 0; l < ValueV.Len(); l++) { RegValueV.Add(TFltPr(RegV[l], double(MaxL) * (ValueV[l] - MinValue) / RangeVal)); } } //fit logistic regression to normalized likelihood. TVec<TFltV> XV(RegLV.Len()); TFltV YV (RegLV.Len()); for (int l = 0; l < RegLV.Len(); l++) { XV[l] = TFltV::GetV(log(RegLV[l].Val1)); YV[l] = RegLV[l].Val2 / (double) MaxL; } TFltPrV LRVScaled, LRV; TLogRegFit LRFit; PLogRegPredict LRMd = LRFit.CalcLogRegNewton(XV, YV, PltFPrx); for (int l = 0; l < RegLV.Len(); l++) { LRV.Add(TFltPr(RegV[l], LRMd->GetCfy(XV[l]))); LRVScaled.Add(TFltPr(RegV[l], double(MaxL) * LRV.Last().Val2)); } //estimate # communities from fitted logistic regression int NumComs = 0, IdxRegDrop = 0; double LRThres = 1.1, RegDrop; // 1 / (1 + exp(1.1)) = 0.25 double LeftReg = 0.0, RightReg = 0.0; TFltV Theta; LRMd->GetTheta(Theta); RegDrop = (- Theta[1] - LRThres) / Theta[0]; if (RegDrop <= XV[0][0]) { NumComs = (int) RegComsV[0].Val2; } else if (RegDrop >= XV.Last()[0]) { NumComs = (int) RegComsV.Last().Val2; } else { //interpolate for RegDrop for (int i = 0; i < XV.Len(); i++) { if (XV[i][0] > RegDrop) { IdxRegDrop = i; break; } } if (IdxRegDrop == 0) { printf("Error!! RegDrop:%f, Theta[0]:%f, Theta[1]:%f\n", RegDrop, Theta[0].Val, Theta[1].Val); for (int l = 0; l < RegLV.Len(); l++) { printf("X[%d]:%f, Y[%d]:%f\n", l, XV[l][0].Val, l, YV[l].Val); } } IAssert(IdxRegDrop > 0); LeftReg = RegDrop - XV[IdxRegDrop - 1][0]; RightReg = XV[IdxRegDrop][0] - RegDrop; NumComs = (int) TMath::Round( (RightReg * RegComsV[IdxRegDrop - 1].Val2 + LeftReg * RegComsV[IdxRegDrop].Val2) / (LeftReg + RightReg)); } //printf("Interpolation coeff: %f, %f, index at drop:%d (%f), Left-Right Vals: %f, %f\n", LeftReg, RightReg, IdxRegDrop, RegDrop, RegComsV[IdxRegDrop - 1].Val2, RegComsV[IdxRegDrop].Val2); printf("Num Coms:%d\n", NumComs); if (NumComs < 2) { NumComs = 2; } if (PltFPrx.Len() > 0) { TStr PlotTitle = TStr::Fmt("N:%d, E:%d ", Graph->GetNodes(), TE); TGnuPlot GPC(PltFPrx + ".l"); GPC.AddPlot(RegComsV, gpwLinesPoints, "C"); GPC.AddPlot(RegLV, gpwLinesPoints, "likelihood"); GPC.AddPlot(RegBICV, gpwLinesPoints, "BIC"); GPC.AddPlot(LRVScaled, gpwLinesPoints, "Sigmoid (scaled)"); GPC.SetScale(gpsLog10X); GPC.SetTitle(PlotTitle); GPC.SavePng(PltFPrx + ".l.png"); } return NumComs; }