int TGnuPlot::AddPlot(const TIntFltPrV& XYValV, const TGpSeriesTy& SeriesTy, const TStr& Label, const TStr& Style) { TFltKdV XYFltValV(XYValV.Len(), 0); for (int i = 0; i < XYValV.Len(); i++) { XYFltValV.Add(TFltKd(TFlt(XYValV[i].Val1), TFlt(XYValV[i].Val2))); } return AddPlot(XYFltValV, SeriesTy, Label, Style); }
void TSkyGridEnt::GetEntClustV(const TSkyGridBs* SkyGridBs, const uint64& MnTm, const int& MnDocs, const int& MxDocs, const int& Clusts, TVec<TStrFltPrV>& EntNmWgtPrVV) const { EntNmWgtPrVV.Clr(); // create bow PBowDocBs BowDocBs=TBowDocBs::New(); // collect documents TIntV DocIdV; GetDocIdV(SkyGridBs, MnTm, 0, DocIdV); DocIdV.Reverse(); DocIdV.Shuffle(TRnd(1)); DocIdV.Trunc(MxDocs); if (DocIdV.Len()<MnDocs){return;} for (int DocN=0; DocN<DocIdV.Len(); DocN++){ int DocId=DocIdV[DocN]; PSkyGridDoc Doc=SkyGridBs->GetDoc(DocId); // create vector of entity-weights TIntFltPrV WIdWgtPrV; for (int EntN=0; EntN<Doc->GetEnts(); EntN++){ int EntId; int EntFq; Doc->GetEntNmFq(EntN, EntId, EntFq); TStr EntNm=SkyGridBs->GetEntNm(EntId); int EntWId=BowDocBs->AddWordStr(EntNm); WIdWgtPrV.Add(TIntFltPr(EntWId, EntFq)); } // create bow-document int DId=BowDocBs->AddDoc(TInt::GetStr(DocId), TStrV(), WIdWgtPrV); TStr DocDescStr=Doc->GetTitleStr(); BowDocBs->PutDocDescStr(DId, DocDescStr); } // k-means clustering PBowSim BowSim=TBowSim::New(bstCos); // similarity object TBowWordWgtType WordWgtType=bwwtNrmTFIDF; // define weighting PBowDocPart BowDocPart=TBowClust::GetKMeansPart( TNotify::StdNotify, // log output BowDocBs, // document data BowSim, // similarity function TRnd(1), // random generator Clusts, // number of clusters 1, // trials per k-means 1, // convergence epsilon for k-means 1, // min. documents per cluster WordWgtType, // word weighting 0, // cut-word-weights percentage 0); // minimal word frequency EntNmWgtPrVV.Clr(); for (int ClustN=0; ClustN<BowDocPart->GetClusts(); ClustN++){ PBowDocPartClust Clust=BowDocPart->GetClust(ClustN); TStrFltPrV WordStrWgtPrV; Clust->GetTopWordStrWgtPrV(BowDocBs, 25, 0.5, WordStrWgtPrV); EntNmWgtPrVV.Add(WordStrWgtPrV); } //BowDocPart->SaveTxt("Clusts.Txt", BowDocBs, true, 25, 0.5, false); }
TStr TStrUtil::GetStr(const TIntFltPrV& IntFltPrV, const TStr& FieldDelimiterStr, const TStr& DelimiterStr, const TStr& FmtStr) { TChA ResChA; for (int i = 0; i < IntFltPrV.Len(); i++) { if (!ResChA.Empty()) { ResChA += DelimiterStr; } ResChA += IntFltPrV[i].Val1.GetStr(); ResChA += FieldDelimiterStr; ResChA += TFlt::GetStr(IntFltPrV[i].Val2, FmtStr); } return ResChA; }
int TAGMFast::MLEGradAscentParallel(const double& Thres, const int& MaxIter, const int ChunkNum, const int ChunkSize, const TStr PlotNm, const double StepAlpha, const double StepBeta) { //parallel time_t InitTime = time(NULL); uint64 StartTm = TSecTm::GetCurTm().GetAbsSecs(); TExeTm ExeTm, CheckTm; double PrevL = Likelihood(true); TIntFltPrV IterLV; int PrevIter = 0; int iter = 0; TIntV NIdxV(F.Len(), 0); for (int i = 0; i < F.Len(); i++) { NIdxV.Add(i); } TIntV NIDOPTV(F.Len()); //check if a node needs optimization or not 1: does not require optimization NIDOPTV.PutAll(0); TVec<TIntFltH> NewF(ChunkNum * ChunkSize); TIntV NewNIDV(ChunkNum * ChunkSize); for (iter = 0; iter < MaxIter; iter++) { NIdxV.Clr(false); for (int i = 0; i < F.Len(); i++) { if (NIDOPTV[i] == 0) { NIdxV.Add(i); } } IAssert (NIdxV.Len() <= F.Len()); NIdxV.Shuffle(Rnd); // compute gradient for chunk of nodes #pragma omp parallel for schedule(static, 1) for (int TIdx = 0; TIdx < ChunkNum; TIdx++) { TIntFltH GradV; for (int ui = TIdx * ChunkSize; ui < (TIdx + 1) * ChunkSize; ui++) { NewNIDV[ui] = -1; if (ui > NIdxV.Len()) { continue; } int u = NIdxV[ui]; // //find set of candidate c (we only need to consider c to which a neighbor of u belongs to) TUNGraph::TNodeI UI = G->GetNI(u); TIntSet CIDSet(5 * UI.GetDeg()); TIntFltH CurFU = F[u]; for (int e = 0; e < UI.GetDeg(); e++) { if (HOVIDSV[u].IsKey(UI.GetNbrNId(e))) { continue; } TIntFltH& NbhCIDH = F[UI.GetNbrNId(e)]; for (TIntFltH::TIter CI = NbhCIDH.BegI(); CI < NbhCIDH.EndI(); CI++) { CIDSet.AddKey(CI.GetKey()); } } if (CIDSet.Empty()) { CurFU.Clr(); } else { for (TIntFltH::TIter CI = CurFU.BegI(); CI < CurFU.EndI(); CI++) { //remove the community membership which U does not share with its neighbors if (! CIDSet.IsKey(CI.GetKey())) { CurFU.DelIfKey(CI.GetKey()); } } GradientForRow(u, GradV, CIDSet); if (Norm2(GradV) < 1e-4) { NIDOPTV[u] = 1; continue; } double LearnRate = GetStepSizeByLineSearch(u, GradV, GradV, StepAlpha, StepBeta, 5); if (LearnRate <= 1e-5) { NewNIDV[ui] = -2; continue; } for (int ci = 0; ci < GradV.Len(); ci++) { int CID = GradV.GetKey(ci); double Change = LearnRate * GradV.GetDat(CID); double NewFuc = CurFU.IsKey(CID)? CurFU.GetDat(CID) + Change : Change; if (NewFuc <= 0.0) { CurFU.DelIfKey(CID); } else { CurFU.AddDat(CID) = NewFuc; } } CurFU.Defrag(); } //store changes NewF[ui] = CurFU; NewNIDV[ui] = u; } } int NumNoChangeGrad = 0; int NumNoChangeStepSize = 0; for (int ui = 0; ui < NewNIDV.Len(); ui++) { int NewNID = NewNIDV[ui]; if (NewNID == -1) { NumNoChangeGrad++; continue; } if (NewNID == -2) { NumNoChangeStepSize++; continue; } for (TIntFltH::TIter CI = F[NewNID].BegI(); CI < F[NewNID].EndI(); CI++) { SumFV[CI.GetKey()] -= CI.GetDat(); } } #pragma omp parallel for for (int ui = 0; ui < NewNIDV.Len(); ui++) { int NewNID = NewNIDV[ui]; if (NewNID < 0) { continue; } F[NewNID] = NewF[ui]; } for (int ui = 0; ui < NewNIDV.Len(); ui++) { int NewNID = NewNIDV[ui]; if (NewNID < 0) { continue; } for (TIntFltH::TIter CI = F[NewNID].BegI(); CI < F[NewNID].EndI(); CI++) { SumFV[CI.GetKey()] += CI.GetDat(); } } // update the nodes who are optimal for (int ui = 0; ui < NewNIDV.Len(); ui++) { int NewNID = NewNIDV[ui]; if (NewNID < 0) { continue; } TUNGraph::TNodeI UI = G->GetNI(NewNID); NIDOPTV[NewNID] = 0; for (int e = 0; e < UI.GetDeg(); e++) { NIDOPTV[UI.GetNbrNId(e)] = 0; } } int OPTCnt = 0; for (int i = 0; i < NIDOPTV.Len(); i++) { if (NIDOPTV[i] == 1) { OPTCnt++; } } if (! PlotNm.Empty()) { printf("\r%d iterations [%s] %d secs", iter * ChunkSize * ChunkNum, ExeTm.GetTmStr(), int(TSecTm::GetCurTm().GetAbsSecs() - StartTm)); if (PrevL > TFlt::Mn) { printf(" (%f) %d g %d s %d OPT", PrevL, NumNoChangeGrad, NumNoChangeStepSize, OPTCnt); } fflush(stdout); } if ((iter - PrevIter) * ChunkSize * ChunkNum >= G->GetNodes()) { PrevIter = iter; double CurL = Likelihood(true); IterLV.Add(TIntFltPr(iter * ChunkSize * ChunkNum, CurL)); printf("\r%d iterations, Likelihood: %f, Diff: %f [%d secs]", iter, CurL, CurL - PrevL, int(time(NULL) - InitTime)); fflush(stdout); if (CurL - PrevL <= Thres * fabs(PrevL)) { break; } else { PrevL = CurL; } } } if (! PlotNm.Empty()) { printf("\nMLE completed with %d iterations(%s secs)\n", iter, int(TSecTm::GetCurTm().GetAbsSecs() - StartTm)); TGnuPlot::PlotValV(IterLV, PlotNm + ".likelihood_Q");[] } else {
int TAGMFast::MLEGradAscent(const double& Thres, const int& MaxIter, const TStr PlotNm, const double StepAlpha, const double StepBeta) { time_t InitTime = time(NULL); TExeTm ExeTm, CheckTm; int iter = 0, PrevIter = 0; TIntFltPrV IterLV; TUNGraph::TNodeI UI; double PrevL = TFlt::Mn, CurL = 0.0; TIntV NIdxV(F.Len(), 0); for (int i = 0; i < F.Len(); i++) { NIdxV.Add(i); } IAssert(NIdxV.Len() == F.Len()); TIntFltH GradV; while(iter < MaxIter) { NIdxV.Shuffle(Rnd); for (int ui = 0; ui < F.Len(); ui++, iter++) { int u = NIdxV[ui]; // //find set of candidate c (we only need to consider c to which a neighbor of u belongs to) UI = G->GetNI(u); TIntSet CIDSet(5 * UI.GetDeg()); for (int e = 0; e < UI.GetDeg(); e++) { if (HOVIDSV[u].IsKey(UI.GetNbrNId(e))) { continue; } TIntFltH& NbhCIDH = F[UI.GetNbrNId(e)]; for (TIntFltH::TIter CI = NbhCIDH.BegI(); CI < NbhCIDH.EndI(); CI++) { CIDSet.AddKey(CI.GetKey()); } } for (TIntFltH::TIter CI = F[u].BegI(); CI < F[u].EndI(); CI++) { //remove the community membership which U does not share with its neighbors if (! CIDSet.IsKey(CI.GetKey())) { DelCom(u, CI.GetKey()); } } if (CIDSet.Empty()) { continue; } GradientForRow(u, GradV, CIDSet); if (Norm2(GradV) < 1e-4) { continue; } double LearnRate = GetStepSizeByLineSearch(u, GradV, GradV, StepAlpha, StepBeta); if (LearnRate == 0.0) { continue; } for (int ci = 0; ci < GradV.Len(); ci++) { int CID = GradV.GetKey(ci); double Change = LearnRate * GradV.GetDat(CID); double NewFuc = GetCom(u, CID) + Change; if (NewFuc <= 0.0) { DelCom(u, CID); } else { AddCom(u, CID, NewFuc); } } if (! PlotNm.Empty() && (iter + 1) % G->GetNodes() == 0) { IterLV.Add(TIntFltPr(iter, Likelihood(false))); } } printf("\r%d iterations (%f) [%lu sec]", iter, CurL, time(NULL) - InitTime); fflush(stdout); if (iter - PrevIter >= 2 * G->GetNodes() && iter > 10000) { PrevIter = iter; CurL = Likelihood(); if (PrevL > TFlt::Mn && ! PlotNm.Empty()) { printf("\r%d iterations, Likelihood: %f, Diff: %f", iter, CurL, CurL - PrevL); } fflush(stdout); if (CurL - PrevL <= Thres * fabs(PrevL)) { break; } else { PrevL = CurL; } } } printf("\n"); printf("MLE for Lambda completed with %d iterations(%s)\n", iter, ExeTm.GetTmStr()); if (! PlotNm.Empty()) { TGnuPlot::PlotValV(IterLV, PlotNm + ".likelihood_Q"); } return iter; }
int TAGMFast::FindComsByCV(TIntV& ComsV, const double HOFrac, const int NumThreads, const TStr PlotLFNm, const double StepAlpha, const double StepBeta) { if (ComsV.Len() == 0) { int MaxComs = G->GetNodes() / 5; ComsV.Add(2); while(ComsV.Last() < MaxComs) { ComsV.Add(ComsV.Last() * 2); } } TIntPrV EdgeV(G->GetEdges(), 0); for (TUNGraph::TEdgeI EI = G->BegEI(); EI < G->EndEI(); EI++) { EdgeV.Add(TIntPr(EI.GetSrcNId(), EI.GetDstNId())); } EdgeV.Shuffle(Rnd); int MaxIterCV = 3; TVec<TVec<TIntSet> > HoldOutSets(MaxIterCV); if (EdgeV.Len() > 50) { //if edges are many enough, use CV printf("generating hold out set\n"); TIntV NIdV1, NIdV2; G->GetNIdV(NIdV1); G->GetNIdV(NIdV2); for (int IterCV = 0; IterCV < MaxIterCV; IterCV++) { // generate holdout sets HoldOutSets[IterCV].Gen(G->GetNodes()); const int HOTotal = int(HOFrac * G->GetNodes() * (G->GetNodes() - 1) / 2.0); int HOCnt = 0; int HOEdges = (int) TMath::Round(HOFrac * G->GetEdges()); printf("holding out %d edges...\n", HOEdges); for (int he = 0; he < (int) HOEdges; he++) { HoldOutSets[IterCV][EdgeV[he].Val1].AddKey(EdgeV[he].Val2); HoldOutSets[IterCV][EdgeV[he].Val2].AddKey(EdgeV[he].Val1); HOCnt++; } printf("%d Edges hold out\n", HOCnt); while(HOCnt++ < HOTotal) { int SrcNID = Rnd.GetUniDevInt(G->GetNodes()); int DstNID = Rnd.GetUniDevInt(G->GetNodes()); HoldOutSets[IterCV][SrcNID].AddKey(DstNID); HoldOutSets[IterCV][DstNID].AddKey(SrcNID); } } printf("hold out set generated\n"); } TFltV HOLV(ComsV.Len()); TIntFltPrV ComsLV; for (int c = 0; c < ComsV.Len(); c++) { const int Coms = ComsV[c]; printf("Try number of Coms:%d\n", Coms); NeighborComInit(Coms); printf("Initialized\n"); if (EdgeV.Len() > 50) { //if edges are many enough, use CV for (int IterCV = 0; IterCV < MaxIterCV; IterCV++) { HOVIDSV = HoldOutSets[IterCV]; if (NumThreads == 1) { printf("MLE without parallelization begins\n"); MLEGradAscent(0.05, 10 * G->GetNodes(), "", StepAlpha, StepBeta); } else { printf("MLE with parallelization begins\n"); MLEGradAscentParallel(0.05, 100, NumThreads, "", StepAlpha, StepBeta); } double HOL = LikelihoodHoldOut(); HOL = HOL < 0? HOL: TFlt::Mn; HOLV[c] += HOL; } } else { HOVIDSV.Gen(G->GetNodes()); MLEGradAscent(0.0001, 100 * G->GetNodes(), ""); double BIC = 2 * Likelihood() - (double) G->GetNodes() * Coms * 2.0 * log ( (double) G->GetNodes()); HOLV[c] = BIC; } } int EstComs = 2; double MaxL = TFlt::Mn; printf("\n"); for (int c = 0; c < ComsV.Len(); c++) { ComsLV.Add(TIntFltPr(ComsV[c].Val, HOLV[c].Val)); printf("%d(%f)\t", ComsV[c].Val, HOLV[c].Val); if (MaxL < HOLV[c]) { MaxL = HOLV[c]; EstComs = ComsV[c]; } } printf("\n"); RandomInit(EstComs); HOVIDSV.Gen(G->GetNodes()); if (! PlotLFNm.Empty()) { TGnuPlot::PlotValV(ComsLV, PlotLFNm, "hold-out likelihood", "communities", "likelihood"); } return EstComs; }
/// Newton method: DEPRECATED int TAGMFast::MLENewton(const double& Thres, const int& MaxIter, const TStr PlotNm) { TExeTm ExeTm; int iter = 0, PrevIter = 0; TIntFltPrV IterLV; double PrevL = TFlt::Mn, CurL; TUNGraph::TNodeI UI; TIntV NIdxV; G->GetNIdV(NIdxV); int CID, UID, NewtonIter; double Fuc, PrevFuc, Grad, H; while(iter < MaxIter) { NIdxV.Shuffle(Rnd); for (int ui = 0; ui < F.Len(); ui++, iter++) { if (! PlotNm.Empty() && iter % G->GetNodes() == 0) { IterLV.Add(TIntFltPr(iter, Likelihood(false))); } UID = NIdxV[ui]; //find set of candidate c (we only need to consider c to which a neighbor of u belongs to) TIntSet CIDSet; UI = G->GetNI(UID); if (UI.GetDeg() == 0) { //if the node is isolated, clear its membership and skip if (! F[UID].Empty()) { F[UID].Clr(); } continue; } for (int e = 0; e < UI.GetDeg(); e++) { if (HOVIDSV[UID].IsKey(UI.GetNbrNId(e))) { continue; } TIntFltH& NbhCIDH = F[UI.GetNbrNId(e)]; for (TIntFltH::TIter CI = NbhCIDH.BegI(); CI < NbhCIDH.EndI(); CI++) { CIDSet.AddKey(CI.GetKey()); } } for (TIntFltH::TIter CI = F[UID].BegI(); CI < F[UID].EndI(); CI++) { //remove the community membership which U does not share with its neighbors if (! CIDSet.IsKey(CI.GetKey())) { DelCom(UID, CI.GetKey()); } } if (CIDSet.Empty()) { continue; } for (TIntSet::TIter CI = CIDSet.BegI(); CI < CIDSet.EndI(); CI++) { CID = CI.GetKey(); //optimize for UID, CID //compute constants TFltV AlphaKV(UI.GetDeg()); for (int e = 0; e < UI.GetDeg(); e++) { if (HOVIDSV[UID].IsKey(UI.GetNbrNId(e))) { continue; } AlphaKV[e] = (1 - PNoCom) * exp(- DotProduct(UID, UI.GetNbrNId(e)) + GetCom(UI.GetNbrNId(e), CID) * GetCom(UID, CID)); IAssertR(AlphaKV[e] <= 1.0, TStr::Fmt("AlphaKV=%f, %f, %f", AlphaKV[e].Val, PNoCom.Val, GetCom(UI.GetNbrNId(e), CID))); } Fuc = GetCom(UID, CID); PrevFuc = Fuc; Grad = GradientForOneVar(AlphaKV, UID, CID, Fuc), H = 0.0; if (Grad <= 1e-3 && Grad >= -0.1) { continue; } NewtonIter = 0; while (NewtonIter++ < 10) { Grad = GradientForOneVar(AlphaKV, UID, CID, Fuc), H = 0.0; H = HessianForOneVar(AlphaKV, UID, CID, Fuc); if (Fuc == 0.0 && Grad <= 0.0) { Grad = 0.0; } if (fabs(Grad) < 1e-3) { break; } if (H == 0.0) { Fuc = 0.0; break; } double NewtonStep = - Grad / H; if (NewtonStep < -0.5) { NewtonStep = - 0.5; } Fuc += NewtonStep; if (Fuc < 0.0) { Fuc = 0.0; } } if (Fuc == 0.0) { DelCom(UID, CID); } else { AddCom(UID, CID, Fuc); } } } if (iter - PrevIter >= 2 * G->GetNodes() && iter > 10000) { PrevIter = iter; CurL = Likelihood(); if (PrevL > TFlt::Mn && ! PlotNm.Empty()) { printf("\r%d iterations, Likelihood: %f, Diff: %f", iter, CurL, CurL - PrevL); } fflush(stdout); if (CurL - PrevL <= Thres * fabs(PrevL)) { break; } else { PrevL = CurL; } } } if (! PlotNm.Empty()) { printf("\nMLE for Lambda completed with %d iterations(%s)\n", iter, ExeTm.GetTmStr()); TGnuPlot::PlotValV(IterLV, PlotNm + ".likelihood_Q"); } return iter; }