// Loads a (directed, undirected or multi) graph from a text file InFNm with 1 node and all its edges in a single line. void IOConnListStr() { const int NNodes = 500; const int NEdges = 2000; const char *FName = "demo.graph.dat"; PUNGraph GOut, GIn; GOut = GenRndGnm<PUNGraph>(NNodes, NEdges); // Output nodes as random strings TIntStrH OutNIdStrH; TStrHash<TInt> OutStrNIdH; // Generate unique random strings for graph for (TUNGraph::TNodeI NI = GOut->BegNI(); NI < GOut->EndNI(); NI++) { TStr RandStr = ""; do { TInt RandLen = TInt::Rnd.GetUniDevInt(5, 10); for (int i = 0; i < RandLen; i++) { // TStr RandChar(TInt::Rnd.GetUniDevInt(33, 126)); TStr RandChar(TInt::Rnd.GetUniDevInt(97, 122)); RandStr += RandChar; } } while (OutStrNIdH.IsKey(RandStr) || RandStr[0] == '#'); OutNIdStrH.AddDat(NI.GetId(), RandStr); OutStrNIdH.AddDat(RandStr, NI.GetId()); } // Create graph file FILE *F = fopen(FName, "w"); for (TUNGraph::TNodeI NI = GOut->BegNI(); NI < GOut->EndNI(); NI++) { fprintf(F, "%s", OutNIdStrH[NI.GetId()].CStr()); for (int e = 0; e < NI.GetOutDeg(); e++) { fprintf(F, " %s", OutNIdStrH[NI.GetOutNId(e)].CStr()); } fprintf(F, "\n"); } fclose(F); TStrHash<TInt> InStrToNIdH; GIn = LoadConnListStr<PUNGraph>(FName, InStrToNIdH); PrintGStats("ConnListStr - Out", GOut); PrintGStats("ConnListStr - In", GIn); }
// Save and load directed, undirected and multi-graphs, where node names are strings void IOEdgeListStr() { const int NNodes = 1000; const int NEdges = 5000; const char *FName = "demo.graph.dat"; PNEGraph GOut, GIn; // Can also be PUNGraph or PNGraph GOut = GenRndGnm<PNEGraph>(NNodes, NEdges); // Output nodes as random strings TIntStrH OutNIdStrH; TStrHash<TInt> OutStrNIdH; // Generate unique random strings for graph TStr RandStr; for (TNEGraph::TNodeI NI = GOut->BegNI(); NI < GOut->EndNI(); NI++) { do { RandStr.Clr(); TInt RandLen = TInt::Rnd.GetUniDevInt(5, 30); for (int i = 0; i < RandLen; i++) { TStr RandChar(TInt::Rnd.GetUniDevInt(33, 126)); RandStr += RandChar; } } while (OutStrNIdH.IsKey(RandStr) || RandStr[0] == '#'); // Not unique or starts with comment OutNIdStrH.AddDat(NI.GetId(), RandStr); OutStrNIdH.AddDat(RandStr, NI.GetId()); } // Create graph file FILE *F = fopen(FName, "w"); for (TNEGraph::TEdgeI EI = GOut->BegEI(); EI < GOut->EndEI(); EI++) { TInt Src = EI.GetSrcNId(); TInt Dst = EI.GetDstNId(); fprintf(F, "%s %s\n", OutNIdStrH[Src].CStr(), OutNIdStrH[Dst].CStr()); } fclose(F); // Load edge list of strings TStrHash<TInt> InStrToNIdH; GIn = LoadEdgeListStr<PNEGraph>(FName, 0, 1, InStrToNIdH); PrintGStats<PNEGraph>("EdgeListStr - Out", GOut); PrintGStats<PNEGraph>("EdgeListStr - In", GIn); }
// and words to StrH and get a vector of word ids void TStrUtil::GetAddWIdV(TStrHash<TInt>& StrH, const char *CStr, TIntV& WIdV) { TChA ChA(CStr); TVec<char *> WrdV; TInt WId; TStrUtil::SplitWords(ChA, WrdV); WIdV.Clr(false); for (int w = 0; w < WrdV.Len(); w++) { WIdV.Add(StrH.AddDatId(WrdV[w])); } }
int TStrUtil::CountWords(const TChA& ChA, const TStrHash<TInt>& StopWordH) { TChA Tmp; TVec<char *> WrdV; SplitWords(Tmp, WrdV); int SWordCnt = 0; for (int w = 0; w < WrdV.Len(); w++) { if (StopWordH.IsKey(WrdV[w])) { SWordCnt++; } } return WrdV.Len() - SWordCnt; }
void TStrUtil::GetWIdV(const TStrHash<TInt>& StrH, const char *CStr, TIntV& WIdV) { const int NotWId = -1; TChA ChA(CStr); TVec<char *> WrdV; TInt WId; TStrUtil::SplitWords(ChA, WrdV); WIdV.Clr(false); for (int w = 0; w < WrdV.Len(); w++) { if (StrH.IsKeyGetDat(WrdV[w], WId)) { WIdV.Add(WId); } else { WIdV.Add(NotWId); } } }
int main(int argc, char* argv[]) { Env = TEnv(argc, argv, TNotify::StdNotify); Env.PrepArgs(TStr::Fmt("cesna. build: %s, %s. Time: %s", __TIME__, __DATE__, TExeTm::GetCurTm())); TExeTm ExeTm; Try TStr OutFPrx = Env.GetIfArgPrefixStr("-o:", "", "Output Graph data prefix"); const TStr InFNm = Env.GetIfArgPrefixStr("-i:", "./1912.edges", "Input edgelist file name"); const TStr LabelFNm = Env.GetIfArgPrefixStr("-l:", "", "Input file name for node names (Node ID, Node label) "); const TStr AttrFNm = Env.GetIfArgPrefixStr("-a:", "./1912.nodefeat", "Input node attribute file name"); const TStr ANameFNm = Env.GetIfArgPrefixStr("-n:", "./1912.nodefeatnames", "Input file name for node attribute names"); int OptComs = Env.GetIfArgPrefixInt("-c:", 10, "The number of communities to detect (-1: detect automatically)"); const int MinComs = Env.GetIfArgPrefixInt("-mc:", 3, "Minimum number of communities to try"); const int MaxComs = Env.GetIfArgPrefixInt("-xc:", 20, "Maximum number of communities to try"); const int DivComs = Env.GetIfArgPrefixInt("-nc:", 5, "How many trials for the number of communities"); const int NumThreads = Env.GetIfArgPrefixInt("-nt:", 4, "Number of threads for parallelization"); const double AttrWeight = Env.GetIfArgPrefixFlt("-aw:", 0.5, "We maximize (1 - aw) P(Network) + aw * P(Attributes)"); const double LassoWeight = Env.GetIfArgPrefixFlt("-lw:", 1.0, "Weight for l-1 regularization on learning the logistic model parameters"); const double StepAlpha = Env.GetIfArgPrefixFlt("-sa:", 0.05, "Alpha for backtracking line search"); const double StepBeta = Env.GetIfArgPrefixFlt("-sb:", 0.3, "Beta for backtracking line search"); const double MinFeatFrac = Env.GetIfArgPrefixFlt("-mf:", 0.0, "If the fraction of nodes with positive values for an attribute is smaller than this, we ignore that attribute"); #ifdef USE_OPENMP omp_set_num_threads(NumThreads); #endif PUNGraph G; TIntStrH NIDNameH; TStrHash<TInt> NodeNameH; TVec<TFltV> Wck; TVec<TIntV> EstCmtyVV; if (InFNm.IsStrIn(".ungraph")) { TFIn GFIn(InFNm); G = TUNGraph::Load(GFIn); } else { G = TAGMUtil::LoadEdgeListStr<PUNGraph>(InFNm, NodeNameH); NIDNameH.Gen(NodeNameH.Len()); for (int s = 0; s < NodeNameH.Len(); s++) { NIDNameH.AddDat(s, NodeNameH.GetKey(s)); } } if (LabelFNm.Len() > 0) { TSsParser Ss(LabelFNm, ssfTabSep); while (Ss.Next()) { if (Ss.Len() > 0) { NIDNameH.AddDat(Ss.GetInt(0), Ss.GetFld(1)); } } } printf("Graph: %d Nodes %d Edges\n", G->GetNodes(), G->GetEdges()); //load attribute TIntV NIDV; G->GetNIdV(NIDV); THash<TInt, TIntV> RawNIDAttrH, NIDAttrH; TIntStrH RawFeatNameH, FeatNameH; if (ANameFNm.Len() > 0) { TSsParser Ss(ANameFNm, ssfTabSep); while (Ss.Next()) { if (Ss.Len() > 0) { RawFeatNameH.AddDat(Ss.GetInt(0), Ss.GetFld(1)); } } } TCesnaUtil::LoadNIDAttrHFromNIDKH(NIDV, AttrFNm, RawNIDAttrH, NodeNameH); TCesnaUtil::FilterLowEntropy(RawNIDAttrH, NIDAttrH, RawFeatNameH, FeatNameH, MinFeatFrac); TExeTm RunTm; TCesna CS(G, NIDAttrH, 10, 10); if (OptComs == -1) { printf("finding number of communities\n"); OptComs = CS.FindComs(NumThreads, MaxComs, MinComs, DivComs, "", false, 0.1, StepAlpha, StepBeta); } CS.NeighborComInit(OptComs); CS.SetWeightAttr(AttrWeight); CS.SetLassoCoef(LassoWeight); if (NumThreads == 1 || G->GetEdges() < 1000) { CS.MLEGradAscent(0.0001, 1000 * G->GetNodes(), "", StepAlpha, StepBeta); } else { CS.MLEGradAscentParallel(0.0001, 1000, NumThreads, "", StepAlpha, StepBeta); } CS.GetCmtyVV(EstCmtyVV, Wck); TAGMUtil::DumpCmtyVV(OutFPrx + "cmtyvv.txt", EstCmtyVV, NIDNameH); FILE* F = fopen((OutFPrx + "weights.txt").CStr(), "wt"); if (FeatNameH.Len() == Wck[0].Len()) { fprintf(F, "#"); for (int k = 0; k < FeatNameH.Len(); k++) { fprintf(F, "%s", FeatNameH[k].CStr()); if (k < FeatNameH.Len() - 1) { fprintf(F, "\t"); } } fprintf(F, "\n"); } for (int c = 0; c < Wck.Len(); c++) { for (int k = 0; k < Wck[c].Len(); k++) { fprintf(F, "%f", Wck[c][k].Val); if (k < Wck[c].Len() - 1) { fprintf(F, "\t"); } } fprintf(F, "\n"); } fclose(F); Catch printf("\nrun time: %s (%s)\n", ExeTm.GetTmStr(), TSecTm::GetCurTm().GetTmStr().CStr()); return 0; }