Example #1
// Loads a (directed, undirected or multi) graph from a text file InFNm with 1 node and all its edges in a single line.
void IOConnListStr() {
  const int NNodes = 500;
  const int NEdges = 2000;
  const char *FName = "demo.graph.dat";
  PUNGraph GOut, GIn;
  GOut = GenRndGnm<PUNGraph>(NNodes, NEdges);
  // Output nodes as random strings
  TIntStrH OutNIdStrH;
  TStrHash<TInt> OutStrNIdH;
  // Generate unique random strings for graph
  for (TUNGraph::TNodeI NI = GOut->BegNI(); NI < GOut->EndNI(); NI++) {
    TStr RandStr = "";
    do {
      TInt RandLen = TInt::Rnd.GetUniDevInt(5, 10);
      for (int i = 0; i < RandLen; i++) {
        //        TStr RandChar(TInt::Rnd.GetUniDevInt(33, 126));
        TStr RandChar(TInt::Rnd.GetUniDevInt(97, 122));
        RandStr += RandChar;
    while (OutStrNIdH.IsKey(RandStr) || RandStr[0] == '#');
    OutNIdStrH.AddDat(NI.GetId(), RandStr);
    OutStrNIdH.AddDat(RandStr, NI.GetId());
  // Create graph file
  FILE *F = fopen(FName, "w");
  for (TUNGraph::TNodeI NI = GOut->BegNI(); NI < GOut->EndNI(); NI++) {
    fprintf(F, "%s", OutNIdStrH[NI.GetId()].CStr());
    for (int e = 0; e < NI.GetOutDeg(); e++) {
      fprintf(F, " %s", OutNIdStrH[NI.GetOutNId(e)].CStr());
    fprintf(F, "\n");
  TStrHash<TInt> InStrToNIdH;
  GIn = LoadConnListStr<PUNGraph>(FName, InStrToNIdH);
  PrintGStats("ConnListStr - Out", GOut);
  PrintGStats("ConnListStr - In", GIn);
Example #2
// Save and load directed, undirected and multi-graphs, where node names are strings
void IOEdgeListStr() {
  const int NNodes = 1000;
  const int NEdges = 5000;
  const char *FName = "demo.graph.dat";
  PNEGraph GOut, GIn;      // Can also be PUNGraph or PNGraph
  GOut = GenRndGnm<PNEGraph>(NNodes, NEdges);
  // Output nodes as random strings
  TIntStrH OutNIdStrH;
  TStrHash<TInt> OutStrNIdH;
  // Generate unique random strings for graph
  TStr RandStr;
  for (TNEGraph::TNodeI NI = GOut->BegNI(); NI < GOut->EndNI(); NI++) {
    do {
      TInt RandLen = TInt::Rnd.GetUniDevInt(5, 30);
      for (int i = 0; i < RandLen; i++) {
        TStr RandChar(TInt::Rnd.GetUniDevInt(33, 126));
        RandStr += RandChar;
    while (OutStrNIdH.IsKey(RandStr) || RandStr[0] == '#'); // Not unique or starts with comment
    OutNIdStrH.AddDat(NI.GetId(), RandStr);
    OutStrNIdH.AddDat(RandStr, NI.GetId());
  // Create graph file
  FILE *F = fopen(FName, "w");
  for (TNEGraph::TEdgeI EI = GOut->BegEI(); EI < GOut->EndEI(); EI++) {
    TInt Src = EI.GetSrcNId();
    TInt Dst = EI.GetDstNId();
    fprintf(F, "%s %s\n", OutNIdStrH[Src].CStr(), OutNIdStrH[Dst].CStr());
  // Load edge list of strings
  TStrHash<TInt> InStrToNIdH;
  GIn = LoadEdgeListStr<PNEGraph>(FName, 0, 1, InStrToNIdH);
  PrintGStats<PNEGraph>("EdgeListStr - Out", GOut);
  PrintGStats<PNEGraph>("EdgeListStr - In", GIn);

Example #3
// and words to StrH and get a vector of word ids
void TStrUtil::GetAddWIdV(TStrHash<TInt>& StrH, const char *CStr, TIntV& WIdV) {
  TChA ChA(CStr);
  TVec<char *> WrdV;
  TInt WId;
  TStrUtil::SplitWords(ChA, WrdV);
  for (int w = 0; w < WrdV.Len(); w++) {
Example #4
int TStrUtil::CountWords(const TChA& ChA, const TStrHash<TInt>& StopWordH) {
  TChA Tmp;
  TVec<char *> WrdV;
  SplitWords(Tmp, WrdV);
  int SWordCnt = 0;
  for (int w = 0; w < WrdV.Len(); w++) {
    if (StopWordH.IsKey(WrdV[w])) { SWordCnt++; }
  return WrdV.Len() - SWordCnt;
Example #5
void TStrUtil::GetWIdV(const TStrHash<TInt>& StrH, const char *CStr, TIntV& WIdV) {
  const int NotWId = -1;
  TChA ChA(CStr);
  TVec<char *> WrdV;
  TInt WId;
  TStrUtil::SplitWords(ChA, WrdV);
  for (int w = 0; w < WrdV.Len(); w++) {
    if (StrH.IsKeyGetDat(WrdV[w], WId)) { WIdV.Add(WId); }
    else { WIdV.Add(NotWId); }
Example #6
int main(int argc, char* argv[]) {
  Env = TEnv(argc, argv, TNotify::StdNotify);
  Env.PrepArgs(TStr::Fmt("cesna. build: %s, %s. Time: %s", __TIME__, __DATE__, TExeTm::GetCurTm()));
  TExeTm ExeTm;
  TStr OutFPrx = Env.GetIfArgPrefixStr("-o:", "", "Output Graph data prefix");
  const TStr InFNm = Env.GetIfArgPrefixStr("-i:", "./1912.edges", "Input edgelist file name");
  const TStr LabelFNm = Env.GetIfArgPrefixStr("-l:", "", "Input file name for node names (Node ID, Node label) ");
  const TStr AttrFNm = Env.GetIfArgPrefixStr("-a:", "./1912.nodefeat", "Input node attribute file name");
  const TStr ANameFNm = Env.GetIfArgPrefixStr("-n:", "./1912.nodefeatnames", "Input file name for node attribute names");
  int OptComs = Env.GetIfArgPrefixInt("-c:", 10, "The number of communities to detect (-1: detect automatically)");
  const int MinComs = Env.GetIfArgPrefixInt("-mc:", 3, "Minimum number of communities to try");
  const int MaxComs = Env.GetIfArgPrefixInt("-xc:", 20, "Maximum number of communities to try");
  const int DivComs = Env.GetIfArgPrefixInt("-nc:", 5, "How many trials for the number of communities");
  const int NumThreads = Env.GetIfArgPrefixInt("-nt:", 4, "Number of threads for parallelization");
  const double AttrWeight = Env.GetIfArgPrefixFlt("-aw:", 0.5, "We maximize (1 - aw) P(Network) + aw * P(Attributes)");
  const double LassoWeight = Env.GetIfArgPrefixFlt("-lw:", 1.0, "Weight for l-1 regularization on learning the logistic model parameters");
  const double StepAlpha = Env.GetIfArgPrefixFlt("-sa:", 0.05, "Alpha for backtracking line search");
  const double StepBeta = Env.GetIfArgPrefixFlt("-sb:", 0.3, "Beta for backtracking line search");
  const double MinFeatFrac = Env.GetIfArgPrefixFlt("-mf:", 0.0, "If the fraction of nodes with positive values for an attribute is smaller than this, we ignore that attribute");

  PUNGraph G;
  TIntStrH NIDNameH;
  TStrHash<TInt> NodeNameH;
  TVec<TFltV> Wck;
  TVec<TIntV> EstCmtyVV;
  if (InFNm.IsStrIn(".ungraph")) {
    TFIn GFIn(InFNm);
    G = TUNGraph::Load(GFIn);
  } else {
    G = TAGMUtil::LoadEdgeListStr<PUNGraph>(InFNm, NodeNameH);
    for (int s = 0; s < NodeNameH.Len(); s++) { NIDNameH.AddDat(s, NodeNameH.GetKey(s)); }

  if (LabelFNm.Len() > 0) {
    TSsParser Ss(LabelFNm, ssfTabSep);
    while (Ss.Next()) {
      if (Ss.Len() > 0) { NIDNameH.AddDat(Ss.GetInt(0), Ss.GetFld(1)); }
  printf("Graph: %d Nodes %d Edges\n", G->GetNodes(), G->GetEdges());

  //load attribute
  THash<TInt, TIntV> RawNIDAttrH, NIDAttrH;
  TIntStrH RawFeatNameH, FeatNameH;
  if (ANameFNm.Len() > 0) {
    TSsParser Ss(ANameFNm, ssfTabSep);
    while (Ss.Next()) {
      if (Ss.Len() > 0) { RawFeatNameH.AddDat(Ss.GetInt(0), Ss.GetFld(1)); }

  TCesnaUtil::LoadNIDAttrHFromNIDKH(NIDV, AttrFNm, RawNIDAttrH, NodeNameH);
  TCesnaUtil::FilterLowEntropy(RawNIDAttrH, NIDAttrH, RawFeatNameH, FeatNameH, MinFeatFrac);

  TExeTm RunTm;
  TCesna CS(G, NIDAttrH, 10, 10);
  if (OptComs == -1) {
    printf("finding number of communities\n");
    OptComs = CS.FindComs(NumThreads, MaxComs, MinComs, DivComs, "", false, 0.1, StepAlpha, StepBeta);

  if (NumThreads == 1 || G->GetEdges() < 1000) {
    CS.MLEGradAscent(0.0001, 1000 * G->GetNodes(), "", StepAlpha, StepBeta);
  } else {
    CS.MLEGradAscentParallel(0.0001, 1000, NumThreads, "", StepAlpha, StepBeta);
  CS.GetCmtyVV(EstCmtyVV, Wck);
  TAGMUtil::DumpCmtyVV(OutFPrx + "cmtyvv.txt", EstCmtyVV, NIDNameH);
  FILE* F = fopen((OutFPrx + "weights.txt").CStr(), "wt");
  if (FeatNameH.Len() == Wck[0].Len()) {
    fprintf(F, "#");
    for (int k = 0; k < FeatNameH.Len(); k++) {
      fprintf(F, "%s", FeatNameH[k].CStr());
      if (k < FeatNameH.Len() - 1) { fprintf(F, "\t"); }
    fprintf(F, "\n");
  for (int c = 0; c < Wck.Len(); c++) {
    for (int k = 0; k < Wck[c].Len(); k++) {
      fprintf(F, "%f", Wck[c][k].Val);
      if (k < Wck[c].Len() - 1) { fprintf(F, "\t"); }
    fprintf(F, "\n");


  printf("\nrun time: %s (%s)\n", ExeTm.GetTmStr(), TSecTm::GetCurTm().GetTmStr().CStr());

  return 0;