Esempio n. 1
0
// I embarassingly don't know how templating works.
void QuoteGraph::CompareUsingMinHash(TVec<THash<TMd5Sig, TIntSet> >& BucketsVector) {
  THashSet<TIntPr> EdgeCache;
  int Count = 0;
  int RealCount = 0;

  Err("Beginning edge creation step...\n");
  for (int i = 0; i < BucketsVector.Len(); i++) {
    Err("Processing band signature %d of %d - %d signatures\n", i+1, BucketsVector.Len(), BucketsVector[i].Len());
    TVec<TMd5Sig> Buckets;
    BucketsVector[i].GetKeyV(Buckets);
    TVec<TMd5Sig>::TIter BucketEnd = Buckets.EndI();
    for (TVec<TMd5Sig>::TIter BucketSig = Buckets.BegI(); BucketSig < BucketEnd; BucketSig++) {
      TIntSet Bucket  = BucketsVector[i].GetDat(*BucketSig);
      Count += Bucket.Len() * (Bucket.Len() - 1) / 2;
      for (TIntSet::TIter Quote1 = Bucket.BegI(); Quote1 < Bucket.EndI(); Quote1++) {
        TIntSet::TIter Quote1Copy = Quote1;
        Quote1Copy++;
        for (TIntSet::TIter Quote2 = Quote1Copy; Quote2 < Bucket.EndI(); Quote2++) {
          if (!EdgeCache.IsKey(TIntPr(Quote1.GetKey(), Quote2.GetKey())) && !EdgeCache.IsKey(TIntPr(Quote2.GetKey(), Quote1.GetKey()))) {
            EdgeCache.AddKey(TIntPr(Quote1.GetKey(), Quote2.GetKey()));
            EdgeCache.AddKey(TIntPr(Quote2.GetKey(), Quote1.GetKey()));
            RealCount++;
            AddEdgeIfSimilar(Quote1.GetKey(), Quote2.GetKey());
          }
        }
      }
    }
  }
  fprintf(stderr, "NUMBER OF COMPARES: %d\n", Count);
  fprintf(stderr, "NUMBER OF REAL COMPARES: %d\n", RealCount);
}
Esempio n. 2
0
void QuoteGraph::CompareUsingShingles(THash<TMd5Sig, TIntSet>& Shingles) {
  int Count = 0;
  int RealCount = 0;
  TVec<TMd5Sig> ShingleKeys;
  Shingles.GetKeyV(ShingleKeys);
  THashSet<TIntPr> EdgeCache;

  for (int i = 0; i < ShingleKeys.Len(); i++) {
    if (i % 100 == 0) {
      Err("Processed %d out of %d shingles, count = %d\n", i, ShingleKeys.Len(), Count);
    }
    TIntSet Bucket;
    Shingles.IsKeyGetDat(ShingleKeys[i], Bucket);

    for (TIntSet::TIter Quote1 = Bucket.BegI(); Quote1 < Bucket.EndI(); Quote1++) {
      TIntSet::TIter Quote1Copy = Quote1;
      Quote1Copy++;
      for (TIntSet::TIter Quote2 = Quote1Copy; Quote2 < Bucket.EndI(); Quote2++) {
        if (!EdgeCache.IsKey(TIntPr(Quote1.GetKey(), Quote2.GetKey())) && !EdgeCache.IsKey(TIntPr(Quote2.GetKey(), Quote1.GetKey()))) {
          EdgeCache.AddKey(TIntPr(Quote1.GetKey(), Quote2.GetKey()));
          EdgeCache.AddKey(TIntPr(Quote2.GetKey(), Quote1.GetKey()));
          RealCount++;
          AddEdgeIfSimilar(Quote1.GetKey(), Quote2.GetKey());
        }
      }
    }
    int Len = Bucket.Len() * (Bucket.Len() - 1) / 2;
    Count += Len;
  }
  fprintf(stderr, "NUMBER OF COMPARES: %d\n", Count);
  fprintf(stderr, "NUMBER OF REAL COMPARES: %d\n", RealCount);
}
Esempio n. 3
0
// Compute the empirical edge probability between a pair of nodes who share no community (epsilon), based on current community affiliations.
double TAGMFit::CalcPNoComByCmtyVV(const int& SamplePairs) {
  TIntV NIdV;
  G->GetNIdV(NIdV);
  uint64 PairNoCom = 0, EdgesNoCom = 0;
  for (int u = 0; u < NIdV.Len(); u++) {
    for (int v = u + 1; v < NIdV.Len(); v++) {
      int SrcNID = NIdV[u], DstNID = NIdV[v];
      TIntSet JointCom;
      TAGMUtil::GetIntersection(NIDComVH.GetDat(SrcNID),NIDComVH.GetDat(DstNID),JointCom);
      if(JointCom.Len() == 0) {
        PairNoCom++;
        if (G->IsEdge(SrcNID, DstNID)) { EdgesNoCom++; }
        if (SamplePairs > 0 && PairNoCom >= (uint64) SamplePairs) { break; }
      }
    }
    if (SamplePairs > 0 && PairNoCom >= (uint64) SamplePairs) { break; }
  }
  double DefaultVal = 1.0 / (double)G->GetNodes() / (double)G->GetNodes();
  if (EdgesNoCom > 0) {
    PNoCom = (double) EdgesNoCom / (double) PairNoCom;
  } else {
    PNoCom = DefaultVal;
  }
  printf("%s / %s edges without joint com detected (PNoCom = %f)\n", TUInt64::GetStr(EdgesNoCom).CStr(), TUInt64::GetStr(PairNoCom).CStr(), PNoCom.Val);
  return PNoCom;
}
Esempio n. 4
0
double TAGMUtil::GetConductance(const PUNGraph& Graph, const TIntSet& CmtyS, const int Edges) {
    const int Edges2 = Edges >= 0 ? 2*Edges : Graph->GetEdges();
    int Vol = 0,  Cut = 0;
    double Phi = 0.0;
    for (int i = 0; i < CmtyS.Len(); i++) {
        if (! Graph->IsNode(CmtyS[i])) {
            continue;
        }
        TUNGraph::TNodeI NI = Graph->GetNI(CmtyS[i]);
        for (int e = 0; e < NI.GetOutDeg(); e++) {
            if (! CmtyS.IsKey(NI.GetOutNId(e))) {
                Cut += 1;
            }
        }
        Vol += NI.GetOutDeg();
    }
    // get conductance
    if (Vol != Edges2) {
        if (2 * Vol > Edges2) {
            Phi = Cut / double (Edges2 - Vol);
        }
        else if (Vol == 0) {
            Phi = 0.0;
        }
        else {
            Phi = Cut / double(Vol);
        }
    } else {
        if (Vol == Edges2) {
            Phi = 1.0;
        }
    }
    return Phi;
}
Esempio n. 5
0
void TNetInfBs::SavePajek(const TStr& OutFNm) {
    TIntSet NIdSet;
    FILE *F = fopen(OutFNm.CStr(), "wt");
    fprintf(F, "*Vertices %d\r\n", NIdSet.Len());
    for (THash<TInt, TNodeInfo>::TIter NI = NodeNmH.BegI(); NI < NodeNmH.EndI(); NI++) {
      const TNodeInfo& I = NI.GetDat();
      fprintf(F, "%d \"%s\" ic Blue x_fact %f y_fact %f\r\n", NI.GetKey().Val,
        I.Name.CStr(), TMath::Mx<double>(log((double)I.Vol)-5,1), TMath::Mx<double>(log((double)I.Vol)-5,1));
    }
    fprintf(F, "*Arcs\r\n");
    for (TNGraph::TEdgeI EI = Graph->BegEI(); EI < Graph->EndEI(); EI++) {
      fprintf(F, "%d %d 1\r\n", EI.GetSrcNId(), EI.GetDstNId());
    }
    fclose(F);
}
Esempio n. 6
0
/// Barabasi-Albert model of scale-free graphs.
/// The graph has power-law degree distribution.
/// See: Emergence of scaling in random networks by Barabasi and Albert.
/// URL: http://arxiv.org/abs/cond-mat/9910332
PUNGraph GenPrefAttach(const int& Nodes, const int& NodeOutDeg, TRnd& Rnd) {
  PUNGraph GraphPt = PUNGraph::New();
  TUNGraph& Graph = *GraphPt;
  Graph.Reserve(Nodes, NodeOutDeg*Nodes);
  TIntV NIdV(NodeOutDeg*Nodes, 0);
  // first edge
  Graph.AddNode(0);  Graph.AddNode(1);
  NIdV.Add(0);  NIdV.Add(1);
  Graph.AddEdge(0, 1);
  TIntSet NodeSet;
  for (int node = 2; node < Nodes; node++) {
    NodeSet.Clr(false);
    while (NodeSet.Len() < NodeOutDeg && NodeSet.Len() < node) {
      NodeSet.AddKey(NIdV[TInt::Rnd.GetUniDevInt(NIdV.Len())]);
    }
    const int N = Graph.AddNode();
    for (int i = 0; i < NodeSet.Len(); i++) {
      Graph.AddEdge(N, NodeSet[i]);
      NIdV.Add(N);
      NIdV.Add(NodeSet[i]);
    }
  }
  return GraphPt;
}
Esempio n. 7
0
void LSH::WordHashing(TQuoteBase *QuoteBase,
    THash<TMd5Sig, TIntSet>& ShingleToQuoteIds) {
  fprintf(stderr, "Hashing shingles using words...\n");
  TIntV QuoteIds;
  QuoteBase->GetAllQuoteIds(QuoteIds);

  THash<TStr, TIntSet> Temp;

  for (int qt = 0; qt < QuoteIds.Len(); qt++) {
    if (qt % 1000 == 0) {
      fprintf(stderr, "%d out of %d completed\n", qt, QuoteIds.Len());
    }
    TQuote Q;
    QuoteBase->GetQuote(QuoteIds[qt], Q);

    TStrV Content;
    Q.GetParsedContent(Content);

    int ContentLen = Content.Len();
    for (int i = 0; i < ContentLen; i++) {
      const TMd5Sig ShingleMd5(Content[i]);
      TIntSet ShingleQuoteIds;
      ShingleToQuoteIds.IsKeyGetDat(ShingleMd5, ShingleQuoteIds);
      ShingleQuoteIds.AddKey(QuoteIds[qt]);
      ShingleToQuoteIds.AddDat(ShingleMd5, ShingleQuoteIds);

      ///// COMMENT OUT LATER
      TIntSet TempSet;
      Temp.IsKeyGetDat(Content[i], TempSet);
      TempSet.AddKey(QuoteIds[qt]);
      Temp.AddDat(Content[i], TempSet);
    }
  }

  TVec<TStr> ShingleKeys;
  Temp.GetKeyV(ShingleKeys);
  ShingleKeys.SortCmp(TCmpSetByLen(false, &Temp));
  for (int i = 0; i < 100; i++) {
    TIntSet TempSet = Temp.GetDat(ShingleKeys[i]);
    Err("%d: %s - %d \n", i, ShingleKeys[i].CStr(), TempSet.Len());
  }

  Err("Done with word hashing!\n");
}
Esempio n. 8
0
// For each (u, v) in edges, precompute C_uv (the set of communities u and v share).
void TAGMFit::GetEdgeJointCom() {
  ComEdgesV.Gen(CIDNSetV.Len());
  EdgeComVH.Gen(G->GetEdges());
  for (TUNGraph::TNodeI SrcNI = G->BegNI(); SrcNI < G->EndNI(); SrcNI++) {
    int SrcNID = SrcNI.GetId();
    for (int v = 0; v < SrcNI.GetDeg(); v++) {
      int DstNID = SrcNI.GetNbrNId(v);
      if (SrcNID >= DstNID) { continue; }
      TIntSet JointCom;
      IAssert(NIDComVH.IsKey(SrcNID));
      IAssert(NIDComVH.IsKey(DstNID));
      TAGMUtil::GetIntersection(NIDComVH.GetDat(SrcNID), NIDComVH.GetDat(DstNID), JointCom);
      EdgeComVH.AddDat(TIntPr(SrcNID,DstNID),JointCom);
      for (int k = 0; k < JointCom.Len(); k++) {
        ComEdgesV[JointCom[k]]++;
      }
    }
  }
  IAssert(EdgeComVH.Len() == G->GetEdges());
}
Esempio n. 9
0
void TAGMFast::GradientForRow(const int UID, TIntFltH& GradU, const TIntSet& CIDSet) {
  GradU.Gen(CIDSet.Len());

  TFltV HOSumFV; //adjust for Fv of v hold out
  if (HOVIDSV[UID].Len() > 0) {
    HOSumFV.Gen(SumFV.Len());
    
    for (int e = 0; e < HOVIDSV[UID].Len(); e++) {
      for (int c = 0; c < SumFV.Len(); c++) {
        HOSumFV[c] += GetCom(HOVIDSV[UID][e], c);
      }
    }
  }
    
  TUNGraph::TNodeI NI = G->GetNI(UID);
  int Deg = NI.GetDeg();
  TFltV PredV(Deg), GradV(CIDSet.Len());
  TIntV CIDV(CIDSet.Len());
  if (DoParallel && Deg + CIDSet.Len() > 10) {
#pragma omp parallel for schedule(static, 1)
    for (int e = 0; e < Deg; e++) {
      if (NI.GetNbrNId(e) == UID) { continue; }
      if (HOVIDSV[UID].IsKey(NI.GetNbrNId(e))) { continue; }
      PredV[e] = Prediction(UID, NI.GetNbrNId(e));
    }
  
#pragma omp parallel for schedule(static, 1)
    for (int c = 0; c < CIDSet.Len(); c++) {
      int CID = CIDSet.GetKey(c);
      double Val = 0.0;
      for (int e = 0; e < Deg; e++) {
        int VID = NI.GetNbrNId(e);
        if (VID == UID) { continue; }
        if (HOVIDSV[UID].IsKey(VID)) { continue; }
        Val += PredV[e] * GetCom(VID, CID) / (1.0 - PredV[e]) + NegWgt * GetCom(VID, CID);
      }
      double HOSum = HOVIDSV[UID].Len() > 0?  HOSumFV[CID].Val: 0.0;//subtract Hold out pairs only if hold out pairs exist
      Val -= NegWgt * (SumFV[CID] - HOSum - GetCom(UID, CID));
      CIDV[c] = CID;
      GradV[c] = Val;
    }
  } 
  else {
    for (int e = 0; e < Deg; e++) {
      if (NI.GetNbrNId(e) == UID) { continue; }
      if (HOVIDSV[UID].IsKey(NI.GetNbrNId(e))) { continue; }
      PredV[e] = Prediction(UID, NI.GetNbrNId(e));
    }
  
    for (int c = 0; c < CIDSet.Len(); c++) {
      int CID = CIDSet.GetKey(c);
      double Val = 0.0;
      for (int e = 0; e < Deg; e++) {
        int VID = NI.GetNbrNId(e);
        if (VID == UID) { continue; }
        if (HOVIDSV[UID].IsKey(VID)) { continue; }
        Val += PredV[e] * GetCom(VID, CID) / (1.0 - PredV[e]) + NegWgt * GetCom(VID, CID);
      }
      double HOSum = HOVIDSV[UID].Len() > 0?  HOSumFV[CID].Val: 0.0;//subtract Hold out pairs only if hold out pairs exist
      Val -= NegWgt * (SumFV[CID] - HOSum - GetCom(UID, CID));
      CIDV[c] = CID;
      GradV[c] = Val;
    }
  }
  //add regularization
  if (RegCoef > 0.0) { //L1
    for (int c = 0; c < GradV.Len(); c++) {
      GradV[c] -= RegCoef; 
    }
  }
  if (RegCoef < 0.0) { //L2
    for (int c = 0; c < GradV.Len(); c++) {
      GradV[c] += 2 * RegCoef * GetCom(UID, CIDV[c]); 
    }
  }


  for (int c = 0; c < GradV.Len(); c++) {
    if (GetCom(UID, CIDV[c]) == 0.0 && GradV[c] < 0.0) { continue; }
    if (fabs(GradV[c]) < 0.0001) { continue; }
    GradU.AddDat(CIDV[c], GradV[c]);
  }
  for (int c = 0; c < GradU.Len(); c++) {
    if (GradU[c] >= 10) { GradU[c] = 10; }
    if (GradU[c] <= -10) { GradU[c] = -10; }
    IAssert(GradU[c] >= -10);
  }
}