// I embarassingly don't know how templating works. void QuoteGraph::CompareUsingMinHash(TVec<THash<TMd5Sig, TIntSet> >& BucketsVector) { THashSet<TIntPr> EdgeCache; int Count = 0; int RealCount = 0; Err("Beginning edge creation step...\n"); for (int i = 0; i < BucketsVector.Len(); i++) { Err("Processing band signature %d of %d - %d signatures\n", i+1, BucketsVector.Len(), BucketsVector[i].Len()); TVec<TMd5Sig> Buckets; BucketsVector[i].GetKeyV(Buckets); TVec<TMd5Sig>::TIter BucketEnd = Buckets.EndI(); for (TVec<TMd5Sig>::TIter BucketSig = Buckets.BegI(); BucketSig < BucketEnd; BucketSig++) { TIntSet Bucket = BucketsVector[i].GetDat(*BucketSig); Count += Bucket.Len() * (Bucket.Len() - 1) / 2; for (TIntSet::TIter Quote1 = Bucket.BegI(); Quote1 < Bucket.EndI(); Quote1++) { TIntSet::TIter Quote1Copy = Quote1; Quote1Copy++; for (TIntSet::TIter Quote2 = Quote1Copy; Quote2 < Bucket.EndI(); Quote2++) { if (!EdgeCache.IsKey(TIntPr(Quote1.GetKey(), Quote2.GetKey())) && !EdgeCache.IsKey(TIntPr(Quote2.GetKey(), Quote1.GetKey()))) { EdgeCache.AddKey(TIntPr(Quote1.GetKey(), Quote2.GetKey())); EdgeCache.AddKey(TIntPr(Quote2.GetKey(), Quote1.GetKey())); RealCount++; AddEdgeIfSimilar(Quote1.GetKey(), Quote2.GetKey()); } } } } } fprintf(stderr, "NUMBER OF COMPARES: %d\n", Count); fprintf(stderr, "NUMBER OF REAL COMPARES: %d\n", RealCount); }
void QuoteGraph::CompareUsingShingles(THash<TMd5Sig, TIntSet>& Shingles) { int Count = 0; int RealCount = 0; TVec<TMd5Sig> ShingleKeys; Shingles.GetKeyV(ShingleKeys); THashSet<TIntPr> EdgeCache; for (int i = 0; i < ShingleKeys.Len(); i++) { if (i % 100 == 0) { Err("Processed %d out of %d shingles, count = %d\n", i, ShingleKeys.Len(), Count); } TIntSet Bucket; Shingles.IsKeyGetDat(ShingleKeys[i], Bucket); for (TIntSet::TIter Quote1 = Bucket.BegI(); Quote1 < Bucket.EndI(); Quote1++) { TIntSet::TIter Quote1Copy = Quote1; Quote1Copy++; for (TIntSet::TIter Quote2 = Quote1Copy; Quote2 < Bucket.EndI(); Quote2++) { if (!EdgeCache.IsKey(TIntPr(Quote1.GetKey(), Quote2.GetKey())) && !EdgeCache.IsKey(TIntPr(Quote2.GetKey(), Quote1.GetKey()))) { EdgeCache.AddKey(TIntPr(Quote1.GetKey(), Quote2.GetKey())); EdgeCache.AddKey(TIntPr(Quote2.GetKey(), Quote1.GetKey())); RealCount++; AddEdgeIfSimilar(Quote1.GetKey(), Quote2.GetKey()); } } } int Len = Bucket.Len() * (Bucket.Len() - 1) / 2; Count += Len; } fprintf(stderr, "NUMBER OF COMPARES: %d\n", Count); fprintf(stderr, "NUMBER OF REAL COMPARES: %d\n", RealCount); }
// Compute the empirical edge probability between a pair of nodes who share no community (epsilon), based on current community affiliations. double TAGMFit::CalcPNoComByCmtyVV(const int& SamplePairs) { TIntV NIdV; G->GetNIdV(NIdV); uint64 PairNoCom = 0, EdgesNoCom = 0; for (int u = 0; u < NIdV.Len(); u++) { for (int v = u + 1; v < NIdV.Len(); v++) { int SrcNID = NIdV[u], DstNID = NIdV[v]; TIntSet JointCom; TAGMUtil::GetIntersection(NIDComVH.GetDat(SrcNID),NIDComVH.GetDat(DstNID),JointCom); if(JointCom.Len() == 0) { PairNoCom++; if (G->IsEdge(SrcNID, DstNID)) { EdgesNoCom++; } if (SamplePairs > 0 && PairNoCom >= (uint64) SamplePairs) { break; } } } if (SamplePairs > 0 && PairNoCom >= (uint64) SamplePairs) { break; } } double DefaultVal = 1.0 / (double)G->GetNodes() / (double)G->GetNodes(); if (EdgesNoCom > 0) { PNoCom = (double) EdgesNoCom / (double) PairNoCom; } else { PNoCom = DefaultVal; } printf("%s / %s edges without joint com detected (PNoCom = %f)\n", TUInt64::GetStr(EdgesNoCom).CStr(), TUInt64::GetStr(PairNoCom).CStr(), PNoCom.Val); return PNoCom; }
double TAGMUtil::GetConductance(const PUNGraph& Graph, const TIntSet& CmtyS, const int Edges) { const int Edges2 = Edges >= 0 ? 2*Edges : Graph->GetEdges(); int Vol = 0, Cut = 0; double Phi = 0.0; for (int i = 0; i < CmtyS.Len(); i++) { if (! Graph->IsNode(CmtyS[i])) { continue; } TUNGraph::TNodeI NI = Graph->GetNI(CmtyS[i]); for (int e = 0; e < NI.GetOutDeg(); e++) { if (! CmtyS.IsKey(NI.GetOutNId(e))) { Cut += 1; } } Vol += NI.GetOutDeg(); } // get conductance if (Vol != Edges2) { if (2 * Vol > Edges2) { Phi = Cut / double (Edges2 - Vol); } else if (Vol == 0) { Phi = 0.0; } else { Phi = Cut / double(Vol); } } else { if (Vol == Edges2) { Phi = 1.0; } } return Phi; }
void TNetInfBs::SavePajek(const TStr& OutFNm) { TIntSet NIdSet; FILE *F = fopen(OutFNm.CStr(), "wt"); fprintf(F, "*Vertices %d\r\n", NIdSet.Len()); for (THash<TInt, TNodeInfo>::TIter NI = NodeNmH.BegI(); NI < NodeNmH.EndI(); NI++) { const TNodeInfo& I = NI.GetDat(); fprintf(F, "%d \"%s\" ic Blue x_fact %f y_fact %f\r\n", NI.GetKey().Val, I.Name.CStr(), TMath::Mx<double>(log((double)I.Vol)-5,1), TMath::Mx<double>(log((double)I.Vol)-5,1)); } fprintf(F, "*Arcs\r\n"); for (TNGraph::TEdgeI EI = Graph->BegEI(); EI < Graph->EndEI(); EI++) { fprintf(F, "%d %d 1\r\n", EI.GetSrcNId(), EI.GetDstNId()); } fclose(F); }
/// Barabasi-Albert model of scale-free graphs. /// The graph has power-law degree distribution. /// See: Emergence of scaling in random networks by Barabasi and Albert. /// URL: http://arxiv.org/abs/cond-mat/9910332 PUNGraph GenPrefAttach(const int& Nodes, const int& NodeOutDeg, TRnd& Rnd) { PUNGraph GraphPt = PUNGraph::New(); TUNGraph& Graph = *GraphPt; Graph.Reserve(Nodes, NodeOutDeg*Nodes); TIntV NIdV(NodeOutDeg*Nodes, 0); // first edge Graph.AddNode(0); Graph.AddNode(1); NIdV.Add(0); NIdV.Add(1); Graph.AddEdge(0, 1); TIntSet NodeSet; for (int node = 2; node < Nodes; node++) { NodeSet.Clr(false); while (NodeSet.Len() < NodeOutDeg && NodeSet.Len() < node) { NodeSet.AddKey(NIdV[TInt::Rnd.GetUniDevInt(NIdV.Len())]); } const int N = Graph.AddNode(); for (int i = 0; i < NodeSet.Len(); i++) { Graph.AddEdge(N, NodeSet[i]); NIdV.Add(N); NIdV.Add(NodeSet[i]); } } return GraphPt; }
void LSH::WordHashing(TQuoteBase *QuoteBase, THash<TMd5Sig, TIntSet>& ShingleToQuoteIds) { fprintf(stderr, "Hashing shingles using words...\n"); TIntV QuoteIds; QuoteBase->GetAllQuoteIds(QuoteIds); THash<TStr, TIntSet> Temp; for (int qt = 0; qt < QuoteIds.Len(); qt++) { if (qt % 1000 == 0) { fprintf(stderr, "%d out of %d completed\n", qt, QuoteIds.Len()); } TQuote Q; QuoteBase->GetQuote(QuoteIds[qt], Q); TStrV Content; Q.GetParsedContent(Content); int ContentLen = Content.Len(); for (int i = 0; i < ContentLen; i++) { const TMd5Sig ShingleMd5(Content[i]); TIntSet ShingleQuoteIds; ShingleToQuoteIds.IsKeyGetDat(ShingleMd5, ShingleQuoteIds); ShingleQuoteIds.AddKey(QuoteIds[qt]); ShingleToQuoteIds.AddDat(ShingleMd5, ShingleQuoteIds); ///// COMMENT OUT LATER TIntSet TempSet; Temp.IsKeyGetDat(Content[i], TempSet); TempSet.AddKey(QuoteIds[qt]); Temp.AddDat(Content[i], TempSet); } } TVec<TStr> ShingleKeys; Temp.GetKeyV(ShingleKeys); ShingleKeys.SortCmp(TCmpSetByLen(false, &Temp)); for (int i = 0; i < 100; i++) { TIntSet TempSet = Temp.GetDat(ShingleKeys[i]); Err("%d: %s - %d \n", i, ShingleKeys[i].CStr(), TempSet.Len()); } Err("Done with word hashing!\n"); }
// For each (u, v) in edges, precompute C_uv (the set of communities u and v share). void TAGMFit::GetEdgeJointCom() { ComEdgesV.Gen(CIDNSetV.Len()); EdgeComVH.Gen(G->GetEdges()); for (TUNGraph::TNodeI SrcNI = G->BegNI(); SrcNI < G->EndNI(); SrcNI++) { int SrcNID = SrcNI.GetId(); for (int v = 0; v < SrcNI.GetDeg(); v++) { int DstNID = SrcNI.GetNbrNId(v); if (SrcNID >= DstNID) { continue; } TIntSet JointCom; IAssert(NIDComVH.IsKey(SrcNID)); IAssert(NIDComVH.IsKey(DstNID)); TAGMUtil::GetIntersection(NIDComVH.GetDat(SrcNID), NIDComVH.GetDat(DstNID), JointCom); EdgeComVH.AddDat(TIntPr(SrcNID,DstNID),JointCom); for (int k = 0; k < JointCom.Len(); k++) { ComEdgesV[JointCom[k]]++; } } } IAssert(EdgeComVH.Len() == G->GetEdges()); }
void TAGMFast::GradientForRow(const int UID, TIntFltH& GradU, const TIntSet& CIDSet) { GradU.Gen(CIDSet.Len()); TFltV HOSumFV; //adjust for Fv of v hold out if (HOVIDSV[UID].Len() > 0) { HOSumFV.Gen(SumFV.Len()); for (int e = 0; e < HOVIDSV[UID].Len(); e++) { for (int c = 0; c < SumFV.Len(); c++) { HOSumFV[c] += GetCom(HOVIDSV[UID][e], c); } } } TUNGraph::TNodeI NI = G->GetNI(UID); int Deg = NI.GetDeg(); TFltV PredV(Deg), GradV(CIDSet.Len()); TIntV CIDV(CIDSet.Len()); if (DoParallel && Deg + CIDSet.Len() > 10) { #pragma omp parallel for schedule(static, 1) for (int e = 0; e < Deg; e++) { if (NI.GetNbrNId(e) == UID) { continue; } if (HOVIDSV[UID].IsKey(NI.GetNbrNId(e))) { continue; } PredV[e] = Prediction(UID, NI.GetNbrNId(e)); } #pragma omp parallel for schedule(static, 1) for (int c = 0; c < CIDSet.Len(); c++) { int CID = CIDSet.GetKey(c); double Val = 0.0; for (int e = 0; e < Deg; e++) { int VID = NI.GetNbrNId(e); if (VID == UID) { continue; } if (HOVIDSV[UID].IsKey(VID)) { continue; } Val += PredV[e] * GetCom(VID, CID) / (1.0 - PredV[e]) + NegWgt * GetCom(VID, CID); } double HOSum = HOVIDSV[UID].Len() > 0? HOSumFV[CID].Val: 0.0;//subtract Hold out pairs only if hold out pairs exist Val -= NegWgt * (SumFV[CID] - HOSum - GetCom(UID, CID)); CIDV[c] = CID; GradV[c] = Val; } } else { for (int e = 0; e < Deg; e++) { if (NI.GetNbrNId(e) == UID) { continue; } if (HOVIDSV[UID].IsKey(NI.GetNbrNId(e))) { continue; } PredV[e] = Prediction(UID, NI.GetNbrNId(e)); } for (int c = 0; c < CIDSet.Len(); c++) { int CID = CIDSet.GetKey(c); double Val = 0.0; for (int e = 0; e < Deg; e++) { int VID = NI.GetNbrNId(e); if (VID == UID) { continue; } if (HOVIDSV[UID].IsKey(VID)) { continue; } Val += PredV[e] * GetCom(VID, CID) / (1.0 - PredV[e]) + NegWgt * GetCom(VID, CID); } double HOSum = HOVIDSV[UID].Len() > 0? HOSumFV[CID].Val: 0.0;//subtract Hold out pairs only if hold out pairs exist Val -= NegWgt * (SumFV[CID] - HOSum - GetCom(UID, CID)); CIDV[c] = CID; GradV[c] = Val; } } //add regularization if (RegCoef > 0.0) { //L1 for (int c = 0; c < GradV.Len(); c++) { GradV[c] -= RegCoef; } } if (RegCoef < 0.0) { //L2 for (int c = 0; c < GradV.Len(); c++) { GradV[c] += 2 * RegCoef * GetCom(UID, CIDV[c]); } } for (int c = 0; c < GradV.Len(); c++) { if (GetCom(UID, CIDV[c]) == 0.0 && GradV[c] < 0.0) { continue; } if (fabs(GradV[c]) < 0.0001) { continue; } GradU.AddDat(CIDV[c], GradV[c]); } for (int c = 0; c < GradU.Len(); c++) { if (GradU[c] >= 10) { GradU[c] = 10; } if (GradU[c] <= -10) { GradU[c] = -10; } IAssert(GradU[c] >= -10); } }