void IncrementalQuoteGraph::AddNodes() {
  TIntSet::TIter QuoteIdsEnd = NewQuotes.EndI();
  for (TIntSet::TIter QuoteId = NewQuotes.BegI(); QuoteId < QuoteIdsEnd; QuoteId++) {
    if (!QGraph->IsNode(QuoteId.GetKey()))
      QGraph->AddNode(QuoteId.GetKey());
  }
}
Пример #2
0
double TAGMFit::SelectLambdaSum(const TFltV& NewLambdaV, const TIntSet& ComK) {
  double Result = 0.0;
  for (TIntSet::TIter SI = ComK.BegI(); SI < ComK.EndI(); SI++) {
    IAssert(NewLambdaV[SI.GetKey()] >= 0);
    Result += NewLambdaV[SI.GetKey()];
  }
  return Result;
}
Пример #3
0
void TAGMFit::InitNodeData() {
  TSnap::DelSelfEdges(G);
  NIDComVH.Gen(G->GetNodes());
  for (TUNGraph::TNodeI NI = G->BegNI(); NI < G->EndNI(); NI++) {
    NIDComVH.AddDat(NI.GetId());
  }
  TAGMUtil::GetNodeMembership(NIDComVH, CIDNSetV);
  GetEdgeJointCom();
  LambdaV.Gen(CIDNSetV.Len());
  for (int c = 0; c < CIDNSetV.Len(); c++) {
    int MaxE = (CIDNSetV[c].Len()) * (CIDNSetV[c].Len() - 1) / 2;
    if (MaxE < 2) {
      LambdaV[c] = MaxLambda;
    }
    else{
      LambdaV[c] = -log((double) (MaxE - ComEdgesV[c]) / MaxE);
    }
    if (LambdaV[c] > MaxLambda) {  LambdaV[c] = MaxLambda;  }
    if (LambdaV[c] < MinLambda) {  LambdaV[c] = MinLambda;  }
  }
  NIDCIDPrS.Gen(G->GetNodes() * 10);
  for (int c = 0; c < CIDNSetV.Len(); c++) {
    for (TIntSet::TIter SI = CIDNSetV[c].BegI(); SI < CIDNSetV[c].EndI(); SI++) {
      NIDCIDPrS.AddKey(TIntPr(SI.GetKey(), c));
    }
  }
}
Пример #4
0
void TAGMUtil::GetNodeMembership(THash<TInt,TIntSet >& NIDComVH, const TVec<TIntSet>& CmtyVV) {
    for (int i = 0; i < CmtyVV.Len(); i++) {
        int CID = i;
        for (TIntSet::TIter SI = CmtyVV[i].BegI(); SI < CmtyVV[i].EndI(); SI++) {
            int NID = SI.GetKey();
            NIDComVH.AddDat(NID).AddKey(CID);
        }
    }
}
Пример #5
0
// YES I COPIED AND PASTED CODE my section leader would be so ashamed :D
void LSH::MinHash(THash<TMd5Sig, TIntSet>& ShingleToQuoteIds,
    TVec<THash<TIntV, TIntSet> >& SignatureBandBuckets) {
  TRnd RandomGenerator; // TODO: make this "more random" by incorporating time
  for (int i = 0; i < NumBands; ++i) {
    THash < TInt, TIntV > Inverted; // (QuoteID, QuoteSignatureForBand)
    THash < TIntV, TIntSet > BandBuckets; // (BandSignature, QuoteIDs)
    for (int j = 0; j < BandSize; ++j) {
      // Create new signature
      TVec < TMd5Sig > Signature;
      ShingleToQuoteIds.GetKeyV(Signature);
      Signature.Shuffle(RandomGenerator);

      // Place in bucket - not very efficient
      int SigLen = Signature.Len();
      for (int k = 0; k < SigLen; ++k) {
        TIntSet CurSet = ShingleToQuoteIds.GetDat(Signature[k]);
        for (TIntSet::TIter l = CurSet.BegI(); l < CurSet.EndI(); l++) {
          TInt Key = l.GetKey();
          if (Inverted.IsKey(Key)) {
            TIntV CurSignature = Inverted.GetDat(Key);
            if (CurSignature.Len() <= j) {
              CurSignature.Add(k);
              Inverted.AddDat(Key, CurSignature);
            }
          } else {
            TIntV NewSignature;
            NewSignature.Add(k);
            Inverted.AddDat(Key, NewSignature);
          }
        }
      }
    }

    TIntV InvertedKeys;
    Inverted.GetKeyV(InvertedKeys);
    TInt InvertedLen = InvertedKeys.Len();
    for (int k = 0; k < InvertedLen; ++k) {
      TIntSet Bucket;
      TIntV Signature = Inverted.GetDat(InvertedKeys[k]);
      if (BandBuckets.IsKey(Signature)) {
        Bucket = BandBuckets.GetDat(Signature);
      }
      Bucket.AddKey(InvertedKeys[k]);
      BandBuckets.AddDat(Signature, Bucket);
    }

    SignatureBandBuckets.Add(BandBuckets);
    Err("%d out of %d band signatures computed\n", i + 1, NumBands);
  }
  Err("Minhash step complete!\n");
}
void TIncrementalClustering::KeepAtMostOneChildPerNode(PNGraph& G, TQuoteBase *QB, TDocBase *DB) {
  TIntSet::TIter EndNode = AffectedNodes.EndI();
  for (TIntSet::TIter NodeId = AffectedNodes.BegI(); NodeId < EndNode; NodeId++) {
    TNGraph::TNodeI Node = G->GetNI(NodeId.GetKey());
    TQuote SourceQuote;
    if (QB->GetQuote(Node.GetId(), SourceQuote)) {
      TInt NodeDegree = Node.GetOutDeg();
      if (NodeDegree > 1) {
        TFlt MaxScore = 0;
        TInt MaxNodeId = 0;
        TIntV NodeV;
        // first pass: check to see if we are pointing to any old nodes - if so, they get higher
        // priority over the new ones for edge selection.
        bool ContainsOldNode = false;
        for (int i = 0; i < NodeDegree; ++i) {
          if (!NewQuotes.IsKey(Node.GetOutNId(i))) {
            ContainsOldNode = true;
          }
        }
        // modified edge selection: filter out new nodes if old ones exist.
        for (int i = 0; i < NodeDegree; ++i) {
          TInt CurNode = Node.GetOutNId(i);
          NodeV.Add(CurNode);
          TQuote DestQuote;
          if (QB->GetQuote(CurNode, DestQuote)) {
            TFlt EdgeScore = 0;
            if (!ContainsOldNode || !NewQuotes.IsKey(Node.GetOutNId(i))) {
              EdgeScore = ComputeEdgeScore(SourceQuote, DestQuote, DB);
            }
            if (EdgeScore > MaxScore) {
              MaxScore = EdgeScore;
              MaxNodeId = CurNode;
            }
          }
        }

        // remove all other edges, backwards to prevent indexing fail
        for (int i = 0; i < NodeV.Len(); i++) {
          if (NodeV[i] != MaxNodeId) {
            G->DelEdge(Node.GetId(), NodeV[i]);
          }
        }
        //printf("Out degree: %d out of %d\n", Node.GetOutDeg(), NodeDegree.Val);
      }
    }
  }
  fprintf(stderr, "finished deleting edges\n");
}
Пример #7
0
// Gradient of likelihood for P_c.
void TAGMFit::GradLogLForLambda(TFltV& GradV) {
  GradV.Gen(LambdaV.Len());
  TFltV SumEdgeProbsV(LambdaV.Len());
  for (int e = 0; e < EdgeComVH.Len(); e++) {
    TIntSet& JointCom = EdgeComVH[e];
    double LambdaSum = SelectLambdaSum(JointCom);
    double Puv = 1 - exp(- LambdaSum);
    if (JointCom.Len() == 0) {  Puv = PNoCom;  }
    for (TIntSet::TIter SI = JointCom.BegI(); SI < JointCom.EndI(); SI++) {
      SumEdgeProbsV[SI.GetKey()] += (1 - Puv) / Puv;
    }
  }
  for (int k = 0; k < LambdaV.Len(); k++) {
    int MaxEk = CIDNSetV[k].Len() * (CIDNSetV[k].Len() - 1) / 2;
    int NotEdgesInCom = MaxEk - ComEdgesV[k];
    GradV[k] = SumEdgeProbsV[k] - (double) NotEdgesInCom;
    if (LambdaV[k] > 0.0 && RegCoef > 0.0) { //if regularization exists
      GradV[k] -= RegCoef;
    }
  }
}
Пример #8
0
/// Newton method: DEPRECATED
int TAGMFast::MLENewton(const double& Thres, const int& MaxIter, const TStr PlotNm) {
  TExeTm ExeTm;
  int iter = 0, PrevIter = 0;
  TIntFltPrV IterLV;
  double PrevL = TFlt::Mn, CurL;
  TUNGraph::TNodeI UI;
  TIntV NIdxV;
  G->GetNIdV(NIdxV);
  int CID, UID, NewtonIter;
  double Fuc, PrevFuc, Grad, H;
  while(iter < MaxIter) {
    NIdxV.Shuffle(Rnd);
    for (int ui = 0; ui < F.Len(); ui++, iter++) {
      if (! PlotNm.Empty() && iter % G->GetNodes() == 0) {
        IterLV.Add(TIntFltPr(iter, Likelihood(false)));
      }
      UID = NIdxV[ui];
      //find set of candidate c (we only need to consider c to which a neighbor of u belongs to)
      TIntSet CIDSet;
      UI = G->GetNI(UID);
      if (UI.GetDeg() == 0) { //if the node is isolated, clear its membership and skip
        if (! F[UID].Empty()) { F[UID].Clr(); }
        continue;
      }
      for (int e = 0; e < UI.GetDeg(); e++) {
        if (HOVIDSV[UID].IsKey(UI.GetNbrNId(e))) { continue; }
        TIntFltH& NbhCIDH = F[UI.GetNbrNId(e)];
        for (TIntFltH::TIter CI = NbhCIDH.BegI(); CI < NbhCIDH.EndI(); CI++) {
          CIDSet.AddKey(CI.GetKey());
        }
      }
      for (TIntFltH::TIter CI = F[UID].BegI(); CI < F[UID].EndI(); CI++) { //remove the community membership which U does not share with its neighbors
        if (! CIDSet.IsKey(CI.GetKey())) {
          DelCom(UID, CI.GetKey());
        }
      }
      if (CIDSet.Empty()) { continue; }
      for (TIntSet::TIter CI = CIDSet.BegI(); CI < CIDSet.EndI(); CI++) {
        CID = CI.GetKey();
        //optimize for UID, CID
        //compute constants
        TFltV AlphaKV(UI.GetDeg());
        for (int e = 0; e < UI.GetDeg(); e++) {
          if (HOVIDSV[UID].IsKey(UI.GetNbrNId(e))) { continue; }
          AlphaKV[e] = (1 - PNoCom) * exp(- DotProduct(UID, UI.GetNbrNId(e)) + GetCom(UI.GetNbrNId(e), CID) * GetCom(UID, CID));
          IAssertR(AlphaKV[e] <= 1.0, TStr::Fmt("AlphaKV=%f, %f, %f", AlphaKV[e].Val, PNoCom.Val, GetCom(UI.GetNbrNId(e), CID)));
        }
        Fuc = GetCom(UID, CID);
        PrevFuc = Fuc;
        Grad = GradientForOneVar(AlphaKV, UID, CID, Fuc), H = 0.0;
        if (Grad <= 1e-3 && Grad >= -0.1) { continue; }
        NewtonIter = 0;
        while (NewtonIter++ < 10) {
          Grad = GradientForOneVar(AlphaKV, UID, CID, Fuc), H = 0.0;
          H = HessianForOneVar(AlphaKV, UID, CID, Fuc);
          if (Fuc == 0.0 && Grad <= 0.0) { Grad = 0.0; }
          if (fabs(Grad) < 1e-3) { break; }
          if (H == 0.0) { Fuc = 0.0; break; }
          double NewtonStep = - Grad / H;
          if (NewtonStep < -0.5) { NewtonStep = - 0.5; }
          Fuc += NewtonStep;
          if (Fuc < 0.0) { Fuc = 0.0; }
        }
        if (Fuc == 0.0) {
          DelCom(UID, CID);
        }
        else {
          AddCom(UID, CID, Fuc);
        }
      }
    }
    if (iter - PrevIter >= 2 * G->GetNodes() && iter > 10000) {
      PrevIter = iter;
      CurL = Likelihood();
      if (PrevL > TFlt::Mn && ! PlotNm.Empty()) {
        printf("\r%d iterations, Likelihood: %f, Diff: %f", iter, CurL,  CurL - PrevL);
      }
      fflush(stdout);
      if (CurL - PrevL <= Thres * fabs(PrevL)) { break; }
      else { PrevL = CurL; }
    }
    
  }
  if (! PlotNm.Empty()) {
    printf("\nMLE for Lambda completed with %d iterations(%s)\n", iter, ExeTm.GetTmStr());
    TGnuPlot::PlotValV(IterLV, PlotNm + ".likelihood_Q");
  }
  return iter;
}