// I embarassingly don't know how templating works.
void QuoteGraph::CompareUsingMinHash(TVec<THash<TMd5Sig, TIntSet> >& BucketsVector) {
  THashSet<TIntPr> EdgeCache;
  int Count = 0;
  int RealCount = 0;

  Err("Beginning edge creation step...\n");
  for (int i = 0; i < BucketsVector.Len(); i++) {
    Err("Processing band signature %d of %d - %d signatures\n", i+1, BucketsVector.Len(), BucketsVector[i].Len());
    TVec<TMd5Sig> Buckets;
    BucketsVector[i].GetKeyV(Buckets);
    TVec<TMd5Sig>::TIter BucketEnd = Buckets.EndI();
    for (TVec<TMd5Sig>::TIter BucketSig = Buckets.BegI(); BucketSig < BucketEnd; BucketSig++) {
      TIntSet Bucket  = BucketsVector[i].GetDat(*BucketSig);
      Count += Bucket.Len() * (Bucket.Len() - 1) / 2;
      for (TIntSet::TIter Quote1 = Bucket.BegI(); Quote1 < Bucket.EndI(); Quote1++) {
        TIntSet::TIter Quote1Copy = Quote1;
        Quote1Copy++;
        for (TIntSet::TIter Quote2 = Quote1Copy; Quote2 < Bucket.EndI(); Quote2++) {
          if (!EdgeCache.IsKey(TIntPr(Quote1.GetKey(), Quote2.GetKey())) && !EdgeCache.IsKey(TIntPr(Quote2.GetKey(), Quote1.GetKey()))) {
            EdgeCache.AddKey(TIntPr(Quote1.GetKey(), Quote2.GetKey()));
            EdgeCache.AddKey(TIntPr(Quote2.GetKey(), Quote1.GetKey()));
            RealCount++;
            AddEdgeIfSimilar(Quote1.GetKey(), Quote2.GetKey());
          }
        }
      }
    }
  }
  fprintf(stderr, "NUMBER OF COMPARES: %d\n", Count);
  fprintf(stderr, "NUMBER OF REAL COMPARES: %d\n", RealCount);
}
void QuoteGraph::CompareUsingShingles(THash<TMd5Sig, TIntSet>& Shingles) {
  int Count = 0;
  int RealCount = 0;
  TVec<TMd5Sig> ShingleKeys;
  Shingles.GetKeyV(ShingleKeys);
  THashSet<TIntPr> EdgeCache;

  for (int i = 0; i < ShingleKeys.Len(); i++) {
    if (i % 100 == 0) {
      Err("Processed %d out of %d shingles, count = %d\n", i, ShingleKeys.Len(), Count);
    }
    TIntSet Bucket;
    Shingles.IsKeyGetDat(ShingleKeys[i], Bucket);

    for (TIntSet::TIter Quote1 = Bucket.BegI(); Quote1 < Bucket.EndI(); Quote1++) {
      TIntSet::TIter Quote1Copy = Quote1;
      Quote1Copy++;
      for (TIntSet::TIter Quote2 = Quote1Copy; Quote2 < Bucket.EndI(); Quote2++) {
        if (!EdgeCache.IsKey(TIntPr(Quote1.GetKey(), Quote2.GetKey())) && !EdgeCache.IsKey(TIntPr(Quote2.GetKey(), Quote1.GetKey()))) {
          EdgeCache.AddKey(TIntPr(Quote1.GetKey(), Quote2.GetKey()));
          EdgeCache.AddKey(TIntPr(Quote2.GetKey(), Quote1.GetKey()));
          RealCount++;
          AddEdgeIfSimilar(Quote1.GetKey(), Quote2.GetKey());
        }
      }
    }
    int Len = Bucket.Len() * (Bucket.Len() - 1) / 2;
    Count += Len;
  }
  fprintf(stderr, "NUMBER OF COMPARES: %d\n", Count);
  fprintf(stderr, "NUMBER OF REAL COMPARES: %d\n", RealCount);
}
double TAGMFit::SelectLambdaSum(const TFltV& NewLambdaV, const TIntSet& ComK) {
  double Result = 0.0;
  for (TIntSet::TIter SI = ComK.BegI(); SI < ComK.EndI(); SI++) {
    IAssert(NewLambdaV[SI.GetKey()] >= 0);
    Result += NewLambdaV[SI.GetKey()];
  }
  return Result;
}
Exemple #4
0
// YES I COPIED AND PASTED CODE my section leader would be so ashamed :D
void LSH::MinHash(THash<TMd5Sig, TIntSet>& ShingleToQuoteIds,
    TVec<THash<TIntV, TIntSet> >& SignatureBandBuckets) {
  TRnd RandomGenerator; // TODO: make this "more random" by incorporating time
  for (int i = 0; i < NumBands; ++i) {
    THash < TInt, TIntV > Inverted; // (QuoteID, QuoteSignatureForBand)
    THash < TIntV, TIntSet > BandBuckets; // (BandSignature, QuoteIDs)
    for (int j = 0; j < BandSize; ++j) {
      // Create new signature
      TVec < TMd5Sig > Signature;
      ShingleToQuoteIds.GetKeyV(Signature);
      Signature.Shuffle(RandomGenerator);

      // Place in bucket - not very efficient
      int SigLen = Signature.Len();
      for (int k = 0; k < SigLen; ++k) {
        TIntSet CurSet = ShingleToQuoteIds.GetDat(Signature[k]);
        for (TIntSet::TIter l = CurSet.BegI(); l < CurSet.EndI(); l++) {
          TInt Key = l.GetKey();
          if (Inverted.IsKey(Key)) {
            TIntV CurSignature = Inverted.GetDat(Key);
            if (CurSignature.Len() <= j) {
              CurSignature.Add(k);
              Inverted.AddDat(Key, CurSignature);
            }
          } else {
            TIntV NewSignature;
            NewSignature.Add(k);
            Inverted.AddDat(Key, NewSignature);
          }
        }
      }
    }

    TIntV InvertedKeys;
    Inverted.GetKeyV(InvertedKeys);
    TInt InvertedLen = InvertedKeys.Len();
    for (int k = 0; k < InvertedLen; ++k) {
      TIntSet Bucket;
      TIntV Signature = Inverted.GetDat(InvertedKeys[k]);
      if (BandBuckets.IsKey(Signature)) {
        Bucket = BandBuckets.GetDat(Signature);
      }
      Bucket.AddKey(InvertedKeys[k]);
      BandBuckets.AddDat(Signature, Bucket);
    }

    SignatureBandBuckets.Add(BandBuckets);
    Err("%d out of %d band signatures computed\n", i + 1, NumBands);
  }
  Err("Minhash step complete!\n");
}
Exemple #5
0
/// Newton method: DEPRECATED
int TAGMFast::MLENewton(const double& Thres, const int& MaxIter, const TStr PlotNm) {
  TExeTm ExeTm;
  int iter = 0, PrevIter = 0;
  TIntFltPrV IterLV;
  double PrevL = TFlt::Mn, CurL;
  TUNGraph::TNodeI UI;
  TIntV NIdxV;
  G->GetNIdV(NIdxV);
  int CID, UID, NewtonIter;
  double Fuc, PrevFuc, Grad, H;
  while(iter < MaxIter) {
    NIdxV.Shuffle(Rnd);
    for (int ui = 0; ui < F.Len(); ui++, iter++) {
      if (! PlotNm.Empty() && iter % G->GetNodes() == 0) {
        IterLV.Add(TIntFltPr(iter, Likelihood(false)));
      }
      UID = NIdxV[ui];
      //find set of candidate c (we only need to consider c to which a neighbor of u belongs to)
      TIntSet CIDSet;
      UI = G->GetNI(UID);
      if (UI.GetDeg() == 0) { //if the node is isolated, clear its membership and skip
        if (! F[UID].Empty()) { F[UID].Clr(); }
        continue;
      }
      for (int e = 0; e < UI.GetDeg(); e++) {
        if (HOVIDSV[UID].IsKey(UI.GetNbrNId(e))) { continue; }
        TIntFltH& NbhCIDH = F[UI.GetNbrNId(e)];
        for (TIntFltH::TIter CI = NbhCIDH.BegI(); CI < NbhCIDH.EndI(); CI++) {
          CIDSet.AddKey(CI.GetKey());
        }
      }
      for (TIntFltH::TIter CI = F[UID].BegI(); CI < F[UID].EndI(); CI++) { //remove the community membership which U does not share with its neighbors
        if (! CIDSet.IsKey(CI.GetKey())) {
          DelCom(UID, CI.GetKey());
        }
      }
      if (CIDSet.Empty()) { continue; }
      for (TIntSet::TIter CI = CIDSet.BegI(); CI < CIDSet.EndI(); CI++) {
        CID = CI.GetKey();
        //optimize for UID, CID
        //compute constants
        TFltV AlphaKV(UI.GetDeg());
        for (int e = 0; e < UI.GetDeg(); e++) {
          if (HOVIDSV[UID].IsKey(UI.GetNbrNId(e))) { continue; }
          AlphaKV[e] = (1 - PNoCom) * exp(- DotProduct(UID, UI.GetNbrNId(e)) + GetCom(UI.GetNbrNId(e), CID) * GetCom(UID, CID));
          IAssertR(AlphaKV[e] <= 1.0, TStr::Fmt("AlphaKV=%f, %f, %f", AlphaKV[e].Val, PNoCom.Val, GetCom(UI.GetNbrNId(e), CID)));
        }
        Fuc = GetCom(UID, CID);
        PrevFuc = Fuc;
        Grad = GradientForOneVar(AlphaKV, UID, CID, Fuc), H = 0.0;
        if (Grad <= 1e-3 && Grad >= -0.1) { continue; }
        NewtonIter = 0;
        while (NewtonIter++ < 10) {
          Grad = GradientForOneVar(AlphaKV, UID, CID, Fuc), H = 0.0;
          H = HessianForOneVar(AlphaKV, UID, CID, Fuc);
          if (Fuc == 0.0 && Grad <= 0.0) { Grad = 0.0; }
          if (fabs(Grad) < 1e-3) { break; }
          if (H == 0.0) { Fuc = 0.0; break; }
          double NewtonStep = - Grad / H;
          if (NewtonStep < -0.5) { NewtonStep = - 0.5; }
          Fuc += NewtonStep;
          if (Fuc < 0.0) { Fuc = 0.0; }
        }
        if (Fuc == 0.0) {
          DelCom(UID, CID);
        }
        else {
          AddCom(UID, CID, Fuc);
        }
      }
    }
    if (iter - PrevIter >= 2 * G->GetNodes() && iter > 10000) {
      PrevIter = iter;
      CurL = Likelihood();
      if (PrevL > TFlt::Mn && ! PlotNm.Empty()) {
        printf("\r%d iterations, Likelihood: %f, Diff: %f", iter, CurL,  CurL - PrevL);
      }
      fflush(stdout);
      if (CurL - PrevL <= Thres * fabs(PrevL)) { break; }
      else { PrevL = CurL; }
    }
    
  }
  if (! PlotNm.Empty()) {
    printf("\nMLE for Lambda completed with %d iterations(%s)\n", iter, ExeTm.GetTmStr());
    TGnuPlot::PlotValV(IterLV, PlotNm + ".likelihood_Q");
  }
  return iter;
}