Ejemplo n.º 1
0
// YES I COPIED AND PASTED CODE my section leader would be so ashamed :D
void LSH::MinHash(THash<TMd5Sig, TIntSet>& ShingleToQuoteIds,
    TVec<THash<TIntV, TIntSet> >& SignatureBandBuckets) {
  TRnd RandomGenerator; // TODO: make this "more random" by incorporating time
  for (int i = 0; i < NumBands; ++i) {
    THash < TInt, TIntV > Inverted; // (QuoteID, QuoteSignatureForBand)
    THash < TIntV, TIntSet > BandBuckets; // (BandSignature, QuoteIDs)
    for (int j = 0; j < BandSize; ++j) {
      // Create new signature
      TVec < TMd5Sig > Signature;
      ShingleToQuoteIds.GetKeyV(Signature);
      Signature.Shuffle(RandomGenerator);

      // Place in bucket - not very efficient
      int SigLen = Signature.Len();
      for (int k = 0; k < SigLen; ++k) {
        TIntSet CurSet = ShingleToQuoteIds.GetDat(Signature[k]);
        for (TIntSet::TIter l = CurSet.BegI(); l < CurSet.EndI(); l++) {
          TInt Key = l.GetKey();
          if (Inverted.IsKey(Key)) {
            TIntV CurSignature = Inverted.GetDat(Key);
            if (CurSignature.Len() <= j) {
              CurSignature.Add(k);
              Inverted.AddDat(Key, CurSignature);
            }
          } else {
            TIntV NewSignature;
            NewSignature.Add(k);
            Inverted.AddDat(Key, NewSignature);
          }
        }
      }
    }

    TIntV InvertedKeys;
    Inverted.GetKeyV(InvertedKeys);
    TInt InvertedLen = InvertedKeys.Len();
    for (int k = 0; k < InvertedLen; ++k) {
      TIntSet Bucket;
      TIntV Signature = Inverted.GetDat(InvertedKeys[k]);
      if (BandBuckets.IsKey(Signature)) {
        Bucket = BandBuckets.GetDat(Signature);
      }
      Bucket.AddKey(InvertedKeys[k]);
      BandBuckets.AddDat(Signature, Bucket);
    }

    SignatureBandBuckets.Add(BandBuckets);
    Err("%d out of %d band signatures computed\n", i + 1, NumBands);
  }
  Err("Minhash step complete!\n");
}
Ejemplo n.º 2
0
void QuoteGraph::CompareUsingShingles(THash<TMd5Sig, TIntSet>& Shingles) {
  int Count = 0;
  int RealCount = 0;
  TVec<TMd5Sig> ShingleKeys;
  Shingles.GetKeyV(ShingleKeys);
  THashSet<TIntPr> EdgeCache;

  for (int i = 0; i < ShingleKeys.Len(); i++) {
    if (i % 100 == 0) {
      Err("Processed %d out of %d shingles, count = %d\n", i, ShingleKeys.Len(), Count);
    }
    TIntSet Bucket;
    Shingles.IsKeyGetDat(ShingleKeys[i], Bucket);

    for (TIntSet::TIter Quote1 = Bucket.BegI(); Quote1 < Bucket.EndI(); Quote1++) {
      TIntSet::TIter Quote1Copy = Quote1;
      Quote1Copy++;
      for (TIntSet::TIter Quote2 = Quote1Copy; Quote2 < Bucket.EndI(); Quote2++) {
        if (!EdgeCache.IsKey(TIntPr(Quote1.GetKey(), Quote2.GetKey())) && !EdgeCache.IsKey(TIntPr(Quote2.GetKey(), Quote1.GetKey()))) {
          EdgeCache.AddKey(TIntPr(Quote1.GetKey(), Quote2.GetKey()));
          EdgeCache.AddKey(TIntPr(Quote2.GetKey(), Quote1.GetKey()));
          RealCount++;
          AddEdgeIfSimilar(Quote1.GetKey(), Quote2.GetKey());
        }
      }
    }
    int Len = Bucket.Len() * (Bucket.Len() - 1) / 2;
    Count += Len;
  }
  fprintf(stderr, "NUMBER OF COMPARES: %d\n", Count);
  fprintf(stderr, "NUMBER OF REAL COMPARES: %d\n", RealCount);
}
Ejemplo n.º 3
0
int main(int argc, char *argv[]) {
  TStr BaseString = "/lfs/1/tmp/curis/week/QBDB.bin";
  TFIn BaseFile(BaseString);
  TQuoteBase *QB = new TQuoteBase;
  TDocBase *DB = new TDocBase;
  QB->Load(BaseFile);
  DB->Load(BaseFile);

  TIntV QuoteIds;
  QB->GetAllQuoteIds(QuoteIds);

  int NumQuotes = QuoteIds.Len();
  THash<TInt, TStrSet> PeakCounts;
  for (int i = 0; i < NumQuotes; i++) {
    TQuote CurQuote;
    if (QB->GetQuote(QuoteIds[i], CurQuote)) {
      TVec<TSecTm> Peaks;
      CurQuote.GetPeaks(DB, Peaks);
      TStr QuoteString;
      CurQuote.GetParsedContentString(QuoteString);
      TStrSet StringSet;
      if (PeakCounts.IsKey(Peaks.Len())) {
        StringSet = PeakCounts.GetDat(Peaks.Len());
      }
      StringSet.AddKey(QuoteString);
      PeakCounts.AddDat(Peaks.Len(), StringSet);
    }
  }

  TIntV PeakCountKeys;
  PeakCounts.GetKeyV(PeakCountKeys);
  PeakCountKeys.Sort(true);
  for (int i = 0; i < PeakCountKeys.Len(); i++) {
    TStrSet CurSet = PeakCounts.GetDat(PeakCountKeys[i]);
    if (CurSet.Len() > 0) {
      printf("QUOTES WITH %d PEAKS\n", PeakCountKeys[i].Val);
      printf("#########################################\n");
      THashSet<TStr> StringSet = PeakCounts.GetDat(PeakCountKeys[i]);
      for (THashSet<TStr>::TIter l = StringSet.BegI(); l < StringSet.EndI(); l++) {
        printf("%s\n", l.GetKey().CStr());
      }
      printf("\n");
    }
  }
  delete QB;
  delete DB;
  return 0;
}
Ejemplo n.º 4
0
void LSH::WordHashing(TQuoteBase *QuoteBase,
    THash<TMd5Sig, TIntSet>& ShingleToQuoteIds) {
  fprintf(stderr, "Hashing shingles using words...\n");
  TIntV QuoteIds;
  QuoteBase->GetAllQuoteIds(QuoteIds);

  THash<TStr, TIntSet> Temp;

  for (int qt = 0; qt < QuoteIds.Len(); qt++) {
    if (qt % 1000 == 0) {
      fprintf(stderr, "%d out of %d completed\n", qt, QuoteIds.Len());
    }
    TQuote Q;
    QuoteBase->GetQuote(QuoteIds[qt], Q);

    TStrV Content;
    Q.GetParsedContent(Content);

    int ContentLen = Content.Len();
    for (int i = 0; i < ContentLen; i++) {
      const TMd5Sig ShingleMd5(Content[i]);
      TIntSet ShingleQuoteIds;
      ShingleToQuoteIds.IsKeyGetDat(ShingleMd5, ShingleQuoteIds);
      ShingleQuoteIds.AddKey(QuoteIds[qt]);
      ShingleToQuoteIds.AddDat(ShingleMd5, ShingleQuoteIds);

      ///// COMMENT OUT LATER
      TIntSet TempSet;
      Temp.IsKeyGetDat(Content[i], TempSet);
      TempSet.AddKey(QuoteIds[qt]);
      Temp.AddDat(Content[i], TempSet);
    }
  }

  TVec<TStr> ShingleKeys;
  Temp.GetKeyV(ShingleKeys);
  ShingleKeys.SortCmp(TCmpSetByLen(false, &Temp));
  for (int i = 0; i < 100; i++) {
    TIntSet TempSet = Temp.GetDat(ShingleKeys[i]);
    Err("%d: %s - %d \n", i, ShingleKeys[i].CStr(), TempSet.Len());
  }

  Err("Done with word hashing!\n");
}