// YES I COPIED AND PASTED CODE my section leader would be so ashamed :D void LSH::MinHash(THash<TMd5Sig, TIntSet>& ShingleToQuoteIds, TVec<THash<TIntV, TIntSet> >& SignatureBandBuckets) { TRnd RandomGenerator; // TODO: make this "more random" by incorporating time for (int i = 0; i < NumBands; ++i) { THash < TInt, TIntV > Inverted; // (QuoteID, QuoteSignatureForBand) THash < TIntV, TIntSet > BandBuckets; // (BandSignature, QuoteIDs) for (int j = 0; j < BandSize; ++j) { // Create new signature TVec < TMd5Sig > Signature; ShingleToQuoteIds.GetKeyV(Signature); Signature.Shuffle(RandomGenerator); // Place in bucket - not very efficient int SigLen = Signature.Len(); for (int k = 0; k < SigLen; ++k) { TIntSet CurSet = ShingleToQuoteIds.GetDat(Signature[k]); for (TIntSet::TIter l = CurSet.BegI(); l < CurSet.EndI(); l++) { TInt Key = l.GetKey(); if (Inverted.IsKey(Key)) { TIntV CurSignature = Inverted.GetDat(Key); if (CurSignature.Len() <= j) { CurSignature.Add(k); Inverted.AddDat(Key, CurSignature); } } else { TIntV NewSignature; NewSignature.Add(k); Inverted.AddDat(Key, NewSignature); } } } } TIntV InvertedKeys; Inverted.GetKeyV(InvertedKeys); TInt InvertedLen = InvertedKeys.Len(); for (int k = 0; k < InvertedLen; ++k) { TIntSet Bucket; TIntV Signature = Inverted.GetDat(InvertedKeys[k]); if (BandBuckets.IsKey(Signature)) { Bucket = BandBuckets.GetDat(Signature); } Bucket.AddKey(InvertedKeys[k]); BandBuckets.AddDat(Signature, Bucket); } SignatureBandBuckets.Add(BandBuckets); Err("%d out of %d band signatures computed\n", i + 1, NumBands); } Err("Minhash step complete!\n"); }
void QuoteGraph::CompareUsingShingles(THash<TMd5Sig, TIntSet>& Shingles) { int Count = 0; int RealCount = 0; TVec<TMd5Sig> ShingleKeys; Shingles.GetKeyV(ShingleKeys); THashSet<TIntPr> EdgeCache; for (int i = 0; i < ShingleKeys.Len(); i++) { if (i % 100 == 0) { Err("Processed %d out of %d shingles, count = %d\n", i, ShingleKeys.Len(), Count); } TIntSet Bucket; Shingles.IsKeyGetDat(ShingleKeys[i], Bucket); for (TIntSet::TIter Quote1 = Bucket.BegI(); Quote1 < Bucket.EndI(); Quote1++) { TIntSet::TIter Quote1Copy = Quote1; Quote1Copy++; for (TIntSet::TIter Quote2 = Quote1Copy; Quote2 < Bucket.EndI(); Quote2++) { if (!EdgeCache.IsKey(TIntPr(Quote1.GetKey(), Quote2.GetKey())) && !EdgeCache.IsKey(TIntPr(Quote2.GetKey(), Quote1.GetKey()))) { EdgeCache.AddKey(TIntPr(Quote1.GetKey(), Quote2.GetKey())); EdgeCache.AddKey(TIntPr(Quote2.GetKey(), Quote1.GetKey())); RealCount++; AddEdgeIfSimilar(Quote1.GetKey(), Quote2.GetKey()); } } } int Len = Bucket.Len() * (Bucket.Len() - 1) / 2; Count += Len; } fprintf(stderr, "NUMBER OF COMPARES: %d\n", Count); fprintf(stderr, "NUMBER OF REAL COMPARES: %d\n", RealCount); }
int main(int argc, char *argv[]) { TStr BaseString = "/lfs/1/tmp/curis/week/QBDB.bin"; TFIn BaseFile(BaseString); TQuoteBase *QB = new TQuoteBase; TDocBase *DB = new TDocBase; QB->Load(BaseFile); DB->Load(BaseFile); TIntV QuoteIds; QB->GetAllQuoteIds(QuoteIds); int NumQuotes = QuoteIds.Len(); THash<TInt, TStrSet> PeakCounts; for (int i = 0; i < NumQuotes; i++) { TQuote CurQuote; if (QB->GetQuote(QuoteIds[i], CurQuote)) { TVec<TSecTm> Peaks; CurQuote.GetPeaks(DB, Peaks); TStr QuoteString; CurQuote.GetParsedContentString(QuoteString); TStrSet StringSet; if (PeakCounts.IsKey(Peaks.Len())) { StringSet = PeakCounts.GetDat(Peaks.Len()); } StringSet.AddKey(QuoteString); PeakCounts.AddDat(Peaks.Len(), StringSet); } } TIntV PeakCountKeys; PeakCounts.GetKeyV(PeakCountKeys); PeakCountKeys.Sort(true); for (int i = 0; i < PeakCountKeys.Len(); i++) { TStrSet CurSet = PeakCounts.GetDat(PeakCountKeys[i]); if (CurSet.Len() > 0) { printf("QUOTES WITH %d PEAKS\n", PeakCountKeys[i].Val); printf("#########################################\n"); THashSet<TStr> StringSet = PeakCounts.GetDat(PeakCountKeys[i]); for (THashSet<TStr>::TIter l = StringSet.BegI(); l < StringSet.EndI(); l++) { printf("%s\n", l.GetKey().CStr()); } printf("\n"); } } delete QB; delete DB; return 0; }
void LSH::WordHashing(TQuoteBase *QuoteBase, THash<TMd5Sig, TIntSet>& ShingleToQuoteIds) { fprintf(stderr, "Hashing shingles using words...\n"); TIntV QuoteIds; QuoteBase->GetAllQuoteIds(QuoteIds); THash<TStr, TIntSet> Temp; for (int qt = 0; qt < QuoteIds.Len(); qt++) { if (qt % 1000 == 0) { fprintf(stderr, "%d out of %d completed\n", qt, QuoteIds.Len()); } TQuote Q; QuoteBase->GetQuote(QuoteIds[qt], Q); TStrV Content; Q.GetParsedContent(Content); int ContentLen = Content.Len(); for (int i = 0; i < ContentLen; i++) { const TMd5Sig ShingleMd5(Content[i]); TIntSet ShingleQuoteIds; ShingleToQuoteIds.IsKeyGetDat(ShingleMd5, ShingleQuoteIds); ShingleQuoteIds.AddKey(QuoteIds[qt]); ShingleToQuoteIds.AddDat(ShingleMd5, ShingleQuoteIds); ///// COMMENT OUT LATER TIntSet TempSet; Temp.IsKeyGetDat(Content[i], TempSet); TempSet.AddKey(QuoteIds[qt]); Temp.AddDat(Content[i], TempSet); } } TVec<TStr> ShingleKeys; Temp.GetKeyV(ShingleKeys); ShingleKeys.SortCmp(TCmpSetByLen(false, &Temp)); for (int i = 0; i < 100; i++) { TIntSet TempSet = Temp.GetDat(ShingleKeys[i]); Err("%d: %s - %d \n", i, ShingleKeys[i].CStr(), TempSet.Len()); } Err("Done with word hashing!\n"); }