void QuoteGraph::CompareUsingShingles(THash<TMd5Sig, TIntSet>& Shingles) { int Count = 0; int RealCount = 0; TVec<TMd5Sig> ShingleKeys; Shingles.GetKeyV(ShingleKeys); THashSet<TIntPr> EdgeCache; for (int i = 0; i < ShingleKeys.Len(); i++) { if (i % 100 == 0) { Err("Processed %d out of %d shingles, count = %d\n", i, ShingleKeys.Len(), Count); } TIntSet Bucket; Shingles.IsKeyGetDat(ShingleKeys[i], Bucket); for (TIntSet::TIter Quote1 = Bucket.BegI(); Quote1 < Bucket.EndI(); Quote1++) { TIntSet::TIter Quote1Copy = Quote1; Quote1Copy++; for (TIntSet::TIter Quote2 = Quote1Copy; Quote2 < Bucket.EndI(); Quote2++) { if (!EdgeCache.IsKey(TIntPr(Quote1.GetKey(), Quote2.GetKey())) && !EdgeCache.IsKey(TIntPr(Quote2.GetKey(), Quote1.GetKey()))) { EdgeCache.AddKey(TIntPr(Quote1.GetKey(), Quote2.GetKey())); EdgeCache.AddKey(TIntPr(Quote2.GetKey(), Quote1.GetKey())); RealCount++; AddEdgeIfSimilar(Quote1.GetKey(), Quote2.GetKey()); } } } int Len = Bucket.Len() * (Bucket.Len() - 1) / 2; Count += Len; } fprintf(stderr, "NUMBER OF COMPARES: %d\n", Count); fprintf(stderr, "NUMBER OF REAL COMPARES: %d\n", RealCount); }
void LSH::WordHashing(TQuoteBase *QuoteBase, THash<TMd5Sig, TIntSet>& ShingleToQuoteIds) { fprintf(stderr, "Hashing shingles using words...\n"); TIntV QuoteIds; QuoteBase->GetAllQuoteIds(QuoteIds); THash<TStr, TIntSet> Temp; for (int qt = 0; qt < QuoteIds.Len(); qt++) { if (qt % 1000 == 0) { fprintf(stderr, "%d out of %d completed\n", qt, QuoteIds.Len()); } TQuote Q; QuoteBase->GetQuote(QuoteIds[qt], Q); TStrV Content; Q.GetParsedContent(Content); int ContentLen = Content.Len(); for (int i = 0; i < ContentLen; i++) { const TMd5Sig ShingleMd5(Content[i]); TIntSet ShingleQuoteIds; ShingleToQuoteIds.IsKeyGetDat(ShingleMd5, ShingleQuoteIds); ShingleQuoteIds.AddKey(QuoteIds[qt]); ShingleToQuoteIds.AddDat(ShingleMd5, ShingleQuoteIds); ///// COMMENT OUT LATER TIntSet TempSet; Temp.IsKeyGetDat(Content[i], TempSet); TempSet.AddKey(QuoteIds[qt]); Temp.AddDat(Content[i], TempSet); } } TVec<TStr> ShingleKeys; Temp.GetKeyV(ShingleKeys); ShingleKeys.SortCmp(TCmpSetByLen(false, &Temp)); for (int i = 0; i < 100; i++) { TIntSet TempSet = Temp.GetDat(ShingleKeys[i]); Err("%d: %s - %d \n", i, ShingleKeys[i].CStr(), TempSet.Len()); } Err("Done with word hashing!\n"); }
PSwSet TSwSet::GetSwSet(const TSwSetType& SwSetType){ static THash<TInt, PSwSet> SwSetTypeToSwSetH; PSwSet SwSet; if (SwSetTypeToSwSetH.IsKeyGetDat(TInt(int(SwSetType)), SwSet)){ } else { SwSet=TSwSet::New(SwSetType); SwSetTypeToSwSetH.AddDat(TInt(int(SwSetType)), SwSet); } return SwSet; }
void LogOutput::ComputeOldRankString(THash<TInt, TInt>& OldRankings, TInt& ClusterId, TInt CurRank, TStr& OldRankStr) { TInt OldRanking; if (OldRankings.IsKeyGetDat(ClusterId, OldRanking)) { TInt Difference = OldRanking - CurRank; if (Difference < 0) { OldRankStr = "<b><center><font color=\\\"red\\\">" + Difference.GetStr() + "</font></center></b>"; } else if (Difference > 0) { OldRankStr = "<b><center><font color=\\\"green\\\">+" + Difference.GetStr() + "</font></center></b>"; } else { OldRankStr = "<b><center>0</center></b>"; } } else { OldRankStr = "<center>new!</center>"; } }
void LSH::ComputeSignatures(THashSet<TMd5Sig>& Shingles, THash<TMd5Sig, TIntV>& Signatures, int NumSignatures) { if (NumSignatures < 1) return; TRnd RandomGenerator; // TODO: make this "more random" by incorporating time TInt NumShingles = Shingles.Len(); for (int i = 0; i < NumSignatures; ++i) { // Create new signature TVec < TMd5Sig > Shuffle; Shingles.GetKeyV(Shuffle); Shuffle.Shuffle(RandomGenerator); for (int j = 0; j < NumShingles; j++) { TIntV Signature; Signatures.IsKeyGetDat(Shuffle[j], Signature); Signature.Add(j); Signatures.AddDat(Shuffle[j], Signature); } } Err("Computed %d signatures!\n", NumSignatures); }
int main(int argc, char *argv[]) { // #### SETUP: Parse Arguments LogOutput Log; THash<TStr, TStr> Arguments; ArgumentParser::ParseArguments(argc, argv, Arguments, Log); TStr OutputDirectory; TStr StartString = ArgumentParser::GetArgument(Arguments, "start", "2009-02-01"); TStr QBDBDirectory = ArgumentParser::GetArgument(Arguments, "qbdb", QBDB_DIR_DEFAULT); TStr OutDirectory = ArgumentParser::GetArgument(Arguments, "out", "/lfs/1/tmp/curis/"); TInt WindowSize = ArgumentParser::GetArgument(Arguments, "window", "14").GetInt(); if (ArgumentParser::GetArgument(Arguments, "nolog", "") == "") { Log.DisableLogging(); } else if (!Arguments.IsKeyGetDat("directory", OutputDirectory)) { Log.SetupNewOutputDirectory(""); } else { Log.SetDirectory(OutputDirectory); } // #### DATA LOADING: Load ALL the things! TQuoteBase QB; TDocBase DB; fprintf(stderr, "Loading QB and DB from file for %d days, starting from %s...\n", WindowSize.Val, StartString.CStr()); Err("%s\n", QBDBDirectory.CStr()); TSecTm PresentTime = TDataLoader::LoadQBDBByWindow(QBDBDirectory, StartString, WindowSize, QB, DB); fprintf(stderr, "QBDB successfully loaded!\n"); TVec<TSecTm> PubTmV; TVec<TStr> PostUrlV; TVec<TStr> QuoteV; fprintf(stderr, "Dumping quotes to file...\n"); TIntV QuoteIds; QB.GetAllQuoteIds(QuoteIds); for (int i = 0; i < QuoteIds.Len(); i++) { TQuote Q; QB.GetQuote(QuoteIds[i], Q); TStr QContentString; Q.GetContentString(QContentString); TVec<TUInt> Sources; Q.GetSources(Sources); for (int j = 0; j < Sources.Len(); j++) { TDoc D; DB.GetDoc(Sources[j], D); TStr PostUrl; D.GetUrl(PostUrl); TSecTm PostTime = D.GetDate(); QuoteV.Add(QContentString); PubTmV.Add(PostTime); PostUrlV.Add(PostUrl); } } TFOut FOut(OutDirectory + "QuoteList" + ".bin"); PubTmV.Save(FOut); PostUrlV.Save(FOut); QuoteV.Save(FOut); fprintf(stderr, "Done!\n"); return 0; }