/// For every quote, add it to corresponding bucket for each hashed x-character shingle of the quote // (Shingles by characters) void LSH::HashShingles(TQuoteBase *QuoteBase, TClusterBase *CB, TInt ShingleLen, THash<TMd5Sig, TShingleIdSet>& ShingleToQuoteIds) { Err("Hashing shingles...\n"); TIntV QuoteIds; QuoteBase->GetAllQuoteIds(QuoteIds); for (int qt = 0; qt < QuoteIds.Len(); qt++) { if (qt % 1000 == 0) { fprintf(stderr, "%d out of %d completed\n", qt, QuoteIds.Len()); } if (CB->IsQuoteInArchivedCluster(QuoteIds[qt])) continue; TQuote Q; QuoteBase->GetQuote(QuoteIds[qt], Q); // Put x-character (or x-word) shingles into hash table; x is specified by ShingleLen parameter TStr QContentStr; Q.GetParsedContentString(QContentStr); TChA QContentChA = TChA(QContentStr); int CurWord = 0; for (int i = 0; i < QContentChA.Len() - ShingleLen + 1; i++) { TChA ShingleChA = TChA(); for (int j = 0; j < ShingleLen; j++) { ShingleChA.AddCh(QContentChA.GetCh(i + j)); } TStr Shingle = TStr(ShingleChA); const TMd5Sig ShingleMd5(Shingle); TShingleIdSet ShingleQuoteIds; if (ShingleToQuoteIds.IsKey(ShingleMd5)) { ShingleQuoteIds = ShingleToQuoteIds.GetDat(ShingleMd5); } for (int j = CurWord; j > CurWord - WordWindow && j >= 0; j--) { ShingleQuoteIds.AddKey(TShingleId(QuoteIds[qt], j)); } ShingleToQuoteIds.AddDat(ShingleMd5, ShingleQuoteIds); // up the current word index if we see a space if (QContentChA.GetCh(i + ShingleLen - 1) == ' ') { CurWord++; } } } Err("Done hashing!\n"); }
void LSH::ElCheapoHashing(TQuoteBase *QuoteBase, TInt ShingleLen, THash<TMd5Sig, TIntSet>& ShingleToQuoteIds) { fprintf(stderr, "Hashing shingles the el cheapo way...\n"); TIntV QuoteIds; QuoteBase->GetAllQuoteIds(QuoteIds); for (int qt = 0; qt < QuoteIds.Len(); qt++) { if (qt % 1000 == 0) { fprintf(stderr, "%d out of %d completed\n", qt, QuoteIds.Len()); } TQuote Q; QuoteBase->GetQuote(QuoteIds[qt], Q); // Put x-character (or x-word) shingles into hash table; x is specified by ShingleLen parameter TStr QContentStr; Q.GetParsedContentString(QContentStr); TChA QContentChA = TChA(QContentStr); for (int i = 0; i < QContentChA.Len() - ShingleLen + 1; i++) { TChA ShingleChA = TChA(); for (int j = 0; j < ShingleLen; j++) { ShingleChA.AddCh(QContentChA.GetCh(i + j)); } TStr Shingle = TStr(ShingleChA); const TMd5Sig ShingleMd5(Shingle); TIntSet ShingleQuoteIds; if (ShingleToQuoteIds.IsKey(ShingleMd5)) { ShingleQuoteIds = ShingleToQuoteIds.GetDat(ShingleMd5); } ShingleQuoteIds.AddKey(QuoteIds[qt]); ShingleToQuoteIds.AddDat(ShingleMd5, ShingleQuoteIds); } } Err("Done with el cheapo hashing!\n"); }