void LSH::ElCheapoHashing(TQuoteBase *QuoteBase, TInt ShingleLen, THash<TMd5Sig, TIntSet>& ShingleToQuoteIds) { fprintf(stderr, "Hashing shingles the el cheapo way...\n"); TIntV QuoteIds; QuoteBase->GetAllQuoteIds(QuoteIds); for (int qt = 0; qt < QuoteIds.Len(); qt++) { if (qt % 1000 == 0) { fprintf(stderr, "%d out of %d completed\n", qt, QuoteIds.Len()); } TQuote Q; QuoteBase->GetQuote(QuoteIds[qt], Q); // Put x-character (or x-word) shingles into hash table; x is specified by ShingleLen parameter TStr QContentStr; Q.GetParsedContentString(QContentStr); TChA QContentChA = TChA(QContentStr); for (int i = 0; i < QContentChA.Len() - ShingleLen + 1; i++) { TChA ShingleChA = TChA(); for (int j = 0; j < ShingleLen; j++) { ShingleChA.AddCh(QContentChA.GetCh(i + j)); } TStr Shingle = TStr(ShingleChA); const TMd5Sig ShingleMd5(Shingle); TIntSet ShingleQuoteIds; if (ShingleToQuoteIds.IsKey(ShingleMd5)) { ShingleQuoteIds = ShingleToQuoteIds.GetDat(ShingleMd5); } ShingleQuoteIds.AddKey(QuoteIds[qt]); ShingleToQuoteIds.AddDat(ShingleMd5, ShingleQuoteIds); } } Err("Done with el cheapo hashing!\n"); }
/// For every quote, add it to corresponding bucket for each hashed x-character shingle of the quote // (Shingles by characters) void LSH::HashShingles(TQuoteBase *QuoteBase, TClusterBase *CB, TInt ShingleLen, THash<TMd5Sig, TShingleIdSet>& ShingleToQuoteIds) { Err("Hashing shingles...\n"); TIntV QuoteIds; QuoteBase->GetAllQuoteIds(QuoteIds); for (int qt = 0; qt < QuoteIds.Len(); qt++) { if (qt % 1000 == 0) { fprintf(stderr, "%d out of %d completed\n", qt, QuoteIds.Len()); } if (CB->IsQuoteInArchivedCluster(QuoteIds[qt])) continue; TQuote Q; QuoteBase->GetQuote(QuoteIds[qt], Q); // Put x-character (or x-word) shingles into hash table; x is specified by ShingleLen parameter TStr QContentStr; Q.GetParsedContentString(QContentStr); TChA QContentChA = TChA(QContentStr); int CurWord = 0; for (int i = 0; i < QContentChA.Len() - ShingleLen + 1; i++) { TChA ShingleChA = TChA(); for (int j = 0; j < ShingleLen; j++) { ShingleChA.AddCh(QContentChA.GetCh(i + j)); } TStr Shingle = TStr(ShingleChA); const TMd5Sig ShingleMd5(Shingle); TShingleIdSet ShingleQuoteIds; if (ShingleToQuoteIds.IsKey(ShingleMd5)) { ShingleQuoteIds = ShingleToQuoteIds.GetDat(ShingleMd5); } for (int j = CurWord; j > CurWord - WordWindow && j >= 0; j--) { ShingleQuoteIds.AddKey(TShingleId(QuoteIds[qt], j)); } ShingleToQuoteIds.AddDat(ShingleMd5, ShingleQuoteIds); // up the current word index if we see a space if (QContentChA.GetCh(i + ShingleLen - 1) == ' ') { CurWord++; } } } Err("Done hashing!\n"); }
int main(int argc, char *argv[]) { TDocBase *DocBase = new TDocBase; TChA Url = TChA("http://www.newyorktimes.com/news_story"); TSecTm Date = TSecTm::GetCurTm(); TChA Content = TChA("foo bar foo foo"); TVec<TChA> Links; Links.Add(TChA("http://www.google.com")); Links.Add(TChA("http://www.yahoo.com")); DocBase->AddDoc(Url, Date, Content, Links); printf("Number of documents: %d\n", DocBase->Len()); TDoc t; DocBase->GetDoc(0, t); TStr tUrl; t.GetUrl(tUrl); printf("URL: %s\n", tUrl.CStr()); TStrV l; t.GetLinks(l); printf("Link1: %s\n", l[0].CStr()); printf("Link2: %s\n", l[1].CStr()); { TFOut FOut("tmp.bin"); DocBase->Save(FOut); } printf("Save data successfully\n"); delete DocBase; TFIn FIn("tmp.bin"); printf("Load data successfully\n"); TDocBase *DocBase2 = new TDocBase; DocBase2->Load(FIn); printf("Number of documents: %d\n", DocBase2->Len()); TDoc t2; DocBase2->GetDoc(0, t2); TStr t2Url; t2.GetUrl(t2Url); printf("URL: %s\n", t2Url.CStr()); t2.GetLinks(l); printf("Link1: %s\n", l[0].CStr()); printf("Link2: %s\n", l[1].CStr()); delete DocBase2; return 0; }
// space seprated sequence of words (includes all non-blank characters, i.e., punctuations) TChA TStrUtil::GetCleanStr(const TChA& ChA) { char *b = (char *) ChA.CStr(); while (*b && ! TCh::IsAlNum(*b)) { b++; } if (*b == 0) { return TChA(); } TChA OutChA(ChA.Len()); char *e = b; bool ws=false; while (*e) { while (*e && TCh::IsWs(*e)) { e++; ws=true; } if (! *e) { break; } if (ws) { OutChA.AddCh(' '); ws=false; } OutChA.AddCh(*e); e++; } //OutChA.ToLc(); return OutChA; }
// space separated sequence of words, remove all punctuations, etc. TChA TStrUtil::GetCleanWrdStr(const TChA& ChA) { char *b = (char *) ChA.CStr(); while (*b && ! TCh::IsAlNum(*b)) { b++; } if (*b == 0) { return TChA(); } TChA OutChA(ChA.Len()); char *e = b, tmp; while (*e) { b = e; while (*e && (TCh::IsAlNum(*e) || ((*e=='\'' || *e=='-') && TCh::IsAlNum(*(e+1))))) { e++; } if (b < e) { tmp = *e; *e=0; OutChA += b; OutChA.AddCh(' '); *e = tmp; } while (*e && ! TCh::IsAlNum(*e)) { e++; } if (! *e) { break; } } OutChA.DelLastCh(); OutChA.ToLc(); return OutChA; }
// remove ending /, /index.html, etc. and strip starting www. bool TStrUtil::GetNormalizedUrl(const TChA& UrlIn, const TChA& BaseUrl, TChA& UrlOut) { UrlOut = UrlIn; if (StripEnd(UrlIn, "/", UrlOut)) {} else if (StripEnd(UrlIn, "/index.html", UrlOut)) {} else if (StripEnd(UrlIn, "/index.htm", UrlOut)) {} else if (StripEnd(UrlIn, "/index.php", UrlOut)) {} if (! (UrlOut.IsPrefix("http://") || UrlOut.IsPrefix("ftp://"))) { // if UrlIn is relative url, try combine it with BaseUrl if (UrlIn.Empty() || ! (BaseUrl.IsPrefix("http://") || BaseUrl.IsPrefix("ftp://"))) { //printf("** Bad URL: base:'%s' url:'%s'\n", BaseUrl.CStr(), UrlIn.CStr()); return false; } TChA Out; if (! GetNormalizedUrl(BaseUrl, TChA(), Out)) { return false; } if (UrlIn[0] != '/') { Out.AddCh('/'); } Out += UrlOut; UrlOut = Out; } // http://www. --> http:// if (UrlOut.IsPrefix("http://www.")) { TStr prefix("http://"); UrlOut = prefix + UrlOut.GetSubStr(11, TInt::Mx); } UrlOut.ToLc(); return true; }
// http://www.ijs.si/fdfd/blah.html --> www.ijs.si TChA TStrUtil::GetDomNm(const TChA& UrlChA) { int EndSlash = UrlChA.SearchCh('/', 7)-1; // skip starting http:// if (EndSlash > 0) { const int BegSlash = UrlChA.SearchChBack('/', EndSlash); if (BegSlash > 0) { return UrlChA.GetSubStr(BegSlash+1, EndSlash).ToLc(); } else { return UrlChA.GetSubStr(0, UrlChA.SearchCh('/', 0)-1).ToLc(); } } else { if (UrlChA.IsPrefix("http://")) { return UrlChA.GetSubStr(7, UrlChA.Len()-1).ToLc(); } EndSlash = UrlChA.SearchCh('/', 0); if (EndSlash > 0) { return UrlChA.GetSubStr(0, EndSlash-1).ToLc(); } else { return TChA(UrlChA).ToLc(); } } }
TEST(TStr, Constructors) { TStr Default; TStr CStr("abc"); TStr OneChar('a'); TStr CopyCStr(CStr); TStr Move(TStr("abc")); TStr ChA(TChA("abc")); TStr SStr(TSStr("abc")); EXPECT_EQ(Default, ""); EXPECT_EQ(CStr, "abc"); EXPECT_EQ(OneChar, "a"); EXPECT_EQ(CopyCStr, "abc"); EXPECT_EQ(Move, "abc"); EXPECT_EQ(ChA, "abc"); EXPECT_EQ(SStr, "abc"); EXPECT_EQ(TStr(nullptr), ""); }