예제 #1
0
void LSH::ElCheapoHashing(TQuoteBase *QuoteBase, TInt ShingleLen,
    THash<TMd5Sig, TIntSet>& ShingleToQuoteIds) {
  fprintf(stderr, "Hashing shingles the el cheapo way...\n");
  TIntV QuoteIds;
  QuoteBase->GetAllQuoteIds(QuoteIds);
  for (int qt = 0; qt < QuoteIds.Len(); qt++) {
    if (qt % 1000 == 0) {
      fprintf(stderr, "%d out of %d completed\n", qt, QuoteIds.Len());
    }
    TQuote Q;
    QuoteBase->GetQuote(QuoteIds[qt], Q);

    // Put x-character (or x-word) shingles into hash table; x is specified by ShingleLen parameter
    TStr QContentStr;
    Q.GetParsedContentString(QContentStr);
    TChA QContentChA = TChA(QContentStr);

    for (int i = 0; i < QContentChA.Len() - ShingleLen + 1; i++) {
      TChA ShingleChA = TChA();
      for (int j = 0; j < ShingleLen; j++) {
        ShingleChA.AddCh(QContentChA.GetCh(i + j));
      }
      TStr Shingle = TStr(ShingleChA);
      const TMd5Sig ShingleMd5(Shingle);
      TIntSet ShingleQuoteIds;
      if (ShingleToQuoteIds.IsKey(ShingleMd5)) {
        ShingleQuoteIds = ShingleToQuoteIds.GetDat(ShingleMd5);
      }

      ShingleQuoteIds.AddKey(QuoteIds[qt]);
      ShingleToQuoteIds.AddDat(ShingleMd5, ShingleQuoteIds);
    }
  }
  Err("Done with el cheapo hashing!\n");
}
예제 #2
0
/// For every quote, add it to corresponding bucket for each hashed x-character shingle of the quote
// (Shingles by characters)
void LSH::HashShingles(TQuoteBase *QuoteBase, TClusterBase *CB, TInt ShingleLen,
    THash<TMd5Sig, TShingleIdSet>& ShingleToQuoteIds) {
  Err("Hashing shingles...\n");
  TIntV QuoteIds;
  QuoteBase->GetAllQuoteIds(QuoteIds);
  for (int qt = 0; qt < QuoteIds.Len(); qt++) {
    if (qt % 1000 == 0) {
      fprintf(stderr, "%d out of %d completed\n", qt, QuoteIds.Len());
    }

    if (CB->IsQuoteInArchivedCluster(QuoteIds[qt]))
      continue;
    TQuote Q;
    QuoteBase->GetQuote(QuoteIds[qt], Q);

    // Put x-character (or x-word) shingles into hash table; x is specified by ShingleLen parameter
    TStr QContentStr;
    Q.GetParsedContentString(QContentStr);
    TChA QContentChA = TChA(QContentStr);

    int CurWord = 0;

    for (int i = 0; i < QContentChA.Len() - ShingleLen + 1; i++) {
      TChA ShingleChA = TChA();
      for (int j = 0; j < ShingleLen; j++) {
        ShingleChA.AddCh(QContentChA.GetCh(i + j));
      }
      TStr Shingle = TStr(ShingleChA);
      const TMd5Sig ShingleMd5(Shingle);
      TShingleIdSet ShingleQuoteIds;
      if (ShingleToQuoteIds.IsKey(ShingleMd5)) {
        ShingleQuoteIds = ShingleToQuoteIds.GetDat(ShingleMd5);
      }

      for (int j = CurWord; j > CurWord - WordWindow && j >= 0; j--) {
        ShingleQuoteIds.AddKey(TShingleId(QuoteIds[qt], j));
      }

      ShingleToQuoteIds.AddDat(ShingleMd5, ShingleQuoteIds);

      // up the current word index if we see a space
      if (QContentChA.GetCh(i + ShingleLen - 1) == ' ') {
        CurWord++;
      }
    }
  }
  Err("Done hashing!\n");
}
예제 #3
0
int main(int argc, char *argv[]) {
  TDocBase *DocBase = new TDocBase;
  TChA Url = TChA("http://www.newyorktimes.com/news_story");
  TSecTm Date = TSecTm::GetCurTm();
  TChA Content = TChA("foo bar foo foo");
  TVec<TChA> Links;
  Links.Add(TChA("http://www.google.com"));
  Links.Add(TChA("http://www.yahoo.com"));
  DocBase->AddDoc(Url, Date, Content, Links);

  printf("Number of documents: %d\n", DocBase->Len());
  
  TDoc t;
  DocBase->GetDoc(0, t);
  TStr tUrl;
  t.GetUrl(tUrl);
  printf("URL: %s\n", tUrl.CStr());
  
  TStrV l;
  t.GetLinks(l);
  printf("Link1: %s\n", l[0].CStr());
  printf("Link2: %s\n", l[1].CStr());

  { TFOut FOut("tmp.bin"); DocBase->Save(FOut); }
  printf("Save data successfully\n");

  delete DocBase;

  TFIn FIn("tmp.bin");
  printf("Load data successfully\n");
  TDocBase *DocBase2 = new TDocBase;
  DocBase2->Load(FIn);

  printf("Number of documents: %d\n", DocBase2->Len());

  TDoc t2;
  DocBase2->GetDoc(0, t2);
  TStr t2Url;
  t2.GetUrl(t2Url);
  printf("URL: %s\n", t2Url.CStr());

  t2.GetLinks(l);
  printf("Link1: %s\n", l[0].CStr());
  printf("Link2: %s\n", l[1].CStr());

  delete DocBase2;
  return 0;
}
예제 #4
0
파일: util.cpp 프로젝트: pikma/Snap
// space seprated sequence of words (includes all non-blank characters, i.e., punctuations)
TChA TStrUtil::GetCleanStr(const TChA& ChA) {
    char *b = (char *) ChA.CStr();
    while (*b && ! TCh::IsAlNum(*b)) {
        b++;
    }
    if (*b == 0) {
        return TChA();
    }
    TChA OutChA(ChA.Len());
    char *e = b;
    bool ws=false;
    while (*e) {
        while (*e && TCh::IsWs(*e)) {
            e++;
            ws=true;
        }
        if (! *e) {
            break;
        }
        if (ws) {
            OutChA.AddCh(' ');
            ws=false;
        }
        OutChA.AddCh(*e);
        e++;
    }
    //OutChA.ToLc();
    return OutChA;
}
예제 #5
0
파일: util.cpp 프로젝트: pikma/Snap
// space separated sequence of words, remove all punctuations, etc.
TChA TStrUtil::GetCleanWrdStr(const TChA& ChA) {
    char *b = (char *) ChA.CStr();
    while (*b && ! TCh::IsAlNum(*b)) {
        b++;
    }
    if (*b == 0) {
        return TChA();
    }
    TChA OutChA(ChA.Len());
    char *e = b, tmp;
    while (*e) {
        b = e;
        while (*e && (TCh::IsAlNum(*e) || ((*e=='\'' || *e=='-') && TCh::IsAlNum(*(e+1))))) {
            e++;
        }
        if (b < e) {
            tmp = *e;
            *e=0;
            OutChA += b;
            OutChA.AddCh(' ');
            *e = tmp;
        }
        while (*e && ! TCh::IsAlNum(*e)) {
            e++;
        }
        if (! *e) {
            break;
        }
    }
    OutChA.DelLastCh();
    OutChA.ToLc();
    return OutChA;
}
예제 #6
0
파일: util.cpp 프로젝트: pikma/Snap
// remove ending /, /index.html, etc. and strip starting www.
bool TStrUtil::GetNormalizedUrl(const TChA& UrlIn, const TChA& BaseUrl, TChA& UrlOut) {
    UrlOut = UrlIn;
    if (StripEnd(UrlIn, "/", UrlOut)) {}
    else if (StripEnd(UrlIn, "/index.html", UrlOut)) {}
    else if (StripEnd(UrlIn, "/index.htm", UrlOut)) {}
    else if (StripEnd(UrlIn, "/index.php", UrlOut)) {}
    if (! (UrlOut.IsPrefix("http://") || UrlOut.IsPrefix("ftp://"))) {
        // if UrlIn is relative url, try combine it with BaseUrl
        if (UrlIn.Empty() || ! (BaseUrl.IsPrefix("http://") || BaseUrl.IsPrefix("ftp://"))) {
            //printf("** Bad URL: base:'%s' url:'%s'\n", BaseUrl.CStr(), UrlIn.CStr());
            return false;
        }
        TChA Out;
        if (! GetNormalizedUrl(BaseUrl, TChA(), Out)) {
            return false;
        }
        if (UrlIn[0] != '/') {
            Out.AddCh('/');
        }
        Out += UrlOut;
        UrlOut = Out;
    }
    // http://www. --> http://
    if (UrlOut.IsPrefix("http://www.")) {
        TStr prefix("http://");
        UrlOut = prefix + UrlOut.GetSubStr(11, TInt::Mx);
    }
    UrlOut.ToLc();
    return true;
}
예제 #7
0
// http://www.ijs.si/fdfd/blah.html --> www.ijs.si
TChA TStrUtil::GetDomNm(const TChA& UrlChA) {
  int EndSlash = UrlChA.SearchCh('/', 7)-1; // skip starting http://
  if (EndSlash > 0) {
    const int BegSlash = UrlChA.SearchChBack('/', EndSlash);
    if (BegSlash > 0) { return UrlChA.GetSubStr(BegSlash+1, EndSlash).ToLc(); }
    else { return UrlChA.GetSubStr(0, UrlChA.SearchCh('/', 0)-1).ToLc(); }
  } else {
    if (UrlChA.IsPrefix("http://")) { return UrlChA.GetSubStr(7, UrlChA.Len()-1).ToLc(); }
    EndSlash = UrlChA.SearchCh('/', 0);
    if (EndSlash > 0) { return UrlChA.GetSubStr(0, EndSlash-1).ToLc(); }
    else { return TChA(UrlChA).ToLc(); }
  }
}
예제 #8
0
TEST(TStr, Constructors) {
	TStr Default;
	TStr CStr("abc");	
	TStr OneChar('a');
	TStr CopyCStr(CStr); 
	TStr Move(TStr("abc"));
	TStr ChA(TChA("abc"));
	TStr SStr(TSStr("abc"));
	
	EXPECT_EQ(Default, "");
	EXPECT_EQ(CStr, "abc");
	EXPECT_EQ(OneChar, "a");
	EXPECT_EQ(CopyCStr, "abc");
	EXPECT_EQ(Move, "abc");
	EXPECT_EQ(ChA, "abc");
	EXPECT_EQ(SStr, "abc");
	
	EXPECT_EQ(TStr(nullptr), "");
}