Exemple #1
0
TEST(TStr, GetLc) {
	TStr Mixedcase = "AbCd";
	TStr Lowercase = "abcd";	
	TStr Empty = "";
	EXPECT_EQ(Lowercase, Mixedcase.GetLc());
	EXPECT_EQ(Empty, Empty.GetLc());
}
Exemple #2
0
int TTmInfo::GetDayOfWeekN(const TStr& DayOfWeekNm, const TLoc& Loc){
  EnsureInit();
  int DayOfWeekN=-1;
  switch (Loc){
    case lUs: DayOfWeekN=UsDayOfWeekNmV.SearchForw(DayOfWeekNm.GetLc()); break;
    case lSi: DayOfWeekN=SiDayOfWeekNmV.SearchForw(DayOfWeekNm.GetLc()); break;
    default: Fail;
  }
  if (DayOfWeekN==-1){return -1;} else {return DayOfWeekN+1;}
}
Exemple #3
0
int TTmInfo::GetMonthN(const TStr& MonthNm, const TLoc& Loc){
  EnsureInit();
  int MonthN=-1;
  switch (Loc){
    case lUs: MonthN=UsMonthNmV.SearchForw(MonthNm.GetLc()); break;
    case lSi: MonthN=SiMonthNmV.SearchForw(MonthNm.GetLc()); break;
    default: Fail;
  }
  if (MonthN==-1){return -1;} else {return MonthN+1;}
}
bool GetBoolArg(const PXmlTok& QueryXml, const TStr& ArgNm, const bool DfVal)
{
	if (QueryXml.Empty()) return DfVal;
	TStr val = QueryXml->GetArgVal(ArgNm, "");
	if (val == "") return DfVal;
	val = val.GetLc();
	if (val == "true" || val == "1")
		return true;
	return false;
}
void TTokenizerHtml::GetTokens(const PSIn& SIn, TStrV& TokenV) const {
	THtmlLx HtmlLx(SIn, false);
    // traverse html string symbols
	while (HtmlLx.Sym!=hsyEof){
		if (HtmlLx.Sym==hsyStr){
			TStr UcStr = HtmlLx.UcChA;
			// check if stop word
			if ((SwSet.Empty()) || (!SwSet->IsIn(UcStr))) {
				TStr TokenStr = ToUcP ? UcStr : TStr(HtmlLx.ChA);
				if (!Stemmer.Empty()) { 
					TokenStr = Stemmer->GetStem(TokenStr); }
				TokenV.Add(TokenStr.GetLc());
			}
		}
		// get next symbol
		HtmlLx.GetSym();
	}

 //   // tokenize
 //   TStrV TokenStrV; Tokenizer->GetTokens(TextStr, TokenStrV);
 //   // transform words to IDs
 //   const int Tokens = TokenStrV.Len();
 //   TIntV TokenIdV(Tokens, 0);
 //   for (int TokenN = 0; TokenN < Tokens; TokenN++) {
 //       // add token to the hashtable of all tokens
 //       const int TokenId = WordH.AddKey(TokenStrV[TokenN].GetUc());
 //       // keep track of it's count
 //       WordH[TokenId]++;
 //       // and prepare a token vector for ngram base
 //       TokenIdV.Add(TokenId);
 //   }
	//// extract the n-grams
 //   TNGramDescV NGramDescV;
	//NGramBs->AddDocTokIdV(TokenIdV, StoreThreshold, NGramDescV);
 //   // get string representations of n-grams above threshold
 //   TStrH NGramH;
 //   for (int NGramDescN = 0; NGramDescN < NGramDescV.Len(); NGramDescN++) {
 //       const TNGramDesc& NGramDesc = NGramDescV[NGramDescN];
 //       // make it into a string
 //       const TIntV& NGramTokenIdV = NGramDesc.TokIdV;
 //       TChA NGramChA = WordH.GetKey(NGramTokenIdV[0]);
 //       for (int NGramTokenIdN = 1; NGramTokenIdN < NGramTokenIdV.Len(); NGramTokenIdN++) {
 //           NGramChA += ' '; NGramChA += WordH.GetKey(NGramTokenIdV[NGramTokenIdN]);
 //       }
 //       // remember the ngram, if not stopword
 //       if (!SwSet->IsIn(NGramChA)) { NGramH.AddDat(NGramChA); }
 //   }
 //   // remember n-grams above threshold
 //   int NGramKeyId = NGramH.FFirstKeyId();
 //   while (NGramH.FNextKeyId(NGramKeyId)) {
 //       const TStr& NGramStr = NGramH.GetKey(NGramKeyId);
 //       // add to the result list
 //       ConceptV.Add(TOgNewsConcept(NGramStr, EmtpyStr));
 //   }
}
Exemple #6
0
TStemmerType TStemmer::GetStemmerType(const TStr& StemmerTypeNm) {
    TStr LcStemmerTypeNm=StemmerTypeNm.GetLc();
    if (LcStemmerTypeNm=="none") {
        return stmtNone;
    }
    else if (LcStemmerTypeNm=="porter") {
        return stmtPorter;
    }
    else {
        return stmtNone;
    }
}
Exemple #7
0
int TRSet::GetHitN(const TStr& UrlStr, const bool& LcP) const {
  int Hits=GetHits();
  TStr LcUrlStr=UrlStr.GetLc();
  for (int HitN=0; HitN<Hits; HitN++){
    if (LcP){
      if (GetHitUrlStr(HitN).GetLc()==LcUrlStr){return HitN;}
    } else {
      if (GetHitUrlStr(HitN)==UrlStr){return HitN;}
    }
  }
  return -1;
}
Exemple #8
0
TSwSetType TSwSet::GetSwSetType(const TStr& SwSetTypeNm){
  TStr LcSwSetTypeNm=SwSetTypeNm.GetLc();
  if (LcSwSetTypeNm=="none"){return swstNone;}
  else if (LcSwSetTypeNm=="en8"){return swstEn8;}
  else if (LcSwSetTypeNm=="en425"){return swstEn425;}
  else if (LcSwSetTypeNm=="en523"){return swstEn523;}
  else if (LcSwSetTypeNm=="enMsdn"){return swstEnMsdn;}
  else if (LcSwSetTypeNm=="es"){return swstEs;}
  else if (LcSwSetTypeNm=="german"){return swstGe;}
  else if (LcSwSetTypeNm=="siyuascii"){return swstSiYuAscii;}
  else if (LcSwSetTypeNm=="siisoce"){return swstSiIsoCe;}
  else {return swstNone;}
}
Exemple #9
0
void THtml::GetTokens(const PSIn& SIn, TStrV& TokenV) const {
	THtmlLx HtmlLx(SIn, false);
    // traverse html string symbols
	while (HtmlLx.Sym!=hsyEof){
		if (HtmlLx.Sym==hsyStr){
			TStr UcStr = HtmlLx.UcChA;
			// check if stop word
			if ((SwSet.Empty()) || (!SwSet->IsIn(UcStr))) {
				TStr TokenStr = ToUcP ? UcStr : TStr(HtmlLx.ChA);
				if (!Stemmer.Empty()) { 
					TokenStr = Stemmer->GetStem(TokenStr); }
				TokenV.Add(TokenStr.GetLc());
			}
		}
		// get next symbol
		HtmlLx.GetSym();
	}
}
Exemple #10
0
/////////////////////////////////////////////////
// BagOfWords-Files
void TBowFl::LoadHtmlTxt(
 PBowDocBs BowDocBs, const TStr& FPath, TIntV& NewDIdV,
 const bool& RecurseDirP, const int& MxDocs,
 const bool& SaveDocP, const PNotify& Notify) {
  // prepare file-directory traversal
  TStr LcNrFPath=TStr::GetNrFPath(FPath).GetLc();
  Notify->OnStatus("Creating Bow from file-path " + FPath + " ...");
  TFFile FFile(FPath, "", RecurseDirP);
  // traverse files
  TStr FNm; int Docs=0; NewDIdV.Clr();
  while (FFile.Next(FNm)){
    Docs++; if ((MxDocs!=-1)&&(Docs>MxDocs)){break;}
    Notify->OnStatus(TStr::Fmt("%d\r", Docs));
    // prepare document-name
    if (TFile::Exists(FNm)) { //B:
        TStr DocNm=FNm.GetLc();
        if (DocNm.IsPrefix(LcNrFPath)){
          DocNm=DocNm.GetSubStr(LcNrFPath.Len(), DocNm.Len()-1);}
        // categories
        TStrV CatNmV; TStr CatNm;
        if (DocNm.IsChIn('/')){
          TStr Str; DocNm.SplitOnCh(CatNm, '/', Str);
        } else if (DocNm.IsChIn('\\')){
          TStr Str; DocNm.SplitOnCh(CatNm, '\\', Str);
        }
        if (!CatNm.Empty()){
          CatNmV.Add(CatNm);}
        // load document-content
        TStr DocStr=TStr::LoadTxt(FNm);
        // add document to bow
        NewDIdV.Add(BowDocBs->AddHtmlDoc(DocNm, CatNmV, DocStr, SaveDocP));
    }
  }
  Notify->OnStatus(TStr::Fmt("%d", Docs));
  // return results
  Notify->OnStatus("Done.");
  BowDocBs->AssertOk();
}
Exemple #11
0
void TGgSchBs::SetAuthCrawled(const TStr& AuthNm){
  AuthNmToCrawlPH.AddDat(AuthNm.GetLc())=true;
}
void BigMain(int argc, char* argv[]) {
  TExeTm ExeTm;
  Env = TEnv(argc, argv, TNotify::StdNotify);
  Env.PrepArgs("QuotesApp");
  const TStr ToDo = Env.GetIfArgPrefixStr("-do:", "", "To do").GetLc();
  if (Env.IsEndOfRun()) {
    printf("To do:\n");
    printf("    MkDataset         : Make memes dataset (extract quotes and save txt)\n");
    printf("    ExtractSubset     : Extract a subset of memes containing particular words\n");
    printf("    MemesToQtBs       : Load memes dataset and create quote base\n");
    printf("    MkClustNet        : Build cluster network from the quote base\n");
    return;
  }	
#pragma region mkdataset
  // extract quotes and links and make them into a single file
  if (ToDo == "mkdataset") {
    const TStr InFNm = Env.GetIfArgPrefixStr("-i:", "files.txt", "Spinn3r input files (one file per line)");
    const TStr OutFNm = Env.GetIfArgPrefixStr("-o:", "Spinn3r-dataset.txt", "Output file");
    const int MinQtWrdLen = Env.GetIfArgPrefixInt("-w:", 3, "Minimum quote word length");
    const TStr UrlFNm = Env.GetIfArgPrefixStr("-u:", "", "Seen url set (THashSet<TMd5Sig>) file name");
    const bool UrlOnlyOnce = Env.GetIfArgPrefixBool("-q:", true, "Only keep unique Urls");
    //// parse directly from Spinn3r
    TStr Spinn3rFNm;
    THashSet<TMd5Sig> SeenUrlSet;
    if (UrlOnlyOnce && ! UrlFNm.Empty()) {  // keep track of already seen urls (so that there are no duplicate urls)
      TFIn FIn(UrlFNm);  SeenUrlSet.Load(FIn);
    }
    FILE *F = fopen(OutFNm.CStr(), "wt");
    TFIn FIn(InFNm);
    int Items=0;
    for (int f=0; FIn.GetNextLn(Spinn3rFNm); f++) {
      TQuoteExtractor QE(Spinn3rFNm.ToTrunc());
      printf("Processing %02d: %s [%s]\n", f+1, Spinn3rFNm.CStr(), TExeTm::GetCurTm());
      fflush(stdout);
      for (int item = 0; QE.Next(); item++) {
        const TMd5Sig PostMd5(QE.PostUrlStr);
        if (QE.QuoteV.Empty() && QE.LinkV.Empty()) { continue; } // no quotes, no links
        if (UrlOnlyOnce) {
          if (SeenUrlSet.IsKey(PostMd5)) { continue; }
          SeenUrlSet.AddKey(PostMd5);
        }
        fprintf(F, "P\t%s\n", QE.PostUrlStr.CStr());
        //if (QE.PubTm > TSecTm(2008,8,30) || QE.PubTm < TSecTm(2008,7,25)) { printf("%s\n", QE.PubTm.GetStr().CStr()); }
        fprintf(F, "T\t%s\n", QE.PubTm.GetYmdTmStr().CStr());
        for (int q = 0; q < QE.QuoteV.Len(); q++) {
          if (TStrUtil::CountWords(QE.QuoteV[q]) >= MinQtWrdLen) {
            fprintf(F, "Q\t%s\n", QE.QuoteV[q].CStr()); }
        }
        for (int l = 0; l < QE.LinkV.Len(); l++) {
          fprintf(F, "L\t%s\n", QE.LinkV[l].CStr()); }
        fprintf(F, "\n");
        if (item>0 && item % Kilo(100) == 0) {
          QE.DumpStat();  QE.ExeTm.Tick(); }
        Items++;
      }
      printf("file done. Total %d all posts, %d all items\n", SeenUrlSet.Len(), Items);
      fflush(stdout);
    }
    printf("all done. Saving %d post urls\n", SeenUrlSet.Len());  fflush(stdout);
    if (! SeenUrlSet.Empty()) {
      TFOut FOut(OutFNm.GetFMid()+".SeenUrlSet");
      SeenUrlSet.Save(FOut);
    }
    fclose(F);
  }
#pragma endregion mkdataset

#pragma region extractsubset
  // save posts with memes containing particular words
  else if (ToDo == "extractsubset") {
    const TStr InFNmWc = Env.GetIfArgPrefixStr("-i:", "memes_*.rar", "Input file prefix");
    const bool IsInFNmWc = Env.GetIfArgPrefixBool("-w:", true, "Input is wildcard (else a file with list of input files)");
    const TStr OutFNm = Env.GetIfArgPrefixStr("-o:", "memes-subset.txt", "Output memes file");
    const TStr WordsFNm = Env.GetIfArgPrefixStr("-p:", "phrases-in.txt", "Phrases that memes have to contain");

    TChAV CatchMemeV;// = TStr::GetV("great depression", "economic meltdown", "recession had bottomed out", "green shoots", "slow recovery", "gradual recovery");
    printf("Loading %s\n", WordsFNm.CStr());
    { TFIn FIn(WordsFNm);
    for (TStr Ln; FIn.GetNextLn(Ln); ) {
      printf("  %s\n", Ln.GetLc().CStr());
      CatchMemeV.Add(Ln.GetLc()); }
    }
    printf("%d strings loaded\n", CatchMemeV.Len());
    TFOut FOut(OutFNm);
    TMemesDataLoader Memes(InFNmWc, IsInFNmWc);
    for (int posts = 0, nsave=0; Memes.LoadNext(); posts++) {
      bool DoSave = false;
      for (int m = 0; m < Memes.MemeV.Len(); m++) {
        for (int i = 0; i < CatchMemeV.Len(); i++) {
          if (Memes.MemeV[m].SearchStr(CatchMemeV[i]) != -1) {
            DoSave=true; break; }
        }
        if (DoSave) { break; }
      }
      if (DoSave) { Memes.SaveTxt(FOut); nsave++; }
      if (posts % Mega(1) == 0) {
        printf("%dm posts, %d saved\n", posts/Mega(1), nsave);
        FOut.Flush();
      }
    }
  }
#pragma endregion extractsubset

#pragma region memestoqtbs
  // load memes dataset (MkDataset) and create quote base
  else if (ToDo == "memestoqtbs") {
    const TStr InFNm = Env.GetIfArgPrefixStr("-i:", "201007_201107.txt", "Input Memes dataset files");
    const TStr MediaUrlFNm = Env.GetIfArgPrefixStr("-u:", "news_media.txt", "Fule with news media urls");
    const TStr Pref = Env.GetIfArgPrefixStr("-o:", "qt", "Output file name prefix");
    const int MinWrdLen = Env.GetIfArgPrefixInt("-l:", 4, "Min quote word length");
    const int MinMemeFq = Env.GetIfArgPrefixInt("-f:", 5, "Min meme frequency");
		const TStr MinTmStr = Env.GetIfArgPrefixStr("-mint:", "20100714", "Min time of quotes, format = YYYYMMDD");
		const TStr MaxTmStr = Env.GetIfArgPrefixStr("-maxt:", "20110728", "Max time of quotes, format = YYYYMMDD");
		TSecTm MinTm(atoi(MinTmStr.GetSubStr(0,3).CStr()),atoi(MinTmStr.GetSubStr(4,5).CStr()),atoi(MinTmStr.GetSubStr(6,7).CStr()));
		TSecTm MaxTm(atoi(MaxTmStr.GetSubStr(0,3).CStr()),atoi(MaxTmStr.GetSubStr(4,5).CStr()),atoi(MaxTmStr.GetSubStr(6,7).CStr()));

		PQuoteBs QtBs = TQuoteBs::New();
		int HashTableSize = 100; // 100 for each quarter, for one year data, use 400
		int UrlSetSize = 4 * HashTableSize;
		QtBs->ConstructQtBsZarya(InFNm, Pref, MediaUrlFNm, MinTm, MaxTm, MinWrdLen, MinMemeFq, HashTableSize, UrlSetSize);
		}
#pragma endregion memestoqtbs

#pragma region mkclustnet
  // make cluster network
  else if (ToDo == "mkclustnet") {
    TStr InQtBsNm = Env.GetIfArgPrefixStr("-i:", "", "Input quote base file name");
    TStr Pref = Env.GetIfArgPrefixStr("-o:", "qt", "Output network/updated QtBs filename");
		TStr BlackListFNm = Env.GetIfArgPrefixStr("-b:", "quote_blacklist.txt", "Blacklist file name");
		bool IsShglReady = Env.GetIfArgPrefixBool("-shglready:", false, "Indicate whether shingle hash table is ready");
		bool IsNetReady = Env.GetIfArgPrefixBool("-netready:", false, "Indicate whether cluster net is ready");
		double BktThresh = Env.GetIfArgPrefixFlt("-bktthresh:", 0.4, "Threshold for bad shingle bucket elimination");
		double MxTmDelay = Env.GetIfArgPrefixFlt("-delaythresh:", 5, "Max time delay between two quotes in the same cluster");
		double MxTmDev = Env.GetIfArgPrefixFlt("-devthresh:", 3, "Max time deviation for a quote to be specific rather than general");
		double RefineThresh = Env.GetIfArgPrefixFlt("-refinethresh:", 0.2, "Threshold for merging quote cluster in refining process");
    const int MinWrdLen = Env.GetIfArgPrefixInt("-minl:", 4, "Min quote word length");
		const int MaxWrdLen = Env.GetIfArgPrefixInt("-maxl:", 200, "Max quote word length");
    const int MinMemeFq = Env.GetIfArgPrefixInt("-mf:", 5, "Min meme frequency");
		const int MinClustFq = Env.GetIfArgPrefixInt("-cf:", 50, "Min quote cluster frequency");

		// Load quote base
    PQuoteBs QtBs;
    if (TZipIn::IsZipFNm(InQtBsNm)) { TZipIn ZipIn(InQtBsNm);  QtBs = TQuoteBs::Load(ZipIn); }
    else { TFIn FIn(InQtBsNm);  QtBs = TQuoteBs::Load(FIn); }

		// Cluster the quotes
    QtBs->ClusterQuotes(MinMemeFq, MinWrdLen, MaxWrdLen, BlackListFNm, Pref, IsShglReady, IsNetReady, BktThresh, MxTmDelay, MxTmDev, RefineThresh);

		// Dump the clusters
		bool SkipUrl = true, FlashDisp = true;
		QtBs->DumpQuoteClusters(MinWrdLen, MinClustFq, SkipUrl, FlashDisp, Pref);
  }
#pragma endregion mkclustnet

#pragma region memeclust
	else if (ToDo.SearchStr(TStr("memeclust")) >= 0) {
    const TStr InFNm = Env.GetIfArgPrefixStr("-i:", "201101.txt", "Input Memes dataset files");
    const TStr MediaUrlFNm = Env.GetIfArgPrefixStr("-u:", "news_media.txt", "Fule with news media urls");
    TStr Pref = Env.GetIfArgPrefixStr("-o:", "qt", "Output file name prefix");
    const int MinWrdLen = Env.GetIfArgPrefixInt("-minl:", 4, "Min quote word length");
		const int MaxWrdLen = Env.GetIfArgPrefixInt("-maxl:", 200, "Max quote word length");
		const int MinMemeFq = Env.GetIfArgPrefixInt("-f:", 5, "Min meme frequency");

		const int MinClustFq = Env.GetIfArgPrefixInt("-cf:", 50, "Min quote cluster frequency");
		TStr BlackListFNm = Env.GetIfArgPrefixStr("-b:", "quote_blacklist.txt", "Blacklist file name");
		bool IsQtBsReady = Env.GetIfArgPrefixBool("-qtbsready:", false, "Indicate whether quote base is ready and can be loaded readily");
		bool IsShglReady = Env.GetIfArgPrefixBool("-shglready:", false, "Indicate whether shingle hash table is ready");
		bool IsNetReady = Env.GetIfArgPrefixBool("-netready:", false, "Indicate whether cluster net is ready");
		double BktThresh = Env.GetIfArgPrefixFlt("-bktthresh:", 0.4, "Threshold for bad shingle bucket elimination");
		double MxTmDelay = Env.GetIfArgPrefixFlt("-delaythresh:", 5, "Max time delay between two quotes in the same cluster");
		double MxTmDev = Env.GetIfArgPrefixFlt("-devthresh:", 3, "Max time deviation for a quote to be specific rather than general");
		double RefineThresh = Env.GetIfArgPrefixFlt("-refinethresh:", 0.2, "Threshold for merging quote cluster in refining process");

		const TStr MinTmStr = Env.GetIfArgPrefixStr("-mint:", "20010101", "Min time of quotes, format = YYYYMMDD");
		const TStr MaxTmStr = Env.GetIfArgPrefixStr("-maxt:", "20200101", "Max time of quotes, format = YYYYMMDD");
		TSecTm MinTm(atoi(MinTmStr.GetSubStr(0,3).CStr()),atoi(MinTmStr.GetSubStr(4,5).CStr()),atoi(MinTmStr.GetSubStr(6,7).CStr()));
		TSecTm MaxTm(atoi(MaxTmStr.GetSubStr(0,3).CStr()),atoi(MaxTmStr.GetSubStr(4,5).CStr()),atoi(MaxTmStr.GetSubStr(6,7).CStr()));

		// Construct the quote base from Zarya data
		PQuoteBs QtBs = TQuoteBs::New();

		if (!IsQtBsReady) {
			int HashTableSize = 100; // 100 for each quarter, for one year data, use 400
			if (ToDo == "memeclustzarya") {
				int UrlSetSize = 4 * HashTableSize;
				QtBs->ConstructQtBsZarya(InFNm, Pref, MediaUrlFNm, MinTm, MaxTm, MinWrdLen, MinMemeFq, HashTableSize, UrlSetSize);
			}	else if (ToDo == "memeclustqtonly") {
				QtBs->ConstructQtBsQtOnly(InFNm, Pref, MediaUrlFNm, MinWrdLen, MinMemeFq, HashTableSize);
			} else if (ToDo == "memeclustqttime") {
				QtBs->ConstructQtBsQtTime(InFNm, Pref, MediaUrlFNm, MinWrdLen, MinMemeFq, HashTableSize);
			} else {
				printf("Please specify one of the three options for -do : memeclustzarya, memeclustqtonly, memeclustqttime!\n");
				return;
			}
		} else {
			TStr InQtBsNm = TStr::Fmt("%s-w%dmfq%d.QtBs", Pref.CStr(), MinWrdLen, MinMemeFq);
			if (TZipIn::IsZipFNm(InQtBsNm)) { TZipIn ZipIn(InQtBsNm);  QtBs = TQuoteBs::Load(ZipIn); }
			else { TFIn FIn(InQtBsNm);  QtBs = TQuoteBs::Load(FIn); }
		}

		// Cluster the quotes
    QtBs->ClusterQuotes(MinMemeFq, MinWrdLen, MaxWrdLen, BlackListFNm, Pref, IsShglReady, IsNetReady, BktThresh, MxTmDelay, MxTmDev, RefineThresh);

		// Dump the clusters
		bool SkipUrl = true, FlashDisp = true;
		QtBs->DumpQuoteClusters(MinWrdLen, MinClustFq, SkipUrl, FlashDisp, Pref);
	}
#pragma endregion memeclust
}