Пример #1
0
void TGgSchRef::GetAuthNmVPubStr(
 const TStr& AuthNmVPubStr, TStrV& AuthNmV, TStr& PubNm, TStr& PubYearStr){
  // split input string into two parts
  TStr AuthNmVStr; TStr PubStr;
  AuthNmVPubStr.SplitOnStr(AuthNmVStr, " - ", PubStr);
  // author-names string
  AuthNmVStr.SplitOnAllCh(',', AuthNmV, true);
  for (int AuthN=0; AuthN<AuthNmV.Len(); AuthN++){
    AuthNmV[AuthN].ToTrunc();
  }
  if ((!AuthNmV.Empty())&&
   ((AuthNmV.Last().IsStrIn("..."))||(AuthNmV.Last().Len()<=2))){
    AuthNmV.DelLast();
  }
  // publication-name & publication-year string
  TStr OriginStr; TStr LinkStr;
  PubStr.SplitOnStr(OriginStr, " - ", LinkStr);
  OriginStr.SplitOnLastCh(PubNm, ',', PubYearStr);
  PubNm.ToTrunc(); PubYearStr.ToTrunc();
  if ((PubYearStr.Len()>=4)&&(PubYearStr.GetSubStr(0, 3).IsInt())){
    PubYearStr=PubYearStr.GetSubStr(0, 3);
  } else
  if ((PubNm.Len()>=4)&&(PubNm.GetSubStr(0, 3).IsInt())){
    PubYearStr=PubNm.GetSubStr(0, 3); PubNm="";
  } else {
    PubYearStr="";
  }
}
Пример #2
0
TEST(TStr, GetSubStr) {
	TStr Str = "abcda";
	TStr Empty = "";

	EXPECT_EQ(Str.GetSubStr(3), "da");
	EXPECT_EQ(Str.GetSubStr(3, 3), "d");

	EXPECT_ANY_THROW(Str.GetSubStr(-1, -1));
	EXPECT_ANY_THROW(Str.GetSubStr(2, 1));
	EXPECT_ANY_THROW(Str.GetSubStr(-1, 100));
}
Пример #3
0
// <last_name>_<first name innitial>
TStr TStrUtil::GetStdName(TStr AuthorName) {
    TStr StdName;
    AuthorName.ToLc();
    AuthorName.ChangeChAll('\n', ' ');
    AuthorName.ChangeChAll('.', ' ');
    // if there is a number in the name, remove it and everything after it
    int i, pos = 0;
    while (pos<AuthorName.Len() && (AuthorName[pos]!='#' && !TCh::IsNum(AuthorName[pos]))) {
        pos++;
    }
    if (pos < AuthorName.Len()) {
        AuthorName = AuthorName.GetSubStr(0, pos-1).ToTrunc();
    }
    if (AuthorName.Empty()) {
        return TStr::GetNullStr();
    }

    // replace everything after '('
    int b = AuthorName.SearchCh('(');
    if (b != -1) {
        AuthorName = AuthorName.GetSubStr(0, b-1).ToTrunc();
    }
    // skip if contains ')'
    if (AuthorName .SearchCh(')')!=-1) {
        return TStr::GetNullStr();
    }
    // skip if it is not a name
    if (AuthorName .SearchStr("figures")!=-1 || AuthorName .SearchStr("macros")!=-1
            || AuthorName .SearchStr("univ")!=-1 || AuthorName .SearchStr("institute")!=-1) {
        return TStr::GetNullStr();
    }
    // remove all non-letters (latex tags, ...)
    TChA NewName;
    for (i = 0; i < AuthorName.Len(); i++) {
        const char Ch = AuthorName[i];
        if (TCh::IsAlpha(Ch) || TCh::IsWs(Ch) || Ch=='-') {
            NewName += Ch;
        }
    }
    StdName = NewName;
    StdName.ToTrunc();
    TStrV AuthNmV;
    StdName.SplitOnWs(AuthNmV);
    // too short -- not a name
    if (! AuthNmV.Empty() && AuthNmV.Last() == "jr") AuthNmV.DelLast();
    if (AuthNmV.Len() < 2) return TStr::GetNullStr();

    const TStr LastNm = AuthNmV.Last();
    if (! TCh::IsAlpha(LastNm[0]) || LastNm.Len() == 1) return TStr::GetNullStr();

    IAssert(isalpha(AuthNmV[0][0]));
    return TStr::Fmt("%s_%c", LastNm.CStr(), AuthNmV[0][0]);
}
Пример #4
0
TStrV TEnv::GetIfArgPrefixStrV(const TStr& PrefixStr,
                               TStrV& DfValV,
                               const TStr& DNm) const {
    TStrV ArgValV;
    if (Env.GetArgs() <= MnArgs) {
        // 'usage' argument message
        if (!SilentP) {
            printf("   %s%s (default:", PrefixStr.CStr(),
                   DNm.CStr());
            for (int DfValN = 0; DfValN < DfValV.Len();
                 DfValN++) {
                if (DfValN > 0) printf(", ");
                printf("%s", DfValV[DfValN].CStr());
            }
            printf(")\n");
        }
        return ArgValV;
    } else {
        // argument & value message
        TStrV Items;
        for (int ArgN = 0; ArgN < GetArgs(); ArgN++) {
            // get argument string
            TStr ArgStr = GetArg(ArgN);
            if (ArgStr.GetSubStr(0, PrefixStr.Len() - 1) ==
                PrefixStr) {
                TStr ArgVals = ArgStr.GetSubStr(
                    PrefixStr.Len(), ArgStr.Len());
                ArgVals.SplitOnAllCh(',', Items);
                for (int i = 0; i < Items.Len(); i++)
                    ArgValV.Add(Items[i]);
            }
        }
        if (ArgValV.Empty()) ArgValV = DfValV;
        // output argument values
        TChA MsgChA;
        MsgChA += DNm;
        MsgChA += " (";
        MsgChA += PrefixStr;
        MsgChA += ")=";
        for (int ArgValN = 0; ArgValN < ArgValV.Len();
             ArgValN++) {
            if (ArgValN > 0) MsgChA += ", ";
            MsgChA += ArgValV[ArgValN];
        }
        if (!SilentP) TNotify::OnStatus(Notify, MsgChA);
        return ArgValV;
    }
}
Пример #5
0
TStr TEnv::GetIfArgPrefixStr(
    const TStr& PrefixStr, const TStr& DfVal, const TStr& DNm) const {
    if (Env.GetArgs()<=MnArgs) {
        // 'usage' argument message
        if (!SilentP) {
            printf("   %s%s (default:'%s')\n", PrefixStr.CStr(), DNm.CStr(), DfVal.CStr());
        }
        return DfVal;
    } else {
        // argument & value message
        TStr Val;
        if (Env.IsArgPrefix(PrefixStr)) {
            Val=Env.GetArgPostfix(PrefixStr);
            if (Val.Len()>1) {
                if ((Val[0]=='\"')&&(Val.LastCh()=='\"')) {
                    Val=Val.GetSubStr(1, Val.Len()-2);
                }
            }
        } else {
            Val=DfVal;
        }
        TStr MsgStr="  "+DNm+" ("+PrefixStr+")="+Val;
        if (!SilentP) {
            TNotify::OnStatus(Notify, MsgStr);
        }
        return Val;
    }
}
Пример #6
0
TStr TEnv::GetExeFNm() const {
    TStr ExeFNm = GetArg(0);
    if (ExeFNm.IsPrefix("//?")) {  // observed on Win64 CGI
        ExeFNm = ExeFNm.GetSubStr(3, ExeFNm.Len());
    }
    return ExeFNm;
}
Пример #7
0
int TNntpSockEvent::GetRespCd(const TStr& RespCrLfLn){
  if (RespCrLfLn.Len()>=3){
    TStr RespCdStr=RespCrLfLn.GetSubStr(0, 2);
    int RespCd=0;
    if (RespCdStr.IsInt(RespCd)){return RespCd;} else {return -1;}
  } else {
    return -1;
  }
}
Пример #8
0
void TStrParser::DocStrToChIdV(const TStr& _DocStr, TIntV& ChIdV) {
    TStr DocStr = _DocStr.GetUc();  // to upper case
    int ChN = DocStr.Len();
    ChIdV.Reserve(ChN, 0);
    for (int ChC = 0; ChC < ChN; ChC++) {
        TStr ChStr = DocStr.GetSubStr(ChC,ChC);
        int ChId = GetWId(ChStr);
        if (ChId != -1) {
            WordToIdH[ChId]++;
        } else {
            ChId = WordToIdH.AddKey(ChStr);
            WordToIdH[ChId] = 1;
        }
        ChIdV.Add(ChId);
    }
}
Пример #9
0
//////////////////////////////////////
// File-Download-Function
void TSASFunFile::LoadFunFileV(const TStr& FPath, TSAppSrvFunV& SrvFunV) {
	TFFile File(FPath, "", false); TStr FNm;
	while (File.Next(FNm)) {
		TStr FExt = FNm.GetFExt();
		TStr FUrl = FNm.GetSubStr(FPath.Len());
		FUrl.ChangeChAll('\\', '/');
		printf("%s %s %s\n", FNm.CStr(), FExt.CStr(), FUrl.CStr());
		if (FExt == ".htm") { SrvFunV.Add(TSASFunFile::New(FUrl, FNm, THttp::TextHtmlFldVal)); }
		else if (FExt == ".html") { SrvFunV.Add(TSASFunFile::New(FUrl, FNm, THttp::TextHtmlFldVal)); }
		else if (FExt == ".js") { SrvFunV.Add(TSASFunFile::New(FUrl, FNm, THttp::TextJavaScriptFldVal)); }
		else if (FExt == ".css") { SrvFunV.Add(TSASFunFile::New(FUrl, FNm, THttp::TextCssFldVal)); }
		else if (FExt == ".jpg") { SrvFunV.Add(TSASFunFile::New(FUrl, FNm, THttp::ImageJpgFldVal)); }
		else if (FExt == ".jpeg") { SrvFunV.Add(TSASFunFile::New(FUrl, FNm, THttp::ImageJpgFldVal)); }
		else if (FExt == ".gif") { SrvFunV.Add(TSASFunFile::New(FUrl, FNm, THttp::ImageGifFldVal)); }
		else {
			printf("Unknown MIME type for extension '%s' for file '%s'", FExt.CStr(), FNm.CStr());
			SrvFunV.Add(TSASFunFile::New(FUrl, FNm, THttp::AppOctetFldVal)); 
		}
	}	
}
Пример #10
0
TStr TEnv::GetArgPostfix(const TStr& PrefixStr) const {
    int ArgN = GetPrefixArgN(PrefixStr);
    IAssert(ArgN != -1);
    TStr ArgStr = GetArg(ArgN);
    return ArgStr.GetSubStr(PrefixStr.Len(), ArgStr.Len());
}
Пример #11
0
// load from allactors.zip that was prepared by Brad Malin in 2005
PImdbNet TImdbNet::LoadTxt(const TStr& ActorFNm) {
  PImdbNet Net = TImdbNet::New();
  TStrV ColV;
  char line [2024];
  int NLines=0, DupEdge=0, Year, Position, ActorNId, MovieNId;
  TIntH ActorNIdH;
  THash<TIntPr, TInt> MovieNIdH;
  FILE *F = fopen(ActorFNm.CStr(), "rt");  fgets(line, 2024, F);
  while (! feof(F)) {
    memset(line, 0, 2024);
    fgets(line, 2024, F);
    if (strlen(line) == 0) break;
    TStr(line).SplitOnAllCh('|', ColV, false);  IAssert(ColV.Len() == 7);
    const int NameStrId = Net->AddStr(ColV[0].GetTrunc().GetLc()+" "+ColV[1].GetTrunc().GetLc());
    const int MovieStrId = Net->AddStr(ColV[2].GetTrunc().GetLc());
    TStr YearStr = ColV[3].GetTrunc();
    if (YearStr.Len() > 4) YearStr = YearStr.GetSubStr(0, 3);
    Year = 1;  YearStr.IsInt(Year);
    const TMovieTy MovieTy = TImdbNet::GetMovieTy(ColV[4]);
    Position = TInt::Mx;  ColV[5].GetTrunc().IsInt(Position);
    IAssert(ColV[6].GetTrunc()[0] == 'M' || ColV[6].GetTrunc()[0]=='F');
    const bool IsMale = ColV[6].GetTrunc()[0] == 'M';
    // create nodes  
    if (ActorNIdH.IsKey(NameStrId)) { 
      ActorNId = ActorNIdH.GetDat(NameStrId); }
    else { 
      ActorNId = Net->AddNode(-1, TImdbNode(NameStrId, Year, Position, IsMale));
      ActorNIdH.AddDat(NameStrId, ActorNId);
    }
    if (MovieNIdH.IsKey(TIntPr(MovieStrId, Year))) {
      MovieNId = MovieNIdH.GetDat(TIntPr(MovieStrId, Year)); }
    else {
      MovieNId = Net->AddNode(-1, TImdbNode(NameStrId, Year, MovieTy)); 
      MovieNIdH.AddDat(TIntPr(MovieStrId, Year), MovieNId); 
    }
    if (! Net->IsEdge(ActorNId, MovieNId)) { 
      Net->AddEdge(ActorNId, MovieNId); }
    else { DupEdge++; }
    if (++NLines % 100000 == 0) printf("\r  %dk  ", NLines/1000);
  }
  fclose(F);
  printf("duplicate edges: %d\n", DupEdge);
  printf("nodes:  %d\n", Net->GetNodes());
  printf("edges:  %d\n", Net->GetEdges());
  printf("actors: %d\n", ActorNIdH.Len());
  printf("movies: %d\n", MovieNIdH.Len());
  // set the actor year to the year of his first movie
  int NUpdates=0;
  for (TNet::TNodeI NI = Net->BegNI(); NI < Net->EndNI(); NI++) {
    if (NI().IsActor()) {
      int MinYear = NI().GetYear();
      for (int e = 0; e < NI.GetOutDeg(); e++) {
        const TImdbNode& NodeDat = Net->GetNDat(NI.GetOutNId(e));
        if (NodeDat.IsMovie()) MinYear = TMath::Mn(MinYear, NodeDat.GetYear());
      }
      if (NI().Year != MinYear) NUpdates++;
      NI().Year = MinYear;
    }
  }
  printf("updated actor times: %d\n", NUpdates);
  return Net;
}
Пример #12
0
void BigMain(int argc, char* argv[]) {
  TExeTm ExeTm;
  Env = TEnv(argc, argv, TNotify::StdNotify);
  Env.PrepArgs("QuotesApp");
  const TStr ToDo = Env.GetIfArgPrefixStr("-do:", "", "To do").GetLc();
  if (Env.IsEndOfRun()) {
    printf("To do:\n");
    printf("    MkDataset         : Make memes dataset (extract quotes and save txt)\n");
    printf("    ExtractSubset     : Extract a subset of memes containing particular words\n");
    printf("    MemesToQtBs       : Load memes dataset and create quote base\n");
    printf("    MkClustNet        : Build cluster network from the quote base\n");
    return;
  }	
#pragma region mkdataset
  // extract quotes and links and make them into a single file
  if (ToDo == "mkdataset") {
    const TStr InFNm = Env.GetIfArgPrefixStr("-i:", "files.txt", "Spinn3r input files (one file per line)");
    const TStr OutFNm = Env.GetIfArgPrefixStr("-o:", "Spinn3r-dataset.txt", "Output file");
    const int MinQtWrdLen = Env.GetIfArgPrefixInt("-w:", 3, "Minimum quote word length");
    const TStr UrlFNm = Env.GetIfArgPrefixStr("-u:", "", "Seen url set (THashSet<TMd5Sig>) file name");
    const bool UrlOnlyOnce = Env.GetIfArgPrefixBool("-q:", true, "Only keep unique Urls");
    //// parse directly from Spinn3r
    TStr Spinn3rFNm;
    THashSet<TMd5Sig> SeenUrlSet;
    if (UrlOnlyOnce && ! UrlFNm.Empty()) {  // keep track of already seen urls (so that there are no duplicate urls)
      TFIn FIn(UrlFNm);  SeenUrlSet.Load(FIn);
    }
    FILE *F = fopen(OutFNm.CStr(), "wt");
    TFIn FIn(InFNm);
    int Items=0;
    for (int f=0; FIn.GetNextLn(Spinn3rFNm); f++) {
      TQuoteExtractor QE(Spinn3rFNm.ToTrunc());
      printf("Processing %02d: %s [%s]\n", f+1, Spinn3rFNm.CStr(), TExeTm::GetCurTm());
      fflush(stdout);
      for (int item = 0; QE.Next(); item++) {
        const TMd5Sig PostMd5(QE.PostUrlStr);
        if (QE.QuoteV.Empty() && QE.LinkV.Empty()) { continue; } // no quotes, no links
        if (UrlOnlyOnce) {
          if (SeenUrlSet.IsKey(PostMd5)) { continue; }
          SeenUrlSet.AddKey(PostMd5);
        }
        fprintf(F, "P\t%s\n", QE.PostUrlStr.CStr());
        //if (QE.PubTm > TSecTm(2008,8,30) || QE.PubTm < TSecTm(2008,7,25)) { printf("%s\n", QE.PubTm.GetStr().CStr()); }
        fprintf(F, "T\t%s\n", QE.PubTm.GetYmdTmStr().CStr());
        for (int q = 0; q < QE.QuoteV.Len(); q++) {
          if (TStrUtil::CountWords(QE.QuoteV[q]) >= MinQtWrdLen) {
            fprintf(F, "Q\t%s\n", QE.QuoteV[q].CStr()); }
        }
        for (int l = 0; l < QE.LinkV.Len(); l++) {
          fprintf(F, "L\t%s\n", QE.LinkV[l].CStr()); }
        fprintf(F, "\n");
        if (item>0 && item % Kilo(100) == 0) {
          QE.DumpStat();  QE.ExeTm.Tick(); }
        Items++;
      }
      printf("file done. Total %d all posts, %d all items\n", SeenUrlSet.Len(), Items);
      fflush(stdout);
    }
    printf("all done. Saving %d post urls\n", SeenUrlSet.Len());  fflush(stdout);
    if (! SeenUrlSet.Empty()) {
      TFOut FOut(OutFNm.GetFMid()+".SeenUrlSet");
      SeenUrlSet.Save(FOut);
    }
    fclose(F);
  }
#pragma endregion mkdataset

#pragma region extractsubset
  // save posts with memes containing particular words
  else if (ToDo == "extractsubset") {
    const TStr InFNmWc = Env.GetIfArgPrefixStr("-i:", "memes_*.rar", "Input file prefix");
    const bool IsInFNmWc = Env.GetIfArgPrefixBool("-w:", true, "Input is wildcard (else a file with list of input files)");
    const TStr OutFNm = Env.GetIfArgPrefixStr("-o:", "memes-subset.txt", "Output memes file");
    const TStr WordsFNm = Env.GetIfArgPrefixStr("-p:", "phrases-in.txt", "Phrases that memes have to contain");

    TChAV CatchMemeV;// = TStr::GetV("great depression", "economic meltdown", "recession had bottomed out", "green shoots", "slow recovery", "gradual recovery");
    printf("Loading %s\n", WordsFNm.CStr());
    { TFIn FIn(WordsFNm);
    for (TStr Ln; FIn.GetNextLn(Ln); ) {
      printf("  %s\n", Ln.GetLc().CStr());
      CatchMemeV.Add(Ln.GetLc()); }
    }
    printf("%d strings loaded\n", CatchMemeV.Len());
    TFOut FOut(OutFNm);
    TMemesDataLoader Memes(InFNmWc, IsInFNmWc);
    for (int posts = 0, nsave=0; Memes.LoadNext(); posts++) {
      bool DoSave = false;
      for (int m = 0; m < Memes.MemeV.Len(); m++) {
        for (int i = 0; i < CatchMemeV.Len(); i++) {
          if (Memes.MemeV[m].SearchStr(CatchMemeV[i]) != -1) {
            DoSave=true; break; }
        }
        if (DoSave) { break; }
      }
      if (DoSave) { Memes.SaveTxt(FOut); nsave++; }
      if (posts % Mega(1) == 0) {
        printf("%dm posts, %d saved\n", posts/Mega(1), nsave);
        FOut.Flush();
      }
    }
  }
#pragma endregion extractsubset

#pragma region memestoqtbs
  // load memes dataset (MkDataset) and create quote base
  else if (ToDo == "memestoqtbs") {
    const TStr InFNm = Env.GetIfArgPrefixStr("-i:", "201007_201107.txt", "Input Memes dataset files");
    const TStr MediaUrlFNm = Env.GetIfArgPrefixStr("-u:", "news_media.txt", "Fule with news media urls");
    const TStr Pref = Env.GetIfArgPrefixStr("-o:", "qt", "Output file name prefix");
    const int MinWrdLen = Env.GetIfArgPrefixInt("-l:", 4, "Min quote word length");
    const int MinMemeFq = Env.GetIfArgPrefixInt("-f:", 5, "Min meme frequency");
		const TStr MinTmStr = Env.GetIfArgPrefixStr("-mint:", "20100714", "Min time of quotes, format = YYYYMMDD");
		const TStr MaxTmStr = Env.GetIfArgPrefixStr("-maxt:", "20110728", "Max time of quotes, format = YYYYMMDD");
		TSecTm MinTm(atoi(MinTmStr.GetSubStr(0,3).CStr()),atoi(MinTmStr.GetSubStr(4,5).CStr()),atoi(MinTmStr.GetSubStr(6,7).CStr()));
		TSecTm MaxTm(atoi(MaxTmStr.GetSubStr(0,3).CStr()),atoi(MaxTmStr.GetSubStr(4,5).CStr()),atoi(MaxTmStr.GetSubStr(6,7).CStr()));

		PQuoteBs QtBs = TQuoteBs::New();
		int HashTableSize = 100; // 100 for each quarter, for one year data, use 400
		int UrlSetSize = 4 * HashTableSize;
		QtBs->ConstructQtBsZarya(InFNm, Pref, MediaUrlFNm, MinTm, MaxTm, MinWrdLen, MinMemeFq, HashTableSize, UrlSetSize);
		}
#pragma endregion memestoqtbs

#pragma region mkclustnet
  // make cluster network
  else if (ToDo == "mkclustnet") {
    TStr InQtBsNm = Env.GetIfArgPrefixStr("-i:", "", "Input quote base file name");
    TStr Pref = Env.GetIfArgPrefixStr("-o:", "qt", "Output network/updated QtBs filename");
		TStr BlackListFNm = Env.GetIfArgPrefixStr("-b:", "quote_blacklist.txt", "Blacklist file name");
		bool IsShglReady = Env.GetIfArgPrefixBool("-shglready:", false, "Indicate whether shingle hash table is ready");
		bool IsNetReady = Env.GetIfArgPrefixBool("-netready:", false, "Indicate whether cluster net is ready");
		double BktThresh = Env.GetIfArgPrefixFlt("-bktthresh:", 0.4, "Threshold for bad shingle bucket elimination");
		double MxTmDelay = Env.GetIfArgPrefixFlt("-delaythresh:", 5, "Max time delay between two quotes in the same cluster");
		double MxTmDev = Env.GetIfArgPrefixFlt("-devthresh:", 3, "Max time deviation for a quote to be specific rather than general");
		double RefineThresh = Env.GetIfArgPrefixFlt("-refinethresh:", 0.2, "Threshold for merging quote cluster in refining process");
    const int MinWrdLen = Env.GetIfArgPrefixInt("-minl:", 4, "Min quote word length");
		const int MaxWrdLen = Env.GetIfArgPrefixInt("-maxl:", 200, "Max quote word length");
    const int MinMemeFq = Env.GetIfArgPrefixInt("-mf:", 5, "Min meme frequency");
		const int MinClustFq = Env.GetIfArgPrefixInt("-cf:", 50, "Min quote cluster frequency");

		// Load quote base
    PQuoteBs QtBs;
    if (TZipIn::IsZipFNm(InQtBsNm)) { TZipIn ZipIn(InQtBsNm);  QtBs = TQuoteBs::Load(ZipIn); }
    else { TFIn FIn(InQtBsNm);  QtBs = TQuoteBs::Load(FIn); }

		// Cluster the quotes
    QtBs->ClusterQuotes(MinMemeFq, MinWrdLen, MaxWrdLen, BlackListFNm, Pref, IsShglReady, IsNetReady, BktThresh, MxTmDelay, MxTmDev, RefineThresh);

		// Dump the clusters
		bool SkipUrl = true, FlashDisp = true;
		QtBs->DumpQuoteClusters(MinWrdLen, MinClustFq, SkipUrl, FlashDisp, Pref);
  }
#pragma endregion mkclustnet

#pragma region memeclust
	else if (ToDo.SearchStr(TStr("memeclust")) >= 0) {
    const TStr InFNm = Env.GetIfArgPrefixStr("-i:", "201101.txt", "Input Memes dataset files");
    const TStr MediaUrlFNm = Env.GetIfArgPrefixStr("-u:", "news_media.txt", "Fule with news media urls");
    TStr Pref = Env.GetIfArgPrefixStr("-o:", "qt", "Output file name prefix");
    const int MinWrdLen = Env.GetIfArgPrefixInt("-minl:", 4, "Min quote word length");
		const int MaxWrdLen = Env.GetIfArgPrefixInt("-maxl:", 200, "Max quote word length");
		const int MinMemeFq = Env.GetIfArgPrefixInt("-f:", 5, "Min meme frequency");

		const int MinClustFq = Env.GetIfArgPrefixInt("-cf:", 50, "Min quote cluster frequency");
		TStr BlackListFNm = Env.GetIfArgPrefixStr("-b:", "quote_blacklist.txt", "Blacklist file name");
		bool IsQtBsReady = Env.GetIfArgPrefixBool("-qtbsready:", false, "Indicate whether quote base is ready and can be loaded readily");
		bool IsShglReady = Env.GetIfArgPrefixBool("-shglready:", false, "Indicate whether shingle hash table is ready");
		bool IsNetReady = Env.GetIfArgPrefixBool("-netready:", false, "Indicate whether cluster net is ready");
		double BktThresh = Env.GetIfArgPrefixFlt("-bktthresh:", 0.4, "Threshold for bad shingle bucket elimination");
		double MxTmDelay = Env.GetIfArgPrefixFlt("-delaythresh:", 5, "Max time delay between two quotes in the same cluster");
		double MxTmDev = Env.GetIfArgPrefixFlt("-devthresh:", 3, "Max time deviation for a quote to be specific rather than general");
		double RefineThresh = Env.GetIfArgPrefixFlt("-refinethresh:", 0.2, "Threshold for merging quote cluster in refining process");

		const TStr MinTmStr = Env.GetIfArgPrefixStr("-mint:", "20010101", "Min time of quotes, format = YYYYMMDD");
		const TStr MaxTmStr = Env.GetIfArgPrefixStr("-maxt:", "20200101", "Max time of quotes, format = YYYYMMDD");
		TSecTm MinTm(atoi(MinTmStr.GetSubStr(0,3).CStr()),atoi(MinTmStr.GetSubStr(4,5).CStr()),atoi(MinTmStr.GetSubStr(6,7).CStr()));
		TSecTm MaxTm(atoi(MaxTmStr.GetSubStr(0,3).CStr()),atoi(MaxTmStr.GetSubStr(4,5).CStr()),atoi(MaxTmStr.GetSubStr(6,7).CStr()));

		// Construct the quote base from Zarya data
		PQuoteBs QtBs = TQuoteBs::New();

		if (!IsQtBsReady) {
			int HashTableSize = 100; // 100 for each quarter, for one year data, use 400
			if (ToDo == "memeclustzarya") {
				int UrlSetSize = 4 * HashTableSize;
				QtBs->ConstructQtBsZarya(InFNm, Pref, MediaUrlFNm, MinTm, MaxTm, MinWrdLen, MinMemeFq, HashTableSize, UrlSetSize);
			}	else if (ToDo == "memeclustqtonly") {
				QtBs->ConstructQtBsQtOnly(InFNm, Pref, MediaUrlFNm, MinWrdLen, MinMemeFq, HashTableSize);
			} else if (ToDo == "memeclustqttime") {
				QtBs->ConstructQtBsQtTime(InFNm, Pref, MediaUrlFNm, MinWrdLen, MinMemeFq, HashTableSize);
			} else {
				printf("Please specify one of the three options for -do : memeclustzarya, memeclustqtonly, memeclustqttime!\n");
				return;
			}
		} else {
			TStr InQtBsNm = TStr::Fmt("%s-w%dmfq%d.QtBs", Pref.CStr(), MinWrdLen, MinMemeFq);
			if (TZipIn::IsZipFNm(InQtBsNm)) { TZipIn ZipIn(InQtBsNm);  QtBs = TQuoteBs::Load(ZipIn); }
			else { TFIn FIn(InQtBsNm);  QtBs = TQuoteBs::Load(FIn); }
		}

		// Cluster the quotes
    QtBs->ClusterQuotes(MinMemeFq, MinWrdLen, MaxWrdLen, BlackListFNm, Pref, IsShglReady, IsNetReady, BktThresh, MxTmDelay, MxTmDev, RefineThresh);

		// Dump the clusters
		bool SkipUrl = true, FlashDisp = true;
		QtBs->DumpQuoteClusters(MinWrdLen, MinClustFq, SkipUrl, FlashDisp, Pref);
	}
#pragma endregion memeclust
}