void TGgSchRef::GetAuthNmVPubStr( const TStr& AuthNmVPubStr, TStrV& AuthNmV, TStr& PubNm, TStr& PubYearStr){ // split input string into two parts TStr AuthNmVStr; TStr PubStr; AuthNmVPubStr.SplitOnStr(AuthNmVStr, " - ", PubStr); // author-names string AuthNmVStr.SplitOnAllCh(',', AuthNmV, true); for (int AuthN=0; AuthN<AuthNmV.Len(); AuthN++){ AuthNmV[AuthN].ToTrunc(); } if ((!AuthNmV.Empty())&& ((AuthNmV.Last().IsStrIn("..."))||(AuthNmV.Last().Len()<=2))){ AuthNmV.DelLast(); } // publication-name & publication-year string TStr OriginStr; TStr LinkStr; PubStr.SplitOnStr(OriginStr, " - ", LinkStr); OriginStr.SplitOnLastCh(PubNm, ',', PubYearStr); PubNm.ToTrunc(); PubYearStr.ToTrunc(); if ((PubYearStr.Len()>=4)&&(PubYearStr.GetSubStr(0, 3).IsInt())){ PubYearStr=PubYearStr.GetSubStr(0, 3); } else if ((PubNm.Len()>=4)&&(PubNm.GetSubStr(0, 3).IsInt())){ PubYearStr=PubNm.GetSubStr(0, 3); PubNm=""; } else { PubYearStr=""; } }
TEST(TStr, GetSubStr) { TStr Str = "abcda"; TStr Empty = ""; EXPECT_EQ(Str.GetSubStr(3), "da"); EXPECT_EQ(Str.GetSubStr(3, 3), "d"); EXPECT_ANY_THROW(Str.GetSubStr(-1, -1)); EXPECT_ANY_THROW(Str.GetSubStr(2, 1)); EXPECT_ANY_THROW(Str.GetSubStr(-1, 100)); }
// <last_name>_<first name innitial> TStr TStrUtil::GetStdName(TStr AuthorName) { TStr StdName; AuthorName.ToLc(); AuthorName.ChangeChAll('\n', ' '); AuthorName.ChangeChAll('.', ' '); // if there is a number in the name, remove it and everything after it int i, pos = 0; while (pos<AuthorName.Len() && (AuthorName[pos]!='#' && !TCh::IsNum(AuthorName[pos]))) { pos++; } if (pos < AuthorName.Len()) { AuthorName = AuthorName.GetSubStr(0, pos-1).ToTrunc(); } if (AuthorName.Empty()) { return TStr::GetNullStr(); } // replace everything after '(' int b = AuthorName.SearchCh('('); if (b != -1) { AuthorName = AuthorName.GetSubStr(0, b-1).ToTrunc(); } // skip if contains ')' if (AuthorName .SearchCh(')')!=-1) { return TStr::GetNullStr(); } // skip if it is not a name if (AuthorName .SearchStr("figures")!=-1 || AuthorName .SearchStr("macros")!=-1 || AuthorName .SearchStr("univ")!=-1 || AuthorName .SearchStr("institute")!=-1) { return TStr::GetNullStr(); } // remove all non-letters (latex tags, ...) TChA NewName; for (i = 0; i < AuthorName.Len(); i++) { const char Ch = AuthorName[i]; if (TCh::IsAlpha(Ch) || TCh::IsWs(Ch) || Ch=='-') { NewName += Ch; } } StdName = NewName; StdName.ToTrunc(); TStrV AuthNmV; StdName.SplitOnWs(AuthNmV); // too short -- not a name if (! AuthNmV.Empty() && AuthNmV.Last() == "jr") AuthNmV.DelLast(); if (AuthNmV.Len() < 2) return TStr::GetNullStr(); const TStr LastNm = AuthNmV.Last(); if (! TCh::IsAlpha(LastNm[0]) || LastNm.Len() == 1) return TStr::GetNullStr(); IAssert(isalpha(AuthNmV[0][0])); return TStr::Fmt("%s_%c", LastNm.CStr(), AuthNmV[0][0]); }
TStrV TEnv::GetIfArgPrefixStrV(const TStr& PrefixStr, TStrV& DfValV, const TStr& DNm) const { TStrV ArgValV; if (Env.GetArgs() <= MnArgs) { // 'usage' argument message if (!SilentP) { printf(" %s%s (default:", PrefixStr.CStr(), DNm.CStr()); for (int DfValN = 0; DfValN < DfValV.Len(); DfValN++) { if (DfValN > 0) printf(", "); printf("%s", DfValV[DfValN].CStr()); } printf(")\n"); } return ArgValV; } else { // argument & value message TStrV Items; for (int ArgN = 0; ArgN < GetArgs(); ArgN++) { // get argument string TStr ArgStr = GetArg(ArgN); if (ArgStr.GetSubStr(0, PrefixStr.Len() - 1) == PrefixStr) { TStr ArgVals = ArgStr.GetSubStr( PrefixStr.Len(), ArgStr.Len()); ArgVals.SplitOnAllCh(',', Items); for (int i = 0; i < Items.Len(); i++) ArgValV.Add(Items[i]); } } if (ArgValV.Empty()) ArgValV = DfValV; // output argument values TChA MsgChA; MsgChA += DNm; MsgChA += " ("; MsgChA += PrefixStr; MsgChA += ")="; for (int ArgValN = 0; ArgValN < ArgValV.Len(); ArgValN++) { if (ArgValN > 0) MsgChA += ", "; MsgChA += ArgValV[ArgValN]; } if (!SilentP) TNotify::OnStatus(Notify, MsgChA); return ArgValV; } }
TStr TEnv::GetIfArgPrefixStr( const TStr& PrefixStr, const TStr& DfVal, const TStr& DNm) const { if (Env.GetArgs()<=MnArgs) { // 'usage' argument message if (!SilentP) { printf(" %s%s (default:'%s')\n", PrefixStr.CStr(), DNm.CStr(), DfVal.CStr()); } return DfVal; } else { // argument & value message TStr Val; if (Env.IsArgPrefix(PrefixStr)) { Val=Env.GetArgPostfix(PrefixStr); if (Val.Len()>1) { if ((Val[0]=='\"')&&(Val.LastCh()=='\"')) { Val=Val.GetSubStr(1, Val.Len()-2); } } } else { Val=DfVal; } TStr MsgStr=" "+DNm+" ("+PrefixStr+")="+Val; if (!SilentP) { TNotify::OnStatus(Notify, MsgStr); } return Val; } }
TStr TEnv::GetExeFNm() const { TStr ExeFNm = GetArg(0); if (ExeFNm.IsPrefix("//?")) { // observed on Win64 CGI ExeFNm = ExeFNm.GetSubStr(3, ExeFNm.Len()); } return ExeFNm; }
int TNntpSockEvent::GetRespCd(const TStr& RespCrLfLn){ if (RespCrLfLn.Len()>=3){ TStr RespCdStr=RespCrLfLn.GetSubStr(0, 2); int RespCd=0; if (RespCdStr.IsInt(RespCd)){return RespCd;} else {return -1;} } else { return -1; } }
void TStrParser::DocStrToChIdV(const TStr& _DocStr, TIntV& ChIdV) { TStr DocStr = _DocStr.GetUc(); // to upper case int ChN = DocStr.Len(); ChIdV.Reserve(ChN, 0); for (int ChC = 0; ChC < ChN; ChC++) { TStr ChStr = DocStr.GetSubStr(ChC,ChC); int ChId = GetWId(ChStr); if (ChId != -1) { WordToIdH[ChId]++; } else { ChId = WordToIdH.AddKey(ChStr); WordToIdH[ChId] = 1; } ChIdV.Add(ChId); } }
////////////////////////////////////// // File-Download-Function void TSASFunFile::LoadFunFileV(const TStr& FPath, TSAppSrvFunV& SrvFunV) { TFFile File(FPath, "", false); TStr FNm; while (File.Next(FNm)) { TStr FExt = FNm.GetFExt(); TStr FUrl = FNm.GetSubStr(FPath.Len()); FUrl.ChangeChAll('\\', '/'); printf("%s %s %s\n", FNm.CStr(), FExt.CStr(), FUrl.CStr()); if (FExt == ".htm") { SrvFunV.Add(TSASFunFile::New(FUrl, FNm, THttp::TextHtmlFldVal)); } else if (FExt == ".html") { SrvFunV.Add(TSASFunFile::New(FUrl, FNm, THttp::TextHtmlFldVal)); } else if (FExt == ".js") { SrvFunV.Add(TSASFunFile::New(FUrl, FNm, THttp::TextJavaScriptFldVal)); } else if (FExt == ".css") { SrvFunV.Add(TSASFunFile::New(FUrl, FNm, THttp::TextCssFldVal)); } else if (FExt == ".jpg") { SrvFunV.Add(TSASFunFile::New(FUrl, FNm, THttp::ImageJpgFldVal)); } else if (FExt == ".jpeg") { SrvFunV.Add(TSASFunFile::New(FUrl, FNm, THttp::ImageJpgFldVal)); } else if (FExt == ".gif") { SrvFunV.Add(TSASFunFile::New(FUrl, FNm, THttp::ImageGifFldVal)); } else { printf("Unknown MIME type for extension '%s' for file '%s'", FExt.CStr(), FNm.CStr()); SrvFunV.Add(TSASFunFile::New(FUrl, FNm, THttp::AppOctetFldVal)); } } }
TStr TEnv::GetArgPostfix(const TStr& PrefixStr) const { int ArgN = GetPrefixArgN(PrefixStr); IAssert(ArgN != -1); TStr ArgStr = GetArg(ArgN); return ArgStr.GetSubStr(PrefixStr.Len(), ArgStr.Len()); }
// load from allactors.zip that was prepared by Brad Malin in 2005 PImdbNet TImdbNet::LoadTxt(const TStr& ActorFNm) { PImdbNet Net = TImdbNet::New(); TStrV ColV; char line [2024]; int NLines=0, DupEdge=0, Year, Position, ActorNId, MovieNId; TIntH ActorNIdH; THash<TIntPr, TInt> MovieNIdH; FILE *F = fopen(ActorFNm.CStr(), "rt"); fgets(line, 2024, F); while (! feof(F)) { memset(line, 0, 2024); fgets(line, 2024, F); if (strlen(line) == 0) break; TStr(line).SplitOnAllCh('|', ColV, false); IAssert(ColV.Len() == 7); const int NameStrId = Net->AddStr(ColV[0].GetTrunc().GetLc()+" "+ColV[1].GetTrunc().GetLc()); const int MovieStrId = Net->AddStr(ColV[2].GetTrunc().GetLc()); TStr YearStr = ColV[3].GetTrunc(); if (YearStr.Len() > 4) YearStr = YearStr.GetSubStr(0, 3); Year = 1; YearStr.IsInt(Year); const TMovieTy MovieTy = TImdbNet::GetMovieTy(ColV[4]); Position = TInt::Mx; ColV[5].GetTrunc().IsInt(Position); IAssert(ColV[6].GetTrunc()[0] == 'M' || ColV[6].GetTrunc()[0]=='F'); const bool IsMale = ColV[6].GetTrunc()[0] == 'M'; // create nodes if (ActorNIdH.IsKey(NameStrId)) { ActorNId = ActorNIdH.GetDat(NameStrId); } else { ActorNId = Net->AddNode(-1, TImdbNode(NameStrId, Year, Position, IsMale)); ActorNIdH.AddDat(NameStrId, ActorNId); } if (MovieNIdH.IsKey(TIntPr(MovieStrId, Year))) { MovieNId = MovieNIdH.GetDat(TIntPr(MovieStrId, Year)); } else { MovieNId = Net->AddNode(-1, TImdbNode(NameStrId, Year, MovieTy)); MovieNIdH.AddDat(TIntPr(MovieStrId, Year), MovieNId); } if (! Net->IsEdge(ActorNId, MovieNId)) { Net->AddEdge(ActorNId, MovieNId); } else { DupEdge++; } if (++NLines % 100000 == 0) printf("\r %dk ", NLines/1000); } fclose(F); printf("duplicate edges: %d\n", DupEdge); printf("nodes: %d\n", Net->GetNodes()); printf("edges: %d\n", Net->GetEdges()); printf("actors: %d\n", ActorNIdH.Len()); printf("movies: %d\n", MovieNIdH.Len()); // set the actor year to the year of his first movie int NUpdates=0; for (TNet::TNodeI NI = Net->BegNI(); NI < Net->EndNI(); NI++) { if (NI().IsActor()) { int MinYear = NI().GetYear(); for (int e = 0; e < NI.GetOutDeg(); e++) { const TImdbNode& NodeDat = Net->GetNDat(NI.GetOutNId(e)); if (NodeDat.IsMovie()) MinYear = TMath::Mn(MinYear, NodeDat.GetYear()); } if (NI().Year != MinYear) NUpdates++; NI().Year = MinYear; } } printf("updated actor times: %d\n", NUpdates); return Net; }
void BigMain(int argc, char* argv[]) { TExeTm ExeTm; Env = TEnv(argc, argv, TNotify::StdNotify); Env.PrepArgs("QuotesApp"); const TStr ToDo = Env.GetIfArgPrefixStr("-do:", "", "To do").GetLc(); if (Env.IsEndOfRun()) { printf("To do:\n"); printf(" MkDataset : Make memes dataset (extract quotes and save txt)\n"); printf(" ExtractSubset : Extract a subset of memes containing particular words\n"); printf(" MemesToQtBs : Load memes dataset and create quote base\n"); printf(" MkClustNet : Build cluster network from the quote base\n"); return; } #pragma region mkdataset // extract quotes and links and make them into a single file if (ToDo == "mkdataset") { const TStr InFNm = Env.GetIfArgPrefixStr("-i:", "files.txt", "Spinn3r input files (one file per line)"); const TStr OutFNm = Env.GetIfArgPrefixStr("-o:", "Spinn3r-dataset.txt", "Output file"); const int MinQtWrdLen = Env.GetIfArgPrefixInt("-w:", 3, "Minimum quote word length"); const TStr UrlFNm = Env.GetIfArgPrefixStr("-u:", "", "Seen url set (THashSet<TMd5Sig>) file name"); const bool UrlOnlyOnce = Env.GetIfArgPrefixBool("-q:", true, "Only keep unique Urls"); //// parse directly from Spinn3r TStr Spinn3rFNm; THashSet<TMd5Sig> SeenUrlSet; if (UrlOnlyOnce && ! UrlFNm.Empty()) { // keep track of already seen urls (so that there are no duplicate urls) TFIn FIn(UrlFNm); SeenUrlSet.Load(FIn); } FILE *F = fopen(OutFNm.CStr(), "wt"); TFIn FIn(InFNm); int Items=0; for (int f=0; FIn.GetNextLn(Spinn3rFNm); f++) { TQuoteExtractor QE(Spinn3rFNm.ToTrunc()); printf("Processing %02d: %s [%s]\n", f+1, Spinn3rFNm.CStr(), TExeTm::GetCurTm()); fflush(stdout); for (int item = 0; QE.Next(); item++) { const TMd5Sig PostMd5(QE.PostUrlStr); if (QE.QuoteV.Empty() && QE.LinkV.Empty()) { continue; } // no quotes, no links if (UrlOnlyOnce) { if (SeenUrlSet.IsKey(PostMd5)) { continue; } SeenUrlSet.AddKey(PostMd5); } fprintf(F, "P\t%s\n", QE.PostUrlStr.CStr()); //if (QE.PubTm > TSecTm(2008,8,30) || QE.PubTm < TSecTm(2008,7,25)) { printf("%s\n", QE.PubTm.GetStr().CStr()); } fprintf(F, "T\t%s\n", QE.PubTm.GetYmdTmStr().CStr()); for (int q = 0; q < QE.QuoteV.Len(); q++) { if (TStrUtil::CountWords(QE.QuoteV[q]) >= MinQtWrdLen) { fprintf(F, "Q\t%s\n", QE.QuoteV[q].CStr()); } } for (int l = 0; l < QE.LinkV.Len(); l++) { fprintf(F, "L\t%s\n", QE.LinkV[l].CStr()); } fprintf(F, "\n"); if (item>0 && item % Kilo(100) == 0) { QE.DumpStat(); QE.ExeTm.Tick(); } Items++; } printf("file done. Total %d all posts, %d all items\n", SeenUrlSet.Len(), Items); fflush(stdout); } printf("all done. Saving %d post urls\n", SeenUrlSet.Len()); fflush(stdout); if (! SeenUrlSet.Empty()) { TFOut FOut(OutFNm.GetFMid()+".SeenUrlSet"); SeenUrlSet.Save(FOut); } fclose(F); } #pragma endregion mkdataset #pragma region extractsubset // save posts with memes containing particular words else if (ToDo == "extractsubset") { const TStr InFNmWc = Env.GetIfArgPrefixStr("-i:", "memes_*.rar", "Input file prefix"); const bool IsInFNmWc = Env.GetIfArgPrefixBool("-w:", true, "Input is wildcard (else a file with list of input files)"); const TStr OutFNm = Env.GetIfArgPrefixStr("-o:", "memes-subset.txt", "Output memes file"); const TStr WordsFNm = Env.GetIfArgPrefixStr("-p:", "phrases-in.txt", "Phrases that memes have to contain"); TChAV CatchMemeV;// = TStr::GetV("great depression", "economic meltdown", "recession had bottomed out", "green shoots", "slow recovery", "gradual recovery"); printf("Loading %s\n", WordsFNm.CStr()); { TFIn FIn(WordsFNm); for (TStr Ln; FIn.GetNextLn(Ln); ) { printf(" %s\n", Ln.GetLc().CStr()); CatchMemeV.Add(Ln.GetLc()); } } printf("%d strings loaded\n", CatchMemeV.Len()); TFOut FOut(OutFNm); TMemesDataLoader Memes(InFNmWc, IsInFNmWc); for (int posts = 0, nsave=0; Memes.LoadNext(); posts++) { bool DoSave = false; for (int m = 0; m < Memes.MemeV.Len(); m++) { for (int i = 0; i < CatchMemeV.Len(); i++) { if (Memes.MemeV[m].SearchStr(CatchMemeV[i]) != -1) { DoSave=true; break; } } if (DoSave) { break; } } if (DoSave) { Memes.SaveTxt(FOut); nsave++; } if (posts % Mega(1) == 0) { printf("%dm posts, %d saved\n", posts/Mega(1), nsave); FOut.Flush(); } } } #pragma endregion extractsubset #pragma region memestoqtbs // load memes dataset (MkDataset) and create quote base else if (ToDo == "memestoqtbs") { const TStr InFNm = Env.GetIfArgPrefixStr("-i:", "201007_201107.txt", "Input Memes dataset files"); const TStr MediaUrlFNm = Env.GetIfArgPrefixStr("-u:", "news_media.txt", "Fule with news media urls"); const TStr Pref = Env.GetIfArgPrefixStr("-o:", "qt", "Output file name prefix"); const int MinWrdLen = Env.GetIfArgPrefixInt("-l:", 4, "Min quote word length"); const int MinMemeFq = Env.GetIfArgPrefixInt("-f:", 5, "Min meme frequency"); const TStr MinTmStr = Env.GetIfArgPrefixStr("-mint:", "20100714", "Min time of quotes, format = YYYYMMDD"); const TStr MaxTmStr = Env.GetIfArgPrefixStr("-maxt:", "20110728", "Max time of quotes, format = YYYYMMDD"); TSecTm MinTm(atoi(MinTmStr.GetSubStr(0,3).CStr()),atoi(MinTmStr.GetSubStr(4,5).CStr()),atoi(MinTmStr.GetSubStr(6,7).CStr())); TSecTm MaxTm(atoi(MaxTmStr.GetSubStr(0,3).CStr()),atoi(MaxTmStr.GetSubStr(4,5).CStr()),atoi(MaxTmStr.GetSubStr(6,7).CStr())); PQuoteBs QtBs = TQuoteBs::New(); int HashTableSize = 100; // 100 for each quarter, for one year data, use 400 int UrlSetSize = 4 * HashTableSize; QtBs->ConstructQtBsZarya(InFNm, Pref, MediaUrlFNm, MinTm, MaxTm, MinWrdLen, MinMemeFq, HashTableSize, UrlSetSize); } #pragma endregion memestoqtbs #pragma region mkclustnet // make cluster network else if (ToDo == "mkclustnet") { TStr InQtBsNm = Env.GetIfArgPrefixStr("-i:", "", "Input quote base file name"); TStr Pref = Env.GetIfArgPrefixStr("-o:", "qt", "Output network/updated QtBs filename"); TStr BlackListFNm = Env.GetIfArgPrefixStr("-b:", "quote_blacklist.txt", "Blacklist file name"); bool IsShglReady = Env.GetIfArgPrefixBool("-shglready:", false, "Indicate whether shingle hash table is ready"); bool IsNetReady = Env.GetIfArgPrefixBool("-netready:", false, "Indicate whether cluster net is ready"); double BktThresh = Env.GetIfArgPrefixFlt("-bktthresh:", 0.4, "Threshold for bad shingle bucket elimination"); double MxTmDelay = Env.GetIfArgPrefixFlt("-delaythresh:", 5, "Max time delay between two quotes in the same cluster"); double MxTmDev = Env.GetIfArgPrefixFlt("-devthresh:", 3, "Max time deviation for a quote to be specific rather than general"); double RefineThresh = Env.GetIfArgPrefixFlt("-refinethresh:", 0.2, "Threshold for merging quote cluster in refining process"); const int MinWrdLen = Env.GetIfArgPrefixInt("-minl:", 4, "Min quote word length"); const int MaxWrdLen = Env.GetIfArgPrefixInt("-maxl:", 200, "Max quote word length"); const int MinMemeFq = Env.GetIfArgPrefixInt("-mf:", 5, "Min meme frequency"); const int MinClustFq = Env.GetIfArgPrefixInt("-cf:", 50, "Min quote cluster frequency"); // Load quote base PQuoteBs QtBs; if (TZipIn::IsZipFNm(InQtBsNm)) { TZipIn ZipIn(InQtBsNm); QtBs = TQuoteBs::Load(ZipIn); } else { TFIn FIn(InQtBsNm); QtBs = TQuoteBs::Load(FIn); } // Cluster the quotes QtBs->ClusterQuotes(MinMemeFq, MinWrdLen, MaxWrdLen, BlackListFNm, Pref, IsShglReady, IsNetReady, BktThresh, MxTmDelay, MxTmDev, RefineThresh); // Dump the clusters bool SkipUrl = true, FlashDisp = true; QtBs->DumpQuoteClusters(MinWrdLen, MinClustFq, SkipUrl, FlashDisp, Pref); } #pragma endregion mkclustnet #pragma region memeclust else if (ToDo.SearchStr(TStr("memeclust")) >= 0) { const TStr InFNm = Env.GetIfArgPrefixStr("-i:", "201101.txt", "Input Memes dataset files"); const TStr MediaUrlFNm = Env.GetIfArgPrefixStr("-u:", "news_media.txt", "Fule with news media urls"); TStr Pref = Env.GetIfArgPrefixStr("-o:", "qt", "Output file name prefix"); const int MinWrdLen = Env.GetIfArgPrefixInt("-minl:", 4, "Min quote word length"); const int MaxWrdLen = Env.GetIfArgPrefixInt("-maxl:", 200, "Max quote word length"); const int MinMemeFq = Env.GetIfArgPrefixInt("-f:", 5, "Min meme frequency"); const int MinClustFq = Env.GetIfArgPrefixInt("-cf:", 50, "Min quote cluster frequency"); TStr BlackListFNm = Env.GetIfArgPrefixStr("-b:", "quote_blacklist.txt", "Blacklist file name"); bool IsQtBsReady = Env.GetIfArgPrefixBool("-qtbsready:", false, "Indicate whether quote base is ready and can be loaded readily"); bool IsShglReady = Env.GetIfArgPrefixBool("-shglready:", false, "Indicate whether shingle hash table is ready"); bool IsNetReady = Env.GetIfArgPrefixBool("-netready:", false, "Indicate whether cluster net is ready"); double BktThresh = Env.GetIfArgPrefixFlt("-bktthresh:", 0.4, "Threshold for bad shingle bucket elimination"); double MxTmDelay = Env.GetIfArgPrefixFlt("-delaythresh:", 5, "Max time delay between two quotes in the same cluster"); double MxTmDev = Env.GetIfArgPrefixFlt("-devthresh:", 3, "Max time deviation for a quote to be specific rather than general"); double RefineThresh = Env.GetIfArgPrefixFlt("-refinethresh:", 0.2, "Threshold for merging quote cluster in refining process"); const TStr MinTmStr = Env.GetIfArgPrefixStr("-mint:", "20010101", "Min time of quotes, format = YYYYMMDD"); const TStr MaxTmStr = Env.GetIfArgPrefixStr("-maxt:", "20200101", "Max time of quotes, format = YYYYMMDD"); TSecTm MinTm(atoi(MinTmStr.GetSubStr(0,3).CStr()),atoi(MinTmStr.GetSubStr(4,5).CStr()),atoi(MinTmStr.GetSubStr(6,7).CStr())); TSecTm MaxTm(atoi(MaxTmStr.GetSubStr(0,3).CStr()),atoi(MaxTmStr.GetSubStr(4,5).CStr()),atoi(MaxTmStr.GetSubStr(6,7).CStr())); // Construct the quote base from Zarya data PQuoteBs QtBs = TQuoteBs::New(); if (!IsQtBsReady) { int HashTableSize = 100; // 100 for each quarter, for one year data, use 400 if (ToDo == "memeclustzarya") { int UrlSetSize = 4 * HashTableSize; QtBs->ConstructQtBsZarya(InFNm, Pref, MediaUrlFNm, MinTm, MaxTm, MinWrdLen, MinMemeFq, HashTableSize, UrlSetSize); } else if (ToDo == "memeclustqtonly") { QtBs->ConstructQtBsQtOnly(InFNm, Pref, MediaUrlFNm, MinWrdLen, MinMemeFq, HashTableSize); } else if (ToDo == "memeclustqttime") { QtBs->ConstructQtBsQtTime(InFNm, Pref, MediaUrlFNm, MinWrdLen, MinMemeFq, HashTableSize); } else { printf("Please specify one of the three options for -do : memeclustzarya, memeclustqtonly, memeclustqttime!\n"); return; } } else { TStr InQtBsNm = TStr::Fmt("%s-w%dmfq%d.QtBs", Pref.CStr(), MinWrdLen, MinMemeFq); if (TZipIn::IsZipFNm(InQtBsNm)) { TZipIn ZipIn(InQtBsNm); QtBs = TQuoteBs::Load(ZipIn); } else { TFIn FIn(InQtBsNm); QtBs = TQuoteBs::Load(FIn); } } // Cluster the quotes QtBs->ClusterQuotes(MinMemeFq, MinWrdLen, MaxWrdLen, BlackListFNm, Pref, IsShglReady, IsNetReady, BktThresh, MxTmDelay, MxTmDev, RefineThresh); // Dump the clusters bool SkipUrl = true, FlashDisp = true; QtBs->DumpQuoteClusters(MinWrdLen, MinClustFq, SkipUrl, FlashDisp, Pref); } #pragma endregion memeclust }