PAlignPair TAlignPair::LoadAcXml(const TStr& FNm, const int& MxSents) { printf("Loading %s ...\n", FNm.CStr()); // get the lanugagne names TStr BaseNm = FNm.GetFMid(); TStrV PartV; BaseNm.SplitOnAllCh('-', PartV); IAssertR(PartV.Len() == 3, "Bad file name: " + BaseNm); // prepare aligne pair PAlignPair AlignPair = TAlignPair::New(PartV[1], PartV[2]); // parse the XML PTransCorpus TransCorpus = TTransCorpus::LoadAC(FNm, MxSents * 4); // select subset of sentences which will go into aligned corpus const int AllSents = TransCorpus->GetSentences(); TIntV SentIdV(AllSents, 0); for (int SentId = 0; SentId < AllSents; SentId++) { SentIdV.Add(SentId); } if (MxSents != -1 && AllSents > MxSents) { TRnd Rnd(1); SentIdV.Shuffle(Rnd); SentIdV.Trunc(MxSents); } // add the sentences to the bow const int Sents = SentIdV.Len(); for (int SentIdN = 0; SentIdN < Sents; SentIdN++) { const int SentId = SentIdV[SentIdN]; const TStr& Sent1 = TransCorpus->GetOrgStr(SentId); const TStr& Sent2 = TransCorpus->GetRefTransStrV(SentId)[0]; AlignPair->AddSent(Sent1, Sent2); } // finish the alignment pair AlignPair->Def(); return AlignPair; }
void DoLayout(const TStr& GraphInFNm, TStr OutFNm, const TGVizLayout& Layout) { TStr LayoutExe = GetLayoutStr(Layout), Ext = OutFNm.GetFExt(), GvPath; #if defined(GLib_WIN) GvPath = "C:\\Prog\\GraphViz\\bin\\"; #else GvPath = "/usr/bin/"; Ext = ".ps"; OutFNm = OutFNm.GetFMid() + Ext; #endif IAssert(Ext==".ps" || Ext==".gif" || Ext==".png"); const TStr ExeCmd = TStr::Fmt("%s -T%s %s -o %s", LayoutExe.CStr(), Ext.CStr()+1, GraphInFNm.CStr(), OutFNm.CStr()); if (system(ExeCmd.CStr())==0) { return; } #if defined(GLib_WIN) if (system(TStr::Fmt(".\\%s", ExeCmd.CStr()).CStr())==0) { return; } #else if (system(TStr::Fmt("./%s", ExeCmd.CStr()).CStr())==0) { return; } #endif if (system(TStr::Fmt("%s%s", GvPath.CStr(), ExeCmd.CStr()).CStr())==0) { return; } fprintf(stderr, "[%s:%d] Cat not find GraphViz (%s). Set the PATH.\n", __FILE__, __LINE__, ExeCmd.CStr()); //#if defined(GLib_WIN) //if (ShowPlot) system(TStr::Fmt("start %s", OutFNm.CStr()).CStr()); //#endif }
// parse from script filename, assuming default structure TJsParam(const TStr& RootFNm, const TStr& _FNm) { // remember script name FNm = _FNm; // derive namespace from filename Nm = FNm.GetFMid(); // no initialization parameters InitVal = TJsonVal::NewObj(); // handle default includes AddLocalLibFPath(); AddQMinerLibFPath(); // add sandbox access AddSandboxAccessFPath(RootFNm); }
void TGnuPlot::SavePng(const TStr& FNm, const int& SizeX, const int& SizeY, const TStr& Comment, const TStr& Terminal) { if (Terminal.Empty()) { //#ifdef GLib_WIN #ifndef GLib_MACOSX // The standard GNUPlot for MacOS does not support PNG AddCmd(TStr::Fmt("set terminal png small size %d,%d", SizeX, SizeY)); AddCmd(TStr::Fmt("set output '%s'", FNm.CStr())); #else // EPS AddCmd("set terminal postscript eps 10 enhanced color"); AddCmd(TStr::Fmt("set output '%s.eps'", FNm.GetFMid().CStr())); #endif } else { AddCmd(Terminal); AddCmd(TStr::Fmt("set output '%s'", FNm.CStr())); } Pause(false); CreatePlotFile(Comment.Empty()? Title : Comment); RunGnuPlot(); MoreCmds.DelLast(); MoreCmds.DelLast(); }
void TTb::SaveAssis(const TStr& FNm){ TStr DoFNm=FNm.GetFPath()+"AsDo"+FNm.GetFMid().GetSubStr(0, 3)+".Dat"; TStr DaFNm=FNm.GetFPath()+"AsDa"+FNm.GetFMid().GetSubStr(0, 3)+".Dat"; TOLx DoLx(PSOut(new TFOut(DoFNm)), TFSet()|oloFrcEoln|oloSigNum|oloUniStr); int Dscs=GetVar(0)->GetVarType()->GetDscs(); Assert(Dscs>0); DoLx.PutInt(Dscs); DoLx.PutDosLn(); for (int DscN=0; DscN<Dscs; DscN++){ TTbVal Val=GetVar(0)->GetVarType()->GetVal(DscN); DoLx.PutStr(GetVar(0)->GetVarType()->GetValStr(Val)); DoLx.PutDosLn(); } DoLx.PutInt(TInt(GetVars()-1)); DoLx.PutDosLn(); for (int VarN=1; VarN<GetVars(); VarN++){ DoLx.PutStr(GetVar(VarN)->GetNm()); DoLx.PutDosLn(); int Dscs=GetVar(VarN)->GetVarType()->GetDscs(); if (Dscs>0){ DoLx.PutInt(Dscs); DoLx.PutDosLn(); for (int DscN=0; DscN<Dscs; DscN++){ TTbVal Val=GetVar(VarN)->GetVarType()->GetVal(DscN); DoLx.PutStr(GetVar(VarN)->GetVarType()->GetValStr(DscN)); DoLx.PutDosLn();} } else { DoLx.PutInt(TInt(0)); DoLx.PutInt(TInt(100)); DoLx.PutDosLn(); } } TOLx DaLx(PSOut(new TFOut(DaFNm)), TFSet()|oloFrcEoln|oloSigNum|oloUniStr); for (int TupN=0; TupN<GetTups(); TupN++){ for (int VarN=0; VarN<GetVars(); VarN++){ TTbVal Val=GetVal(TupN, VarN); switch (Val.GetValTag()){ case tvtUnknw: DaLx.PutSym(syQuestion); break; case tvtUnimp: DaLx.PutSym(syAsterisk); break; case tvtUnapp: DaLx.PutSym(syAsterisk); break; case tvtDsc: DaLx.PutInt(TInt(1+Val.GetDsc())); break; case tvtFlt: DaLx.PutFlt(Val.GetFlt()); break; default: Fail; } } DaLx.PutDosLn(); } }
// parse from json configuration file TJsParam(const TStr& RootFNm, const PJsonVal& JsVal) { EAssertR(JsVal->IsObj(), "Unsupported type: " + TJsonVal::GetStrFromVal(JsVal)); // we must have at least the script name EAssert(JsVal->IsObjKey("file")); // get script name FNm = JsVal->GetObjStr("file"); // get namespace (get from script name if not available) Nm = JsVal->IsObjKey("name") ? JsVal->GetObjStr("name") : FNm.GetFMid(); // get initialization parameters (if available) InitVal = JsVal->IsObjKey("init") ? JsVal->GetObjKey("init") : TJsonVal::NewObj(); // get library include folders if (JsVal->IsObjKey("include")) { PJsonVal IncludesVal = JsVal->GetObjKey("include"); EAssertR(IncludesVal->IsArr(), "Expected array of strings, not: " + TJsonVal::GetStrFromVal(IncludesVal)); for (int IncludeN = 0; IncludeN < IncludesVal->GetArrVals(); IncludeN++) { PJsonVal IncludeVal = IncludesVal->GetArrVal(IncludeN); EAssertR(IncludeVal->IsStr(), "Expected string, not: " + TJsonVal::GetStrFromVal(IncludeVal)); IncludeFPathV.Add(IncludeVal->GetStr()); } } // handle default includes AddLocalLibFPath(); AddQMinerLibFPath(); // get folders with access permissions if (JsVal->IsObjKey("dirs")) { PJsonVal DirsVal = JsVal->GetObjKey("dirs"); EAssertR(DirsVal->IsArr(), "Expected array of strings, not: " + TJsonVal::GetStrFromVal(DirsVal)); for (int DirN = 0; DirN < DirsVal->GetArrVals(); DirN++) { PJsonVal DirVal = DirsVal->GetArrVal(DirN); EAssertR(DirVal->IsStr(), "Expected string, not: " + TJsonVal::GetStrFromVal(DirVal)); AccessFPathV.Add(DirVal->GetStr()); } } // add sandbox access AddSandboxAccessFPath(RootFNm); }
TStr TZipOut::GetCmd(const TStr& ZipFNm) { if (FExtToCmdH.Empty()) FillFExtToCmdH(); const TStr Ext = ZipFNm.GetFExt().GetLc(); EAssertR(FExtToCmdH.IsKey(Ext), TStr::Fmt("Unsupported file extension '%s'", Ext.CStr())); return FExtToCmdH.GetDat(Ext)+ZipFNm.GetFMid(); }
int main(int argc, char* argv[]) { Env = TEnv(argc, argv, TNotify::StdNotify); Env.PrepArgs( TStr::Fmt("Kronecker graphs. build: %s, %s. Time: %s", __TIME__, __DATE__, TExeTm::GetCurTm())); TExeTm ExeTm; Try Env = TEnv(argc, argv, TNotify::StdNotify); const TStr InFNm = Env.GetIfArgPrefixStr("-i:", "../as20graph.txt", "Input graph file (single directed edge per line)"); TStr OutFNm = Env.GetIfArgPrefixStr("-o:", "", "Output file prefix"); const TInt NZero = Env.GetIfArgPrefixInt("-n0:", 2, "Innitiator matrix size"); const TStr InitMtx = Env.GetIfArgPrefixStr("-m:", "0.9 0.7; 0.5 0.2", "Init Gradient Descent Matrix (R=random)").GetLc(); const TStr Perm = Env.GetIfArgPrefixStr("-p:", "d", "Initial node permutation: d:Degree, r:Random, o:Order").GetLc(); const TInt GradIter = Env.GetIfArgPrefixInt("-gi:", 50, "Gradient descent iterations"); const TFlt LrnRate = Env.GetIfArgPrefixFlt("-l:", 1e-5, "Learning rate"); const TFlt MnStep = Env.GetIfArgPrefixFlt("-mns:", 0.005, "Minimum gradient step"); const TFlt MxStep = Env.GetIfArgPrefixFlt("-mxs:", 0.05, "Maximum gradient step"); const TInt WarmUp = Env.GetIfArgPrefixInt("-w:", 10000, "Samples to warm up"); const TInt NSamples = Env.GetIfArgPrefixInt("-s:", 100000, "Samples per gradient estimation"); //const TInt GradType = Env.GetIfArgPrefixInt("-gt:", 1, "1:Grad1, 2:Grad2"); const bool ScaleInitMtx = Env.GetIfArgPrefixBool("-sim:", true, "Scale the initiator to match the number of edges"); const TFlt PermSwapNodeProb = Env.GetIfArgPrefixFlt("-nsp:", 1.0, "Probability of using NodeSwap (vs. EdgeSwap) MCMC proposal distribution"); if (OutFNm.Empty()) { OutFNm = TStr::Fmt("%s-fit%d", InFNm.GetFMid().CStr(), NZero()); } // load graph PNGraph G; if (InFNm.GetFExt().GetLc() == ".ungraph") { TFIn FIn(InFNm); G = TSnap::ConvertGraph<PNGraph>(TUNGraph::Load(FIn), true); } else if (InFNm.GetFExt().GetLc() == ".ngraph") { TFIn FIn(InFNm); G = TNGraph::Load(FIn); } else { G = TSnap::LoadEdgeList<PNGraph>(InFNm, 0, 1); } // fit TKronMtx InitKronMtx = InitMtx == "r" ? TKronMtx::GetRndMtx(NZero, 0.1) : TKronMtx::GetMtx(InitMtx); InitKronMtx.Dump("INIT PARAM", true); TKroneckerLL KronLL(G, InitKronMtx, PermSwapNodeProb); if (ScaleInitMtx) { InitKronMtx.SetForEdges(G->GetNodes(), G->GetEdges()); } KronLL.InitLL(G, InitKronMtx); InitKronMtx.Dump("SCALED PARAM", true); KronLL.SetPerm(Perm.GetCh(0)); double LogLike = 0; //if (GradType == 1) { LogLike = KronLL.GradDescent(GradIter, LrnRate, MnStep, MxStep, WarmUp, NSamples); //} else if (GradType == 2) { // LogLike = KronLL.GradDescent2(GradIter, LrnRate, MnStep, MxStep, WarmUp, NSamples); } //else{ Fail; } const TKronMtx& FitMtx = KronLL.GetProbMtx(); FILE *F = fopen(OutFNm.CStr(), "w"); fprintf(F, "Input\t%s\n", InFNm.CStr()); TStrV ParamV; Env.GetCmLn().SplitOnAllCh(' ', ParamV); fprintf(F, "Command line options\n"); for (int i = 0; i < ParamV.Len(); i++) { fprintf(F, "\t%s\n", ParamV[i].CStr() + (ParamV[i][0] == '-' ? 1 : 0)); } fprintf(F, "Loglikelihood\t%10.2f\n", LogLike); fprintf(F, "Absolute error (based on expected number of edges)\t%f\n", KronLL.GetAbsErr()); fprintf(F, "RunTime\t%g\n", ExeTm.GetSecs()); fprintf(F, "Estimated initiator\t%s\n", FitMtx.GetMtxStr().CStr()); fclose(F); Catch printf("\nrun time: %s (%s)\n", ExeTm.GetTmStr(), TSecTm::GetCurTm().GetTmStr().CStr()); return 0; }
// width=0.3, height=0.3, label=\"\", style=filled, color=black void TGraphKey::PlotGViz(const TStr& OutFNm, const TStr& Desc, const TStr& NodeAttrs, const int& Size) const { const TStr DotFNm = OutFNm.GetFMid()+".dot"; SaveGViz(DotFNm, Desc, NodeAttrs, Size); TGraphViz::DoLayout(DotFNm, OutFNm, gvlDot); }
void BigMain(int argc, char* argv[]) { TExeTm ExeTm; Env = TEnv(argc, argv, TNotify::StdNotify); Env.PrepArgs("QuotesApp"); const TStr ToDo = Env.GetIfArgPrefixStr("-do:", "", "To do").GetLc(); if (Env.IsEndOfRun()) { printf("To do:\n"); printf(" MkDataset : Make memes dataset (extract quotes and save txt)\n"); printf(" ExtractSubset : Extract a subset of memes containing particular words\n"); printf(" MemesToQtBs : Load memes dataset and create quote base\n"); printf(" MkClustNet : Build cluster network from the quote base\n"); return; } #pragma region mkdataset // extract quotes and links and make them into a single file if (ToDo == "mkdataset") { const TStr InFNm = Env.GetIfArgPrefixStr("-i:", "files.txt", "Spinn3r input files (one file per line)"); const TStr OutFNm = Env.GetIfArgPrefixStr("-o:", "Spinn3r-dataset.txt", "Output file"); const int MinQtWrdLen = Env.GetIfArgPrefixInt("-w:", 3, "Minimum quote word length"); const TStr UrlFNm = Env.GetIfArgPrefixStr("-u:", "", "Seen url set (THashSet<TMd5Sig>) file name"); const bool UrlOnlyOnce = Env.GetIfArgPrefixBool("-q:", true, "Only keep unique Urls"); //// parse directly from Spinn3r TStr Spinn3rFNm; THashSet<TMd5Sig> SeenUrlSet; if (UrlOnlyOnce && ! UrlFNm.Empty()) { // keep track of already seen urls (so that there are no duplicate urls) TFIn FIn(UrlFNm); SeenUrlSet.Load(FIn); } FILE *F = fopen(OutFNm.CStr(), "wt"); TFIn FIn(InFNm); int Items=0; for (int f=0; FIn.GetNextLn(Spinn3rFNm); f++) { TQuoteExtractor QE(Spinn3rFNm.ToTrunc()); printf("Processing %02d: %s [%s]\n", f+1, Spinn3rFNm.CStr(), TExeTm::GetCurTm()); fflush(stdout); for (int item = 0; QE.Next(); item++) { const TMd5Sig PostMd5(QE.PostUrlStr); if (QE.QuoteV.Empty() && QE.LinkV.Empty()) { continue; } // no quotes, no links if (UrlOnlyOnce) { if (SeenUrlSet.IsKey(PostMd5)) { continue; } SeenUrlSet.AddKey(PostMd5); } fprintf(F, "P\t%s\n", QE.PostUrlStr.CStr()); //if (QE.PubTm > TSecTm(2008,8,30) || QE.PubTm < TSecTm(2008,7,25)) { printf("%s\n", QE.PubTm.GetStr().CStr()); } fprintf(F, "T\t%s\n", QE.PubTm.GetYmdTmStr().CStr()); for (int q = 0; q < QE.QuoteV.Len(); q++) { if (TStrUtil::CountWords(QE.QuoteV[q]) >= MinQtWrdLen) { fprintf(F, "Q\t%s\n", QE.QuoteV[q].CStr()); } } for (int l = 0; l < QE.LinkV.Len(); l++) { fprintf(F, "L\t%s\n", QE.LinkV[l].CStr()); } fprintf(F, "\n"); if (item>0 && item % Kilo(100) == 0) { QE.DumpStat(); QE.ExeTm.Tick(); } Items++; } printf("file done. Total %d all posts, %d all items\n", SeenUrlSet.Len(), Items); fflush(stdout); } printf("all done. Saving %d post urls\n", SeenUrlSet.Len()); fflush(stdout); if (! SeenUrlSet.Empty()) { TFOut FOut(OutFNm.GetFMid()+".SeenUrlSet"); SeenUrlSet.Save(FOut); } fclose(F); } #pragma endregion mkdataset #pragma region extractsubset // save posts with memes containing particular words else if (ToDo == "extractsubset") { const TStr InFNmWc = Env.GetIfArgPrefixStr("-i:", "memes_*.rar", "Input file prefix"); const bool IsInFNmWc = Env.GetIfArgPrefixBool("-w:", true, "Input is wildcard (else a file with list of input files)"); const TStr OutFNm = Env.GetIfArgPrefixStr("-o:", "memes-subset.txt", "Output memes file"); const TStr WordsFNm = Env.GetIfArgPrefixStr("-p:", "phrases-in.txt", "Phrases that memes have to contain"); TChAV CatchMemeV;// = TStr::GetV("great depression", "economic meltdown", "recession had bottomed out", "green shoots", "slow recovery", "gradual recovery"); printf("Loading %s\n", WordsFNm.CStr()); { TFIn FIn(WordsFNm); for (TStr Ln; FIn.GetNextLn(Ln); ) { printf(" %s\n", Ln.GetLc().CStr()); CatchMemeV.Add(Ln.GetLc()); } } printf("%d strings loaded\n", CatchMemeV.Len()); TFOut FOut(OutFNm); TMemesDataLoader Memes(InFNmWc, IsInFNmWc); for (int posts = 0, nsave=0; Memes.LoadNext(); posts++) { bool DoSave = false; for (int m = 0; m < Memes.MemeV.Len(); m++) { for (int i = 0; i < CatchMemeV.Len(); i++) { if (Memes.MemeV[m].SearchStr(CatchMemeV[i]) != -1) { DoSave=true; break; } } if (DoSave) { break; } } if (DoSave) { Memes.SaveTxt(FOut); nsave++; } if (posts % Mega(1) == 0) { printf("%dm posts, %d saved\n", posts/Mega(1), nsave); FOut.Flush(); } } } #pragma endregion extractsubset #pragma region memestoqtbs // load memes dataset (MkDataset) and create quote base else if (ToDo == "memestoqtbs") { const TStr InFNm = Env.GetIfArgPrefixStr("-i:", "201007_201107.txt", "Input Memes dataset files"); const TStr MediaUrlFNm = Env.GetIfArgPrefixStr("-u:", "news_media.txt", "Fule with news media urls"); const TStr Pref = Env.GetIfArgPrefixStr("-o:", "qt", "Output file name prefix"); const int MinWrdLen = Env.GetIfArgPrefixInt("-l:", 4, "Min quote word length"); const int MinMemeFq = Env.GetIfArgPrefixInt("-f:", 5, "Min meme frequency"); const TStr MinTmStr = Env.GetIfArgPrefixStr("-mint:", "20100714", "Min time of quotes, format = YYYYMMDD"); const TStr MaxTmStr = Env.GetIfArgPrefixStr("-maxt:", "20110728", "Max time of quotes, format = YYYYMMDD"); TSecTm MinTm(atoi(MinTmStr.GetSubStr(0,3).CStr()),atoi(MinTmStr.GetSubStr(4,5).CStr()),atoi(MinTmStr.GetSubStr(6,7).CStr())); TSecTm MaxTm(atoi(MaxTmStr.GetSubStr(0,3).CStr()),atoi(MaxTmStr.GetSubStr(4,5).CStr()),atoi(MaxTmStr.GetSubStr(6,7).CStr())); PQuoteBs QtBs = TQuoteBs::New(); int HashTableSize = 100; // 100 for each quarter, for one year data, use 400 int UrlSetSize = 4 * HashTableSize; QtBs->ConstructQtBsZarya(InFNm, Pref, MediaUrlFNm, MinTm, MaxTm, MinWrdLen, MinMemeFq, HashTableSize, UrlSetSize); } #pragma endregion memestoqtbs #pragma region mkclustnet // make cluster network else if (ToDo == "mkclustnet") { TStr InQtBsNm = Env.GetIfArgPrefixStr("-i:", "", "Input quote base file name"); TStr Pref = Env.GetIfArgPrefixStr("-o:", "qt", "Output network/updated QtBs filename"); TStr BlackListFNm = Env.GetIfArgPrefixStr("-b:", "quote_blacklist.txt", "Blacklist file name"); bool IsShglReady = Env.GetIfArgPrefixBool("-shglready:", false, "Indicate whether shingle hash table is ready"); bool IsNetReady = Env.GetIfArgPrefixBool("-netready:", false, "Indicate whether cluster net is ready"); double BktThresh = Env.GetIfArgPrefixFlt("-bktthresh:", 0.4, "Threshold for bad shingle bucket elimination"); double MxTmDelay = Env.GetIfArgPrefixFlt("-delaythresh:", 5, "Max time delay between two quotes in the same cluster"); double MxTmDev = Env.GetIfArgPrefixFlt("-devthresh:", 3, "Max time deviation for a quote to be specific rather than general"); double RefineThresh = Env.GetIfArgPrefixFlt("-refinethresh:", 0.2, "Threshold for merging quote cluster in refining process"); const int MinWrdLen = Env.GetIfArgPrefixInt("-minl:", 4, "Min quote word length"); const int MaxWrdLen = Env.GetIfArgPrefixInt("-maxl:", 200, "Max quote word length"); const int MinMemeFq = Env.GetIfArgPrefixInt("-mf:", 5, "Min meme frequency"); const int MinClustFq = Env.GetIfArgPrefixInt("-cf:", 50, "Min quote cluster frequency"); // Load quote base PQuoteBs QtBs; if (TZipIn::IsZipFNm(InQtBsNm)) { TZipIn ZipIn(InQtBsNm); QtBs = TQuoteBs::Load(ZipIn); } else { TFIn FIn(InQtBsNm); QtBs = TQuoteBs::Load(FIn); } // Cluster the quotes QtBs->ClusterQuotes(MinMemeFq, MinWrdLen, MaxWrdLen, BlackListFNm, Pref, IsShglReady, IsNetReady, BktThresh, MxTmDelay, MxTmDev, RefineThresh); // Dump the clusters bool SkipUrl = true, FlashDisp = true; QtBs->DumpQuoteClusters(MinWrdLen, MinClustFq, SkipUrl, FlashDisp, Pref); } #pragma endregion mkclustnet #pragma region memeclust else if (ToDo.SearchStr(TStr("memeclust")) >= 0) { const TStr InFNm = Env.GetIfArgPrefixStr("-i:", "201101.txt", "Input Memes dataset files"); const TStr MediaUrlFNm = Env.GetIfArgPrefixStr("-u:", "news_media.txt", "Fule with news media urls"); TStr Pref = Env.GetIfArgPrefixStr("-o:", "qt", "Output file name prefix"); const int MinWrdLen = Env.GetIfArgPrefixInt("-minl:", 4, "Min quote word length"); const int MaxWrdLen = Env.GetIfArgPrefixInt("-maxl:", 200, "Max quote word length"); const int MinMemeFq = Env.GetIfArgPrefixInt("-f:", 5, "Min meme frequency"); const int MinClustFq = Env.GetIfArgPrefixInt("-cf:", 50, "Min quote cluster frequency"); TStr BlackListFNm = Env.GetIfArgPrefixStr("-b:", "quote_blacklist.txt", "Blacklist file name"); bool IsQtBsReady = Env.GetIfArgPrefixBool("-qtbsready:", false, "Indicate whether quote base is ready and can be loaded readily"); bool IsShglReady = Env.GetIfArgPrefixBool("-shglready:", false, "Indicate whether shingle hash table is ready"); bool IsNetReady = Env.GetIfArgPrefixBool("-netready:", false, "Indicate whether cluster net is ready"); double BktThresh = Env.GetIfArgPrefixFlt("-bktthresh:", 0.4, "Threshold for bad shingle bucket elimination"); double MxTmDelay = Env.GetIfArgPrefixFlt("-delaythresh:", 5, "Max time delay between two quotes in the same cluster"); double MxTmDev = Env.GetIfArgPrefixFlt("-devthresh:", 3, "Max time deviation for a quote to be specific rather than general"); double RefineThresh = Env.GetIfArgPrefixFlt("-refinethresh:", 0.2, "Threshold for merging quote cluster in refining process"); const TStr MinTmStr = Env.GetIfArgPrefixStr("-mint:", "20010101", "Min time of quotes, format = YYYYMMDD"); const TStr MaxTmStr = Env.GetIfArgPrefixStr("-maxt:", "20200101", "Max time of quotes, format = YYYYMMDD"); TSecTm MinTm(atoi(MinTmStr.GetSubStr(0,3).CStr()),atoi(MinTmStr.GetSubStr(4,5).CStr()),atoi(MinTmStr.GetSubStr(6,7).CStr())); TSecTm MaxTm(atoi(MaxTmStr.GetSubStr(0,3).CStr()),atoi(MaxTmStr.GetSubStr(4,5).CStr()),atoi(MaxTmStr.GetSubStr(6,7).CStr())); // Construct the quote base from Zarya data PQuoteBs QtBs = TQuoteBs::New(); if (!IsQtBsReady) { int HashTableSize = 100; // 100 for each quarter, for one year data, use 400 if (ToDo == "memeclustzarya") { int UrlSetSize = 4 * HashTableSize; QtBs->ConstructQtBsZarya(InFNm, Pref, MediaUrlFNm, MinTm, MaxTm, MinWrdLen, MinMemeFq, HashTableSize, UrlSetSize); } else if (ToDo == "memeclustqtonly") { QtBs->ConstructQtBsQtOnly(InFNm, Pref, MediaUrlFNm, MinWrdLen, MinMemeFq, HashTableSize); } else if (ToDo == "memeclustqttime") { QtBs->ConstructQtBsQtTime(InFNm, Pref, MediaUrlFNm, MinWrdLen, MinMemeFq, HashTableSize); } else { printf("Please specify one of the three options for -do : memeclustzarya, memeclustqtonly, memeclustqttime!\n"); return; } } else { TStr InQtBsNm = TStr::Fmt("%s-w%dmfq%d.QtBs", Pref.CStr(), MinWrdLen, MinMemeFq); if (TZipIn::IsZipFNm(InQtBsNm)) { TZipIn ZipIn(InQtBsNm); QtBs = TQuoteBs::Load(ZipIn); } else { TFIn FIn(InQtBsNm); QtBs = TQuoteBs::Load(FIn); } } // Cluster the quotes QtBs->ClusterQuotes(MinMemeFq, MinWrdLen, MaxWrdLen, BlackListFNm, Pref, IsShglReady, IsNetReady, BktThresh, MxTmDelay, MxTmDev, RefineThresh); // Dump the clusters bool SkipUrl = true, FlashDisp = true; QtBs->DumpQuoteClusters(MinWrdLen, MinClustFq, SkipUrl, FlashDisp, Pref); } #pragma endregion memeclust }