TEST(TStr, Search) { TStr Str = "abcdaaba"; int Len = Str.Len(); EXPECT_EQ(Str.CountCh('a'), 4); EXPECT_EQ(Str.CountCh('b'), 2); EXPECT_EQ(Str.CountCh('e'), 0); EXPECT_TRUE(Str.IsChIn('a')); EXPECT_TRUE(Str.IsChIn('b')); EXPECT_FALSE(Str.IsChIn('e')); EXPECT_TRUE(Str.IsStrIn(Str)); EXPECT_TRUE(Str.IsStrIn("")); EXPECT_TRUE(Str.IsStrIn("bcd")); EXPECT_TRUE(Str.IsStrIn("ab")); EXPECT_FALSE(Str.IsStrIn("eba")); EXPECT_EQ(Str.CountCh('a', 1), 3); EXPECT_ANY_THROW(Str.CountCh('a', 10)); EXPECT_EQ(Str.CountCh('b', 2), 1); EXPECT_EQ(Str.CountCh('e', 1), 0); EXPECT_EQ(Str.SearchCh('a'), 0); EXPECT_EQ(Str.SearchCh('b'), 1); EXPECT_EQ(Str.SearchCh('e'), -1); EXPECT_EQ(Str.SearchCh('a', 1), 4); EXPECT_EQ(Str.SearchCh('b', 2), 6); EXPECT_EQ(Str.SearchCh('e', 1), -1); EXPECT_EQ(Str.SearchChBack('a'), Len - 1); EXPECT_EQ(Str.SearchChBack('b'), Len - 2); EXPECT_EQ(Str.SearchChBack('e'), -1); EXPECT_EQ(Str.SearchChBack('a', Len - 2), Len - 3); EXPECT_EQ(Str.SearchChBack('b', Len - 3), 1);; EXPECT_EQ(Str.SearchChBack('e', 3), -1); EXPECT_EQ(Str.SearchStr("a"), 0); EXPECT_EQ(Str.SearchStr("b"), 1); EXPECT_EQ(Str.SearchStr("e"), -1); EXPECT_EQ(Str.SearchStr(""), 0); EXPECT_EQ(Str.SearchStr("a", 1), 4); EXPECT_EQ(Str.SearchStr("b", 2), 6); EXPECT_EQ(Str.SearchStr("e", 1), -1); }
// copy files for a particular folder info void TBackupProfile::CopyFolder(const TStr& BaseTargetFolder, const TStr& SourceFolder, const TStrV& Extensions, const TStrV& SkipIfContainingV, const bool& IncludeSubfolders, const bool& ReportP, TStr& ErrMsg) { try { // get the name of the source folder TStrV PathV; TDir::SplitPath(SourceFolder, PathV); EAssert(PathV.Len() > 0); // create the folder in the base target folder TStr TargetFolder = BaseTargetFolder + PathV[PathV.Len() - 1] + "/"; if (!TDir::Exists(TargetFolder)) TDir::GenDir(TargetFolder); // find files to be copied TStrV FileV; TFFile::GetFNmV(SourceFolder, Extensions, false, FileV); TStrV FolderV; // copy them for (int N = 0; N < FileV.Len(); N++) { // we found a file if (TFile::Exists(FileV[N])) { const TStr FileName = TDir::GetFileName(FileV[N]); // is this a file that we wish to ignore? bool ShouldCopy = true; for (int S = 0; S < SkipIfContainingV.Len(); S++) { if (FileName.SearchStr(SkipIfContainingV[S]) >= 0) ShouldCopy = false; } if (!ShouldCopy) continue; const TStr TargetFNm = TargetFolder + FileName; if (ReportP) TNotify::StdNotify->OnStatusFmt("Copying file: %s\r", FileName.CStr()); TFile::Copy(FileV[N], TargetFNm); } // we found a folder else { FolderV.Add(FileV[N]); } } if (IncludeSubfolders) { for (int N = 0; N < FolderV.Len(); N++) CopyFolder(TargetFolder, FolderV[N], Extensions, SkipIfContainingV, IncludeSubfolders, ReportP, ErrMsg); } } catch (PExcept E) { if (ErrMsg != "") ErrMsg += "\n"; ErrMsg += "Exception while copying from " + SourceFolder + ": " + E->GetMsgStr(); } catch (...) { if (ErrMsg != "") ErrMsg += "\n"; ErrMsg += "Exception while copying from " + SourceFolder + ": " + "Unrecognized exception occured."; } }
int Intersect1(TUNGraph::TNodeI Node, TStr NNodes){ int br = 0; for (int i = 0; i<Node.GetDeg(); i++) { TInt digi = Node.GetNbrNId(i); TStr buf = ""; buf = digi.GetStr(); if (NNodes.SearchStr(buf.CStr()) != -1) br++; } TInt digi = Node.GetId(); TStr buf = digi.GetStr(); if (NNodes.SearchStr(buf.CStr()) != -1) br++; return br; }
inline TStr getWebsite(TStr fulladdress) { TStr left,right,tmp,res; if(fulladdress.SearchStr(TStr("http"),0)>=0) { fulladdress.SplitOnStr(left,TStr("//"),right); right.SplitOnCh(res,'/',tmp); } else { fulladdress.SplitOnCh(res,'/',tmp); } return res; }
void TVizMapContext::PaintMgGlass(PGks Gks, const int& KeyWdFontSize) { // drawing the dark circle TFltRect ZoomRect = GetZoomRect(); int SizeX = TFlt::Round((MgGlassSize/ZoomRect.GetXLen()) * Gks->GetWidth()); int SizeY = TFlt::Round((MgGlassSize/ZoomRect.GetYLen()) * Gks->GetHeight()); Gks->SetBrush(TGksBrush::New(ColorMgGlass)); Gks->FillEllipse(ScreenX-SizeX, ScreenY-SizeY, ScreenX+SizeX, ScreenY+SizeY); // drawing the keywords if (MgGlassKeyWdV.Len() > 0) { // prepare the string Gks->SetFont(TGksFont::New("ARIAL", KeyWdFontSize, TGksColor::GetBlack(), TFSet()|gfsBold)); TStr KeyWdStr = Gks->BreakTxt(MgGlassKeyWdV, ", ", ",", MgGlassWindowWidth); TStr NearPointStr; if (NearPointN != -1) { PVizMapPoint NearPoint = VizMapFrame->GetPoint(NearPointN); if (NearPoint->IsPointNm()) { TStr NearPointNm = NearPoint->GetPointNm(); if (NearPointNm.IsStrIn("[[")) { const int StartPos = NearPointNm.SearchStr("[["); NearPointNm = NearPointNm.Left(StartPos - 1); } NearPointStr = Gks->BreakTxt(NearPointNm, " ", "", MgGlassWindowWidth, 1); NearPointStr.DelChAll('\n'); NearPointStr += "\n"; } } TStr DocCountStr = "#documents = " + MgGlassPoints.GetStr() + "\n"; // compose the final message KeyWdStr = NearPointStr + DocCountStr + KeyWdStr; // find position of the window int WndWidth = Gks->GetTxtWidth(KeyWdStr) + 6; int WndHeight = Gks->GetTxtHeight(KeyWdStr) + 6; int PosX = ScreenX + 20, PosY = ScreenY + 20; if (PosX + WndWidth > Gks->GetWidth()) { PosX = ScreenX - 20 - WndWidth; } if (PosY + WndHeight > Gks->GetHeight()) { PosY = ScreenY - 20 - WndHeight; } // draw the keyword string Gks->SetBrush(TGksBrush::New(ColorMgGlassWndShadow)); Gks->FillRect(PosX + 5, PosY + 5, PosX + WndWidth + 5, PosY + WndHeight + 5); Gks->SetBrush(TGksBrush::New(ColorMgGlassWnd)); Gks->SetPen(TGksPen::New(ColorMgGlassWndFrm)); Gks->Rectangle(PosX, PosY, PosX + WndWidth, PosY + WndHeight); Gks->PutTxt(KeyWdStr, PosX+3, PosY+3); } }
TFltRect TVizMapContext::PaintPointNm(PGks Gks, PVizMapPoint Point, const int& X, const int& Y, const int& PointFontSize, const int& PointNmFontScale, const bool& SelPointP, const bool& IsCatP) { // get and clean point name TStr PointNm = Point->GetPointNm(); PointNm.ChangeChAll('_', ' '); if (PointNm.IsStrIn("[[")) { const int StartPos = PointNm.SearchStr("[["); PointNm = PointNm.Left(StartPos - 1); } // set font TGksColor FontColor = SelPointP ? ColorSelPointFont : ColorPointFont; const int FontSize = PointFontSize + TFlt::Round(Point->GetWgt()*PointNmFontScale); //TFSet FontStyle = IsCatP ? (TFSet() | gfsBold) : TFSet(); //Gks->SetFont(TGksFont::New("ARIAL", FontSize, FontColor, FontStyle)); Gks->SetFont(TGksFont::New("ARIAL", FontSize, FontColor)); // refit it for the screen TStr ScreenPointNm = Gks->BreakTxt(PointNm, " ", "", PointNmWidth, PointNmMxLines); // calculate string position on the screen const int HalfTxtWidth = Gks->GetTxtWidth(ScreenPointNm) / 2; const int HalfTxtHeight = Gks->GetTxtHeight(ScreenPointNm) / 2; // draw it! const int MnX = X - HalfTxtWidth; int CurrY = Y - HalfTxtHeight; TStrV LineV; ScreenPointNm.SplitOnAllCh('\n', LineV); for (int LineN = 0; LineN < LineV.Len(); LineN++) { const int HalfLineWidth = Gks->GetTxtWidth(LineV[LineN]) / 2; const int LineHeight = Gks->GetTxtHeight(LineV[LineN]); Gks->PutTxt(LineV[LineN], MnX + (HalfTxtWidth - HalfLineWidth), CurrY); CurrY += LineHeight-3; } // finish return TFltRect(X - HalfTxtWidth, Y - HalfTxtHeight, X + HalfTxtWidth, Y + HalfTxtHeight - LineV.Len()*3); }
void BigMain(int argc, char* argv[]) { TExeTm ExeTm; Env = TEnv(argc, argv, TNotify::StdNotify); Env.PrepArgs("QuotesApp"); const TStr ToDo = Env.GetIfArgPrefixStr("-do:", "", "To do").GetLc(); if (Env.IsEndOfRun()) { printf("To do:\n"); printf(" MkDataset : Make memes dataset (extract quotes and save txt)\n"); printf(" ExtractSubset : Extract a subset of memes containing particular words\n"); printf(" MemesToQtBs : Load memes dataset and create quote base\n"); printf(" MkClustNet : Build cluster network from the quote base\n"); return; } #pragma region mkdataset // extract quotes and links and make them into a single file if (ToDo == "mkdataset") { const TStr InFNm = Env.GetIfArgPrefixStr("-i:", "files.txt", "Spinn3r input files (one file per line)"); const TStr OutFNm = Env.GetIfArgPrefixStr("-o:", "Spinn3r-dataset.txt", "Output file"); const int MinQtWrdLen = Env.GetIfArgPrefixInt("-w:", 3, "Minimum quote word length"); const TStr UrlFNm = Env.GetIfArgPrefixStr("-u:", "", "Seen url set (THashSet<TMd5Sig>) file name"); const bool UrlOnlyOnce = Env.GetIfArgPrefixBool("-q:", true, "Only keep unique Urls"); //// parse directly from Spinn3r TStr Spinn3rFNm; THashSet<TMd5Sig> SeenUrlSet; if (UrlOnlyOnce && ! UrlFNm.Empty()) { // keep track of already seen urls (so that there are no duplicate urls) TFIn FIn(UrlFNm); SeenUrlSet.Load(FIn); } FILE *F = fopen(OutFNm.CStr(), "wt"); TFIn FIn(InFNm); int Items=0; for (int f=0; FIn.GetNextLn(Spinn3rFNm); f++) { TQuoteExtractor QE(Spinn3rFNm.ToTrunc()); printf("Processing %02d: %s [%s]\n", f+1, Spinn3rFNm.CStr(), TExeTm::GetCurTm()); fflush(stdout); for (int item = 0; QE.Next(); item++) { const TMd5Sig PostMd5(QE.PostUrlStr); if (QE.QuoteV.Empty() && QE.LinkV.Empty()) { continue; } // no quotes, no links if (UrlOnlyOnce) { if (SeenUrlSet.IsKey(PostMd5)) { continue; } SeenUrlSet.AddKey(PostMd5); } fprintf(F, "P\t%s\n", QE.PostUrlStr.CStr()); //if (QE.PubTm > TSecTm(2008,8,30) || QE.PubTm < TSecTm(2008,7,25)) { printf("%s\n", QE.PubTm.GetStr().CStr()); } fprintf(F, "T\t%s\n", QE.PubTm.GetYmdTmStr().CStr()); for (int q = 0; q < QE.QuoteV.Len(); q++) { if (TStrUtil::CountWords(QE.QuoteV[q]) >= MinQtWrdLen) { fprintf(F, "Q\t%s\n", QE.QuoteV[q].CStr()); } } for (int l = 0; l < QE.LinkV.Len(); l++) { fprintf(F, "L\t%s\n", QE.LinkV[l].CStr()); } fprintf(F, "\n"); if (item>0 && item % Kilo(100) == 0) { QE.DumpStat(); QE.ExeTm.Tick(); } Items++; } printf("file done. Total %d all posts, %d all items\n", SeenUrlSet.Len(), Items); fflush(stdout); } printf("all done. Saving %d post urls\n", SeenUrlSet.Len()); fflush(stdout); if (! SeenUrlSet.Empty()) { TFOut FOut(OutFNm.GetFMid()+".SeenUrlSet"); SeenUrlSet.Save(FOut); } fclose(F); } #pragma endregion mkdataset #pragma region extractsubset // save posts with memes containing particular words else if (ToDo == "extractsubset") { const TStr InFNmWc = Env.GetIfArgPrefixStr("-i:", "memes_*.rar", "Input file prefix"); const bool IsInFNmWc = Env.GetIfArgPrefixBool("-w:", true, "Input is wildcard (else a file with list of input files)"); const TStr OutFNm = Env.GetIfArgPrefixStr("-o:", "memes-subset.txt", "Output memes file"); const TStr WordsFNm = Env.GetIfArgPrefixStr("-p:", "phrases-in.txt", "Phrases that memes have to contain"); TChAV CatchMemeV;// = TStr::GetV("great depression", "economic meltdown", "recession had bottomed out", "green shoots", "slow recovery", "gradual recovery"); printf("Loading %s\n", WordsFNm.CStr()); { TFIn FIn(WordsFNm); for (TStr Ln; FIn.GetNextLn(Ln); ) { printf(" %s\n", Ln.GetLc().CStr()); CatchMemeV.Add(Ln.GetLc()); } } printf("%d strings loaded\n", CatchMemeV.Len()); TFOut FOut(OutFNm); TMemesDataLoader Memes(InFNmWc, IsInFNmWc); for (int posts = 0, nsave=0; Memes.LoadNext(); posts++) { bool DoSave = false; for (int m = 0; m < Memes.MemeV.Len(); m++) { for (int i = 0; i < CatchMemeV.Len(); i++) { if (Memes.MemeV[m].SearchStr(CatchMemeV[i]) != -1) { DoSave=true; break; } } if (DoSave) { break; } } if (DoSave) { Memes.SaveTxt(FOut); nsave++; } if (posts % Mega(1) == 0) { printf("%dm posts, %d saved\n", posts/Mega(1), nsave); FOut.Flush(); } } } #pragma endregion extractsubset #pragma region memestoqtbs // load memes dataset (MkDataset) and create quote base else if (ToDo == "memestoqtbs") { const TStr InFNm = Env.GetIfArgPrefixStr("-i:", "201007_201107.txt", "Input Memes dataset files"); const TStr MediaUrlFNm = Env.GetIfArgPrefixStr("-u:", "news_media.txt", "Fule with news media urls"); const TStr Pref = Env.GetIfArgPrefixStr("-o:", "qt", "Output file name prefix"); const int MinWrdLen = Env.GetIfArgPrefixInt("-l:", 4, "Min quote word length"); const int MinMemeFq = Env.GetIfArgPrefixInt("-f:", 5, "Min meme frequency"); const TStr MinTmStr = Env.GetIfArgPrefixStr("-mint:", "20100714", "Min time of quotes, format = YYYYMMDD"); const TStr MaxTmStr = Env.GetIfArgPrefixStr("-maxt:", "20110728", "Max time of quotes, format = YYYYMMDD"); TSecTm MinTm(atoi(MinTmStr.GetSubStr(0,3).CStr()),atoi(MinTmStr.GetSubStr(4,5).CStr()),atoi(MinTmStr.GetSubStr(6,7).CStr())); TSecTm MaxTm(atoi(MaxTmStr.GetSubStr(0,3).CStr()),atoi(MaxTmStr.GetSubStr(4,5).CStr()),atoi(MaxTmStr.GetSubStr(6,7).CStr())); PQuoteBs QtBs = TQuoteBs::New(); int HashTableSize = 100; // 100 for each quarter, for one year data, use 400 int UrlSetSize = 4 * HashTableSize; QtBs->ConstructQtBsZarya(InFNm, Pref, MediaUrlFNm, MinTm, MaxTm, MinWrdLen, MinMemeFq, HashTableSize, UrlSetSize); } #pragma endregion memestoqtbs #pragma region mkclustnet // make cluster network else if (ToDo == "mkclustnet") { TStr InQtBsNm = Env.GetIfArgPrefixStr("-i:", "", "Input quote base file name"); TStr Pref = Env.GetIfArgPrefixStr("-o:", "qt", "Output network/updated QtBs filename"); TStr BlackListFNm = Env.GetIfArgPrefixStr("-b:", "quote_blacklist.txt", "Blacklist file name"); bool IsShglReady = Env.GetIfArgPrefixBool("-shglready:", false, "Indicate whether shingle hash table is ready"); bool IsNetReady = Env.GetIfArgPrefixBool("-netready:", false, "Indicate whether cluster net is ready"); double BktThresh = Env.GetIfArgPrefixFlt("-bktthresh:", 0.4, "Threshold for bad shingle bucket elimination"); double MxTmDelay = Env.GetIfArgPrefixFlt("-delaythresh:", 5, "Max time delay between two quotes in the same cluster"); double MxTmDev = Env.GetIfArgPrefixFlt("-devthresh:", 3, "Max time deviation for a quote to be specific rather than general"); double RefineThresh = Env.GetIfArgPrefixFlt("-refinethresh:", 0.2, "Threshold for merging quote cluster in refining process"); const int MinWrdLen = Env.GetIfArgPrefixInt("-minl:", 4, "Min quote word length"); const int MaxWrdLen = Env.GetIfArgPrefixInt("-maxl:", 200, "Max quote word length"); const int MinMemeFq = Env.GetIfArgPrefixInt("-mf:", 5, "Min meme frequency"); const int MinClustFq = Env.GetIfArgPrefixInt("-cf:", 50, "Min quote cluster frequency"); // Load quote base PQuoteBs QtBs; if (TZipIn::IsZipFNm(InQtBsNm)) { TZipIn ZipIn(InQtBsNm); QtBs = TQuoteBs::Load(ZipIn); } else { TFIn FIn(InQtBsNm); QtBs = TQuoteBs::Load(FIn); } // Cluster the quotes QtBs->ClusterQuotes(MinMemeFq, MinWrdLen, MaxWrdLen, BlackListFNm, Pref, IsShglReady, IsNetReady, BktThresh, MxTmDelay, MxTmDev, RefineThresh); // Dump the clusters bool SkipUrl = true, FlashDisp = true; QtBs->DumpQuoteClusters(MinWrdLen, MinClustFq, SkipUrl, FlashDisp, Pref); } #pragma endregion mkclustnet #pragma region memeclust else if (ToDo.SearchStr(TStr("memeclust")) >= 0) { const TStr InFNm = Env.GetIfArgPrefixStr("-i:", "201101.txt", "Input Memes dataset files"); const TStr MediaUrlFNm = Env.GetIfArgPrefixStr("-u:", "news_media.txt", "Fule with news media urls"); TStr Pref = Env.GetIfArgPrefixStr("-o:", "qt", "Output file name prefix"); const int MinWrdLen = Env.GetIfArgPrefixInt("-minl:", 4, "Min quote word length"); const int MaxWrdLen = Env.GetIfArgPrefixInt("-maxl:", 200, "Max quote word length"); const int MinMemeFq = Env.GetIfArgPrefixInt("-f:", 5, "Min meme frequency"); const int MinClustFq = Env.GetIfArgPrefixInt("-cf:", 50, "Min quote cluster frequency"); TStr BlackListFNm = Env.GetIfArgPrefixStr("-b:", "quote_blacklist.txt", "Blacklist file name"); bool IsQtBsReady = Env.GetIfArgPrefixBool("-qtbsready:", false, "Indicate whether quote base is ready and can be loaded readily"); bool IsShglReady = Env.GetIfArgPrefixBool("-shglready:", false, "Indicate whether shingle hash table is ready"); bool IsNetReady = Env.GetIfArgPrefixBool("-netready:", false, "Indicate whether cluster net is ready"); double BktThresh = Env.GetIfArgPrefixFlt("-bktthresh:", 0.4, "Threshold for bad shingle bucket elimination"); double MxTmDelay = Env.GetIfArgPrefixFlt("-delaythresh:", 5, "Max time delay between two quotes in the same cluster"); double MxTmDev = Env.GetIfArgPrefixFlt("-devthresh:", 3, "Max time deviation for a quote to be specific rather than general"); double RefineThresh = Env.GetIfArgPrefixFlt("-refinethresh:", 0.2, "Threshold for merging quote cluster in refining process"); const TStr MinTmStr = Env.GetIfArgPrefixStr("-mint:", "20010101", "Min time of quotes, format = YYYYMMDD"); const TStr MaxTmStr = Env.GetIfArgPrefixStr("-maxt:", "20200101", "Max time of quotes, format = YYYYMMDD"); TSecTm MinTm(atoi(MinTmStr.GetSubStr(0,3).CStr()),atoi(MinTmStr.GetSubStr(4,5).CStr()),atoi(MinTmStr.GetSubStr(6,7).CStr())); TSecTm MaxTm(atoi(MaxTmStr.GetSubStr(0,3).CStr()),atoi(MaxTmStr.GetSubStr(4,5).CStr()),atoi(MaxTmStr.GetSubStr(6,7).CStr())); // Construct the quote base from Zarya data PQuoteBs QtBs = TQuoteBs::New(); if (!IsQtBsReady) { int HashTableSize = 100; // 100 for each quarter, for one year data, use 400 if (ToDo == "memeclustzarya") { int UrlSetSize = 4 * HashTableSize; QtBs->ConstructQtBsZarya(InFNm, Pref, MediaUrlFNm, MinTm, MaxTm, MinWrdLen, MinMemeFq, HashTableSize, UrlSetSize); } else if (ToDo == "memeclustqtonly") { QtBs->ConstructQtBsQtOnly(InFNm, Pref, MediaUrlFNm, MinWrdLen, MinMemeFq, HashTableSize); } else if (ToDo == "memeclustqttime") { QtBs->ConstructQtBsQtTime(InFNm, Pref, MediaUrlFNm, MinWrdLen, MinMemeFq, HashTableSize); } else { printf("Please specify one of the three options for -do : memeclustzarya, memeclustqtonly, memeclustqttime!\n"); return; } } else { TStr InQtBsNm = TStr::Fmt("%s-w%dmfq%d.QtBs", Pref.CStr(), MinWrdLen, MinMemeFq); if (TZipIn::IsZipFNm(InQtBsNm)) { TZipIn ZipIn(InQtBsNm); QtBs = TQuoteBs::Load(ZipIn); } else { TFIn FIn(InQtBsNm); QtBs = TQuoteBs::Load(FIn); } } // Cluster the quotes QtBs->ClusterQuotes(MinMemeFq, MinWrdLen, MaxWrdLen, BlackListFNm, Pref, IsShglReady, IsNetReady, BktThresh, MxTmDelay, MxTmDev, RefineThresh); // Dump the clusters bool SkipUrl = true, FlashDisp = true; QtBs->DumpQuoteClusters(MinWrdLen, MinClustFq, SkipUrl, FlashDisp, Pref); } #pragma endregion memeclust }