void QuoteGraph::CompareUsingShingles(THash<TMd5Sig, TIntSet>& Shingles) { int Count = 0; int RealCount = 0; TVec<TMd5Sig> ShingleKeys; Shingles.GetKeyV(ShingleKeys); THashSet<TIntPr> EdgeCache; for (int i = 0; i < ShingleKeys.Len(); i++) { if (i % 100 == 0) { Err("Processed %d out of %d shingles, count = %d\n", i, ShingleKeys.Len(), Count); } TIntSet Bucket; Shingles.IsKeyGetDat(ShingleKeys[i], Bucket); for (TIntSet::TIter Quote1 = Bucket.BegI(); Quote1 < Bucket.EndI(); Quote1++) { TIntSet::TIter Quote1Copy = Quote1; Quote1Copy++; for (TIntSet::TIter Quote2 = Quote1Copy; Quote2 < Bucket.EndI(); Quote2++) { if (!EdgeCache.IsKey(TIntPr(Quote1.GetKey(), Quote2.GetKey())) && !EdgeCache.IsKey(TIntPr(Quote2.GetKey(), Quote1.GetKey()))) { EdgeCache.AddKey(TIntPr(Quote1.GetKey(), Quote2.GetKey())); EdgeCache.AddKey(TIntPr(Quote2.GetKey(), Quote1.GetKey())); RealCount++; AddEdgeIfSimilar(Quote1.GetKey(), Quote2.GetKey()); } } } int Len = Bucket.Len() * (Bucket.Len() - 1) / 2; Count += Len; } fprintf(stderr, "NUMBER OF COMPARES: %d\n", Count); fprintf(stderr, "NUMBER OF REAL COMPARES: %d\n", RealCount); }
// DyNetML format, loads all the networks in the file TVec<PNGraph> LoadDyNetGraphV(const TStr& FNm) { TXmlLx XmlLx(TFIn::New(FNm), xspTruncate); TVec<PNGraph> GraphV; THashSet<TStr> NIdStr; while (XmlLx.GetSym()!=xsyEof) { if (XmlLx.Sym==xsySTag && XmlLx.TagNm=="network") { PNGraph G = TNGraph::New(); GraphV.Add(G); XmlLx.GetSym(); while (XmlLx.TagNm=="link") { TStr Str1, Val1, Str2, Val2; XmlLx.GetArg(0, Str1, Val1); XmlLx.GetArg(1, Str2, Val2); IAssert(Str1=="source" && Str2=="target"); NIdStr.AddKey(Val1); NIdStr.AddKey(Val2); const int src=NIdStr.GetKeyId(Val1); const int dst=NIdStr.GetKeyId(Val2); if (! G->IsNode(src)) { G->AddNode(src); } if (! G->IsNode(dst)) { G->AddNode(dst); } G->AddEdge(src, dst); XmlLx.GetSym(); } } } return GraphV; }
// I embarassingly don't know how templating works. void QuoteGraph::CompareUsingMinHash(TVec<THash<TMd5Sig, TIntSet> >& BucketsVector) { THashSet<TIntPr> EdgeCache; int Count = 0; int RealCount = 0; Err("Beginning edge creation step...\n"); for (int i = 0; i < BucketsVector.Len(); i++) { Err("Processing band signature %d of %d - %d signatures\n", i+1, BucketsVector.Len(), BucketsVector[i].Len()); TVec<TMd5Sig> Buckets; BucketsVector[i].GetKeyV(Buckets); TVec<TMd5Sig>::TIter BucketEnd = Buckets.EndI(); for (TVec<TMd5Sig>::TIter BucketSig = Buckets.BegI(); BucketSig < BucketEnd; BucketSig++) { TIntSet Bucket = BucketsVector[i].GetDat(*BucketSig); Count += Bucket.Len() * (Bucket.Len() - 1) / 2; for (TIntSet::TIter Quote1 = Bucket.BegI(); Quote1 < Bucket.EndI(); Quote1++) { TIntSet::TIter Quote1Copy = Quote1; Quote1Copy++; for (TIntSet::TIter Quote2 = Quote1Copy; Quote2 < Bucket.EndI(); Quote2++) { if (!EdgeCache.IsKey(TIntPr(Quote1.GetKey(), Quote2.GetKey())) && !EdgeCache.IsKey(TIntPr(Quote2.GetKey(), Quote1.GetKey()))) { EdgeCache.AddKey(TIntPr(Quote1.GetKey(), Quote2.GetKey())); EdgeCache.AddKey(TIntPr(Quote2.GetKey(), Quote1.GetKey())); RealCount++; AddEdgeIfSimilar(Quote1.GetKey(), Quote2.GetKey()); } } } } } fprintf(stderr, "NUMBER OF COMPARES: %d\n", Count); fprintf(stderr, "NUMBER OF REAL COMPARES: %d\n", RealCount); }
TVec<TPair<TFltV, TFltV> > TLSHash::GetAllCandidatePairs() { THashSet<TPair<TInt, TInt> > CandidateIdPairs; for (int i=0; i<Bands; i++) { TVec<TIntV> BucketVV; SigBucketVHV[i].GetDatV(BucketVV); for (int j=0; j<BucketVV.Len(); j++) { TIntV BucketV = BucketVV[j]; for (int k=0; k<BucketV.Len(); k++) { for (int l=k+1; l<BucketV.Len(); l++) { int First = BucketV[k], Second = BucketV[l]; if (First > Second) { int Temp = First; First = Second; Second = Temp; } CandidateIdPairs.AddKey(TPair<TInt, TInt> (First, Second)); } } } } TVec<TPair<TFltV, TFltV> > CandidatePairs; int Ind = CandidateIdPairs.FFirstKeyId(); while (CandidateIdPairs.FNextKeyId(Ind)) { TPair<TInt, TInt> IdPair = CandidateIdPairs[Ind]; TPair<TFltV, TFltV> Pair(DataV[IdPair.GetVal1()], DataV[IdPair.GetVal2()]); CandidatePairs.Add(Pair); } return CandidatePairs; }
static void AddTreeCtrs(const TTrainData& data, const TSplitTree& currentTree, TFold* fold, TLearnContext* ctx, TStatsFromPrevTree* statsFromPrevTree, TCandidateList* candList) { using TSeenProjHash = THashSet<TProjection>; TSeenProjHash seenProj; // greedy construction TProjection binAndOneHotFeaturesTree; binAndOneHotFeaturesTree.BinFeatures = currentTree.GetBinFeatures(); binAndOneHotFeaturesTree.OneHotFeatures = currentTree.GetOneHotFeatures(); seenProj.insert(binAndOneHotFeaturesTree); for (const auto& ctrSplit : currentTree.GetCtrSplits()) { seenProj.insert(ctrSplit.Projection); } TSeenProjHash addedProjHash; for (const auto& baseProj : seenProj) { if (baseProj.IsEmpty()) { continue; } for (int cf = 0; cf < data.AllFeatures.CatFeatures.ysize(); ++cf) { if (data.AllFeatures.CatFeatures[cf].empty() || data.AllFeatures.IsOneHot[cf] || ctx->Rand.GenRandReal1() > ctx->Params.ObliviousTreeOptions->Rsm) { continue; } TProjection proj = baseProj; proj.AddCatFeature(cf); if (proj.IsRedundant() || proj.GetFullProjectionLength() > ctx->Params.CatFeatureParams->MaxTensorComplexity) { continue; } if (addedProjHash.has(proj)) { continue; } addedProjHash.insert(proj); AddCtrsToCandList(*fold, *ctx, proj, candList); fold->GetCtrRef(proj); } } THashSet<TSplitCandidate> candidatesToErase; for (auto& splitCandidate : statsFromPrevTree->Stats) { if (splitCandidate.first.Type == ESplitType::OnlineCtr) { if (!addedProjHash.has(splitCandidate.first.Ctr.Projection)) { candidatesToErase.insert(splitCandidate.first); } } } for (const auto& splitCandidate : candidatesToErase) { statsFromPrevTree->Stats.erase(splitCandidate); } }
/// Shingles by words void LSH::HashShinglesOfClusters(TQuoteBase *QuoteBase, TClusterBase *ClusterBase, TIntV& ClusterIds, TInt ShingleLen, THash<TMd5Sig, TIntV>& ShingleToClusterIds) { Err("Hashing shingles of clusters...\n"); for (int i = 0; i < ClusterIds.Len(); i++) { if (i % 1000 == 0) { fprintf(stderr, "%d out of %d completed\n", i, ClusterIds.Len()); } TCluster C; ClusterBase->GetCluster(ClusterIds[i], C); //fprintf(stderr, "%d vs. %d\n", ClusterIds[i].Val, C.GetId().Val); // Put x-word shingles into hash table; x is specified by ShingleLen parameter THashSet < TMd5Sig > CHashedShingles; GetHashedShinglesOfCluster(QuoteBase, C, ShingleLen, CHashedShingles); for (THashSet<TMd5Sig>::TIter Hash = CHashedShingles.BegI(); Hash < CHashedShingles.EndI(); Hash++) { TIntV ShingleClusterIds; if (ShingleToClusterIds.IsKey(*Hash)) { ShingleClusterIds = ShingleToClusterIds.GetDat(*Hash); } ShingleClusterIds.Add(ClusterIds[i]); ShingleToClusterIds.AddDat(*Hash, ShingleClusterIds); } } Err("Done hashing!\n"); }
void TCliqueOverlap::GetIntersection(const THashSet<TInt>& A, const THashSet<TInt>& B, THashSet<TInt>& C) { if (A.Len() < B.Len()) { for (THashSetKeyI<TInt> it=A.BegI(); it<A.EndI(); it++) if (B.IsKey(it.GetKey())) C.AddKey(it.GetKey()); } else { for (THashSetKeyI<TInt> it=B.BegI(); it<B.EndI(); it++) if (A.IsKey(it.GetKey())) C.AddKey(it.GetKey()); } }
int TCliqueOverlap::GetNodeIdWithMaxDeg(const THashSet<TInt>& Set) const{ int id = -1; int maxDeg = -1; // for (THashSetKeyI<TInt> it=Set.BegI(); it<Set.EndI(); it++) { int nId = it.GetKey(); int deg = m_G->GetNI(nId).GetDeg(); if (maxDeg < deg) { maxDeg=deg; id=nId; } } return id; }
int main(int argc, char *argv[]) { TStr BaseString = "/lfs/1/tmp/curis/week/QBDB.bin"; TFIn BaseFile(BaseString); TQuoteBase *QB = new TQuoteBase; TDocBase *DB = new TDocBase; QB->Load(BaseFile); DB->Load(BaseFile); TIntV QuoteIds; QB->GetAllQuoteIds(QuoteIds); int NumQuotes = QuoteIds.Len(); THash<TInt, TStrSet> PeakCounts; for (int i = 0; i < NumQuotes; i++) { TQuote CurQuote; if (QB->GetQuote(QuoteIds[i], CurQuote)) { TVec<TSecTm> Peaks; CurQuote.GetPeaks(DB, Peaks); TStr QuoteString; CurQuote.GetParsedContentString(QuoteString); TStrSet StringSet; if (PeakCounts.IsKey(Peaks.Len())) { StringSet = PeakCounts.GetDat(Peaks.Len()); } StringSet.AddKey(QuoteString); PeakCounts.AddDat(Peaks.Len(), StringSet); } } TIntV PeakCountKeys; PeakCounts.GetKeyV(PeakCountKeys); PeakCountKeys.Sort(true); for (int i = 0; i < PeakCountKeys.Len(); i++) { TStrSet CurSet = PeakCounts.GetDat(PeakCountKeys[i]); if (CurSet.Len() > 0) { printf("QUOTES WITH %d PEAKS\n", PeakCountKeys[i].Val); printf("#########################################\n"); THashSet<TStr> StringSet = PeakCounts.GetDat(PeakCountKeys[i]); for (THashSet<TStr>::TIter l = StringSet.BegI(); l < StringSet.EndI(); l++) { printf("%s\n", l.GetKey().CStr()); } printf("\n"); } } delete QB; delete DB; return 0; }
void LSH::GetHashedShinglesOfCluster(TQuoteBase *QuoteBase, TCluster& C, TInt ShingleLen, THashSet<TMd5Sig>& HashedShingles) { TIntV QuoteIds; C.GetQuoteIds(QuoteIds); for (int qt = 0; qt < QuoteIds.Len(); qt++) { TQuote Q; QuoteBase->GetQuote(QuoteIds[qt], Q); TStr QContentStr; Q.GetContentString(QContentStr); TStr QContentStrNoPunc; TStringUtil::RemovePunctuation(QContentStr, QContentStrNoPunc); TStrV QContentV; QContentStrNoPunc.SplitOnWs(QContentV); for (int i = 0; i < QContentV.Len() - ShingleLen + 1; i++) { TStr Shingle; for (int j = 0; j < ShingleLen; j++) { if (j > 0) { Shingle.InsStr(Shingle.Len(), " "); } Shingle.InsStr(Shingle.Len(), QContentV[i + j]); } TMd5Sig ShingleMd5(Shingle); HashedShingles.AddKey(ShingleMd5); } } }
void MakeSlashdotSignNet(const TStr InFNm, TStr OutFNm, TStr Desc, THashSet<TChA> NIdSet) { //THashSet<TChA> NIdSet; TChA LnStr; TVec<char *> WrdV; int Sign; //PSignNet Net = TSignNet::New(); TPt<TNodeEDatNet<TInt, TInt> > Net = TNodeEDatNet<TInt, TInt>::New(); int i = 0; for (TFIn FIn(InFNm); FIn.GetNextLn(LnStr); ) { if (LnStr.Empty() || LnStr[0]=='#') { continue; } LnStr.ToLc(); TStrUtil::SplitOnCh(LnStr, WrdV, '\t', false); //NIdSet.AddKey(WrdV[0]); if (strcmp(WrdV[1], "friends")==0) { Sign = 1; } else if (strcmp(WrdV[1], "fans")==0) { continue; } // skip (fans are in-friends) else if (strcmp(WrdV[1], "foes")==0) { Sign = -1; } else { Fail; } const int SrcNId = NIdSet.AddKey(WrdV[0]); if (! Net->IsNode(SrcNId)) { Net->AddNode(SrcNId); } for (int e = 2; e < WrdV.Len(); e++) { const int DstNId = NIdSet.AddKey(WrdV[e]); i ++ ; if ((SrcNId != DstNId) && ! Net->IsEdge(SrcNId, DstNId)) { if (! Net->IsNode(DstNId)) Net->AddNode(DstNId); Net->AddEdge(SrcNId, DstNId, Sign); } } } TSnap::PrintInfo(Net, "Slashdot (" + TInt::GetStr(i) + ")"); // copied from gio.h - line 111 FILE *F = fopen(OutFNm.CStr(), "wt"); fprintf(F, "# Directed graph: %s\n", OutFNm.CStr()); if (! Desc.Empty()) fprintf(F, "# %s\n", (Desc).CStr()); fprintf(F, "# Nodes: %d Edges: %d\n", Net->GetNodes(), Net->GetEdges()); fprintf(F, "# UserId\tGroupId\tSign\n"); for (TNodeEDatNet<TInt,TInt>::TEdgeI ei = Net->BegEI(); ei < Net->EndEI(); ei++) { fprintf(F, "%d\t%d\t%d\n", ei.GetSrcNId(), ei.GetDstNId(), ei()()); } fclose(F); PrintGraphStatTable(Net, OutFNm, Desc); }
void TCliqueOverlap::GetMaximalCliques(const PUNGraph& G, int MinMaxCliqueSize, TVec<TIntV>& MaxCliques) { if (G->GetNodes() == 0) return; // m_G = G; m_minMaxCliqueSize = MinMaxCliqueSize; m_maxCliques =& MaxCliques; m_Q.Clr(); // THashSet<TInt> SUBG; THashSet<TInt> CAND; for (TUNGraph::TNodeI NI=m_G->BegNI(); NI<m_G->EndNI(); NI++) { TInt nId = NI.GetId(); SUBG.AddKey(nId); CAND.AddKey(nId); } // Expand(SUBG, CAND); }
int TCliqueOverlap::Intersection(const THashSet<TInt>& A, const THashSet<TInt>& B) { int n = 0; if (A.Len() < B.Len()) { for (THashSetKeyI<TInt> it=A.BegI(); it<A.EndI(); it++) if (B.IsKey(it.GetKey())) n++; } else { for (THashSetKeyI<TInt> it=B.BegI(); it<B.EndI(); it++) if (A.IsKey(it.GetKey())) n++; } return n; }
int TCliqueOverlap::MaxNbrsInCANDNodeId(const THashSet<TInt>& SUBG, const THashSet<TInt>& CAND) const{ int id = -1; int maxIntersection = -1; // for (THashSetKeyI<TInt> it=SUBG.BegI(); it<SUBG.EndI(); it++) { int nId = it.GetKey(); TUNGraph::TNodeI nIt = m_G->GetNI(nId); int deg = nIt.GetDeg(); // int curIntersection = 0; for (int i=0; i<deg; i++) { int nbrId = nIt.GetNbrNId(i); if (CAND.IsKey(nbrId)) curIntersection++; } // if (maxIntersection < curIntersection) { maxIntersection=curIntersection; id=nId; } } return id; }
TVec<TFltV> TLSHash::GetCandidates(TFltV Datum) { THashSet<TInt> CandidateIds; for (int i=0; i<Bands; i++) { TInt Sig = ComputeSignature(Datum, i); THash<TInt, TIntV>& SigBucketVH = SigBucketVHV[i]; if (!SigBucketVH.IsKey(Sig)) { continue; } CandidateIds.AddKeyV(SigBucketVH.GetDat(Sig)); } TVec<TFltV> Candidates; int Ind = CandidateIds.FFirstKeyId(); while(CandidateIds.FNextKeyId(Ind)) { int Id = CandidateIds[Ind]; Candidates.Add(DataV[Id]); } return Candidates; }
void TCliqueOverlap::Expand(const THashSet<TInt>& SUBG, THashSet<TInt>& CAND) { if (SUBG.Len()==0) { if (m_Q.Len() >= m_minMaxCliqueSize) { m_Q.Pack(); m_maxCliques->Add(m_Q); } return; } if (CAND.Len()==0) return; //Get u that maximaze CAND intersection with neighbours of vertex u int u = MaxNbrsInCANDNodeId(SUBG, CAND); //Get neighbours of node u THashSet<TInt> nbrsU; GetNbrs(u, nbrsU); //Get relative complement of nbrsU in CAND THashSet<TInt> EXT; GetRelativeComplement(CAND, nbrsU, EXT); while(EXT.Len() != 0) { int q = GetNodeIdWithMaxDeg(EXT); // m_Q.Add(q); // THashSet<TInt> nbrsQ; GetNbrs(q, nbrsQ); // THashSet<TInt> SUBGq; GetIntersection(SUBG, nbrsQ, SUBGq); // THashSet<TInt> CANDq; GetIntersection(CAND, nbrsQ, CANDq); // Expand(SUBGq, CANDq); // CAND.DelKey(q); m_Q.DelLast(); // EXT.Clr(); GetRelativeComplement(CAND, nbrsU, EXT); } }
void LSH::WordHashing(TQuoteBase* QuoteBase, THashSet<TMd5Sig>& Shingles) { Err("Hashing shingles using words...\n"); TIntV QuoteIds; QuoteBase->GetAllQuoteIds(QuoteIds); for (int qt = 0; qt < QuoteIds.Len(); qt++) { if (qt % 1000 == 0) { Err("%d out of %d completed\n", qt, QuoteIds.Len()); } TQuote Q; QuoteBase->GetQuote(QuoteIds[qt], Q); TStrV Content; Q.GetParsedContent(Content); int ContentLen = Content.Len(); for (int i = 0; i < ContentLen; i++) { const TMd5Sig ShingleMd5(Content[i]); Shingles.AddKey(ShingleMd5); } } Err("Done with word hashing! Number of shingles: %d\n", Shingles.Len()); }
void LSH::ComputeSignatures(THashSet<TMd5Sig>& Shingles, THash<TMd5Sig, TIntV>& Signatures, int NumSignatures) { if (NumSignatures < 1) return; TRnd RandomGenerator; // TODO: make this "more random" by incorporating time TInt NumShingles = Shingles.Len(); for (int i = 0; i < NumSignatures; ++i) { // Create new signature TVec < TMd5Sig > Shuffle; Shingles.GetKeyV(Shuffle); Shuffle.Shuffle(RandomGenerator); for (int j = 0; j < NumShingles; j++) { TIntV Signature; Signatures.IsKeyGetDat(Shuffle[j], Signature); Signature.Add(j); Signatures.AddDat(Shuffle[j], Signature); } } Err("Computed %d signatures!\n", NumSignatures); }
void BigMain(int argc, char* argv[]) { TExeTm ExeTm; Env = TEnv(argc, argv, TNotify::StdNotify); Env.PrepArgs("QuotesApp"); const TStr ToDo = Env.GetIfArgPrefixStr("-do:", "", "To do").GetLc(); if (Env.IsEndOfRun()) { printf("To do:\n"); printf(" MkDataset : Make memes dataset (extract quotes and save txt)\n"); printf(" ExtractSubset : Extract a subset of memes containing particular words\n"); printf(" MemesToQtBs : Load memes dataset and create quote base\n"); printf(" MkClustNet : Build cluster network from the quote base\n"); return; } #pragma region mkdataset // extract quotes and links and make them into a single file if (ToDo == "mkdataset") { const TStr InFNm = Env.GetIfArgPrefixStr("-i:", "files.txt", "Spinn3r input files (one file per line)"); const TStr OutFNm = Env.GetIfArgPrefixStr("-o:", "Spinn3r-dataset.txt", "Output file"); const int MinQtWrdLen = Env.GetIfArgPrefixInt("-w:", 3, "Minimum quote word length"); const TStr UrlFNm = Env.GetIfArgPrefixStr("-u:", "", "Seen url set (THashSet<TMd5Sig>) file name"); const bool UrlOnlyOnce = Env.GetIfArgPrefixBool("-q:", true, "Only keep unique Urls"); //// parse directly from Spinn3r TStr Spinn3rFNm; THashSet<TMd5Sig> SeenUrlSet; if (UrlOnlyOnce && ! UrlFNm.Empty()) { // keep track of already seen urls (so that there are no duplicate urls) TFIn FIn(UrlFNm); SeenUrlSet.Load(FIn); } FILE *F = fopen(OutFNm.CStr(), "wt"); TFIn FIn(InFNm); int Items=0; for (int f=0; FIn.GetNextLn(Spinn3rFNm); f++) { TQuoteExtractor QE(Spinn3rFNm.ToTrunc()); printf("Processing %02d: %s [%s]\n", f+1, Spinn3rFNm.CStr(), TExeTm::GetCurTm()); fflush(stdout); for (int item = 0; QE.Next(); item++) { const TMd5Sig PostMd5(QE.PostUrlStr); if (QE.QuoteV.Empty() && QE.LinkV.Empty()) { continue; } // no quotes, no links if (UrlOnlyOnce) { if (SeenUrlSet.IsKey(PostMd5)) { continue; } SeenUrlSet.AddKey(PostMd5); } fprintf(F, "P\t%s\n", QE.PostUrlStr.CStr()); //if (QE.PubTm > TSecTm(2008,8,30) || QE.PubTm < TSecTm(2008,7,25)) { printf("%s\n", QE.PubTm.GetStr().CStr()); } fprintf(F, "T\t%s\n", QE.PubTm.GetYmdTmStr().CStr()); for (int q = 0; q < QE.QuoteV.Len(); q++) { if (TStrUtil::CountWords(QE.QuoteV[q]) >= MinQtWrdLen) { fprintf(F, "Q\t%s\n", QE.QuoteV[q].CStr()); } } for (int l = 0; l < QE.LinkV.Len(); l++) { fprintf(F, "L\t%s\n", QE.LinkV[l].CStr()); } fprintf(F, "\n"); if (item>0 && item % Kilo(100) == 0) { QE.DumpStat(); QE.ExeTm.Tick(); } Items++; } printf("file done. Total %d all posts, %d all items\n", SeenUrlSet.Len(), Items); fflush(stdout); } printf("all done. Saving %d post urls\n", SeenUrlSet.Len()); fflush(stdout); if (! SeenUrlSet.Empty()) { TFOut FOut(OutFNm.GetFMid()+".SeenUrlSet"); SeenUrlSet.Save(FOut); } fclose(F); } #pragma endregion mkdataset #pragma region extractsubset // save posts with memes containing particular words else if (ToDo == "extractsubset") { const TStr InFNmWc = Env.GetIfArgPrefixStr("-i:", "memes_*.rar", "Input file prefix"); const bool IsInFNmWc = Env.GetIfArgPrefixBool("-w:", true, "Input is wildcard (else a file with list of input files)"); const TStr OutFNm = Env.GetIfArgPrefixStr("-o:", "memes-subset.txt", "Output memes file"); const TStr WordsFNm = Env.GetIfArgPrefixStr("-p:", "phrases-in.txt", "Phrases that memes have to contain"); TChAV CatchMemeV;// = TStr::GetV("great depression", "economic meltdown", "recession had bottomed out", "green shoots", "slow recovery", "gradual recovery"); printf("Loading %s\n", WordsFNm.CStr()); { TFIn FIn(WordsFNm); for (TStr Ln; FIn.GetNextLn(Ln); ) { printf(" %s\n", Ln.GetLc().CStr()); CatchMemeV.Add(Ln.GetLc()); } } printf("%d strings loaded\n", CatchMemeV.Len()); TFOut FOut(OutFNm); TMemesDataLoader Memes(InFNmWc, IsInFNmWc); for (int posts = 0, nsave=0; Memes.LoadNext(); posts++) { bool DoSave = false; for (int m = 0; m < Memes.MemeV.Len(); m++) { for (int i = 0; i < CatchMemeV.Len(); i++) { if (Memes.MemeV[m].SearchStr(CatchMemeV[i]) != -1) { DoSave=true; break; } } if (DoSave) { break; } } if (DoSave) { Memes.SaveTxt(FOut); nsave++; } if (posts % Mega(1) == 0) { printf("%dm posts, %d saved\n", posts/Mega(1), nsave); FOut.Flush(); } } } #pragma endregion extractsubset #pragma region memestoqtbs // load memes dataset (MkDataset) and create quote base else if (ToDo == "memestoqtbs") { const TStr InFNm = Env.GetIfArgPrefixStr("-i:", "201007_201107.txt", "Input Memes dataset files"); const TStr MediaUrlFNm = Env.GetIfArgPrefixStr("-u:", "news_media.txt", "Fule with news media urls"); const TStr Pref = Env.GetIfArgPrefixStr("-o:", "qt", "Output file name prefix"); const int MinWrdLen = Env.GetIfArgPrefixInt("-l:", 4, "Min quote word length"); const int MinMemeFq = Env.GetIfArgPrefixInt("-f:", 5, "Min meme frequency"); const TStr MinTmStr = Env.GetIfArgPrefixStr("-mint:", "20100714", "Min time of quotes, format = YYYYMMDD"); const TStr MaxTmStr = Env.GetIfArgPrefixStr("-maxt:", "20110728", "Max time of quotes, format = YYYYMMDD"); TSecTm MinTm(atoi(MinTmStr.GetSubStr(0,3).CStr()),atoi(MinTmStr.GetSubStr(4,5).CStr()),atoi(MinTmStr.GetSubStr(6,7).CStr())); TSecTm MaxTm(atoi(MaxTmStr.GetSubStr(0,3).CStr()),atoi(MaxTmStr.GetSubStr(4,5).CStr()),atoi(MaxTmStr.GetSubStr(6,7).CStr())); PQuoteBs QtBs = TQuoteBs::New(); int HashTableSize = 100; // 100 for each quarter, for one year data, use 400 int UrlSetSize = 4 * HashTableSize; QtBs->ConstructQtBsZarya(InFNm, Pref, MediaUrlFNm, MinTm, MaxTm, MinWrdLen, MinMemeFq, HashTableSize, UrlSetSize); } #pragma endregion memestoqtbs #pragma region mkclustnet // make cluster network else if (ToDo == "mkclustnet") { TStr InQtBsNm = Env.GetIfArgPrefixStr("-i:", "", "Input quote base file name"); TStr Pref = Env.GetIfArgPrefixStr("-o:", "qt", "Output network/updated QtBs filename"); TStr BlackListFNm = Env.GetIfArgPrefixStr("-b:", "quote_blacklist.txt", "Blacklist file name"); bool IsShglReady = Env.GetIfArgPrefixBool("-shglready:", false, "Indicate whether shingle hash table is ready"); bool IsNetReady = Env.GetIfArgPrefixBool("-netready:", false, "Indicate whether cluster net is ready"); double BktThresh = Env.GetIfArgPrefixFlt("-bktthresh:", 0.4, "Threshold for bad shingle bucket elimination"); double MxTmDelay = Env.GetIfArgPrefixFlt("-delaythresh:", 5, "Max time delay between two quotes in the same cluster"); double MxTmDev = Env.GetIfArgPrefixFlt("-devthresh:", 3, "Max time deviation for a quote to be specific rather than general"); double RefineThresh = Env.GetIfArgPrefixFlt("-refinethresh:", 0.2, "Threshold for merging quote cluster in refining process"); const int MinWrdLen = Env.GetIfArgPrefixInt("-minl:", 4, "Min quote word length"); const int MaxWrdLen = Env.GetIfArgPrefixInt("-maxl:", 200, "Max quote word length"); const int MinMemeFq = Env.GetIfArgPrefixInt("-mf:", 5, "Min meme frequency"); const int MinClustFq = Env.GetIfArgPrefixInt("-cf:", 50, "Min quote cluster frequency"); // Load quote base PQuoteBs QtBs; if (TZipIn::IsZipFNm(InQtBsNm)) { TZipIn ZipIn(InQtBsNm); QtBs = TQuoteBs::Load(ZipIn); } else { TFIn FIn(InQtBsNm); QtBs = TQuoteBs::Load(FIn); } // Cluster the quotes QtBs->ClusterQuotes(MinMemeFq, MinWrdLen, MaxWrdLen, BlackListFNm, Pref, IsShglReady, IsNetReady, BktThresh, MxTmDelay, MxTmDev, RefineThresh); // Dump the clusters bool SkipUrl = true, FlashDisp = true; QtBs->DumpQuoteClusters(MinWrdLen, MinClustFq, SkipUrl, FlashDisp, Pref); } #pragma endregion mkclustnet #pragma region memeclust else if (ToDo.SearchStr(TStr("memeclust")) >= 0) { const TStr InFNm = Env.GetIfArgPrefixStr("-i:", "201101.txt", "Input Memes dataset files"); const TStr MediaUrlFNm = Env.GetIfArgPrefixStr("-u:", "news_media.txt", "Fule with news media urls"); TStr Pref = Env.GetIfArgPrefixStr("-o:", "qt", "Output file name prefix"); const int MinWrdLen = Env.GetIfArgPrefixInt("-minl:", 4, "Min quote word length"); const int MaxWrdLen = Env.GetIfArgPrefixInt("-maxl:", 200, "Max quote word length"); const int MinMemeFq = Env.GetIfArgPrefixInt("-f:", 5, "Min meme frequency"); const int MinClustFq = Env.GetIfArgPrefixInt("-cf:", 50, "Min quote cluster frequency"); TStr BlackListFNm = Env.GetIfArgPrefixStr("-b:", "quote_blacklist.txt", "Blacklist file name"); bool IsQtBsReady = Env.GetIfArgPrefixBool("-qtbsready:", false, "Indicate whether quote base is ready and can be loaded readily"); bool IsShglReady = Env.GetIfArgPrefixBool("-shglready:", false, "Indicate whether shingle hash table is ready"); bool IsNetReady = Env.GetIfArgPrefixBool("-netready:", false, "Indicate whether cluster net is ready"); double BktThresh = Env.GetIfArgPrefixFlt("-bktthresh:", 0.4, "Threshold for bad shingle bucket elimination"); double MxTmDelay = Env.GetIfArgPrefixFlt("-delaythresh:", 5, "Max time delay between two quotes in the same cluster"); double MxTmDev = Env.GetIfArgPrefixFlt("-devthresh:", 3, "Max time deviation for a quote to be specific rather than general"); double RefineThresh = Env.GetIfArgPrefixFlt("-refinethresh:", 0.2, "Threshold for merging quote cluster in refining process"); const TStr MinTmStr = Env.GetIfArgPrefixStr("-mint:", "20010101", "Min time of quotes, format = YYYYMMDD"); const TStr MaxTmStr = Env.GetIfArgPrefixStr("-maxt:", "20200101", "Max time of quotes, format = YYYYMMDD"); TSecTm MinTm(atoi(MinTmStr.GetSubStr(0,3).CStr()),atoi(MinTmStr.GetSubStr(4,5).CStr()),atoi(MinTmStr.GetSubStr(6,7).CStr())); TSecTm MaxTm(atoi(MaxTmStr.GetSubStr(0,3).CStr()),atoi(MaxTmStr.GetSubStr(4,5).CStr()),atoi(MaxTmStr.GetSubStr(6,7).CStr())); // Construct the quote base from Zarya data PQuoteBs QtBs = TQuoteBs::New(); if (!IsQtBsReady) { int HashTableSize = 100; // 100 for each quarter, for one year data, use 400 if (ToDo == "memeclustzarya") { int UrlSetSize = 4 * HashTableSize; QtBs->ConstructQtBsZarya(InFNm, Pref, MediaUrlFNm, MinTm, MaxTm, MinWrdLen, MinMemeFq, HashTableSize, UrlSetSize); } else if (ToDo == "memeclustqtonly") { QtBs->ConstructQtBsQtOnly(InFNm, Pref, MediaUrlFNm, MinWrdLen, MinMemeFq, HashTableSize); } else if (ToDo == "memeclustqttime") { QtBs->ConstructQtBsQtTime(InFNm, Pref, MediaUrlFNm, MinWrdLen, MinMemeFq, HashTableSize); } else { printf("Please specify one of the three options for -do : memeclustzarya, memeclustqtonly, memeclustqttime!\n"); return; } } else { TStr InQtBsNm = TStr::Fmt("%s-w%dmfq%d.QtBs", Pref.CStr(), MinWrdLen, MinMemeFq); if (TZipIn::IsZipFNm(InQtBsNm)) { TZipIn ZipIn(InQtBsNm); QtBs = TQuoteBs::Load(ZipIn); } else { TFIn FIn(InQtBsNm); QtBs = TQuoteBs::Load(FIn); } } // Cluster the quotes QtBs->ClusterQuotes(MinMemeFq, MinWrdLen, MaxWrdLen, BlackListFNm, Pref, IsShglReady, IsNetReady, BktThresh, MxTmDelay, MxTmDev, RefineThresh); // Dump the clusters bool SkipUrl = true, FlashDisp = true; QtBs->DumpQuoteClusters(MinWrdLen, MinClustFq, SkipUrl, FlashDisp, Pref); } #pragma endregion memeclust }
//---------------------------------------------------------------------------- void Node::WarmUpRendering(Renderer* pRenderer) { #ifndef WIRE_WII // Wii does not need to warm up by submitting draw calls WIRE_ASSERT(pRenderer); UpdateGS(0, true, false); Vector3F cameraLocation = WorldBound->GetCenter(); cameraLocation.Z() += WorldBound->GetRadius(); Vector3F viewDirection = -Vector3F::UNIT_Z; Vector3F up = Vector3F::UNIT_Y; Vector3F right = viewDirection.Cross(up); CameraPtr spCamera = WIRE_NEW Camera; spCamera->SetFrame(cameraLocation, viewDirection, up, right); Float fieldOfView = 60.0F; Float aspectRatio = 2; Float nearPlane = 0.1F; Float farPlane = WorldBound->GetRadius() * 2.0F; spCamera->SetFrustum(fieldOfView, aspectRatio, nearPlane, farPlane); CullerSorting culler; culler.SetCamera(spCamera); culler.ComputeVisibleSet(this); pRenderer->PreDraw(spCamera); // draw scene to warm up batching buffers pRenderer->Draw(culler.GetVisibleSets()); // collect and draw all materials separately so none will be missed // by CULL_ALWAYS or Switch/LOD nodes. THashSet<Material*> materials; TStack<Node*> scene(1000); scene.Push(this); while (!scene.IsEmpty()) { Node* pNode = NULL; scene.Pop(pNode); RenderObject* pRenderObject = pNode->GetRenderObject(); if (pRenderObject && pRenderObject->GetMaterial()) { materials.Insert(pRenderObject->GetMaterial()); } for (UInt i = 0; i < pNode->GetQuantity(); i++) { Node* pChild = DynamicCast<Node>(pNode->GetChild(i)); if (pChild) { scene.Push(pChild); } } } RenderObjectPtr spCube = StandardMesh::CreateCube24(4, pRenderer-> GetMaxTextureStages(), true); THashSet<Material*>::Iterator it(&materials); Transformation transformation; transformation.SetTranslate(cameraLocation - Vector3F(0, 0, 3)); for (Material** pMaterial = it.GetFirst(); pMaterial; pMaterial = it.GetNext()) { spCube->SetMaterial(*pMaterial); pRenderer->Draw(spCube, transformation); } pRenderer->PostDraw(); #endif }
void LSH::MinHash(TQuoteBase *QB, THashSet<TMd5Sig>& Shingles, TVec<THash<TMd5Sig, TIntSet> >& SignatureBandBuckets) { Err("Creating buckets...\n"); THash < TMd5Sig, TIntV > Signatures; ComputeSignatures(Shingles, Signatures, NumBands * BandSize); // bucket creation for (int i = 0; i < NumBands; ++i) { SignatureBandBuckets.Add(THash<TMd5Sig, TIntSet>()); } // bucket filling int NumShingles = Shingles.Len(); THash<TInt, TQuote> Quotes; QB->GetIdToTQuotes(Quotes); THash<TInt, TQuote>::TIter CurI = Quotes.BegI(); THash<TInt, TQuote>::TIter EndI = Quotes.EndI(); TQuote Q; // SKYFALL for (; CurI < EndI; CurI++) { Q = CurI.GetDat(); TStrV Content; Q.GetParsedContent(Content); TInt Id = Q.GetId(); // signature for quote int ContentLen = Content.Len(); TVec < TIntV > Signature; for (int i = 0; i < ContentLen; i++) { const TMd5Sig ShingleMd5(Content[i]); Signature.Add(Signatures.GetDat(ShingleMd5)); } // place in bucket if (ContentLen < WordWindow) { for (int i = 0; i < NumBands; ++i) { TStr Sig; for (int j = 0; j < BandSize; ++j) { int CurSig = i * BandSize + j; TInt min = NumShingles; for (int k = 0; k < ContentLen; k++) { if (Signature[k][CurSig] < min) { min = Signature[k][CurSig]; } } Sig += min.GetStr() + "-"; } //Err(Sig.CStr()); const TMd5Sig SigMd5(Sig); TIntSet Bucket; SignatureBandBuckets[i].IsKeyGetDat(SigMd5, Bucket); Bucket.AddKey(Id); SignatureBandBuckets[i].AddDat(SigMd5, Bucket); } } else { } } Err("Minhash step complete!\n"); }
///////////////////////////////////////////////// // TCommunity implementation void TCliqueOverlap::GetRelativeComplement(const THashSet<TInt>& A, const THashSet<TInt>& B, THashSet<TInt>& Complement) { for (THashSet<TInt>::TIter it=A.BegI(); it<A.EndI(); it++) { const int nId = it.GetKey(); if (!B.IsKey(nId)) Complement.AddKey(nId); } }
void TCliqueOverlap::GetNbrs(int NId, THashSet<TInt>& Nbrs) const{ TUNGraph::TNodeI node = m_G->GetNI(NId); int deg = node.GetDeg(); for (int i=0; i<deg; i++) Nbrs.AddKey(node.GetNbrNId(i)); }
/** * Used for benchmarking sorting by source algorithm. * Takes as input starting point of * a top cascade and outputs time taken for casacade detection. * Input : Source, Dest, Start, Duration * Output : Prints the time for cascade detection */ int main(int argc,char* argv[]) { TTableContext Context; Schema TimeS; TimeS.Add(TPair<TStr,TAttrType>("Source",atInt)); TimeS.Add(TPair<TStr,TAttrType>("Dest",atInt)); TimeS.Add(TPair<TStr,TAttrType>("Start",atInt)); TimeS.Add(TPair<TStr,TAttrType>("Duration",atInt)); PTable P1 = TTable::LoadSS(TimeS,"./../../../../datasets/temporal/yemen_call_201001.txt",&Context,' '); TIntV MapV; TStrV SortBy; SortBy.Add("Source"); P1->Order(SortBy); TIntV Source; // Sorted vec of start time P1->ReadIntCol("Source",Source); for (TRowIterator RI = P1->BegRI(); RI < P1->EndRI(); RI++) { MapV.Add(RI.GetRowIdx()); } // Attribute to Int mapping TInt SIdx = P1->GetColIdx("Source"); TInt DIdx = P1->GetColIdx("Dest"); TInt StIdx = P1->GetColIdx("Start"); TInt DuIdx = P1->GetColIdx("Duration"); int W = atoi(argv[1]); int len = 0; // Find the starting point int TSource = atoi(argv[2]); int TDest = atoi(argv[3]); int TStart = atoi(argv[4]); int TDur = atoi(argv[5]); TInt RIdx; for (TRowIterator RI = P1->BegRI(); RI < P1->EndRI(); RI++) { RIdx = RI.GetRowIdx(); int RSource = P1->GetIntValAtRowIdx(SIdx,RIdx).Val; int RDest = P1->GetIntValAtRowIdx(DIdx,RIdx).Val; int RStart = P1->GetIntValAtRowIdx(StIdx,RIdx).Val; int RDur = P1->GetIntValAtRowIdx(DuIdx,RIdx).Val; if (TSource == RSource && TDest == RDest && TStart == RStart && TDur == RDur) break; } // Start building the cascade from the start point clock_t st,et; st = clock(); for (int i = 0; i < 1; i++) { THashSet<TInt> VisitedH; TSnapQueue<TInt> EventQ; EventQ.Push(RIdx); VisitedH.AddKey(RIdx); while (!EventQ.Empty()) { TInt CIdx = EventQ.Top(); EventQ.Pop(); int CDest = P1->GetIntValAtRowIdx(DIdx,CIdx).Val; int CStart = P1->GetIntValAtRowIdx(StIdx,CIdx).Val; int CDur = P1->GetIntValAtRowIdx(DuIdx,CIdx).Val; // In line binary search int val = CDest; int lo = 0; int hi = Source.Len() - 1; int index = -1; while (hi >= lo) { int mid = lo + (hi - lo)/2; if (Source.GetVal(mid) > val) { hi = mid - 1;} else if (Source.GetVal(mid) < val) { lo = mid + 1;} else { index = mid; hi = mid - 1;} } // End of binary search int BIdx = index; for(int i = BIdx; i < Source.Len(); i++) { int PId = MapV.GetVal(i).Val; if (! VisitedH.IsKey(PId)) { int TSource = P1->GetIntValAtRowIdx(SIdx,PId).Val; int TStart = P1->GetIntValAtRowIdx(StIdx,PId).Val; if (TSource != CDest) { break; } if (TStart >= (CDur + CStart) && TStart - (CDur + CStart) <= W) { VisitedH.AddKey(PId); EventQ.Push(PId); } } } } len = VisitedH.Len(); } et = clock(); float diff = ((float) et - (float) st)/CLOCKS_PER_SEC; printf("Size %d,Time %f\n",len,diff); return 0; }