int main(int argc, char *argv[]) { LogOutput Log; THash<TStr, TStr> Arguments; ArgumentParser::ParseArguments(argc, argv, Arguments, Log); TStr StartString = ArgumentParser::GetArgument(Arguments, "start", "2009-01-14"); TStr EndString = ArgumentParser::GetArgument(Arguments, "end", "2012-09-30"); TStr QBDBCDirectory = ArgumentParser::GetArgument(Arguments, "qbdbc", "/lfs/1/tmp/curis/QBDBC-final/"); TStr QBDBDirectory = ArgumentParser::GetArgument(Arguments, "qbdb", "/lfs/1/tmp/curis/QBDB/"); TSecTm StartDate = TSecTm::GetDtTmFromYmdHmsStr(StartString); TSecTm EndDate = TSecTm::GetDtTmFromYmdHmsStr(EndString); TQuoteBase QB; TDocBase DB; TClusterBase CB; PNGraph QGraph; TQuoteBase NewQB; TDocBase NewDB; TSecTm CurrentDate = StartDate; TInt NumUnprocessedQuotes = 0; TInt NumUnprocessedDocs = 0; TInt NumQuotes = 0; TInt NumClusters = 0; TInt NumTopClusters = 0; TInt NumDiscardedClustersByPeak = 0; TInt NumDiscardedClustersByVariant = 0; TInt NumRemainingClusters = 0; TInt NumVariants = 0; TInt NumDocs = 0; TInt Count = 0; while(CurrentDate < EndDate) { if (Count % 100 == 0) Err("%d days evaluated!\n", Count.Val); TDataLoader::LoadCumulative(QBDBCDirectory, CurrentDate.GetDtYmdStr(), QB, DB, CB, QGraph); TDataLoader::LoadQBDB(QBDBDirectory, CurrentDate.GetDtYmdStr(), NewQB, NewDB); if (NewQB.Len() > 0) { NumUnprocessedQuotes += NewQB.Len(); NumUnprocessedDocs += NewDB.Len(); } if (QB.Len() > 0) { Count++; Err("Loaded base for %s! Calculating stats...\n", CurrentDate.GetDtYmdStr().CStr()); NumQuotes += QB.Len(); NumClusters += CB.Len(); NumDocs += DB.Len(); TIntV TopClusters; CB.GetTopClusterIdsByFreq(TopClusters); int NmTopClusters = TopClusters.Len(); NumTopClusters += NmTopClusters; for (int i = 0; i < NmTopClusters; i++) { TCluster C; CB.GetCluster(TopClusters[i], C); if (C.GetDiscardState() == 1) { NumDiscardedClustersByPeak++; } else if (C.GetDiscardState() == 2) { NumDiscardedClustersByVariant++; } else { NumRemainingClusters++; NumVariants += C.GetNumUniqueQuotes(); } } } CurrentDate.AddDays(1); } Err("Number of quotes processed through in total: %d\n", NumUnprocessedQuotes.Val); Err("Number of docs processed through in total: %d\n", NumUnprocessedDocs.Val); Err("Number of quotes in total: %d\n", NumQuotes.Val); Err("Number of clusters in total: %d\n", NumClusters.Val); Err("Number of docs in total: %d\n", NumDocs.Val); Err("Number of top clusters in total: %d\n", NumTopClusters.Val); Err("Number of discarded clusters by peak in total: %d\n", NumDiscardedClustersByPeak.Val); Err("Number of discarded clusters by variant total: %d\n", NumDiscardedClustersByVariant.Val); Err("Number of top clusters remaining in total: %d\n", NumRemainingClusters.Val); Err("Number of top variants found: %d\n", NumVariants.Val); Err("=============\n===========\n"); Err("Number of days: %d\n", Count.Val); double AvgQuotes = (NumQuotes.Val * 1.0 / Count.Val); double AvgClusters = (NumClusters.Val * 1.0 / Count.Val); double AvgDocs = (NumDocs.Val * 1.0 / Count.Val); double AvgTopClusters = (NumTopClusters.Val * 1.0 / Count.Val); double AvgDiscardedPeaks = (NumDiscardedClustersByPeak.Val * 1.0 / Count.Val); double AvgDiscardedVariants = (NumDiscardedClustersByVariant.Val * 1.0 / Count.Val); double AvgRemaining = (NumRemainingClusters.Val * 1.0 / Count.Val); double AvgNumVariants = (NumVariants.Val * 1.0 / Count.Val); Err("Average number of quotes in total: %f\n", AvgQuotes); Err("Average number of clusters in total: %f\n", AvgClusters); Err("Average number of docs in total: %f\n", AvgDocs); Err("Average number of top clusters in total: %f\n", AvgTopClusters); Err("Average number of discarded clusters by peak in total: %f\n", AvgDiscardedPeaks); Err("Average number of discarded clusters by variant total: %f\n", AvgDiscardedVariants); Err("Average number of top clusters remaining in total: %f\n", AvgRemaining); Err("Average number of top variants found: %f\n", AvgNumVariants); return 0; }
int main(int argc, char* argv[]) { TTableContext Context; Schema NetworkS; NetworkS.Add(TPair<TStr, TAttrType>("Year", atInt)); NetworkS.Add(TPair<TStr, TAttrType>("Month", atInt)); NetworkS.Add(TPair<TStr, TAttrType>("DayOfMonth", atInt)); NetworkS.Add(TPair<TStr, TAttrType>("DayOfWeek", atInt)); NetworkS.Add(TPair<TStr, TAttrType>("DepTime", atStr)); NetworkS.Add(TPair<TStr, TAttrType>("CRSDepTime", atStr)); NetworkS.Add(TPair<TStr, TAttrType>("ArrTime", atStr)); NetworkS.Add(TPair<TStr, TAttrType>("CRSArrTime", atStr)); NetworkS.Add(TPair<TStr, TAttrType>("UniqueCarrier", atStr)); NetworkS.Add(TPair<TStr, TAttrType>("FlightNum", atStr)); NetworkS.Add(TPair<TStr, TAttrType>("TailNum", atStr)); NetworkS.Add(TPair<TStr, TAttrType>("ActualElapsedTime", atInt)); NetworkS.Add(TPair<TStr, TAttrType>("CRSElapsedTime", atInt)); NetworkS.Add(TPair<TStr, TAttrType>("AirTime", atInt)); NetworkS.Add(TPair<TStr, TAttrType>("ArrDelay", atInt)); NetworkS.Add(TPair<TStr, TAttrType>("DepDelay", atInt)); NetworkS.Add(TPair<TStr, TAttrType>("Origin", atStr)); NetworkS.Add(TPair<TStr, TAttrType>("Dest", atStr)); NetworkS.Add(TPair<TStr, TAttrType>("Distance", atStr)); NetworkS.Add(TPair<TStr, TAttrType>("TaxiIn", atInt)); NetworkS.Add(TPair<TStr, TAttrType>("TaxiOut", atInt)); NetworkS.Add(TPair<TStr, TAttrType>("Cancelled", atStr)); NetworkS.Add(TPair<TStr, TAttrType>("CancellationCode", atStr)); NetworkS.Add(TPair<TStr, TAttrType>("Diverted", atStr)); NetworkS.Add(TPair<TStr, TAttrType>("CarrierDelay", atInt)); NetworkS.Add(TPair<TStr, TAttrType>("WeatherDelay", atInt)); NetworkS.Add(TPair<TStr, TAttrType>("NASDelay", atInt)); NetworkS.Add(TPair<TStr, TAttrType>("SecurityDelay", atInt)); NetworkS.Add(TPair<TStr, TAttrType>("LateAircraftDelay", atInt)); TIntV RelevantCols; RelevantCols.Add(0); RelevantCols.Add(1); RelevantCols.Add(2);RelevantCols.Add(3); RelevantCols.Add(4); RelevantCols.Add(5); RelevantCols.Add(6); RelevantCols.Add(7); RelevantCols.Add(8); RelevantCols.Add(9); RelevantCols.Add(10); RelevantCols.Add(11); RelevantCols.Add(12); RelevantCols.Add(13); RelevantCols.Add(14); RelevantCols.Add(15); RelevantCols.Add(16); RelevantCols.Add(17); RelevantCols.Add(18); RelevantCols.Add(19); RelevantCols.Add(20); RelevantCols.Add(21); RelevantCols.Add(22); RelevantCols.Add(23); RelevantCols.Add(24); RelevantCols.Add(25); RelevantCols.Add(26); RelevantCols.Add(27); RelevantCols.Add(28); PTable P = TTable::LoadSS(NetworkS, "table/2007.csv", Context, RelevantCols, ',', false); TStrV SV; TStrV DV; TStrV VE; double start = omp_get_wtime(); PNSparseNet G = TSnap::ToNetwork<PNSparseNet>(P, TStr("Origin"), TStr("Dest"), SV, DV, VE, aaLast); double end = omp_get_wtime(); printf("Conversion time without attributes %f\n", (end-start)); start = omp_get_wtime(); TSnap::AddAttrTable<PNSparseNet>(P, G, TStr("Origin"), TStr("Dest"), SV, DV, VE, aaLast); end = omp_get_wtime(); printf("Conversion time with attributes %f\n", (end-start)); /*(PTable Table, PGraph& Graph, const TStr& SrcCol, const TStr& DstCol, TStrV& SrcAttrV, TStrV& DstAttrV, TStrV& EdgeAttrV, TAttrAggr AggrPolicy, TInt DefaultInt, TFlt DefaultFlt, TStr DefaultStr)*/ }
void TNGramBs::GetNGramIdV( const TStr& HtmlStr, TIntV& NGramIdV, TIntPrV& NGramBEChXPrV) const { // create MxNGramLen queues TVec<TIntQ> WIdQV(MxNGramLen); TVec<TIntPrQ> BEChXPrQV(MxNGramLen); for (int NGramLen=1; NGramLen<MxNGramLen; NGramLen++){ WIdQV[NGramLen].Gen(100*NGramLen, NGramLen+1); BEChXPrQV[NGramLen].Gen(100*NGramLen, NGramLen+1); } bool AllWIdQClrP=true; // extract words from text-string PSIn HtmlSIn=TStrIn::New(HtmlStr, false); THtmlLx HtmlLx(HtmlSIn); while (HtmlLx.Sym!=hsyEof){ if ((HtmlLx.Sym==hsyStr)||(HtmlLx.Sym==hsyNum)){ // get word-string & word-id TStr WordStr=HtmlLx.UcChA; int WId; int SymBChX=HtmlLx.SymBChX; int SymEChX=HtmlLx.SymEChX; if ((SwSet.Empty())||(!SwSet->IsIn(WordStr))){ if (!Stemmer.Empty()){ WordStr=Stemmer->GetStem(WordStr);} if (IsWord(WordStr, WId)){ if (!IsSkipWord(WId)){ NGramIdV.Add(0+WId); // add single word NGramBEChXPrV.Add(TIntPr(SymBChX, SymEChX)); // add positions for (int NGramLen=1; NGramLen<MxNGramLen; NGramLen++){ TIntQ& WIdQ=WIdQV[NGramLen]; TIntPrQ& BEChXPrQ=BEChXPrQV[NGramLen]; WIdQ.Push(WId); BEChXPrQ.Push(TIntPr(SymBChX, SymEChX)); AllWIdQClrP=false; // if queue full if (WIdQ.Len()==NGramLen+1){ // create sequence TIntV WIdV; WIdQ.GetSubValVec(0, WIdQ.Len()-1, WIdV); TIntPrV BEChXPrV; BEChXPrQ.GetSubValVec(0, BEChXPrQ.Len()-1, BEChXPrV); // add ngram-id or reset queues int WIdVP; if (WIdVToFqH.IsKey(WIdV, WIdVP)){ // if sequence is frequent int NGramId=GetWords()+WIdVP; // get sequence ngram-id NGramIdV.Add(NGramId); // add sequence ngram-id NGramBEChXPrV.Add(TIntPr(BEChXPrV[0].Val1, BEChXPrV.Last().Val2)); // add positions } } } } } else { // break queue sequences if infrequent word occures if (!AllWIdQClrP){ for (int NGramLen=1; NGramLen<MxNGramLen; NGramLen++){ TIntQ& WIdQ=WIdQV[NGramLen]; TIntPrQ& BEChXPrQ=BEChXPrQV[NGramLen]; if (!WIdQ.Empty()){WIdQ.Clr(); BEChXPrQ.Clr();} } AllWIdQClrP=true; } } } } // get next symbol HtmlLx.GetSym(); } }
void TNEANet::GetEIdV(TIntV& EIdV) const { EIdV.Gen(GetEdges(), 0); for (int E=EdgeH.FFirstKeyId(); EdgeH.FNextKeyId(E); ) { EIdV.Add(EdgeH.GetKey(E)); } }
///////////////////////////////////////////////// // BLEU-score double TEvalScoreBleu::Eval(const PTransCorpus& TransCorpus, const TIntV& _SentIdV) { // check if the corpus has translations IAssert(TransCorpus->IsTrans()); // ngram counts (cliped and full) TIntH ClipCountNGramH, CountNGramH; // candidate and effective reference length int FullTransLen = 0, FullRefLen = 0; // iterate over sentences TIntV SentIdV = _SentIdV; if (SentIdV.Empty()) { TransCorpus->GetSentIdV(SentIdV); } const int Sents = SentIdV.Len(); for (int SentIdN = 0; SentIdN < Sents; SentIdN++) { const int SentId = SentIdV[SentIdN]; // tokenize translation TIntV TransWIdV; Parse(TransCorpus->GetTransStr(SentId), TransWIdV); TIntH TransNGramH; GetNGramH(TransWIdV, MxNGramLen, TransNGramH); TIntH FreeTransNGramH = TransNGramH; // number of non-matched ngrams // counters for getting the closest length of reference sentences const int TransLen = TransWIdV.Len(); int BestLen = 0, BestLenDiff = TInt::Mx; // go over reference translations and count ngram matches TStrV RefTransStrV = TransCorpus->GetRefTransStrV(SentId); for (int RefN = 0; RefN < RefTransStrV.Len(); RefN++) { // parse reference translation sentence TIntV RefWIdV; Parse(RefTransStrV[RefN], RefWIdV); TIntH RefNGramH; GetNGramH(RefWIdV, MxNGramLen, RefNGramH); // check for matches int TransNGramKeyId = TransNGramH.FFirstKeyId(); while(TransNGramH.FNextKeyId(TransNGramKeyId)) { const int NGramId = TransNGramH.GetKey(TransNGramKeyId); const int FreeTransNGrams = FreeTransNGramH(NGramId); if (RefNGramH.IsKey(NGramId) && (FreeTransNGrams>0)) { // ngram match and still some free ngrams left to clip const int RefNGrams = RefNGramH(NGramId); FreeTransNGramH(NGramId) = TInt::GetMx(0, FreeTransNGrams - RefNGrams); } } // check the length difference const int RefLen = RefWIdV.Len(); const int LenDiff = TInt::Abs(TransLen - RefLen); if (LenDiff < BestLenDiff) { BestLen = RefLen; BestLenDiff = LenDiff; } } // count ngrams int TransNGramKeyId = TransNGramH.FFirstKeyId(); while(TransNGramH.FNextKeyId(TransNGramKeyId)) { // get ngram const int NGramId = TransNGramH.GetKey(TransNGramKeyId); IAssert(NGramId != -1); // check if two hash tables are aligned (should be...) const int FreeNGramId = FreeTransNGramH.GetKey(TransNGramKeyId); IAssert(NGramId == FreeNGramId); // get ngram count and clip-count const int Count = TransNGramH[TransNGramKeyId]; const int ClipCount = Count - FreeTransNGramH[TransNGramKeyId]; // add ngram to the coprus ngram counts CountNGramH.AddDat(NGramId) += Count; ClipCountNGramH.AddDat(NGramId) += ClipCount; } // count length FullTransLen += TransLen; FullRefLen += BestLen; } // calcualte ngram precisions TIntV ClipCountV(MxNGramLen); ClipCountV.PutAll(0); int ClipCountKeyId = ClipCountNGramH.FFirstKeyId(); while (ClipCountNGramH.FNextKeyId(ClipCountKeyId)) { const int NGramId = ClipCountNGramH.GetKey(ClipCountKeyId); const int NGramLen = GetNGramLen(NGramId); IAssert(0 < NGramLen && NGramLen <= MxNGramLen); ClipCountV[NGramLen-1] += ClipCountNGramH[ClipCountKeyId]; } TIntV CountV(MxNGramLen); CountV.PutAll(0); int CountKeyId = CountNGramH.FFirstKeyId(); while (CountNGramH.FNextKeyId(CountKeyId)) { const int NGramId = CountNGramH.GetKey(CountKeyId); const int NGramLen = GetNGramLen(NGramId); IAssert(0 < NGramLen && NGramLen <= MxNGramLen); CountV[NGramLen-1] += CountNGramH[CountKeyId]; } TFltV PrecV(MxNGramLen, 0); for (int NGramLen = 0; NGramLen < MxNGramLen; NGramLen++) { const int ClipCount = ClipCountV[NGramLen]; const int Count = CountV[NGramLen]; const double Prec = (Count > 0) ? double(ClipCount)/double(Count) : 0.0; PrecV.Add(Prec); //printf("%d-gram Match:%d Total:%d Prec:%.5f\n", NGramLen+1, ClipCount, Count, Prec); } // calcualte brevity penalty double LogBP = TFlt::GetMn(0.0, 1.0 - double(FullRefLen)/double(FullTransLen)); double BP = exp(LogBP); // calculate full BLEU score double BleuScore = BP; const double Wgt = 1.0 / double(MxNGramLen); for (int NGramLen = 0; NGramLen < MxNGramLen; NGramLen++) { BleuScore *= pow(PrecV[NGramLen], Wgt); } printf("BLEU Score: %.5f\n", BleuScore); // done! return BleuScore; }
///////////////////////////////////////////////// // NGram-Base TStr TNGramBs::GetWIdVStr(const TIntV& WIdV) const { TChA ChA; for (int WIdN=0; WIdN<WIdV.Len(); WIdN++){ if (WIdN>0){ChA+=' '/*'_'*/;} ChA+=GetWordStr(WIdV[WIdN]);} return ChA; }
void DemoFullBfsDfs() { const int NNodes = 500; PGraph G = GenFull<PGraph>(NNodes); PNGraph GOut; int TreeSz, TreeDepth; // Get BFS tree from first node without following links (demos different options) GOut = GetBfsTree(G, 1, false, false); GetSubTreeSz(G, 1, false, false, TreeSz, TreeDepth); printf("FollowOut=false, FollowIn=false, GOut->GetNodes() == %d, GOut->GetEdges() = %d\n", GOut->GetNodes(), G->GetEdges()); printf("TreeSz == %d, TreeDepth = %d\n", TreeSz, TreeDepth); GOut = GetBfsTree(G, NNodes-1, true, true); GetSubTreeSz(G, 1, true, true, TreeSz, TreeDepth); printf("FollowOut=true, FollowIn=true, GOut->GetNodes() == %d, GOut->GetEdges() = %d\n", GOut->GetNodes(), G->GetEdges()); printf("TreeSz == %d, TreeDepth = %d\n", TreeSz, TreeDepth); GOut = GetBfsTree(G, NNodes/2, true, false); GetSubTreeSz(G, 1, true, false, TreeSz, TreeDepth); printf("FollowOut=true, FollowIn=false, GOut->GetNodes() == %d, GOut->GetEdges() = %d\n", GOut->GetNodes(), G->GetEdges()); printf("TreeSz == %d, TreeDepth = %d\n", TreeSz, TreeDepth); GOut = GetBfsTree(G, 1, false, true); GetSubTreeSz(G, 1, false, true, TreeSz, TreeDepth); printf("FollowOut=false, FollowIn=true, GOut->GetNodes() == %d, GOut->GetEdges() = %d\n", GOut->GetNodes(), G->GetEdges()); printf("TreeSz == %d, TreeDepth = %d\n", TreeSz, TreeDepth); TIntV NIdV; int StartNId, Hop, Nodes; StartNId = 1; Hop = 1; Nodes = GetNodesAtHop(G, StartNId, Hop, NIdV, HasGraphFlag(typename PGraph::TObj, gfDirected)); printf("StartNId = %d, Nodes = %d, GetNodesAtHop NIdV.Len() = %d\n", StartNId, Nodes, NIdV.Len()); TIntPrV HopCntV; Nodes = GetNodesAtHops(G, StartNId, HopCntV, HasGraphFlag(typename PGraph::TObj, gfDirected)); printf("StartNId = %d, Nodes = %d, GetNodesAtHops HopCntV.Len() = %d\n", StartNId, Nodes, HopCntV.Len()); int Length, SrcNId, DstNId; SrcNId = 1; DstNId = NNodes-1; Length = GetShortPath(G, SrcNId, DstNId, HasGraphFlag(typename PGraph::TObj, gfDirected)); printf("SPL Length = %d\n", Length); TIntH NIdToDistH; int MaxDist = 9; Length = GetShortPath(G, SrcNId, NIdToDistH, HasGraphFlag(typename PGraph::TObj, gfDirected), MaxDist); // for (int i = 0; i < min(5,NIdToDistH.Len()); i++) { // printf("NIdToDistH[%d] = %d\n", i, NIdToDistH[i].Val); // } int FullDiam; double EffDiam, AvgDiam; int NTestNodes = 10; for (int IsDir = 0; IsDir < 2; IsDir++) { printf("IsDir = %d:\n", IsDir); FullDiam = GetBfsFullDiam(G, NTestNodes, IsDir); printf("FullDiam = %d\n", FullDiam); EffDiam = GetBfsEffDiam (G, NTestNodes, IsDir); printf("EffDiam = %.3f\n", EffDiam); EffDiam = GetBfsEffDiam (G, NTestNodes, IsDir, EffDiam, FullDiam); printf("EffDiam = %.3f, FullDiam = %d\n", EffDiam, FullDiam); EffDiam = GetBfsEffDiam (G, NTestNodes, IsDir, EffDiam, FullDiam, AvgDiam); printf("EffDiam = %.3f, FullDiam = %d, AvgDiam = %.3f\n", EffDiam, FullDiam, AvgDiam); TIntV SubGraphNIdV; for (int i = 0; i < NTestNodes; i++) { SubGraphNIdV.Add(G->GetRndNId()); } // for (int i = 0; i < SubGraphNIdV.Len(); i++) { // printf("SubGraphNIdV[%d] = %d\n", i, SubGraphNIdV[i].Val); // } EffDiam = GetBfsEffDiam(G, NTestNodes, SubGraphNIdV, IsDir, EffDiam, FullDiam); printf("For subgraph: EffDiam = %.3f, FullDiam = %d\n", EffDiam, FullDiam); } }
///////////////////////////////////////////////// // Best-Paths void GetBestPaths( const TStr& SrcNmObjStr, const TStr& DstNmObjStr, const PNmObjBs& NmObjBs){ int SrcNmObjId=NmObjBs->GetNmObjId(SrcNmObjStr); int DstNmObjId=NmObjBs->GetNmObjId(DstNmObjStr); int NmObjs=NmObjBs->GetNmObjs(); TIntPrV ParLevPrV(NmObjs); TIntPrV DstParLevPrV; ParLevPrV.PutAll(TIntPr(-1, -1)); int CurLev=0; ParLevPrV[SrcNmObjId]=TIntPr(SrcNmObjId, CurLev); forever{ CurLev++; int NewEdges=0; for (int NmObjId1=0; NmObjId1<NmObjs; NmObjId1++){ if (ParLevPrV[NmObjId1].Val2==CurLev-1){ TIntV DocIdV1; NmObjBs->GetNmObjDocIdV(NmObjId1, DocIdV1); for (int NmObjId2=0; NmObjId2<NmObjs; NmObjId2++){ if ((NmObjId2==DstNmObjId)||(ParLevPrV[NmObjId2].Val2==-1)){ TIntV DocIdV2; NmObjBs->GetNmObjDocIdV(NmObjId2, DocIdV2); TIntV IntrsDocIdV; DocIdV1.Intrs(DocIdV2, IntrsDocIdV); if (!IntrsDocIdV.Empty()){ ParLevPrV[NmObjId2]=TIntPr(NmObjId1, CurLev); NewEdges++; if (NmObjId2==DstNmObjId){ DstParLevPrV.Add(TIntPr(NmObjId1, CurLev)); } } } } } } if ((NewEdges==0)||(ParLevPrV[DstNmObjId].Val2!=-1)){ break; } } // prepare graph THash<TStr, PVrtx> VrtxNmToVrtxH; TStrPrV VrtxNmPrV; VrtxNmToVrtxH.AddKey(SrcNmObjStr); VrtxNmToVrtxH.AddKey(DstNmObjStr); // write path ContexterF->NmObjLinkageREd->Clear(); for (int DstParLevPrN=0; DstParLevPrN<DstParLevPrV.Len(); DstParLevPrN++){ ParLevPrV[DstNmObjId]=DstParLevPrV[DstParLevPrN]; int DstParLev=ParLevPrV[DstNmObjId].Val2; TStr DstNmObjStr=NmObjBs->GetNmObjStr(DstNmObjId); ContexterF->NmObjLinkageREd->Lines->Add(DstNmObjStr.CStr()); int ParNmObjId=DstNmObjId; TStr PrevNmObjStr=DstNmObjStr; forever { if (ParNmObjId==SrcNmObjId){break;} ParNmObjId=ParLevPrV[ParNmObjId].Val1; int ParLev=ParLevPrV[ParNmObjId].Val2; TStr CurNmObjStr=NmObjBs->GetNmObjStr(ParNmObjId); TStr ParNmObjStr=TStr::GetSpaceStr((DstParLev-ParLev)*4)+CurNmObjStr; ContexterF->NmObjLinkageREd->Lines->Add(ParNmObjStr.CStr()); // create vertex & edge VrtxNmToVrtxH.AddKey(CurNmObjStr); if (!PrevNmObjStr.Empty()){ if (PrevNmObjStr<CurNmObjStr){ VrtxNmPrV.AddUnique(TStrPr(PrevNmObjStr, CurNmObjStr)); } else if (PrevNmObjStr>CurNmObjStr){ VrtxNmPrV.AddUnique(TStrPr(CurNmObjStr, PrevNmObjStr)); } } // save curent named-object PrevNmObjStr=CurNmObjStr; } } // generate graph // create graph PGraph Graph=TGGraph::New(); // create vertices for (int VrtxN=0; VrtxN<VrtxNmToVrtxH.Len(); VrtxN++){ TStr VrtxNm=VrtxNmToVrtxH.GetKey(VrtxN); PVrtx Vrtx=TGVrtx::New(VrtxNm); VrtxNmToVrtxH.GetDat(VrtxNm)=Vrtx; Graph->AddVrtx(Vrtx); } // create edges for (int EdgeN=0; EdgeN<VrtxNmPrV.Len(); EdgeN++){ PVrtx Vrtx1=VrtxNmToVrtxH.GetDat(VrtxNmPrV[EdgeN].Val1); PVrtx Vrtx2=VrtxNmToVrtxH.GetDat(VrtxNmPrV[EdgeN].Val2); PEdge Edge=new TGEdge(Vrtx1, Vrtx2, TStr::Fmt("_%d", EdgeN), false); Graph->AddEdge(Edge); } // place graph ContexterF->State->ElGraph=Graph; TRnd Rnd(1); ContexterF->State->ElGraph->PlaceSimAnnXY(Rnd, ContexterF->State->ElGks); // draw graph ContexterF->State->ElGks->Clr(); ContexterF->ElPbPaint(NULL); }
// Demos BFS functions on undirected graph that is not fully connected void DemoBFSUndirectedRandom() { PUNGraph G; TStr FName = TStr::Fmt("%s/sample_bfsdfs_unpower.txt", DIRNAME); const int NNodes = 50; G = GenRndPowerLaw(NNodes, 2.5); // Can save/here // SaveEdgeList(G, FName); // G = LoadEdgeList<PUNGraph>(FName); TIntStrH NodeLabelH; for (int i = 0; i < G->GetNodes(); i++) { NodeLabelH.AddDat(i, TStr::Fmt("%d", i)); } DrawGViz(G, gvlNeato, TStr::Fmt("%s/sample_bfsdfs_unpower.png", DIRNAME), "Sample bfsdfs Graph", NodeLabelH); TIntV NIdV; int StartNId, Hop, Nodes; int IsDir = 0; printf("IsDir = %d:\n", IsDir); StartNId = 1; Hop = 1; Nodes = GetNodesAtHop(G, StartNId, Hop, NIdV, IsDir); printf("StartNId = %d, Nodes = %d, GetNodesAtHop NIdV.Len() = %d, NIdV[0] = %d\n", StartNId, Nodes, NIdV.Len(), NIdV[0].Val); TIntPrV HopCntV; Nodes = GetNodesAtHops(G, StartNId, HopCntV, IsDir); printf("StartNId = %d, Nodes = %d, GetNodesAtHops HopCntV.Len() = %d\n", StartNId , Nodes, HopCntV.Len()); // for (int N = 0; N < HopCntV.Len(); N++) { // printf("HopCntV[%d] = (%d, %d)\n", N, HopCntV[N].Val1.Val, HopCntV[N].Val2.Val); // } int Length, SrcNId, DstNId; SrcNId = 1; DstNId = G->GetNodes() - 1; Length = GetShortPath(G, SrcNId, DstNId, IsDir); printf("%d -> %d: SPL Length = %d\n", SrcNId, DstNId, Length); SrcNId = 1; DstNId = 33; Length = GetShortPath(G, SrcNId, DstNId, IsDir); printf("%d -> %d: SPL Length = %d\n", SrcNId, DstNId, Length); TIntH NIdToDistH; int MaxDist = 9; Length = GetShortPath(G, SrcNId, NIdToDistH, IsDir, MaxDist); // for (int i = 0; i < min(5,NIdToDistH.Len()); i++) { // printf("NIdToDistH[%d] = %d\n", i, NIdToDistH[i].Val); // } TInt::Rnd.PutSeed(0); int FullDiam; double EffDiam, AvgSPL; int NTestNodes = G->GetNodes() / 3 * 2; FullDiam = GetBfsFullDiam(G, NTestNodes, IsDir); printf("FullDiam = %d\n", FullDiam); EffDiam = GetBfsEffDiam(G, NTestNodes, IsDir); printf("EffDiam = %.3f\n", EffDiam); EffDiam = GetBfsEffDiam(G, NTestNodes, IsDir, EffDiam, FullDiam); printf("EffDiam = %.3f, FullDiam = %d\n", EffDiam, FullDiam); EffDiam = GetBfsEffDiam(G, NTestNodes, IsDir, EffDiam, FullDiam, AvgSPL); printf("EffDiam = %.3f, FullDiam = %d, AvgDiam = %.3f\n", EffDiam, FullDiam, AvgSPL); TIntV SubGraphNIdV; SubGraphNIdV.Add(0); SubGraphNIdV.Add(4); SubGraphNIdV.Add(31); SubGraphNIdV.Add(45); SubGraphNIdV.Add(18); SubGraphNIdV.Add(11); SubGraphNIdV.Add(11); SubGraphNIdV.Add(48); SubGraphNIdV.Add(34); SubGraphNIdV.Add(30); EffDiam = GetBfsEffDiam(G, NTestNodes, SubGraphNIdV, IsDir, EffDiam, FullDiam); printf("For subgraph: EffDiam = %.4f, FullDiam = %d\n", EffDiam, FullDiam); }
// Demos BFS functions on directed graph that is not fully connected void DemoBFSDirectedRandom() { PNGraph G = TNGraph::New(); TStr FName = TStr::Fmt("%s/sample_bfsdfs_ngraph.txt", DIRNAME); // Create benchmark graph, initially visually to confirm values are correct const int NNodes = 30; G = GenRndGnm<PNGraph>(NNodes, NNodes*2); // Add some more random edges for (int i = 0; i < 10; i++) { TInt Src, Dst; do { Src = G->GetRndNId(); Dst = G->GetRndNId(); } while (Src == Dst || G->IsEdge(Src, Dst)); G->AddEdge(Src, Dst); } // Add isolated component G->AddNode(NNodes); G->AddNode(NNodes+1); G->AddNode(NNodes+2); G->AddEdge(NNodes, NNodes+1); G->AddEdge(NNodes+1, NNodes+2); G->AddEdge(NNodes+2, NNodes+1); printf("G->GetNodes() = %d, G->GetEdges() = %d\n", G->GetNodes(), G->GetEdges()); // SaveEdgeList(G, FName); // G = LoadEdgeList<PNGraph>(FName); TIntStrH NodeLabelH; for (int i = 0; i < G->GetNodes(); i++) { NodeLabelH.AddDat(i, TStr::Fmt("%d", i)); } DrawGViz(G, gvlDot, TStr::Fmt("%s/sample_bfsdfs_ngraph.png", DIRNAME), "Sample BFS Graph", NodeLabelH); printf("G->GetNodes() = %d, G->GetEdges() = %d\n", G->GetNodes(), G->GetEdges()); TIntV NIdV; int StartNId, Hop, Nodes; // for (int IsDir = 0; IsDir < 2; IsDir++) { int IsDir = 1; printf("IsDir = %d:\n", IsDir); StartNId = 11; Hop = 1; Nodes = GetNodesAtHop(G, StartNId, Hop, NIdV, IsDir); printf("Nodes = %d, GetNodesAtHop NIdV.Len() = %d\n", Nodes, NIdV.Len()); for (int i = 0; i < NIdV.Len(); i++) { printf("NIdV[%d] = %d\n", i, NIdV[i].Val); } printf("Nodes == 2"); printf("NIdV.Len() == 2"); TIntPrV HopCntV; Nodes = GetNodesAtHops(G, StartNId, HopCntV, IsDir); printf("Nodes = %d, GetNodesAtHops HopCntV.Len() = %d\n", Nodes, HopCntV.Len()); printf("Nodes == 10"); printf("HopCntV.Len() == 10"); // for (int N = 0; N < HopCntV.Len(); N++) { // printf("HopCntV[%d] = (%d, %d)\n", N, HopCntV[N].Val1.Val, HopCntV[N].Val2.Val); // } int Length, SrcNId, DstNId; SrcNId = 11; DstNId = G->GetNodes() - 1; Length = GetShortPath(G, SrcNId, DstNId, IsDir); printf("%d -> %d: SPL Length = %d\n", SrcNId, DstNId, Length); SrcNId = 11; DstNId = 27; Length = GetShortPath(G, SrcNId, DstNId, IsDir); printf("%d -> %d: SPL Length = %d\n", SrcNId, DstNId, Length); TIntH NIdToDistH; int MaxDist = 9; Length = GetShortPath(G, SrcNId, NIdToDistH, IsDir, MaxDist); // for (int i = 0; i < min(5,NIdToDistH.Len()); i++) { // printf("NIdToDistH[%d] = %d\n", i, NIdToDistH[i].Val); // } TInt::Rnd.PutSeed(0); int FullDiam; double EffDiam, AvgSPL; int NTestNodes = G->GetNodes() / 2; FullDiam = GetBfsFullDiam(G, NTestNodes, IsDir); printf("FullDiam = %d\n", FullDiam); EffDiam = GetBfsEffDiam(G, NTestNodes, IsDir); printf("EffDiam = %.3f\n", EffDiam); EffDiam = GetBfsEffDiam(G, NTestNodes, IsDir, EffDiam, FullDiam); printf("EffDiam = %.3f, FullDiam = %d\n", EffDiam, FullDiam); EffDiam = GetBfsEffDiam(G, NTestNodes, IsDir, EffDiam, FullDiam, AvgSPL); printf("EffDiam = %.3f, FullDiam = %d, AvgDiam = %.3f\n", EffDiam, FullDiam, AvgSPL); TIntV SubGraphNIdV; SubGraphNIdV.Add(8); SubGraphNIdV.Add(29); SubGraphNIdV.Add(16); SubGraphNIdV.Add(0); SubGraphNIdV.Add(19); SubGraphNIdV.Add(17); SubGraphNIdV.Add(26); SubGraphNIdV.Add(14); SubGraphNIdV.Add(10); SubGraphNIdV.Add(24); SubGraphNIdV.Add(27); SubGraphNIdV.Add(2); SubGraphNIdV.Add(18); EffDiam = GetBfsEffDiam(G, NTestNodes, SubGraphNIdV, IsDir, EffDiam, FullDiam); printf("For subgraph: EffDiam = %.4f, FullDiam = %d\n", EffDiam, FullDiam); }
void TBlobBs::GenFFreeBlobPtV(const TIntV& BlockLenV, TBlobPtV& FFreeBlobPtV){ FFreeBlobPtV.Gen(BlockLenV.Len()+1); }
// Create TMMNet, add modes, crossnets, create subgraphs, covert to TNEANet void ManipulateMMNet() { int NNodes = 1000; int NEdges = 1000; // Create a multimodal network PMMNet Graph; Graph = PMMNet::New(); PrintMMNetStats("Empty MMNet",Graph); // Add mode TStr TestMode1("TestMode1"); Graph->AddModeNet(TestMode1); TInt TestModeId1 = Graph->GetModeId(TestMode1); // Add same-mode crossnet, directed TStr TestCross1("TestCross1"); Graph->AddCrossNet(TestMode1, TestMode1, TestCross1, true); TInt TestCrossId1 = Graph->GetCrossId(TestCross1); // Add same-mode crossnet, undirected TStr TestCross2("TestCross2"); Graph->AddCrossNet(TestModeId1, TestModeId1, TestCross2, false); TInt TestCrossId2 = Graph->GetCrossId(TestCross2); // Add mode TStr TestMode2("TestMode2"); Graph->AddModeNet(TestMode2); TInt TestModeId2 = Graph->GetModeId(TestMode2); // Add crossnet, directed TStr TestCross3("TestCross3"); Graph->AddCrossNet(TestMode1, TestMode2, TestCross3, true); TInt TestCrossId3 = Graph->GetCrossId(TestCross3); // Add crossnet, undirected TStr TestCross4("TestCross4"); Graph->AddCrossNet(TestModeId1, TestModeId2, TestCross4, false); TInt TestCrossId4 = Graph->GetCrossId(TestCross4); PrintMMNetStats("MMNet with modes/crossnets",Graph); // Add Nodes TModeNet& ModeNet1 = Graph->GetModeNetByName(TestMode1); TModeNet& ModeNet2 = Graph->GetModeNetById(TestModeId2); for (int i=0; i < NNodes; i++) { ModeNet1.AddNode(i); ModeNet2.AddNode(i*2); } // Add edges TCrossNet& CrossNet1 = Graph->GetCrossNetByName(TestCross1); TCrossNet& CrossNet2 = Graph->GetCrossNetById(TestCrossId2); TCrossNet& CrossNet3 = Graph->GetCrossNetByName(TestCross3); TCrossNet& CrossNet4 = Graph->GetCrossNetById(TestCrossId4); for (int i=0; i < NEdges; i++) { CrossNet1.AddEdge(i, (i+1)%NNodes, i); CrossNet2.AddEdge((i+5)%NNodes, i, i); CrossNet3.AddEdge(i, (i%NNodes)*2, i); CrossNet4.AddEdge((i+5)%NNodes, (i%NNodes)*2, i); } //Iterate over modes for (TMMNet::TModeNetI MI = Graph->BegModeNetI(); MI < Graph->EndModeNetI(); MI++) { PrintGStats(MI.GetModeName().GetCStr(), MI.GetModeNet()); } //Iterate over crossnets for (TMMNet::TCrossNetI CI = Graph->BegCrossNetI(); CI < Graph->EndCrossNetI(); CI++) { PrintGStats(CI.GetCrossName().GetCStr(), CI.GetCrossNet()); } // Get subgraph TStrV CrossNets; CrossNets.Add(TestCross1); PMMNet Subgraph = Graph->GetSubgraphByCrossNet(CrossNets); PrintMMNetStats("Subgraph", Subgraph); TModeNet& M1 = Subgraph->GetModeNetByName(TestMode1); PrintGStats("M1", M1); // Get neighbor types TStrV M1Names; M1.GetCrossNetNames(M1Names); printf("Num neighbor types %d\n", M1Names.Len()); // Get Neighbors for node 0 TIntV Neighbors; M1.GetNeighborsByCrossNet(0, TestCross1, Neighbors); printf("Num Neighbors %d\n", Neighbors.Len()); // Convert to TNEANet TIntV CrossNetIds; CrossNetIds.Add(TestCrossId1); CrossNetIds.Add(TestCrossId2); CrossNetIds.Add(TestCrossId3); CrossNetIds.Add(TestCrossId4); TVec<TTriple<TInt, TStr, TStr> > NodeAttrMapping; //Triples of (ModeId, OldAttrName, NewAttrName) TVec<TTriple<TInt, TStr, TStr> > EdgeAttrMapping; //Triples of (CrossId, OldAttrName, NewAttrName) PNEANet Net = Graph->ToNetwork(CrossNetIds, NodeAttrMapping, EdgeAttrMapping); PrintGStats("TNEANet", Net); }
/// rewire bipartite community affiliation graphs void TAGMUtil::RewireCmtyNID(THash<TInt,TIntV >& CmtyVH, TRnd& Rnd) { THash<TInt,TIntV > NewCmtyVH(CmtyVH.Len()); TIntV NDegV; TIntV CDegV; for (int i = 0; i < CmtyVH.Len(); i++) { int CID = CmtyVH.GetKey(i); for (int j = 0; j < CmtyVH[i].Len(); j++) { int NID = CmtyVH[i][j]; NDegV.Add(NID); CDegV.Add(CID); } } TIntPrSet CNIDSet(CDegV.Len()); int c=0; while (c++ < 15 && CDegV.Len() > 1) { for (int i = 0; i < CDegV.Len(); i++) { int u = Rnd.GetUniDevInt(CDegV.Len()); int v = Rnd.GetUniDevInt(NDegV.Len()); if (CNIDSet.IsKey(TIntPr(CDegV[u], NDegV[v]))) { continue; } CNIDSet.AddKey(TIntPr(CDegV[u], NDegV[v])); if (u == CDegV.Len() - 1) { CDegV.DelLast(); } else { CDegV[u] = CDegV.Last(); CDegV.DelLast(); } if ( v == NDegV.Len() - 1) { NDegV.DelLast(); } else { NDegV[v] = NDegV.Last(); NDegV.DelLast(); } } } for (int i = 0; i < CNIDSet.Len(); i++) { TIntPr CNIDPr = CNIDSet[i]; IAssert(CmtyVH.IsKey(CNIDPr.Val1)); NewCmtyVH.AddDat(CNIDPr.Val1); NewCmtyVH.GetDat(CNIDPr.Val1).Add(CNIDPr.Val2); } CmtyVH = NewCmtyVH; }
PLwOntoGround TLwOntoGround::GetOntoGround( const PLwOnto& LwOnto, const PBowDocBs& BowDocBs, const TStr& LangNm, const bool& DocCatIsTermIdP, const double& CutWordWgtSumPrc){ printf("Generating Ontology-Classifier...\n"); // shortcuts PLwTermBs TermBs=LwOnto->GetTermBs(); int Terms=TermBs->GetTerms(); PLwLinkBs LinkBs=LwOnto->GetLinkBs(); PLwLinkTypeBs LinkTypeBs=LwOnto->GetLinkTypeBs(); int LangId=LwOnto->GetLangBs()->GetLangId(LangNm); int Docs=BowDocBs->GetDocs(); // create tfidf printf(" Creating BowDocWgtBs ..."); PBowDocWgtBs BowDocWgtBs=TBowDocWgtBs::New(BowDocBs, bwwtNrmTFIDF); PBowSim BowSim=TBowSim::New(bstCos); printf(" Done.\n"); // collect documents per ontology-term printf(" Collecting documents per ontology-term ...\n"); TIntIntVH TermIdToDIdVH; int PosCats=0; int NegCats=0; for (int DId=0; DId<Docs; DId++){ printf(" Docs:%d/%d Pos:%d Neg:%d\r", 1+DId, Docs, PosCats, NegCats); for (int DocCIdN=0; DocCIdN<BowDocBs->GetDocCIds(DId); DocCIdN++){ // get document-category int CId=BowDocBs->GetDocCId(DId, DocCIdN); TStr CatNm=BowDocBs->GetCatNm(CId); // get term-id if (DocCatIsTermIdP){ int TermId=CatNm.GetInt(); if (TermBs->IsTermId(TermId)){ TermIdToDIdVH.AddDat(TermId).Add(DId); PosCats++; } else {NegCats++;} } else { if (TermBs->IsTermId(CatNm, LangId)){ int TermId=TermBs->GetTermId(CatNm, LangId); TermIdToDIdVH.AddDat(TermId).Add(DId); PosCats++; } else {NegCats++;} } } } printf(" Docs:%d/%d Pos:%d Neg:%d\n", Docs, Docs, PosCats, NegCats); printf(" Done.\n"); // create sub-terms & up-terms vectors printf(" Creating sub-terms & up-terms vectors ..."); TIntIntVH Const_TermIdToSubTermIdVH; TIntIntVH TermIdToSubTermIdVH; TIntIntVH TermIdToUpTermIdVH; for (int TermN=0; TermN<Terms; TermN++){ int TermId=TermBs->GetTermId(TermN); for (int LinkN=0; LinkN<LinkBs->GetFromLinks(TermId); LinkN++){ int LinkTypeId; int DstTermId; LinkBs->GetFromLink(TermId, LinkN, LinkTypeId, DstTermId); TStr LinkTypeNm=LinkTypeBs->GetLinkType(LinkTypeId)->GetLinkTypeNm(); if (LinkTypeNm=="NT"){ Const_TermIdToSubTermIdVH.AddDat(TermId).Add(DstTermId); TermIdToSubTermIdVH.AddDat(TermId).Add(DstTermId); TermIdToUpTermIdVH.AddDat(DstTermId).Add(TermId); } } } printf(" Done.\n"); // create centroids printf(" Creating centroids ...\n"); THash<TInt, PBowSpV> TermIdToConceptSpVH; TIntIntVH TermIdToSubTermDIdVH; TIntH ProcTermIdH; int PrevActiveTerms=-1; forever{ // count active nodes for processing int ActiveTerms=0; for (int TermN=0; TermN<Terms; TermN++){ int TermId=TermBs->GetTermId(TermN); if ((TermIdToSubTermIdVH.IsKey(TermId))&& (TermIdToSubTermIdVH.GetDat(TermId).Len()>0)){ ActiveTerms++; } } // stop if no change from previous round printf(" Active-Terms:%d\n", ActiveTerms); if (ActiveTerms==PrevActiveTerms){break;} PrevActiveTerms=ActiveTerms; // reduce active-nodes with zero-ancestors for (int TermN=0; TermN<Terms; TermN++){ int TermId=TermBs->GetTermId(TermN); if (ProcTermIdH.IsKey(TermId)){continue;} if ((!TermIdToSubTermIdVH.IsKey(TermId))|| (TermIdToSubTermIdVH.GetDat(TermId).Len()==0)){ printf(" %d/%d\r", 1+TermN, Terms); ProcTermIdH.AddKey(TermId); // collect document-ids TIntV TermDIdV; if (TermIdToDIdVH.IsKey(TermId)){ TermDIdV.AddV(TermIdToDIdVH.GetDat(TermId));} if (TermIdToSubTermDIdVH.IsKey(TermId)){ TermDIdV.AddV(TermIdToSubTermDIdVH.GetDat(TermId));} // create concept-vector if any documents if (TermDIdV.Len()>0){ PBowSpV ConceptSpV= TBowClust::GetConceptSpV(BowDocWgtBs, BowSim, TermDIdV, CutWordWgtSumPrc); TermIdToConceptSpVH.AddDat(TermId, ConceptSpV); } // correct upper-term if (TermIdToUpTermIdVH.IsKey(TermId)){ TIntV& UpTermIdV=TermIdToUpTermIdVH.GetDat(TermId); for (int UpTermIdN=0; UpTermIdN<UpTermIdV.Len(); UpTermIdN++){ int UpTermId=UpTermIdV[UpTermIdN]; TermIdToSubTermIdVH.GetDat(UpTermId).DelIfIn(TermId); if (TermDIdV.Len()>0){ TermIdToSubTermDIdVH.AddDat(UpTermId).AddV(TermDIdV);} } } } } } printf(" Done.\n"); // create & return classifier PLwOntoGround OntoGround= TLwOntoGround::New(LwOnto, BowDocBs, BowDocWgtBs, TermIdToConceptSpVH); printf("Done.\n"); return OntoGround; }
TGraphKey::TGraphKey(const TIntV& GraphSigV) : Nodes(-1), EdgeV(), SigV(), VariantId(0) { SigV.Gen(GraphSigV.Len()); for (int i = 0; i < GraphSigV.Len(); i++) { SigV[i] = TFlt(GraphSigV[i]()); } }
int main(int argc, char *argv[]) { // #### SETUP: Parse Arguments LogOutput Log; THash<TStr, TStr> Arguments; ArgumentParser::ParseArguments(argc, argv, Arguments, Log); TStr OutputDirectory; TStr StartString = ArgumentParser::GetArgument(Arguments, "start", "2009-02-01"); TStr QBDBDirectory = ArgumentParser::GetArgument(Arguments, "qbdb", QBDB_DIR_DEFAULT); TStr OutDirectory = ArgumentParser::GetArgument(Arguments, "out", "/lfs/1/tmp/curis/"); TInt WindowSize = ArgumentParser::GetArgument(Arguments, "window", "14").GetInt(); if (ArgumentParser::GetArgument(Arguments, "nolog", "") == "") { Log.DisableLogging(); } else if (!Arguments.IsKeyGetDat("directory", OutputDirectory)) { Log.SetupNewOutputDirectory(""); } else { Log.SetDirectory(OutputDirectory); } // #### DATA LOADING: Load ALL the things! TQuoteBase QB; TDocBase DB; fprintf(stderr, "Loading QB and DB from file for %d days, starting from %s...\n", WindowSize.Val, StartString.CStr()); Err("%s\n", QBDBDirectory.CStr()); TSecTm PresentTime = TDataLoader::LoadQBDBByWindow(QBDBDirectory, StartString, WindowSize, QB, DB); fprintf(stderr, "QBDB successfully loaded!\n"); TVec<TSecTm> PubTmV; TVec<TStr> PostUrlV; TVec<TStr> QuoteV; fprintf(stderr, "Dumping quotes to file...\n"); TIntV QuoteIds; QB.GetAllQuoteIds(QuoteIds); for (int i = 0; i < QuoteIds.Len(); i++) { TQuote Q; QB.GetQuote(QuoteIds[i], Q); TStr QContentString; Q.GetContentString(QContentString); TVec<TUInt> Sources; Q.GetSources(Sources); for (int j = 0; j < Sources.Len(); j++) { TDoc D; DB.GetDoc(Sources[j], D); TStr PostUrl; D.GetUrl(PostUrl); TSecTm PostTime = D.GetDate(); QuoteV.Add(QContentString); PubTmV.Add(PostTime); PostUrlV.Add(PostUrl); } } TFOut FOut(OutDirectory + "QuoteList" + ".bin"); PubTmV.Save(FOut); PostUrlV.Save(FOut); QuoteV.Save(FOut); fprintf(stderr, "Done!\n"); return 0; }
void TempMotifCounter::Count3TEdge3NodeStars(double delta, Counter3D& pre_counts, Counter3D& pos_counts, Counter3D& mid_counts) { TIntV centers; GetAllNodes(centers); pre_counts = Counter3D(2, 2, 2); pos_counts = Counter3D(2, 2, 2); mid_counts = Counter3D(2, 2, 2); // Get counts for each node as the center #pragma omp parallel for schedule(dynamic) for (int c = 0; c < centers.Len(); c++) { // Gather all adjacent events int center = centers[c]; TVec<TIntPair> ts_indices; TVec<StarEdgeData> events; TNGraph::TNodeI NI = static_graph_->GetNI(center); int index = 0; TIntV nbrs; GetAllNeighbors(center, nbrs); int nbr_index = 0; for (int i = 0; i < nbrs.Len(); i++) { int nbr = nbrs[i]; AddStarEdgeData(ts_indices, events, index, center, nbr, nbr_index, 0); AddStarEdgeData(ts_indices, events, index, nbr, center, nbr_index, 1); nbr_index++; } ts_indices.Sort(); TIntV timestamps; TVec<StarEdgeData> ordered_events; for (int j = 0; j < ts_indices.Len(); j++) { timestamps.Add(ts_indices[j].Key); ordered_events.Add(events[ts_indices[j].Dat]); } ThreeTEdgeStarCounter tesc(nbr_index); // dirs: outgoing --> 0, incoming --> 1 tesc.Count(ordered_events, timestamps, delta); #pragma omp critical { // Update counts for (int dir1 = 0; dir1 < 2; ++dir1) { for (int dir2 = 0; dir2 < 2; ++dir2) { for (int dir3 = 0; dir3 < 2; ++dir3) { pre_counts(dir1, dir2, dir3) += tesc.PreCount(dir1, dir2, dir3); pos_counts(dir1, dir2, dir3) += tesc.PosCount(dir1, dir2, dir3); mid_counts(dir1, dir2, dir3) += tesc.MidCount(dir1, dir2, dir3); } } } } // Subtract off edge-wise counts for (int nbr_id = 0; nbr_id < nbrs.Len(); nbr_id++) { int nbr = nbrs[nbr_id]; Counter3D edge_counts; Count3TEdge2Node(center, nbr, delta, edge_counts); #pragma omp critical { for (int dir1 = 0; dir1 < 2; ++dir1) { for (int dir2 = 0; dir2 < 2; ++dir2) { for (int dir3 = 0; dir3 < 2; ++dir3) { pre_counts(dir1, dir2, dir3) -= edge_counts(dir1, dir2, dir3); pos_counts(dir1, dir2, dir3) -= edge_counts(dir1, dir2, dir3); mid_counts(dir1, dir2, dir3) -= edge_counts(dir1, dir2, dir3); } } } } } } }
///////////////////////////////////////////////// // NIST-score double TEvalScoreNist::Eval(const PTransCorpus& TransCorpus, const TIntV& _SentIdV) { // check if the corpus has translations IAssert(TransCorpus->IsTrans()); // ngram counts (cliped and full) TIntH ClipCountNGramH, CountNGramH; // ngram info score TIntFltH NGramInfoH; // candidate and effective reference length double FullTransLen = 0.0, FullRefLen = 0.0; // iterate over sentences TIntV SentIdV = _SentIdV; if (SentIdV.Empty()) { TransCorpus->GetSentIdV(SentIdV); } const int Sents = SentIdV.Len(); for (int SentIdN = 0; SentIdN < Sents; SentIdN++) { const int SentId = SentIdV[SentIdN]; // tokenize translation TIntV TransWIdV; Parse(TransCorpus->GetTransStr(SentId), TransWIdV); TIntH TransNGramH; GetNGramH(TransWIdV, MxNGramLen, TransNGramH); TIntH FreeTransNGramH = TransNGramH; // number of non-matched ngrams // counters for getting the closest length of reference sentences const int TransLen = TransWIdV.Len(); int RefLenSum = 0; // go over reference translations and count ngram matches TStrV RefTransStrV = TransCorpus->GetRefTransStrV(SentId); // we assume that there is at least one reference translation IAssert(!RefTransStrV.Empty()); for (int RefN = 0; RefN < RefTransStrV.Len(); RefN++) { // parse reference translation sentence TIntV RefWIdV; Parse(RefTransStrV[RefN], RefWIdV); TIntH RefNGramH; GetNGramH(RefWIdV, MxNGramLen, RefNGramH); // check for matches int TransNGramKeyId = TransNGramH.FFirstKeyId(); while(TransNGramH.FNextKeyId(TransNGramKeyId)) { const int NGramId = TransNGramH.GetKey(TransNGramKeyId); const int FreeTransNGrams = FreeTransNGramH(NGramId); if (RefNGramH.IsKey(NGramId) && (FreeTransNGrams>0)) { // ngram match and still some free ngrams left to clip const int RefNGrams = RefNGramH(NGramId); FreeTransNGramH(NGramId) = TInt::GetMx(0, FreeTransNGrams - RefNGrams); } } // check the length difference const int RefLen = RefWIdV.Len(); RefLenSum += RefLen; } // count ngrams int TransNGramKeyId = TransNGramH.FFirstKeyId(); while(TransNGramH.FNextKeyId(TransNGramKeyId)) { // get ngram const int NGramId = TransNGramH.GetKey(TransNGramKeyId); IAssert(NGramId != -1); // check if two hash tables are aligned (should be...) const int FreeNGramId = FreeTransNGramH.GetKey(TransNGramKeyId); IAssert(NGramId == FreeNGramId); // get ngram count and clip-count const int Count = TransNGramH[TransNGramKeyId]; const int ClipCount = Count - FreeTransNGramH[TransNGramKeyId]; // add ngram to the coprus ngram counts CountNGramH.AddDat(NGramId) += Count; ClipCountNGramH.AddDat(NGramId) += ClipCount; } // count length FullTransLen += double(TransLen); FullRefLen += double(RefLenSum) / double(RefTransStrV.Len()); } // calculate ngram info scores int CountKeyId = CountNGramH.FFirstKeyId(); while (CountNGramH.FNextKeyId(CountKeyId)) { // get the n-gram const int NGramId = CountNGramH.GetKey(CountKeyId); TIntV NGram = GetNGram(NGramId); // prepare counts if (NGram.Len() == 1) { // n-gram is a word const int WordCount = CountNGramH[CountKeyId]; const double NGramInfoScore = TMath::Log2(FullTransLen / double(WordCount)); NGramInfoH.AddDat(NGramId, NGramInfoScore); } else { // more then one word in the n-gram // get a n-gram with removed last element TIntV N1Gram = NGram; N1Gram.DelLast(); const int N1GramId = NGramH.GetKeyId(N1Gram); // get the counts const int NGramCount = CountNGramH(NGramId); const int N1GramCount = CountNGramH(N1GramId); // get the score const double NGramInfoScore = TMath::Log2(double(N1GramCount) / double(NGramCount)); NGramInfoH.AddDat(NGramId, NGramInfoScore); } } // calcualte ngram precisions TFltV ClipCountV(MxNGramLen); ClipCountV.PutAll(0); int ClipCountKeyId = ClipCountNGramH.FFirstKeyId(); while (ClipCountNGramH.FNextKeyId(ClipCountKeyId)) { const int NGramId = ClipCountNGramH.GetKey(ClipCountKeyId); const int NGramLen = GetNGramLen(NGramId); const double NGramInfo = NGramInfoH(NGramId); IAssert(0 < NGramLen && NGramLen <= MxNGramLen); const int ClipCountNGram = ClipCountNGramH[ClipCountKeyId]; ClipCountV[NGramLen-1] += double(ClipCountNGram) * NGramInfo; } TIntV CountV(MxNGramLen); CountV.PutAll(0); CountKeyId = CountNGramH.FFirstKeyId(); while (CountNGramH.FNextKeyId(CountKeyId)) { const int NGramId = CountNGramH.GetKey(CountKeyId); const int NGramLen = GetNGramLen(NGramId); IAssert(0 < NGramLen && NGramLen <= MxNGramLen); CountV[NGramLen-1] += CountNGramH[CountKeyId]; } TFltV PrecV(MxNGramLen, 0); for (int NGramLen = 0; NGramLen < MxNGramLen; NGramLen++) { const double ClipCount = ClipCountV[NGramLen]; const int Count = CountV[NGramLen]; const double Prec = (Count > 0) ? ClipCount / double(Count) : 0.0; PrecV.Add(Prec); } // calcualte brevity penalty const double LenFrac = double(FullTransLen)/double(FullRefLen); double BP = 0.0; if (LenFrac >= 1.0) { BP = 1.0; } else if (LenFrac <= 0.0) { BP = 0.0; } else { // calculate beta const double LenFracX = 1.5, BPX = 0.5; const double Beta = log(BPX) / TMath::Sqr(log(LenFracX)); // calculate BP score BP = exp(Beta * TMath::Sqr(log(LenFrac))); } // calculate full NIST score double NistScore = 0.0; for (int NGramLen = 0; NGramLen < MxNGramLen; NGramLen++) { NistScore += PrecV[NGramLen]; } NistScore *= BP; printf("NIST Score: %.5f\n", NistScore); // done! return NistScore; }
void TempMotifCounter::Count3TEdgeTriads(double delta, Counter3D& counts) { counts = Counter3D(2, 2, 2); // Get the counts on each undirected edge TVec< THash<TInt, TInt> > edge_counts(static_graph_->GetMxNId()); TVec< THash<TInt, TIntV> > assignments(static_graph_->GetMxNId()); for (TNGraph::TEdgeI it = static_graph_->BegEI(); it < static_graph_->EndEI(); it++) { int src = it.GetSrcNId(); int dst = it.GetDstNId(); int min_node = MIN(src, dst); int max_node = MAX(src, dst); edge_counts[min_node](max_node) += temporal_data_[src](dst).Len(); assignments[min_node](max_node) = TIntV(); } // Assign triangles to the edge with the most events TIntV Us, Vs, Ws; GetAllStaticTriangles(Us, Vs, Ws); #pragma omp parallel for schedule(dynamic) for (int i = 0; i < Us.Len(); i++) { int u = Us[i]; int v = Vs[i]; int w = Ws[i]; int counts_uv = edge_counts[MIN(u, v)].GetDat(MAX(u, v)); int counts_uw = edge_counts[MIN(u, w)].GetDat(MAX(u, w)); int counts_vw = edge_counts[MIN(v, w)].GetDat(MAX(v, w)); if (counts_uv >= MAX(counts_uw, counts_vw)) { #pragma omp critical { TIntV& assignment = assignments[MIN(u, v)].GetDat(MAX(u, v)); assignment.Add(w); } } else if (counts_uw >= MAX(counts_uv, counts_vw)) { #pragma omp critical { TIntV& assignment = assignments[MIN(u, w)].GetDat(MAX(u, w)); assignment.Add(v); } } else if (counts_vw >= MAX(counts_uv, counts_uw)) { #pragma omp critical { TIntV& assignment = assignments[MIN(v, w)].GetDat(MAX(v, w)); assignment.Add(u); } } } TVec<TIntPair> all_edges; TIntV all_nodes; GetAllNodes(all_nodes); for (int node_id = 0; node_id < all_nodes.Len(); node_id++) { int u = all_nodes[node_id]; TIntV nbrs; GetAllNeighbors(u, nbrs); for (int nbr_id = 0; nbr_id < nbrs.Len(); nbr_id++) { int v = nbrs[nbr_id]; if (assignments[u].IsKey(v) && assignments[u].GetDat(v).Len() > 0) { all_edges.Add(TIntPair(u, v)); } } } // Count triangles on edges with the assigned neighbors #pragma omp parallel for schedule(dynamic) for (int edge_id = 0; edge_id < all_edges.Len(); edge_id++) { TIntPair edge = all_edges[edge_id]; int u = edge.Key; int v = edge.Dat; // Continue if no assignment if (!assignments[u].IsKey(v)) { continue; } TIntV& uv_assignment = assignments[u].GetDat(v); // Continue if no data if (uv_assignment.Len() == 0) { continue; } // Get all events on (u, v) TVec<TriadEdgeData> events; TVec<TIntPair> ts_indices; int index = 0; int nbr_index = 0; // Assign indices from 0, 1, ..., num_nbrs + 2 AddTriadEdgeData(events, ts_indices, index, u, v, nbr_index, 0, 1); nbr_index++; AddTriadEdgeData(events, ts_indices, index, v, u, nbr_index, 0, 0); nbr_index++; // Get all events on triangles assigned to (u, v) for (int w_id = 0; w_id < uv_assignment.Len(); w_id++) { int w = uv_assignment[w_id]; AddTriadEdgeData(events, ts_indices, index, w, u, nbr_index, 0, 0); AddTriadEdgeData(events, ts_indices, index, w, v, nbr_index, 0, 1); AddTriadEdgeData(events, ts_indices, index, u, w, nbr_index, 1, 0); AddTriadEdgeData(events, ts_indices, index, v, w, nbr_index, 1, 1); nbr_index++; } // Put events in sorted order ts_indices.Sort(); TIntV timestamps(ts_indices.Len()); TVec<TriadEdgeData> sorted_events(ts_indices.Len()); for (int i = 0; i < ts_indices.Len(); i++) { timestamps[i] = ts_indices[i].Key; sorted_events[i] = events[ts_indices[i].Dat]; } // Get the counts and update the counter ThreeTEdgeTriadCounter tetc(nbr_index, 0, 1); tetc.Count(sorted_events, timestamps, delta); #pragma omp critical { for (int dir1 = 0; dir1 < 2; dir1++) { for (int dir2 = 0; dir2 < 2; dir2++) { for (int dir3 = 0; dir3 < 2; dir3++) { counts(dir1, dir2, dir3) += tetc.Counts(dir1, dir2, dir3); } } } } } }
double TStringKernel::KDynamic(const TIntV& s, const TIntV& t, const TFltV& lc, const double& lb) { const int k = lc.Len() + 1; int x,y,i; int ls = s.Len(), lt = t.Len(); //TVec<TFltVV> Kd(2); //for (i = 0; i < 2; i++) Kd[i].Gen(ls+1, lt+1); // s is on X-axis and t is on Y-axis TVec<double *> Kd(2); if ((ls+1)*(lt+1) > BufN) { if (Buf1 != NULL) delete[] Buf1; if (Buf2 != NULL) delete[] Buf2; BufN = (ls+2)*(lt+2) + 10; Buf1 = new double[BufN]; Buf2 = new double[BufN]; } Kd[0] = Buf1; Kd[1] = Buf2; double *Kdii, *Kdi; //ii == (i-1)%2, i == i%2 // initialize Kd for i == 0 int MxSize = (ls+1)*(lt+1) + 10; for (i = 0, Kdi = Kd[0]; i < MxSize; i++) Kdi[i] = 1.0; //for (x = 0; x <= ls; x++) { // for (y = 0; y <= lt; y++) { // Kd[0](x,y) = 1.0; // } //} // calculate Kd for i == 1..k-1 double K = 0.0; for (i = 1; i < k; i++) { Kdi = Kd[i%2]; Kdii = Kd[(i-1)%2]; for (x = 0; x <= ls; x++) Kdi[x*(lt+1) + (i-1)] = 0.0; for (y = 0; y <= lt; y++) Kdi[(i-1)*(lt+1) + y] = 0.0; //for (x = 0; x <= ls; x++) Kd[i%2](x,i-1) = 0.0; //for (y = 0; y <= lt; y++) Kd[i%2](i-1,y) = 0.0; double Ki = 0.0; for (x = i; x <= ls; x++) { double Kdd = 0.0; int u = s[x-1]; for (y = i; y <= lt; y++) { if (u == t[y-1]) { Kdd = lb * (Kdd + lb*Kdii[(x-1)*(lt+1) + (y-1)]); Ki += lb*lb * Kdi[(x-1)*(lt+1) + (y-1)]; //Kdd = lb * (Kdd + lb*Kd[(i-1)%2](x-1,y-1)); //Ki += lb*lb * Kd[i%2](x-1, y-1); } else { Kdd *= lb; } Kdi[x*(lt+1) + y] = lb*Kdi[(x-1)*(lt+1) + y] + Kdd; //Kd[i%2](x,y) = lb*Kd[i%2](x-1, y) + Kdd; } } K += lc[i-1] * Ki; } return K; }
void TempMotifCounter::GetAllStaticTriangles(TIntV& Us, TIntV& Vs, TIntV& Ws) { Us.Clr(); Vs.Clr(); Ws.Clr(); // Get degree ordering of the graph int max_nodes = static_graph_->GetMxNId(); TVec<TIntPair> degrees(max_nodes); degrees.PutAll(TIntPair(0, 0)); // Set the degree of a node to be the number of nodes adjacent to the node in // the undirected graph. TIntV nodes; GetAllNodes(nodes); #pragma omp parallel for schedule(dynamic) for (int node_id = 0; node_id < nodes.Len(); node_id++) { int src = nodes[node_id]; TIntV nbrs; GetAllNeighbors(src, nbrs); degrees[src] = TIntPair(nbrs.Len(), src); } degrees.Sort(); TIntV order = TIntV(max_nodes); #pragma omp parallel for schedule(dynamic) for (int i = 0; i < order.Len(); i++) { order[degrees[i].Dat] = i; } // Get triangles centered at a given node where that node is the smallest in // the degree ordering. #pragma omp parallel for schedule(dynamic) for (int node_id = 0; node_id < nodes.Len(); node_id++) { int src = nodes[node_id]; int src_pos = order[src]; // Get all neighbors who come later in the ordering TIntV nbrs; GetAllNeighbors(src, nbrs); TIntV neighbors_higher; for (int i = 0; i < nbrs.Len(); i++) { int nbr = nbrs[i]; if (order[nbr] > src_pos) { neighbors_higher.Add(nbr); } } for (int ind1 = 0; ind1 < neighbors_higher.Len(); ind1++) { for (int ind2 = ind1 + 1; ind2 < neighbors_higher.Len(); ind2++) { int dst1 = neighbors_higher[ind1]; int dst2 = neighbors_higher[ind2]; // Check for triangle formation if (static_graph_->IsEdge(dst1, dst2) || static_graph_->IsEdge(dst2, dst1)) { #pragma omp critical { Us.Add(src); Vs.Add(dst1); Ws.Add(dst2); } } } } } }
void TNGramBs::ConcPass(){ IAssert(!IsFinished()); if (PassN==1){ // first pass // collect stop words and words with too low frequency TIntV DelWIdV; // vector for word-ids for deleting int WIds=WordStrToFqH.Len(); // get number of words-ids for (int WId=0; WId<WIds; WId++){ TStr WordStr=WordStrToFqH.GetKey(WId); // get word string //if ((!SwSet.Empty())&&(SwSet->IsIn(WordStr))){ // WordStrToFqH[WId]=-1; // reset stop-word frequency to -1 //} else if (WordStrToFqH[WId]<MnNGramFq){ DelWIdV.Add(WId); // add infrequent word-id to delete-list } } // delete words WordStrToFqH.DelKeyIdV(DelWIdV); WordStrToFqH.Defrag(); } else if (PassN==2){ // second pass int Cands=CandWIdPrToFqH.Len(); // get number of candidates TIntV WIdV(2); // pre-decl for frequent-candidates vector for (int CandId=0; CandId<Cands; CandId++){ int CandFq=CandWIdPrToFqH[CandId]; // get candidate frequency if (CandFq>=MnNGramFq){ // if candidate is frequent const TIntPr& WIdPr=CandWIdPrToFqH.GetKey(CandId); // get word-id pair WIdV[0]=WIdPr.Val1; WIdV[1]=WIdPr.Val2; // assign word-id to vector WIdVToFqH.AddDat(WIdV, CandFq); // add frequent vector } } // clear candidate word-id pairs CandWIdPrToFqH.Clr(); } else if (PassN>2){ // higher passes int Cands=CandWIdVToFqH.Len(); // get number of candidates for (int CandId=0; CandId<Cands; CandId++){ int CandFq=CandWIdVToFqH[CandId]; // get candidate frequency if (CandFq>=MnNGramFq){ // if candidate is frequent const TIntV& CandWIdV=CandWIdVToFqH.GetKey(CandId); // get word-id vector WIdVToFqH.AddDat(CandWIdV, CandWIdVToFqH[CandId]); // add frequent vector } } // clear candidate word-id vectors CandWIdVToFqH.Clr(); } else { Fail; } // increment pass-number PassN++; // conclude or prepare for new pass if (IsFinished()){ // clear queue CandWIdQ.Clr(); // reset stop-words int WIds=WordStrToFqH.Len(); // get number of words-ids for (int WId=0; WId<WIds; WId++){ TStr WordStr=WordStrToFqH.GetKey(WId); // get word string if ((!SwSet.Empty())&&(SwSet->IsIn(WordStr))){ WordStrToFqH[WId]=-1; // reset stop-word frequency to -1 } } // reset ngrams starting with with stop-words for (int WIdVId=0; WIdVId<WIdVToFqH.Len(); WIdVId++){ int FirstWId=WIdVToFqH.GetKey(WIdVId)[0]; // get first word-id int LastWId=WIdVToFqH.GetKey(WIdVId).Last(); // get last word-id if ((WordStrToFqH[FirstWId]==-1)||(WordStrToFqH[LastWId]==-1)){ TStr NGramStr=GetWIdVStr(WIdVToFqH.GetKey(WIdVId)); WIdVToFqH[WIdVId]=-1; } } // print frequent ngrams for (int WIdVId=0; WIdVId<WIdVToFqH.Len(); WIdVId++){ if (WIdVToFqH.GetKey(WIdVId).Len()>1){ //TStr NGramStr=GetWIdVStr(WIdVToFqH.GetKey(WIdVId)); //int NGramFq=WIdVToFqH[WIdVId]; //printf("%s: %d\n", NGramStr.CStr(), NGramFq); } } } else { // prepare new queue length CandWIdQ.Gen(100*PassN, PassN); } }
void TBPGraph::GetLNIdV(TIntV& NIdV) const { NIdV.Gen(GetLNodes(), 0); for (int N=LeftH.FFirstKeyId(); LeftH.FNextKeyId(N); ) { NIdV.Add(LeftH.GetKey(N)); } }
void TNEANet::GetNIdV(TIntV& NIdV) const { NIdV.Gen(GetNodes(), 0); for (int N=NodeH.FFirstKeyId(); NodeH.FNextKeyId(N); ) { NIdV.Add(NodeH.GetKey(N)); } }
void TBPGraph::GetRNIdV(TIntV& NIdV) const { NIdV.Gen(GetRNodes(), 0); for (int N=RightH.FFirstKeyId(); RightH.FNextKeyId(N); ) { NIdV.Add(RightH.GetKey(N)); } }
int main(int argc, char* argv[]) { //// what type of graph do you want to use? //typedef PUNGraph PGraph; // undirected graph typedef PNGraph PGraph; // directed graph //typedef PNEGraph PGraph; // directed multigraph //typedef TPt<TNodeNet<TInt> > PGraph; //typedef TPt<TNodeEdgeNet<TInt, TInt> > PGraph; // this code is independent of what particular graph implementation/type we use printf("Creating graph:\n"); PGraph G = PGraph::TObj::New(); for (int n = 0; n < 14; n++) { G->AddNode(); // if no parameter is given, node ids are 0,1,...,9 } G->AddEdge(1, 4); printf(" Edge 1 -- 4 added\n"); G->AddEdge(1, 3); printf(" Edge 1 -- 3 added\n"); G->AddEdge(2, 5); printf(" Edge 2 -- 5 added\n"); G->AddEdge(3, 2); printf(" Edge 3 -- 2 added\n"); G->AddEdge(3, 5); printf(" Edge 3 -- 5 added\n"); G->AddEdge(3, 10); printf(" Edge 3 -- 10 added\n"); /*for (int e = 0; e < 10; e++) { const int NId1 = G->GetRndNId(); const int NId2 = G->GetRndNId(); if (G->AddEdge(NId1, NId2) != -2) { printf(" Edge %d -- %d added\n", NId1, NId2); } else { printf(" Edge %d -- %d already exists\n", NId1, NId2); } }*/ IAssert(G->IsOk()); //G->Dump(); // delete PGraph::TObj::TNodeI NI = G->GetNI(0); printf("Delete edge %d -- %d\n", NI.GetId(), NI.GetOutNId(0)); G->DelEdge(NI.GetId(), NI.GetOutNId(0)); const int RndNId = G->GetRndNId(); printf("Delete node %d\n", RndNId); G->DelNode(RndNId); IAssert(G->IsOk()); // dump the graph printf("Graph (%d, %d)\n", G->GetNodes(), G->GetEdges()); for (PGraph::TObj::TNodeI NI = G->BegNI(); NI < G->EndNI(); NI++) { printf(" %d: ", NI.GetId()); for (int e = 0; e < NI.GetDeg(); e++) { printf(" %d", NI.GetNbrNId(e)); } printf("\n"); } // dump subgraph TIntV NIdV; for (PGraph::TObj::TNodeI NI = G->BegNI(); NI < G->EndNI(); NI++) { if (NIdV.Len() < G->GetNodes()/2) { NIdV.Add(NI.GetId()); } } PGraph SubG = TSnap::GetSubGraph(G, NIdV); //SubG->Dump(); // get UNGraph { PUNGraph UNG = TSnap::ConvertGraph<PUNGraph>(SubG); UNG->Dump(); IAssert(UNG->IsOk()); TSnap::ConvertSubGraph<PNGraph>(G, NIdV)->Dump(); } // get NGraph { PNGraph NG = TSnap::ConvertGraph<PNGraph>(SubG); NG->Dump(); IAssert(NG->IsOk()); TSnap::ConvertSubGraph<PNGraph>(G, NIdV)->Dump(); } // get NEGraph { PNEGraph NEG = TSnap::ConvertGraph<PNEGraph>(SubG); NEG->Dump(); IAssert(NEG->IsOk()); TSnap::ConvertSubGraph<PNGraph>(G, NIdV)->Dump(); } TSnap::TestAnf<PUNGraph>(); return 0; }
void ComputeMissingProperties (const TStr &Dir, const TStr &TriplesFilename) { // Parse the rdf file and create the graph. TFIn File(TriplesFilename); TRDFParser DBpediaDataset(File); printf("Creating graph from input file...\n"); TGraph G; TStrSet NodeStrs; TStrSet PropStrs; bool Parsed = TSnap::GetGraphFromRDFParser(DBpediaDataset, G, NodeStrs, PropStrs); if (!Parsed) { return; } // Store the graph and associated data G.Save(*TFOut::New(Dir + "graph.bin")); NodeStrs.Save(*TFOut::New(Dir + "nodeStrs.bin")); PropStrs.Save(*TFOut::New(Dir + "propStrs.bin")); printf("Computing objects...\n"); // Get the objects of the graph. TIntV Objects; // We defined the objects to be the nodes with prefix http://dbpedia.org/resource/. TObjectFunctor ObjectFunctor(NodeStrs); TObjectUtils::GetObjects(G, ObjectFunctor, Objects); // Store and print the objects. Objects.Save(*TFOut::New(Dir + "objects.bin")); TObjectUtils::PrintObjects(Objects, NodeStrs, *TFOut::New(Dir + "objects.txt")); printf("Computing object matrix...\n"); // Here we choose the descriptors for the objects. // We chose property + nbh (value) descriptors for objects // We could also use more complicated descriptors such as subgraphs or subnetworks. TSparseColMatrix ObjectMatrix1; TSparseColMatrix ObjectMatrix2; TObjectUtils::GetPropertyCount(Objects, G, ObjectMatrix1); TObjectUtils::GetNbhCount(Objects, G, ObjectMatrix2); TLAUtils::NormalizeMatrix(ObjectMatrix1); TLAUtils::NormalizeMatrix(ObjectMatrix2); TSparseColMatrix ObjectMatrix; TLAUtils::ConcatenateMatricesRowWise(ObjectMatrix1, ObjectMatrix2, ObjectMatrix); TLAUtils::NormalizeMatrix(ObjectMatrix); ObjectMatrix.Save(*TFOut::New(Dir + "objectMatrix.bin")); printf("Clustering objects...\n"); // Partition the objects into 64 partitions (clusters). int K = 64; int NumIterations = 20; TIntV Assigments; TVec<TIntV> Clusters; TClusterUtils::GetClusters(ObjectMatrix, K, NumIterations, Assigments, Clusters); // Store the clustering data. Assigments.Save(*TFOut::New(Dir + "assigments.bin")); Clusters.Save(*TFOut::New(Dir + "clusters.bin")); // Print some details about the clusters. TClusterUtils::PrintClusterSizes(Clusters, *TFOut::New(Dir + "clusterSizes.txt")); TClusterUtils::PrintClusters(Clusters, Objects, NodeStrs, *TFOut::New(Dir + "clusters.txt")); printf("Computing similarities...\n"); // Compute the similarity betweeen the objects. const int MaxNumSimilarObjects = 100; const int NumThreads = 10; TVec<TIntFltKdV> Similarities; TSimilarityUtils::ComputeSimilarities(ObjectMatrix, Assigments, Clusters, MaxNumSimilarObjects, NumThreads, Similarities); // Store the object similarities. Similarities.Save(*TFOut::New(Dir + "objectSimilarities.bin")); // Print the object similarities. TSimilarityUtils::PrintSimilarities(Similarities, Objects, NodeStrs, 10, *TFOut::New(Dir + "objectSimilarities.txt")); printf("Computing existing property matrix...\n"); // Our goal is to compute the missing out-going properties. // Therefore, we create the matrix of existing out-going properties of the objects. TSparseColMatrix OutPropertyCountMatrix; TObjectUtils::GetOutPropertyCount(Objects, G, OutPropertyCountMatrix); TObjectUtils::PrintPropertyMatrix(OutPropertyCountMatrix, Objects, NodeStrs, PropStrs, *TFOut::New(Dir + "outPropertyCountMatrix.txt")); OutPropertyCountMatrix.Save(*TFOut::New(Dir + "outPropertyCountMatrix.bin")); printf("Computing missing properties...\n"); // And finally, compute the missing properties. int MaxNumMissingProperties = 100; TVec<TIntFltKdV> MissingProperties; TPropertyUtils::GetMissingProperties(Similarities, OutPropertyCountMatrix, MaxNumMissingProperties, NumThreads, MissingProperties); // Store the missing properties data. MissingProperties.Save(*TFOut::New(Dir + "missingProperties.bin")); // Print missing properties. TPropertyUtils::PrintMissingProperties(MissingProperties, Objects, NodeStrs, PropStrs, 10, *TFOut::New(Dir + "missingProperties.txt")); }
PBowMd TBowWinnowMd::New( const PBowDocBs& BowDocBs, const TStr& CatNm, const double& Beta){ // create model TBowWinnowMd* WinnowMd=new TBowWinnowMd(BowDocBs); PBowMd BowMd(WinnowMd); WinnowMd->CatNm=CatNm; WinnowMd->Beta=Beta; WinnowMd->VoteTsh=0.5; // prepare Winnow parameters const double MnExpertWgtSum=1e-15; // get cat-id int CId=BowDocBs->GetCId(CatNm); if (CId==-1){ TExcept::Throw(TStr::GetStr(CatNm, "Invalid Category Name ('%s')!"));} // get training documents TIntV TrainDIdV; BowDocBs->GetAllDIdV(TrainDIdV); int TrainDocs=TrainDIdV.Len(); // prepare mini-experts int Words=BowDocBs->GetWords(); WinnowMd->PosExpertWgtV.Gen(Words); WinnowMd->PosExpertWgtV.PutAll(1); WinnowMd->NegExpertWgtV.Gen(Words); WinnowMd->NegExpertWgtV.PutAll(1); // winnow loop double PrevAcc=0; double PrevPrec=0; double PrevRec=0; double PrevF1=0; const double MxDiff=-0.005; const int MxWorseIters=3; int WorseIters=0; const int MxIters=50; int IterN=0; while ((IterN<MxIters)&&(WorseIters<MxWorseIters)){ IterN++; int FalsePos=0; int FalseNeg=0; int TruePos=0; int TrueNeg=0; for (int DIdN=0; DIdN<TrainDocs; DIdN++){ int DId=TrainDIdV[DIdN]; bool ClassVal=BowDocBs->IsCatInDoc(DId, CId); double PosWgt=0; double NegWgt=0; double OldSum=0; double NewSum=0; int WIds=BowDocBs->GetDocWIds(DId); // change only experts of words that occur in the document for (int WIdN=0; WIdN<WIds; WIdN++){ int WId=BowDocBs->GetDocWId(DId, WIdN); OldSum+=WinnowMd->PosExpertWgtV[WId]+WinnowMd->NegExpertWgtV[WId]; // penalize expert giving wrong class prediction if (ClassVal){ WinnowMd->NegExpertWgtV[WId]*=Beta; } else { WinnowMd->PosExpertWgtV[WId]*=Beta; } NewSum+=WinnowMd->PosExpertWgtV[WId]+WinnowMd->NegExpertWgtV[WId]; PosWgt+=WinnowMd->PosExpertWgtV[WId]; NegWgt+=WinnowMd->NegExpertWgtV[WId]; } // normalize all experts if (NewSum>MnExpertWgtSum){ for (int WIdN=0; WIdN<WIds; WIdN++){ int WId=BowDocBs->GetDocWId(DId, WIdN); WinnowMd->PosExpertWgtV[WId]*=OldSum/NewSum; WinnowMd->NegExpertWgtV[WId]*=OldSum/NewSum; } } bool PredClassVal; if (PosWgt+NegWgt==0){PredClassVal=TBool::GetRnd();} else {PredClassVal=(PosWgt/(PosWgt+NegWgt))>WinnowMd->VoteTsh;} if (PredClassVal==ClassVal){ if (PredClassVal){TruePos++;} else {TrueNeg++;} } else { if (PredClassVal){FalsePos++;} else {FalseNeg++;} } } // calculate temporary results if (TrainDocs==0){break;} double Acc=0; double Prec=0; double Rec=0; double F1=0; if (TrainDocs>0){ Acc=100*(TruePos+TrueNeg)/double(TrainDocs); if (TruePos+FalsePos>0){ Prec=(TruePos/double(TruePos+FalsePos)); Rec=(TruePos/double(TruePos+FalseNeg)); if (Prec+Rec>0){ F1=(2*Prec*Rec/(Prec+Rec)); } } } // check if the current iteration gave worse results then the previous if (((Acc-PrevAcc)<MxDiff)||((F1-PrevF1)<MxDiff)||(((Prec-PrevPrec)<MxDiff)&& ((Rec-PrevRec)<MxDiff))){WorseIters++;} else {WorseIters=0;} PrevAcc=Acc; PrevPrec=Prec; PrevRec=Rec; PrevF1=F1; printf("%d. Precision:%0.3f Recall:%0.3f F1:%0.3f Accuracy:%0.3f%%\n", IterN, Prec, Rec, F1, Acc); } // return model return BowMd; }
/** * Used for benchmarking sorting by source algorithm. * Takes as input starting point of * a top cascade and outputs time taken for casacade detection. * Input : Source, Dest, Start, Duration * Output : Prints the time for cascade detection */ int main(int argc,char* argv[]) { TTableContext Context; Schema TimeS; TimeS.Add(TPair<TStr,TAttrType>("Source",atInt)); TimeS.Add(TPair<TStr,TAttrType>("Dest",atInt)); TimeS.Add(TPair<TStr,TAttrType>("Start",atInt)); TimeS.Add(TPair<TStr,TAttrType>("Duration",atInt)); PTable P1 = TTable::LoadSS(TimeS,"./../../../../datasets/temporal/yemen_call_201001.txt",&Context,' '); TIntV MapV; TStrV SortBy; SortBy.Add("Source"); P1->Order(SortBy); TIntV Source; // Sorted vec of start time P1->ReadIntCol("Source",Source); for (TRowIterator RI = P1->BegRI(); RI < P1->EndRI(); RI++) { MapV.Add(RI.GetRowIdx()); } // Attribute to Int mapping TInt SIdx = P1->GetColIdx("Source"); TInt DIdx = P1->GetColIdx("Dest"); TInt StIdx = P1->GetColIdx("Start"); TInt DuIdx = P1->GetColIdx("Duration"); int W = atoi(argv[1]); int len = 0; // Find the starting point int TSource = atoi(argv[2]); int TDest = atoi(argv[3]); int TStart = atoi(argv[4]); int TDur = atoi(argv[5]); TInt RIdx; for (TRowIterator RI = P1->BegRI(); RI < P1->EndRI(); RI++) { RIdx = RI.GetRowIdx(); int RSource = P1->GetIntValAtRowIdx(SIdx,RIdx).Val; int RDest = P1->GetIntValAtRowIdx(DIdx,RIdx).Val; int RStart = P1->GetIntValAtRowIdx(StIdx,RIdx).Val; int RDur = P1->GetIntValAtRowIdx(DuIdx,RIdx).Val; if (TSource == RSource && TDest == RDest && TStart == RStart && TDur == RDur) break; } // Start building the cascade from the start point clock_t st,et; st = clock(); for (int i = 0; i < 1; i++) { THashSet<TInt> VisitedH; TSnapQueue<TInt> EventQ; EventQ.Push(RIdx); VisitedH.AddKey(RIdx); while (!EventQ.Empty()) { TInt CIdx = EventQ.Top(); EventQ.Pop(); int CDest = P1->GetIntValAtRowIdx(DIdx,CIdx).Val; int CStart = P1->GetIntValAtRowIdx(StIdx,CIdx).Val; int CDur = P1->GetIntValAtRowIdx(DuIdx,CIdx).Val; // In line binary search int val = CDest; int lo = 0; int hi = Source.Len() - 1; int index = -1; while (hi >= lo) { int mid = lo + (hi - lo)/2; if (Source.GetVal(mid) > val) { hi = mid - 1;} else if (Source.GetVal(mid) < val) { lo = mid + 1;} else { index = mid; hi = mid - 1;} } // End of binary search int BIdx = index; for(int i = BIdx; i < Source.Len(); i++) { int PId = MapV.GetVal(i).Val; if (! VisitedH.IsKey(PId)) { int TSource = P1->GetIntValAtRowIdx(SIdx,PId).Val; int TStart = P1->GetIntValAtRowIdx(StIdx,PId).Val; if (TSource != CDest) { break; } if (TStart >= (CDur + CStart) && TStart - (CDur + CStart) <= W) { VisitedH.AddKey(PId); EventQ.Push(PId); } } } } len = VisitedH.Len(); } et = clock(); float diff = ((float) et - (float) st)/CLOCKS_PER_SEC; printf("Size %d,Time %f\n",len,diff); return 0; }