void TSAppSrvFun::GetFldValSet(const TStrKdV& FldNmValPrV, const TStr& FldNm, TStrSet& FldValSet) { FldValSet.Clr(); int ValN = FldNmValPrV.SearchForw(TStrKd(FldNm, "")); while (ValN != -1) { FldValSet.AddKey(FldNmValPrV[ValN].Dat); ValN = FldNmValPrV.SearchForw(TStrKd(FldNm, ""), ValN + 1); } }
int main(int argc, char *argv[]) { TStr BaseString = "/lfs/1/tmp/curis/week/QBDB.bin"; TFIn BaseFile(BaseString); TQuoteBase *QB = new TQuoteBase; TDocBase *DB = new TDocBase; QB->Load(BaseFile); DB->Load(BaseFile); TIntV QuoteIds; QB->GetAllQuoteIds(QuoteIds); int NumQuotes = QuoteIds.Len(); THash<TInt, TStrSet> PeakCounts; for (int i = 0; i < NumQuotes; i++) { TQuote CurQuote; if (QB->GetQuote(QuoteIds[i], CurQuote)) { TVec<TSecTm> Peaks; CurQuote.GetPeaks(DB, Peaks); TStr QuoteString; CurQuote.GetParsedContentString(QuoteString); TStrSet StringSet; if (PeakCounts.IsKey(Peaks.Len())) { StringSet = PeakCounts.GetDat(Peaks.Len()); } StringSet.AddKey(QuoteString); PeakCounts.AddDat(Peaks.Len(), StringSet); } } TIntV PeakCountKeys; PeakCounts.GetKeyV(PeakCountKeys); PeakCountKeys.Sort(true); for (int i = 0; i < PeakCountKeys.Len(); i++) { TStrSet CurSet = PeakCounts.GetDat(PeakCountKeys[i]); if (CurSet.Len() > 0) { printf("QUOTES WITH %d PEAKS\n", PeakCountKeys[i].Val); printf("#########################################\n"); THashSet<TStr> StringSet = PeakCounts.GetDat(PeakCountKeys[i]); for (THashSet<TStr>::TIter l = StringSet.BegI(); l < StringSet.EndI(); l++) { printf("%s\n", l.GetKey().CStr()); } printf("\n"); } } delete QB; delete DB; return 0; }
// Eve communication network PWgtNet TWgtNet::LoadEveCommNet(const TStr& FNm) { PWgtNet Net = TWgtNet::New(); TStrSet AuthorSet; TChA Ln; TVec<char*> WrdV; TFIn FIn(FNm); for (int c=0; FIn.GetNextLn(Ln); c++) { TStrUtil::SplitOnCh(Ln, WrdV, ';'); const int n1 = AuthorSet.AddKey(WrdV[0]); const int n2 = AuthorSet.AddKey(WrdV[1]); if (! Net->IsNode(n1)) { Net->AddNode(n1, WrdV[0]); } if (! Net->IsNode(n2)) { Net->AddNode(n2, WrdV[1]); } if (Net->IsEdge(n1, n2)) { Net->GetEDat(n1, n2) += 1; } else { Net->AddEdge(n1, n2, 1); } if (c % Kilo(10) == 0) { printf("\r%dk", c/1000); } } printf("\n"); TGBase::PrintInfo(Net); printf(" Edge weight: %f\n", Net->GetEdgeWgt()); return Net; }
// Arxiv co-authorship network // Network is undirected (edges of equal weight go both ways) // "W:\\Data\\Arxiv\\Arxiv-CoAuth\\gr-qc.lis" PWgtNet TWgtNet::LoadArxivCoAuth(const TStr& FNm) { TArxivPaperList Arxiv(FNm); PWgtNet Net = TWgtNet::New(); TStrSet AuthorSet; while (Arxiv.Next()) { for (int a1 = 0; a1 < Arxiv.AuthorV.Len(); a1++) { const int n1 = AuthorSet.AddKey(Arxiv.AuthorV[a1]); for (int a2 = 0; a2 < Arxiv.AuthorV.Len(); a2++) { if (a1 == a2) { continue; } const int n2 = AuthorSet.AddKey(Arxiv.AuthorV[a2]); if (! Net->IsNode(n1)) { Net->AddNode(n1, Arxiv.AuthorV[a1]); } if (! Net->IsNode(n2)) { Net->AddNode(n2, Arxiv.AuthorV[a2]); } if (Net->IsEdge(n1, n2)) { Net->GetEDat(n1, n2) += 1; } else { Net->AddEdge(n1, n2, 1); } } } } TGBase::PrintInfo(Net); printf(" Edge weight: %f\n", Net->GetEdgeWgt()); return Net; }
// "W:\\Data\\CiteSeer\\old\\citeseer-links.csv" PWgtNet TWgtNet::LoadCiteSeerCoAuth(const TStr& FNm) { PWgtNet Net = TWgtNet::New(); TStrSet AuthorSet; TSsParser Ss(FNm, ssfCommaSep); while (Ss.Next()) { for (int a1 = 2; a1 < Ss.Len(); a1++) { const int n1 = AuthorSet.AddKey(Ss[a1]); for (int a2 = 2; a2 < Ss.Len(); a2++) { if (a1 == a2) { continue; } const int n2 = AuthorSet.AddKey(Ss[a2]); if (! Net->IsNode(n1)) { Net->AddNode(n1, Ss[a1]); } if (! Net->IsNode(n2)) { Net->AddNode(n2, Ss[a2]); } if (Net->IsEdge(n1, n2)) { Net->GetEDat(n1, n2) += 1; } else { Net->AddEdge(n1, n2, 1); } } } } TGBase::PrintInfo(Net); printf(" Edge weight: %f\n", Net->GetEdgeWgt()); return Net; }
// Network is undirected (edges of equal weight go both ways) // "W:\\Data\\DBLP\\dblp.xml.gz" PWgtNet TWgtNet::LoadDblpCoAuth(const TStr& FNm) { TDblpLoader Dblp(FNm); TStrSet AuthorSet; PWgtNet Net = TWgtNet::New(); for (int c = 0; Dblp.Next(); c++) { for (int a1 = 0; a1 < Dblp.AuthorV.Len(); a1++) { const int n1 = AuthorSet.AddKey(Dblp.AuthorV[a1]); for (int a2 = 0; a2 < Dblp.AuthorV.Len(); a2++) { if (a1 == a2) { continue; } const int n2 = AuthorSet.AddKey(Dblp.AuthorV[a2]); if (! Net->IsNode(n1)) { Net->AddNode(n1, Dblp.AuthorV[a1]); } if (! Net->IsNode(n2)) { Net->AddNode(n2, Dblp.AuthorV[a2]); } if (Net->IsEdge(n1, n2)) { Net->GetEDat(n1, n2) += 1; } else { Net->AddEdge(n1, n2, 1); } } } if (c % 1000 == 0) { printf("\r%d", c); } } printf("\n"); TGBase::PrintInfo(Net); printf(" Edge weight: %f\n", Net->GetEdgeWgt()); return Net; }
void ComputeMissingProperties (const TStr &Dir, const TStr &TriplesFilename) { // Parse the rdf file and create the graph. TFIn File(TriplesFilename); TRDFParser DBpediaDataset(File); printf("Creating graph from input file...\n"); TGraph G; TStrSet NodeStrs; TStrSet PropStrs; bool Parsed = TSnap::GetGraphFromRDFParser(DBpediaDataset, G, NodeStrs, PropStrs); if (!Parsed) { return; } // Store the graph and associated data G.Save(*TFOut::New(Dir + "graph.bin")); NodeStrs.Save(*TFOut::New(Dir + "nodeStrs.bin")); PropStrs.Save(*TFOut::New(Dir + "propStrs.bin")); printf("Computing objects...\n"); // Get the objects of the graph. TIntV Objects; // We defined the objects to be the nodes with prefix http://dbpedia.org/resource/. TObjectFunctor ObjectFunctor(NodeStrs); TObjectUtils::GetObjects(G, ObjectFunctor, Objects); // Store and print the objects. Objects.Save(*TFOut::New(Dir + "objects.bin")); TObjectUtils::PrintObjects(Objects, NodeStrs, *TFOut::New(Dir + "objects.txt")); printf("Computing object matrix...\n"); // Here we choose the descriptors for the objects. // We chose property + nbh (value) descriptors for objects // We could also use more complicated descriptors such as subgraphs or subnetworks. TSparseColMatrix ObjectMatrix1; TSparseColMatrix ObjectMatrix2; TObjectUtils::GetPropertyCount(Objects, G, ObjectMatrix1); TObjectUtils::GetNbhCount(Objects, G, ObjectMatrix2); TLAUtils::NormalizeMatrix(ObjectMatrix1); TLAUtils::NormalizeMatrix(ObjectMatrix2); TSparseColMatrix ObjectMatrix; TLAUtils::ConcatenateMatricesRowWise(ObjectMatrix1, ObjectMatrix2, ObjectMatrix); TLAUtils::NormalizeMatrix(ObjectMatrix); ObjectMatrix.Save(*TFOut::New(Dir + "objectMatrix.bin")); printf("Clustering objects...\n"); // Partition the objects into 64 partitions (clusters). int K = 64; int NumIterations = 20; TIntV Assigments; TVec<TIntV> Clusters; TClusterUtils::GetClusters(ObjectMatrix, K, NumIterations, Assigments, Clusters); // Store the clustering data. Assigments.Save(*TFOut::New(Dir + "assigments.bin")); Clusters.Save(*TFOut::New(Dir + "clusters.bin")); // Print some details about the clusters. TClusterUtils::PrintClusterSizes(Clusters, *TFOut::New(Dir + "clusterSizes.txt")); TClusterUtils::PrintClusters(Clusters, Objects, NodeStrs, *TFOut::New(Dir + "clusters.txt")); printf("Computing similarities...\n"); // Compute the similarity betweeen the objects. const int MaxNumSimilarObjects = 100; const int NumThreads = 10; TVec<TIntFltKdV> Similarities; TSimilarityUtils::ComputeSimilarities(ObjectMatrix, Assigments, Clusters, MaxNumSimilarObjects, NumThreads, Similarities); // Store the object similarities. Similarities.Save(*TFOut::New(Dir + "objectSimilarities.bin")); // Print the object similarities. TSimilarityUtils::PrintSimilarities(Similarities, Objects, NodeStrs, 10, *TFOut::New(Dir + "objectSimilarities.txt")); printf("Computing existing property matrix...\n"); // Our goal is to compute the missing out-going properties. // Therefore, we create the matrix of existing out-going properties of the objects. TSparseColMatrix OutPropertyCountMatrix; TObjectUtils::GetOutPropertyCount(Objects, G, OutPropertyCountMatrix); TObjectUtils::PrintPropertyMatrix(OutPropertyCountMatrix, Objects, NodeStrs, PropStrs, *TFOut::New(Dir + "outPropertyCountMatrix.txt")); OutPropertyCountMatrix.Save(*TFOut::New(Dir + "outPropertyCountMatrix.bin")); printf("Computing missing properties...\n"); // And finally, compute the missing properties. int MaxNumMissingProperties = 100; TVec<TIntFltKdV> MissingProperties; TPropertyUtils::GetMissingProperties(Similarities, OutPropertyCountMatrix, MaxNumMissingProperties, NumThreads, MissingProperties); // Store the missing properties data. MissingProperties.Save(*TFOut::New(Dir + "missingProperties.bin")); // Print missing properties. TPropertyUtils::PrintMissingProperties(MissingProperties, Objects, NodeStrs, PropStrs, 10, *TFOut::New(Dir + "missingProperties.txt")); }