int main(int argc, char *argv[]) { // #### SETUP: Parse Arguments LogOutput Log; THash<TStr, TStr> Arguments; ArgumentParser::ParseArguments(argc, argv, Arguments, Log); TStr OutputDirectory; TStr StartString = ArgumentParser::GetArgument(Arguments, "start", "2009-02-01"); TStr QBDBDirectory = ArgumentParser::GetArgument(Arguments, "qbdb", QBDB_DIR_DEFAULT); TStr OutDirectory = ArgumentParser::GetArgument(Arguments, "out", "/lfs/1/tmp/curis/"); TInt WindowSize = ArgumentParser::GetArgument(Arguments, "window", "14").GetInt(); if (ArgumentParser::GetArgument(Arguments, "nolog", "") == "") { Log.DisableLogging(); } else if (!Arguments.IsKeyGetDat("directory", OutputDirectory)) { Log.SetupNewOutputDirectory(""); } else { Log.SetDirectory(OutputDirectory); } // #### DATA LOADING: Load ALL the things! TQuoteBase QB; TDocBase DB; fprintf(stderr, "Loading QB and DB from file for %d days, starting from %s...\n", WindowSize.Val, StartString.CStr()); Err("%s\n", QBDBDirectory.CStr()); TSecTm PresentTime = TDataLoader::LoadQBDBByWindow(QBDBDirectory, StartString, WindowSize, QB, DB); fprintf(stderr, "QBDB successfully loaded!\n"); TVec<TSecTm> PubTmV; TVec<TStr> PostUrlV; TVec<TStr> QuoteV; fprintf(stderr, "Dumping quotes to file...\n"); TIntV QuoteIds; QB.GetAllQuoteIds(QuoteIds); for (int i = 0; i < QuoteIds.Len(); i++) { TQuote Q; QB.GetQuote(QuoteIds[i], Q); TStr QContentString; Q.GetContentString(QContentString); TVec<TUInt> Sources; Q.GetSources(Sources); for (int j = 0; j < Sources.Len(); j++) { TDoc D; DB.GetDoc(Sources[j], D); TStr PostUrl; D.GetUrl(PostUrl); TSecTm PostTime = D.GetDate(); QuoteV.Add(QContentString); PubTmV.Add(PostTime); PostUrlV.Add(PostUrl); } } TFOut FOut(OutDirectory + "QuoteList" + ".bin"); PubTmV.Save(FOut); PostUrlV.Save(FOut); QuoteV.Save(FOut); fprintf(stderr, "Done!\n"); return 0; }
void ComputeMissingProperties (const TStr &Dir, const TStr &TriplesFilename) { // Parse the rdf file and create the graph. TFIn File(TriplesFilename); TRDFParser DBpediaDataset(File); printf("Creating graph from input file...\n"); TGraph G; TStrSet NodeStrs; TStrSet PropStrs; bool Parsed = TSnap::GetGraphFromRDFParser(DBpediaDataset, G, NodeStrs, PropStrs); if (!Parsed) { return; } // Store the graph and associated data G.Save(*TFOut::New(Dir + "graph.bin")); NodeStrs.Save(*TFOut::New(Dir + "nodeStrs.bin")); PropStrs.Save(*TFOut::New(Dir + "propStrs.bin")); printf("Computing objects...\n"); // Get the objects of the graph. TIntV Objects; // We defined the objects to be the nodes with prefix http://dbpedia.org/resource/. TObjectFunctor ObjectFunctor(NodeStrs); TObjectUtils::GetObjects(G, ObjectFunctor, Objects); // Store and print the objects. Objects.Save(*TFOut::New(Dir + "objects.bin")); TObjectUtils::PrintObjects(Objects, NodeStrs, *TFOut::New(Dir + "objects.txt")); printf("Computing object matrix...\n"); // Here we choose the descriptors for the objects. // We chose property + nbh (value) descriptors for objects // We could also use more complicated descriptors such as subgraphs or subnetworks. TSparseColMatrix ObjectMatrix1; TSparseColMatrix ObjectMatrix2; TObjectUtils::GetPropertyCount(Objects, G, ObjectMatrix1); TObjectUtils::GetNbhCount(Objects, G, ObjectMatrix2); TLAUtils::NormalizeMatrix(ObjectMatrix1); TLAUtils::NormalizeMatrix(ObjectMatrix2); TSparseColMatrix ObjectMatrix; TLAUtils::ConcatenateMatricesRowWise(ObjectMatrix1, ObjectMatrix2, ObjectMatrix); TLAUtils::NormalizeMatrix(ObjectMatrix); ObjectMatrix.Save(*TFOut::New(Dir + "objectMatrix.bin")); printf("Clustering objects...\n"); // Partition the objects into 64 partitions (clusters). int K = 64; int NumIterations = 20; TIntV Assigments; TVec<TIntV> Clusters; TClusterUtils::GetClusters(ObjectMatrix, K, NumIterations, Assigments, Clusters); // Store the clustering data. Assigments.Save(*TFOut::New(Dir + "assigments.bin")); Clusters.Save(*TFOut::New(Dir + "clusters.bin")); // Print some details about the clusters. TClusterUtils::PrintClusterSizes(Clusters, *TFOut::New(Dir + "clusterSizes.txt")); TClusterUtils::PrintClusters(Clusters, Objects, NodeStrs, *TFOut::New(Dir + "clusters.txt")); printf("Computing similarities...\n"); // Compute the similarity betweeen the objects. const int MaxNumSimilarObjects = 100; const int NumThreads = 10; TVec<TIntFltKdV> Similarities; TSimilarityUtils::ComputeSimilarities(ObjectMatrix, Assigments, Clusters, MaxNumSimilarObjects, NumThreads, Similarities); // Store the object similarities. Similarities.Save(*TFOut::New(Dir + "objectSimilarities.bin")); // Print the object similarities. TSimilarityUtils::PrintSimilarities(Similarities, Objects, NodeStrs, 10, *TFOut::New(Dir + "objectSimilarities.txt")); printf("Computing existing property matrix...\n"); // Our goal is to compute the missing out-going properties. // Therefore, we create the matrix of existing out-going properties of the objects. TSparseColMatrix OutPropertyCountMatrix; TObjectUtils::GetOutPropertyCount(Objects, G, OutPropertyCountMatrix); TObjectUtils::PrintPropertyMatrix(OutPropertyCountMatrix, Objects, NodeStrs, PropStrs, *TFOut::New(Dir + "outPropertyCountMatrix.txt")); OutPropertyCountMatrix.Save(*TFOut::New(Dir + "outPropertyCountMatrix.bin")); printf("Computing missing properties...\n"); // And finally, compute the missing properties. int MaxNumMissingProperties = 100; TVec<TIntFltKdV> MissingProperties; TPropertyUtils::GetMissingProperties(Similarities, OutPropertyCountMatrix, MaxNumMissingProperties, NumThreads, MissingProperties); // Store the missing properties data. MissingProperties.Save(*TFOut::New(Dir + "missingProperties.bin")); // Print missing properties. TPropertyUtils::PrintMissingProperties(MissingProperties, Objects, NodeStrs, PropStrs, 10, *TFOut::New(Dir + "missingProperties.txt")); }