Ejemplo n.º 1
0
int main(int argc, char *argv[]) {
  // #### SETUP: Parse Arguments
  LogOutput Log;
  THash<TStr, TStr> Arguments;
  ArgumentParser::ParseArguments(argc, argv, Arguments, Log);

  TStr OutputDirectory;
  TStr StartString = ArgumentParser::GetArgument(Arguments, "start", "2009-02-01");
  TStr QBDBDirectory = ArgumentParser::GetArgument(Arguments, "qbdb", QBDB_DIR_DEFAULT);
  TStr OutDirectory = ArgumentParser::GetArgument(Arguments, "out", "/lfs/1/tmp/curis/");
  TInt WindowSize = ArgumentParser::GetArgument(Arguments, "window", "14").GetInt();

  if (ArgumentParser::GetArgument(Arguments, "nolog", "") == "") {
    Log.DisableLogging();
  } else if (!Arguments.IsKeyGetDat("directory", OutputDirectory)) {
    Log.SetupNewOutputDirectory("");
  } else {
    Log.SetDirectory(OutputDirectory);
  }

  // #### DATA LOADING: Load ALL the things!
  TQuoteBase QB;
  TDocBase DB;
  fprintf(stderr, "Loading QB and DB from file for %d days, starting from %s...\n", WindowSize.Val, StartString.CStr());
  Err("%s\n", QBDBDirectory.CStr());
  TSecTm PresentTime = TDataLoader::LoadQBDBByWindow(QBDBDirectory, StartString, WindowSize, QB, DB);
  fprintf(stderr, "QBDB successfully loaded!\n");

  TVec<TSecTm> PubTmV;
  TVec<TStr> PostUrlV;
  TVec<TStr> QuoteV;

  fprintf(stderr, "Dumping quotes to file...\n");
  TIntV QuoteIds;
  QB.GetAllQuoteIds(QuoteIds);
  for (int i = 0; i < QuoteIds.Len(); i++) {
    TQuote Q;
    QB.GetQuote(QuoteIds[i], Q);
    TStr QContentString;
    Q.GetContentString(QContentString);

    TVec<TUInt> Sources;
    Q.GetSources(Sources);
    for (int j = 0; j < Sources.Len(); j++) {
      TDoc D;
      DB.GetDoc(Sources[j], D);
      TStr PostUrl;
      D.GetUrl(PostUrl);
      TSecTm PostTime = D.GetDate();
      QuoteV.Add(QContentString);
      PubTmV.Add(PostTime);
      PostUrlV.Add(PostUrl);
    }
  }

  TFOut FOut(OutDirectory + "QuoteList" + ".bin");
  PubTmV.Save(FOut);
  PostUrlV.Save(FOut);
  QuoteV.Save(FOut);

  fprintf(stderr, "Done!\n");
  return 0;
}
Ejemplo n.º 2
0
void ComputeMissingProperties (const TStr &Dir, const TStr &TriplesFilename)
{
  // Parse the rdf file and create the graph.
  TFIn File(TriplesFilename);
  TRDFParser DBpediaDataset(File);

  printf("Creating graph from input file...\n");
  TGraph G;
  TStrSet NodeStrs;
  TStrSet PropStrs;
  bool Parsed = TSnap::GetGraphFromRDFParser(DBpediaDataset, G, NodeStrs, PropStrs);
  if (!Parsed) {
    return;
  }

  // Store the graph and associated data
  G.Save(*TFOut::New(Dir + "graph.bin"));
  NodeStrs.Save(*TFOut::New(Dir + "nodeStrs.bin"));
  PropStrs.Save(*TFOut::New(Dir + "propStrs.bin"));

  printf("Computing objects...\n");
  // Get the objects of the graph. 
  TIntV Objects;
  // We defined the objects to be the nodes with prefix http://dbpedia.org/resource/. 
  TObjectFunctor ObjectFunctor(NodeStrs);
  TObjectUtils::GetObjects(G, ObjectFunctor, Objects);
  // Store and print the objects.
  Objects.Save(*TFOut::New(Dir + "objects.bin"));
  TObjectUtils::PrintObjects(Objects, NodeStrs, *TFOut::New(Dir + "objects.txt"));

  printf("Computing object matrix...\n");
  // Here we choose the descriptors for the objects.
  // We chose property + nbh (value) descriptors for objects
  // We could also use more complicated descriptors such as subgraphs or subnetworks.
  TSparseColMatrix ObjectMatrix1;
  TSparseColMatrix ObjectMatrix2;
  TObjectUtils::GetPropertyCount(Objects, G, ObjectMatrix1);
  TObjectUtils::GetNbhCount(Objects, G, ObjectMatrix2);
  TLAUtils::NormalizeMatrix(ObjectMatrix1);
  TLAUtils::NormalizeMatrix(ObjectMatrix2);

  TSparseColMatrix ObjectMatrix;
  TLAUtils::ConcatenateMatricesRowWise(ObjectMatrix1, ObjectMatrix2, ObjectMatrix);
  TLAUtils::NormalizeMatrix(ObjectMatrix);
  ObjectMatrix.Save(*TFOut::New(Dir + "objectMatrix.bin"));

  printf("Clustering objects...\n");
  // Partition the objects into 64 partitions (clusters).
  int K = 64;
  int NumIterations = 20;
  TIntV Assigments;
  TVec<TIntV> Clusters;
  TClusterUtils::GetClusters(ObjectMatrix, K, NumIterations, Assigments, Clusters);
  // Store the clustering data.
  Assigments.Save(*TFOut::New(Dir + "assigments.bin"));
  Clusters.Save(*TFOut::New(Dir + "clusters.bin"));
  // Print some details about the clusters.
  TClusterUtils::PrintClusterSizes(Clusters, *TFOut::New(Dir + "clusterSizes.txt"));
  TClusterUtils::PrintClusters(Clusters, Objects, NodeStrs, *TFOut::New(Dir + "clusters.txt"));

  printf("Computing similarities...\n");
  // Compute the similarity betweeen the objects.
  const int MaxNumSimilarObjects = 100;
  const int NumThreads = 10;
  TVec<TIntFltKdV> Similarities;
  TSimilarityUtils::ComputeSimilarities(ObjectMatrix, Assigments, Clusters, MaxNumSimilarObjects, NumThreads, Similarities);
  // Store the object similarities.
  Similarities.Save(*TFOut::New(Dir + "objectSimilarities.bin"));
  // Print the object similarities.
  TSimilarityUtils::PrintSimilarities(Similarities, Objects, NodeStrs, 10, *TFOut::New(Dir + "objectSimilarities.txt"));

  printf("Computing existing property matrix...\n");
  // Our goal is to compute the missing out-going properties.
  // Therefore, we create the matrix of existing out-going properties of the objects.
  TSparseColMatrix OutPropertyCountMatrix;
  TObjectUtils::GetOutPropertyCount(Objects, G, OutPropertyCountMatrix);
  TObjectUtils::PrintPropertyMatrix(OutPropertyCountMatrix, Objects, NodeStrs, PropStrs, *TFOut::New(Dir + "outPropertyCountMatrix.txt"));
  OutPropertyCountMatrix.Save(*TFOut::New(Dir + "outPropertyCountMatrix.bin"));

  printf("Computing missing properties...\n");
  // And finally, compute the missing properties.
  int MaxNumMissingProperties = 100;
  TVec<TIntFltKdV> MissingProperties;
  TPropertyUtils::GetMissingProperties(Similarities, OutPropertyCountMatrix, MaxNumMissingProperties, NumThreads, MissingProperties);
  // Store the missing properties data.
  MissingProperties.Save(*TFOut::New(Dir + "missingProperties.bin"));
  // Print missing properties.
  TPropertyUtils::PrintMissingProperties(MissingProperties, Objects, NodeStrs, PropStrs, 10, *TFOut::New(Dir + "missingProperties.txt"));
}