Example #1
0
void TSAppSrvFun::GetFldValSet(const TStrKdV& FldNmValPrV, const TStr& FldNm, TStrSet& FldValSet) {
	FldValSet.Clr();
	int ValN = FldNmValPrV.SearchForw(TStrKd(FldNm, ""));
	while (ValN != -1) {
		FldValSet.AddKey(FldNmValPrV[ValN].Dat);
		ValN = FldNmValPrV.SearchForw(TStrKd(FldNm, ""), ValN + 1);
	}
}
int main(int argc, char *argv[]) {
  TStr BaseString = "/lfs/1/tmp/curis/week/QBDB.bin";
  TFIn BaseFile(BaseString);
  TQuoteBase *QB = new TQuoteBase;
  TDocBase *DB = new TDocBase;
  QB->Load(BaseFile);
  DB->Load(BaseFile);

  TIntV QuoteIds;
  QB->GetAllQuoteIds(QuoteIds);

  int NumQuotes = QuoteIds.Len();
  THash<TInt, TStrSet> PeakCounts;
  for (int i = 0; i < NumQuotes; i++) {
    TQuote CurQuote;
    if (QB->GetQuote(QuoteIds[i], CurQuote)) {
      TVec<TSecTm> Peaks;
      CurQuote.GetPeaks(DB, Peaks);
      TStr QuoteString;
      CurQuote.GetParsedContentString(QuoteString);
      TStrSet StringSet;
      if (PeakCounts.IsKey(Peaks.Len())) {
        StringSet = PeakCounts.GetDat(Peaks.Len());
      }
      StringSet.AddKey(QuoteString);
      PeakCounts.AddDat(Peaks.Len(), StringSet);
    }
  }

  TIntV PeakCountKeys;
  PeakCounts.GetKeyV(PeakCountKeys);
  PeakCountKeys.Sort(true);
  for (int i = 0; i < PeakCountKeys.Len(); i++) {
    TStrSet CurSet = PeakCounts.GetDat(PeakCountKeys[i]);
    if (CurSet.Len() > 0) {
      printf("QUOTES WITH %d PEAKS\n", PeakCountKeys[i].Val);
      printf("#########################################\n");
      THashSet<TStr> StringSet = PeakCounts.GetDat(PeakCountKeys[i]);
      for (THashSet<TStr>::TIter l = StringSet.BegI(); l < StringSet.EndI(); l++) {
        printf("%s\n", l.GetKey().CStr());
      }
      printf("\n");
    }
  }
  delete QB;
  delete DB;
  return 0;
}
Example #3
0
// Eve communication network
PWgtNet TWgtNet::LoadEveCommNet(const TStr& FNm) {
  PWgtNet Net = TWgtNet::New();
  TStrSet AuthorSet;
  TChA Ln;
  TVec<char*> WrdV;
  TFIn FIn(FNm);
  for (int c=0; FIn.GetNextLn(Ln); c++) {
    TStrUtil::SplitOnCh(Ln, WrdV, ';');
    const int n1 = AuthorSet.AddKey(WrdV[0]);
    const int n2 = AuthorSet.AddKey(WrdV[1]);
    if (! Net->IsNode(n1)) { Net->AddNode(n1, WrdV[0]); }
    if (! Net->IsNode(n2)) { Net->AddNode(n2, WrdV[1]); }
    if (Net->IsEdge(n1, n2)) { Net->GetEDat(n1, n2) += 1; }
    else { Net->AddEdge(n1, n2, 1); }
    if (c % Kilo(10) == 0) { printf("\r%dk", c/1000); }
  }
  printf("\n");
  TGBase::PrintInfo(Net);
  printf("  Edge weight: %f\n", Net->GetEdgeWgt());
  return Net;
}
Example #4
0
// Arxiv co-authorship network
// Network is undirected (edges of equal weight go both ways)
// "W:\\Data\\Arxiv\\Arxiv-CoAuth\\gr-qc.lis"
PWgtNet TWgtNet::LoadArxivCoAuth(const TStr& FNm) {
  TArxivPaperList Arxiv(FNm);
  PWgtNet Net = TWgtNet::New();
  TStrSet AuthorSet;
  while (Arxiv.Next()) {
    for (int a1 = 0; a1 < Arxiv.AuthorV.Len(); a1++) {
      const int n1 = AuthorSet.AddKey(Arxiv.AuthorV[a1]);
      for (int a2 = 0; a2 < Arxiv.AuthorV.Len(); a2++) {
        if (a1 == a2) { continue; }
        const int n2 = AuthorSet.AddKey(Arxiv.AuthorV[a2]);
        if (! Net->IsNode(n1)) { Net->AddNode(n1, Arxiv.AuthorV[a1]); }
        if (! Net->IsNode(n2)) { Net->AddNode(n2, Arxiv.AuthorV[a2]); }
        if (Net->IsEdge(n1, n2)) { Net->GetEDat(n1, n2) += 1; }
        else { Net->AddEdge(n1, n2, 1); }
      }
    }
  }
  TGBase::PrintInfo(Net);
  printf("  Edge weight: %f\n", Net->GetEdgeWgt());
  return Net;
}
Example #5
0
// "W:\\Data\\CiteSeer\\old\\citeseer-links.csv"
PWgtNet TWgtNet::LoadCiteSeerCoAuth(const TStr& FNm) {
  PWgtNet Net = TWgtNet::New();
  TStrSet AuthorSet;
  TSsParser Ss(FNm, ssfCommaSep);
  while (Ss.Next()) {
    for (int a1 = 2; a1 < Ss.Len(); a1++) {
      const int n1 = AuthorSet.AddKey(Ss[a1]);
      for (int a2 = 2; a2 < Ss.Len(); a2++) {
        if (a1 == a2) { continue; }
        const int n2 = AuthorSet.AddKey(Ss[a2]);
        if (! Net->IsNode(n1)) { Net->AddNode(n1, Ss[a1]); }
        if (! Net->IsNode(n2)) { Net->AddNode(n2, Ss[a2]); }
        if (Net->IsEdge(n1, n2)) { Net->GetEDat(n1, n2) += 1; }
        else { Net->AddEdge(n1, n2, 1); }
      }
    }
  }
  TGBase::PrintInfo(Net);
  printf("  Edge weight: %f\n", Net->GetEdgeWgt());
  return Net;
}
Example #6
0
// Network is undirected (edges of equal weight go both ways)
// "W:\\Data\\DBLP\\dblp.xml.gz"
PWgtNet TWgtNet::LoadDblpCoAuth(const TStr& FNm) {
  TDblpLoader Dblp(FNm);
  TStrSet AuthorSet;
  PWgtNet Net = TWgtNet::New();
  for (int c = 0; Dblp.Next(); c++) {
    for (int a1 = 0; a1 < Dblp.AuthorV.Len(); a1++) {
      const int n1 = AuthorSet.AddKey(Dblp.AuthorV[a1]);
      for (int a2 = 0; a2 < Dblp.AuthorV.Len(); a2++) {
        if (a1 == a2) { continue; }
        const int n2 = AuthorSet.AddKey(Dblp.AuthorV[a2]);
        if (! Net->IsNode(n1)) { Net->AddNode(n1, Dblp.AuthorV[a1]); }
        if (! Net->IsNode(n2)) { Net->AddNode(n2, Dblp.AuthorV[a2]); }
        if (Net->IsEdge(n1, n2)) { Net->GetEDat(n1, n2) += 1; }
        else { Net->AddEdge(n1, n2, 1); }
      }
    }
    if (c % 1000 == 0) { printf("\r%d", c); }
  }
  printf("\n");
  TGBase::PrintInfo(Net);
  printf("  Edge weight: %f\n", Net->GetEdgeWgt());
  return Net;
}
void ComputeMissingProperties (const TStr &Dir, const TStr &TriplesFilename)
{
  // Parse the rdf file and create the graph.
  TFIn File(TriplesFilename);
  TRDFParser DBpediaDataset(File);

  printf("Creating graph from input file...\n");
  TGraph G;
  TStrSet NodeStrs;
  TStrSet PropStrs;
  bool Parsed = TSnap::GetGraphFromRDFParser(DBpediaDataset, G, NodeStrs, PropStrs);
  if (!Parsed) {
    return;
  }

  // Store the graph and associated data
  G.Save(*TFOut::New(Dir + "graph.bin"));
  NodeStrs.Save(*TFOut::New(Dir + "nodeStrs.bin"));
  PropStrs.Save(*TFOut::New(Dir + "propStrs.bin"));

  printf("Computing objects...\n");
  // Get the objects of the graph. 
  TIntV Objects;
  // We defined the objects to be the nodes with prefix http://dbpedia.org/resource/. 
  TObjectFunctor ObjectFunctor(NodeStrs);
  TObjectUtils::GetObjects(G, ObjectFunctor, Objects);
  // Store and print the objects.
  Objects.Save(*TFOut::New(Dir + "objects.bin"));
  TObjectUtils::PrintObjects(Objects, NodeStrs, *TFOut::New(Dir + "objects.txt"));

  printf("Computing object matrix...\n");
  // Here we choose the descriptors for the objects.
  // We chose property + nbh (value) descriptors for objects
  // We could also use more complicated descriptors such as subgraphs or subnetworks.
  TSparseColMatrix ObjectMatrix1;
  TSparseColMatrix ObjectMatrix2;
  TObjectUtils::GetPropertyCount(Objects, G, ObjectMatrix1);
  TObjectUtils::GetNbhCount(Objects, G, ObjectMatrix2);
  TLAUtils::NormalizeMatrix(ObjectMatrix1);
  TLAUtils::NormalizeMatrix(ObjectMatrix2);

  TSparseColMatrix ObjectMatrix;
  TLAUtils::ConcatenateMatricesRowWise(ObjectMatrix1, ObjectMatrix2, ObjectMatrix);
  TLAUtils::NormalizeMatrix(ObjectMatrix);
  ObjectMatrix.Save(*TFOut::New(Dir + "objectMatrix.bin"));

  printf("Clustering objects...\n");
  // Partition the objects into 64 partitions (clusters).
  int K = 64;
  int NumIterations = 20;
  TIntV Assigments;
  TVec<TIntV> Clusters;
  TClusterUtils::GetClusters(ObjectMatrix, K, NumIterations, Assigments, Clusters);
  // Store the clustering data.
  Assigments.Save(*TFOut::New(Dir + "assigments.bin"));
  Clusters.Save(*TFOut::New(Dir + "clusters.bin"));
  // Print some details about the clusters.
  TClusterUtils::PrintClusterSizes(Clusters, *TFOut::New(Dir + "clusterSizes.txt"));
  TClusterUtils::PrintClusters(Clusters, Objects, NodeStrs, *TFOut::New(Dir + "clusters.txt"));

  printf("Computing similarities...\n");
  // Compute the similarity betweeen the objects.
  const int MaxNumSimilarObjects = 100;
  const int NumThreads = 10;
  TVec<TIntFltKdV> Similarities;
  TSimilarityUtils::ComputeSimilarities(ObjectMatrix, Assigments, Clusters, MaxNumSimilarObjects, NumThreads, Similarities);
  // Store the object similarities.
  Similarities.Save(*TFOut::New(Dir + "objectSimilarities.bin"));
  // Print the object similarities.
  TSimilarityUtils::PrintSimilarities(Similarities, Objects, NodeStrs, 10, *TFOut::New(Dir + "objectSimilarities.txt"));

  printf("Computing existing property matrix...\n");
  // Our goal is to compute the missing out-going properties.
  // Therefore, we create the matrix of existing out-going properties of the objects.
  TSparseColMatrix OutPropertyCountMatrix;
  TObjectUtils::GetOutPropertyCount(Objects, G, OutPropertyCountMatrix);
  TObjectUtils::PrintPropertyMatrix(OutPropertyCountMatrix, Objects, NodeStrs, PropStrs, *TFOut::New(Dir + "outPropertyCountMatrix.txt"));
  OutPropertyCountMatrix.Save(*TFOut::New(Dir + "outPropertyCountMatrix.bin"));

  printf("Computing missing properties...\n");
  // And finally, compute the missing properties.
  int MaxNumMissingProperties = 100;
  TVec<TIntFltKdV> MissingProperties;
  TPropertyUtils::GetMissingProperties(Similarities, OutPropertyCountMatrix, MaxNumMissingProperties, NumThreads, MissingProperties);
  // Store the missing properties data.
  MissingProperties.Save(*TFOut::New(Dir + "missingProperties.bin"));
  // Print missing properties.
  TPropertyUtils::PrintMissingProperties(MissingProperties, Objects, NodeStrs, PropStrs, 10, *TFOut::New(Dir + "missingProperties.txt"));
}