コード例 #1
0
void QuoteGraph::CompareUsingShingles(THash<TMd5Sig, TIntSet>& Shingles) {
  int Count = 0;
  int RealCount = 0;
  TVec<TMd5Sig> ShingleKeys;
  Shingles.GetKeyV(ShingleKeys);
  THashSet<TIntPr> EdgeCache;

  for (int i = 0; i < ShingleKeys.Len(); i++) {
    if (i % 100 == 0) {
      Err("Processed %d out of %d shingles, count = %d\n", i, ShingleKeys.Len(), Count);
    }
    TIntSet Bucket;
    Shingles.IsKeyGetDat(ShingleKeys[i], Bucket);

    for (TIntSet::TIter Quote1 = Bucket.BegI(); Quote1 < Bucket.EndI(); Quote1++) {
      TIntSet::TIter Quote1Copy = Quote1;
      Quote1Copy++;
      for (TIntSet::TIter Quote2 = Quote1Copy; Quote2 < Bucket.EndI(); Quote2++) {
        if (!EdgeCache.IsKey(TIntPr(Quote1.GetKey(), Quote2.GetKey())) && !EdgeCache.IsKey(TIntPr(Quote2.GetKey(), Quote1.GetKey()))) {
          EdgeCache.AddKey(TIntPr(Quote1.GetKey(), Quote2.GetKey()));
          EdgeCache.AddKey(TIntPr(Quote2.GetKey(), Quote1.GetKey()));
          RealCount++;
          AddEdgeIfSimilar(Quote1.GetKey(), Quote2.GetKey());
        }
      }
    }
    int Len = Bucket.Len() * (Bucket.Len() - 1) / 2;
    Count += Len;
  }
  fprintf(stderr, "NUMBER OF COMPARES: %d\n", Count);
  fprintf(stderr, "NUMBER OF REAL COMPARES: %d\n", RealCount);
}
コード例 #2
0
ファイル: gio.cpp プロジェクト: pikma/Snap
// DyNetML format, loads all the networks in the file
TVec<PNGraph> LoadDyNetGraphV(const TStr& FNm) {
  TXmlLx XmlLx(TFIn::New(FNm), xspTruncate);
  TVec<PNGraph> GraphV;
  THashSet<TStr> NIdStr;
  while (XmlLx.GetSym()!=xsyEof) {
    if (XmlLx.Sym==xsySTag && XmlLx.TagNm=="network") {
      PNGraph G = TNGraph::New();
      GraphV.Add(G);
      XmlLx.GetSym();
      while (XmlLx.TagNm=="link") {
        TStr Str1, Val1, Str2, Val2;
        XmlLx.GetArg(0, Str1, Val1);  XmlLx.GetArg(1, Str2, Val2);
        IAssert(Str1=="source" && Str2=="target");
        NIdStr.AddKey(Val1); NIdStr.AddKey(Val2);
        const int src=NIdStr.GetKeyId(Val1);
        const int dst=NIdStr.GetKeyId(Val2);
        if (! G->IsNode(src)) { G->AddNode(src); }
        if (! G->IsNode(dst)) { G->AddNode(dst); }
        G->AddEdge(src, dst);
        XmlLx.GetSym();
      }
    }
  }
  return GraphV;
}
コード例 #3
0
// I embarassingly don't know how templating works.
void QuoteGraph::CompareUsingMinHash(TVec<THash<TMd5Sig, TIntSet> >& BucketsVector) {
  THashSet<TIntPr> EdgeCache;
  int Count = 0;
  int RealCount = 0;

  Err("Beginning edge creation step...\n");
  for (int i = 0; i < BucketsVector.Len(); i++) {
    Err("Processing band signature %d of %d - %d signatures\n", i+1, BucketsVector.Len(), BucketsVector[i].Len());
    TVec<TMd5Sig> Buckets;
    BucketsVector[i].GetKeyV(Buckets);
    TVec<TMd5Sig>::TIter BucketEnd = Buckets.EndI();
    for (TVec<TMd5Sig>::TIter BucketSig = Buckets.BegI(); BucketSig < BucketEnd; BucketSig++) {
      TIntSet Bucket  = BucketsVector[i].GetDat(*BucketSig);
      Count += Bucket.Len() * (Bucket.Len() - 1) / 2;
      for (TIntSet::TIter Quote1 = Bucket.BegI(); Quote1 < Bucket.EndI(); Quote1++) {
        TIntSet::TIter Quote1Copy = Quote1;
        Quote1Copy++;
        for (TIntSet::TIter Quote2 = Quote1Copy; Quote2 < Bucket.EndI(); Quote2++) {
          if (!EdgeCache.IsKey(TIntPr(Quote1.GetKey(), Quote2.GetKey())) && !EdgeCache.IsKey(TIntPr(Quote2.GetKey(), Quote1.GetKey()))) {
            EdgeCache.AddKey(TIntPr(Quote1.GetKey(), Quote2.GetKey()));
            EdgeCache.AddKey(TIntPr(Quote2.GetKey(), Quote1.GetKey()));
            RealCount++;
            AddEdgeIfSimilar(Quote1.GetKey(), Quote2.GetKey());
          }
        }
      }
    }
  }
  fprintf(stderr, "NUMBER OF COMPARES: %d\n", Count);
  fprintf(stderr, "NUMBER OF REAL COMPARES: %d\n", RealCount);
}
コード例 #4
0
ファイル: lsh.cpp プロジェクト: EDzhangjianyu/snap
TVec<TPair<TFltV, TFltV> > TLSHash::GetAllCandidatePairs() {
  THashSet<TPair<TInt, TInt> > CandidateIdPairs;
  for (int i=0; i<Bands; i++) {
    TVec<TIntV> BucketVV;
    SigBucketVHV[i].GetDatV(BucketVV);
    for (int j=0; j<BucketVV.Len(); j++) {
      TIntV BucketV = BucketVV[j];

      for (int k=0; k<BucketV.Len(); k++) {
        for (int l=k+1; l<BucketV.Len(); l++) {
          int First = BucketV[k], Second = BucketV[l];
          if (First > Second) { 
            int Temp = First;
            First = Second;
            Second = Temp;
          }
          CandidateIdPairs.AddKey(TPair<TInt, TInt> (First, Second));
        }
      }
    }
  }

  TVec<TPair<TFltV, TFltV> > CandidatePairs;
  int Ind = CandidateIdPairs.FFirstKeyId();
  while (CandidateIdPairs.FNextKeyId(Ind)) {
    TPair<TInt, TInt> IdPair = CandidateIdPairs[Ind];
    TPair<TFltV, TFltV> Pair(DataV[IdPair.GetVal1()], DataV[IdPair.GetVal2()]);
    CandidatePairs.Add(Pair);
  }
  return CandidatePairs;
}
コード例 #5
0
static void AddTreeCtrs(const TTrainData& data,
                        const TSplitTree& currentTree,
                        TFold* fold,
                        TLearnContext* ctx,
                        TStatsFromPrevTree* statsFromPrevTree,
                        TCandidateList* candList) {
    using TSeenProjHash = THashSet<TProjection>;
    TSeenProjHash seenProj;

    // greedy construction
    TProjection binAndOneHotFeaturesTree;
    binAndOneHotFeaturesTree.BinFeatures = currentTree.GetBinFeatures();
    binAndOneHotFeaturesTree.OneHotFeatures = currentTree.GetOneHotFeatures();
    seenProj.insert(binAndOneHotFeaturesTree);

    for (const auto& ctrSplit : currentTree.GetCtrSplits()) {
        seenProj.insert(ctrSplit.Projection);
    }

    TSeenProjHash addedProjHash;
    for (const auto& baseProj : seenProj) {
        if (baseProj.IsEmpty()) {
            continue;
        }
        for (int cf = 0; cf < data.AllFeatures.CatFeatures.ysize(); ++cf) {
            if (data.AllFeatures.CatFeatures[cf].empty() ||
                data.AllFeatures.IsOneHot[cf] ||
                ctx->Rand.GenRandReal1() > ctx->Params.ObliviousTreeOptions->Rsm) {
                continue;
            }

            TProjection proj = baseProj;
            proj.AddCatFeature(cf);

            if (proj.IsRedundant() || proj.GetFullProjectionLength() > ctx->Params.CatFeatureParams->MaxTensorComplexity) {
                continue;
            }

            if (addedProjHash.has(proj)) {
                continue;
            }

            addedProjHash.insert(proj);

            AddCtrsToCandList(*fold, *ctx, proj, candList);
            fold->GetCtrRef(proj);
        }
    }
    THashSet<TSplitCandidate> candidatesToErase;
    for (auto& splitCandidate : statsFromPrevTree->Stats) {
        if (splitCandidate.first.Type == ESplitType::OnlineCtr) {
            if (!addedProjHash.has(splitCandidate.first.Ctr.Projection)) {
                candidatesToErase.insert(splitCandidate.first);
            }
        }
    }
    for (const auto& splitCandidate : candidatesToErase) {
        statsFromPrevTree->Stats.erase(splitCandidate);
    }
}
コード例 #6
0
ファイル: lsh.cpp プロジェクト: snap-stanford/curis-2012
/// Shingles by words
void LSH::HashShinglesOfClusters(TQuoteBase *QuoteBase,
    TClusterBase *ClusterBase, TIntV& ClusterIds, TInt ShingleLen,
    THash<TMd5Sig, TIntV>& ShingleToClusterIds) {
  Err("Hashing shingles of clusters...\n");
  for (int i = 0; i < ClusterIds.Len(); i++) {
    if (i % 1000 == 0) {
      fprintf(stderr, "%d out of %d completed\n", i, ClusterIds.Len());
    }
    TCluster C;
    ClusterBase->GetCluster(ClusterIds[i], C);
    //fprintf(stderr, "%d vs. %d\n", ClusterIds[i].Val, C.GetId().Val);

    // Put x-word shingles into hash table; x is specified by ShingleLen parameter
    THashSet < TMd5Sig > CHashedShingles;
    GetHashedShinglesOfCluster(QuoteBase, C, ShingleLen, CHashedShingles);
    for (THashSet<TMd5Sig>::TIter Hash = CHashedShingles.BegI();
        Hash < CHashedShingles.EndI(); Hash++) {
      TIntV ShingleClusterIds;
      if (ShingleToClusterIds.IsKey(*Hash)) {
        ShingleClusterIds = ShingleToClusterIds.GetDat(*Hash);
      }
      ShingleClusterIds.Add(ClusterIds[i]);
      ShingleToClusterIds.AddDat(*Hash, ShingleClusterIds);
    }
  }
  Err("Done hashing!\n");
}
コード例 #7
0
ファイル: cliques.cpp プロジェクト: Aleyasen/Alaki
void TCliqueOverlap::GetIntersection(const THashSet<TInt>& A, const THashSet<TInt>& B, THashSet<TInt>& C) {
	if (A.Len() < B.Len()) {
		for (THashSetKeyI<TInt> it=A.BegI(); it<A.EndI(); it++) 
			if (B.IsKey(it.GetKey())) C.AddKey(it.GetKey());
	} else {
		for (THashSetKeyI<TInt> it=B.BegI(); it<B.EndI(); it++) 
			if (A.IsKey(it.GetKey())) C.AddKey(it.GetKey());
	}
}
コード例 #8
0
ファイル: cliques.cpp プロジェクト: Aleyasen/Alaki
int TCliqueOverlap::GetNodeIdWithMaxDeg(const THashSet<TInt>& Set) const{
	int id = -1;
	int maxDeg = -1;
	//
	for (THashSetKeyI<TInt> it=Set.BegI(); it<Set.EndI(); it++) {
		int nId = it.GetKey();
		int deg = m_G->GetNI(nId).GetDeg();
		if (maxDeg < deg) { maxDeg=deg; id=nId; }
	}
	return id;
}
コード例 #9
0
int main(int argc, char *argv[]) {
  TStr BaseString = "/lfs/1/tmp/curis/week/QBDB.bin";
  TFIn BaseFile(BaseString);
  TQuoteBase *QB = new TQuoteBase;
  TDocBase *DB = new TDocBase;
  QB->Load(BaseFile);
  DB->Load(BaseFile);

  TIntV QuoteIds;
  QB->GetAllQuoteIds(QuoteIds);

  int NumQuotes = QuoteIds.Len();
  THash<TInt, TStrSet> PeakCounts;
  for (int i = 0; i < NumQuotes; i++) {
    TQuote CurQuote;
    if (QB->GetQuote(QuoteIds[i], CurQuote)) {
      TVec<TSecTm> Peaks;
      CurQuote.GetPeaks(DB, Peaks);
      TStr QuoteString;
      CurQuote.GetParsedContentString(QuoteString);
      TStrSet StringSet;
      if (PeakCounts.IsKey(Peaks.Len())) {
        StringSet = PeakCounts.GetDat(Peaks.Len());
      }
      StringSet.AddKey(QuoteString);
      PeakCounts.AddDat(Peaks.Len(), StringSet);
    }
  }

  TIntV PeakCountKeys;
  PeakCounts.GetKeyV(PeakCountKeys);
  PeakCountKeys.Sort(true);
  for (int i = 0; i < PeakCountKeys.Len(); i++) {
    TStrSet CurSet = PeakCounts.GetDat(PeakCountKeys[i]);
    if (CurSet.Len() > 0) {
      printf("QUOTES WITH %d PEAKS\n", PeakCountKeys[i].Val);
      printf("#########################################\n");
      THashSet<TStr> StringSet = PeakCounts.GetDat(PeakCountKeys[i]);
      for (THashSet<TStr>::TIter l = StringSet.BegI(); l < StringSet.EndI(); l++) {
        printf("%s\n", l.GetKey().CStr());
      }
      printf("\n");
    }
  }
  delete QB;
  delete DB;
  return 0;
}
コード例 #10
0
ファイル: lsh.cpp プロジェクト: snap-stanford/curis-2012
void LSH::GetHashedShinglesOfCluster(TQuoteBase *QuoteBase, TCluster& C,
    TInt ShingleLen, THashSet<TMd5Sig>& HashedShingles) {
  TIntV QuoteIds;
  C.GetQuoteIds(QuoteIds);
  for (int qt = 0; qt < QuoteIds.Len(); qt++) {
    TQuote Q;
    QuoteBase->GetQuote(QuoteIds[qt], Q);
    TStr QContentStr;
    Q.GetContentString(QContentStr);
    TStr QContentStrNoPunc;
    TStringUtil::RemovePunctuation(QContentStr, QContentStrNoPunc);
    TStrV QContentV;
    QContentStrNoPunc.SplitOnWs(QContentV);
    for (int i = 0; i < QContentV.Len() - ShingleLen + 1; i++) {
      TStr Shingle;
      for (int j = 0; j < ShingleLen; j++) {
        if (j > 0) {
          Shingle.InsStr(Shingle.Len(), " ");
        }
        Shingle.InsStr(Shingle.Len(), QContentV[i + j]);
      }
      TMd5Sig ShingleMd5(Shingle);
      HashedShingles.AddKey(ShingleMd5);
    }
  }
}
コード例 #11
0
ファイル: mkdatasets.cpp プロジェクト: SherlockYang/Archive
void MakeSlashdotSignNet(const TStr InFNm, TStr OutFNm, TStr Desc, THashSet<TChA> NIdSet) {
  //THashSet<TChA> NIdSet;
  TChA LnStr;
  TVec<char *> WrdV;
  int Sign;
  //PSignNet Net = TSignNet::New();
  TPt<TNodeEDatNet<TInt, TInt> >  Net = TNodeEDatNet<TInt, TInt>::New();
  int i = 0;
  for (TFIn FIn(InFNm); FIn.GetNextLn(LnStr); ) {
    if (LnStr.Empty() || LnStr[0]=='#') { continue; }
    LnStr.ToLc();
    TStrUtil::SplitOnCh(LnStr, WrdV, '\t', false);
    //NIdSet.AddKey(WrdV[0]);
    if (strcmp(WrdV[1], "friends")==0) { Sign = 1; }
    else if (strcmp(WrdV[1], "fans")==0) { continue; } // skip (fans are in-friends)
    else if (strcmp(WrdV[1], "foes")==0) { Sign = -1; } else { Fail; }
    const int SrcNId = NIdSet.AddKey(WrdV[0]);
    if (! Net->IsNode(SrcNId)) {
      Net->AddNode(SrcNId); }   
    for (int e = 2; e < WrdV.Len(); e++) {
      const int DstNId = NIdSet.AddKey(WrdV[e]);
      i ++ ;
      if ((SrcNId != DstNId) && ! Net->IsEdge(SrcNId, DstNId)) {
        if (! Net->IsNode(DstNId))
          Net->AddNode(DstNId);
        Net->AddEdge(SrcNId, DstNId, Sign);
      }
    }  
  }  
  TSnap::PrintInfo(Net, "Slashdot (" + TInt::GetStr(i) + ")");  

  // copied from gio.h - line 111
  FILE *F = fopen(OutFNm.CStr(), "wt");
  fprintf(F, "# Directed graph: %s\n", OutFNm.CStr());
  if (! Desc.Empty()) 
    fprintf(F, "# %s\n", (Desc).CStr());
    fprintf(F, "# Nodes: %d Edges: %d\n", Net->GetNodes(), Net->GetEdges());
    fprintf(F, "# UserId\tGroupId\tSign\n"); 
  for (TNodeEDatNet<TInt,TInt>::TEdgeI ei = Net->BegEI(); ei < Net->EndEI(); ei++) {
      fprintf(F, "%d\t%d\t%d\n", ei.GetSrcNId(), ei.GetDstNId(), ei()());
  }
  fclose(F);
  
  PrintGraphStatTable(Net, OutFNm, Desc);
}
コード例 #12
0
ファイル: cliques.cpp プロジェクト: Aleyasen/Alaki
void TCliqueOverlap::GetMaximalCliques(const PUNGraph& G, int MinMaxCliqueSize, TVec<TIntV>& MaxCliques) {
	if (G->GetNodes() == 0) return;
	//
	m_G = G;
	m_minMaxCliqueSize = MinMaxCliqueSize;
	m_maxCliques =& MaxCliques;
	m_Q.Clr();
	//
	THashSet<TInt> SUBG;
	THashSet<TInt> CAND;
	for (TUNGraph::TNodeI NI=m_G->BegNI(); NI<m_G->EndNI(); NI++) {
		TInt nId = NI.GetId();
		SUBG.AddKey(nId);
		CAND.AddKey(nId);
	}
	//
	Expand(SUBG, CAND);
}
コード例 #13
0
ファイル: cliques.cpp プロジェクト: Aleyasen/Alaki
int TCliqueOverlap::Intersection(const THashSet<TInt>& A, const THashSet<TInt>& B) {
	int n = 0;
	if (A.Len() < B.Len()) {
		for (THashSetKeyI<TInt> it=A.BegI(); it<A.EndI(); it++) 
			if (B.IsKey(it.GetKey())) n++;
	} else {
		for (THashSetKeyI<TInt> it=B.BegI(); it<B.EndI(); it++) 
			if (A.IsKey(it.GetKey())) n++;
	}
	return n;
}
コード例 #14
0
ファイル: cliques.cpp プロジェクト: Aleyasen/Alaki
int TCliqueOverlap::MaxNbrsInCANDNodeId(const THashSet<TInt>& SUBG, const THashSet<TInt>& CAND) const{
	int id = -1;
	int maxIntersection = -1;
	//
	for (THashSetKeyI<TInt> it=SUBG.BegI(); it<SUBG.EndI(); it++) {
		int nId = it.GetKey();
		TUNGraph::TNodeI nIt = m_G->GetNI(nId);
		int deg = nIt.GetDeg();
		//
		int curIntersection = 0;
		for (int i=0; i<deg; i++) {
			int nbrId = nIt.GetNbrNId(i);
			if (CAND.IsKey(nbrId)) curIntersection++;
		}
		//
		if (maxIntersection < curIntersection) { maxIntersection=curIntersection; id=nId; }
	}
	return id;
}
コード例 #15
0
ファイル: lsh.cpp プロジェクト: EDzhangjianyu/snap
TVec<TFltV> TLSHash::GetCandidates(TFltV Datum) {
  THashSet<TInt> CandidateIds;
  for (int i=0; i<Bands; i++) {
    TInt Sig = ComputeSignature(Datum, i);
    THash<TInt, TIntV>& SigBucketVH = SigBucketVHV[i];

    if (!SigBucketVH.IsKey(Sig)) {
      continue;
    }
    CandidateIds.AddKeyV(SigBucketVH.GetDat(Sig));
  }

  TVec<TFltV> Candidates;
  int Ind = CandidateIds.FFirstKeyId();
  while(CandidateIds.FNextKeyId(Ind)) {
    int Id = CandidateIds[Ind];
    Candidates.Add(DataV[Id]);
  }
  return Candidates;
}
コード例 #16
0
ファイル: cliques.cpp プロジェクト: Aleyasen/Alaki
void TCliqueOverlap::Expand(const THashSet<TInt>& SUBG, THashSet<TInt>& CAND) {
	if (SUBG.Len()==0) { if (m_Q.Len() >= m_minMaxCliqueSize) { m_Q.Pack(); m_maxCliques->Add(m_Q); } return; }
	if (CAND.Len()==0) return;
	//Get u that maximaze CAND intersection with neighbours of vertex u
	int u = MaxNbrsInCANDNodeId(SUBG, CAND);
	//Get neighbours of node u
	THashSet<TInt> nbrsU;
	GetNbrs(u, nbrsU);
	//Get relative complement of nbrsU in CAND
	THashSet<TInt> EXT;
	GetRelativeComplement(CAND, nbrsU, EXT);
	while(EXT.Len() != 0) {
		int q = GetNodeIdWithMaxDeg(EXT);
		//
		m_Q.Add(q);
		//
		THashSet<TInt> nbrsQ;
		GetNbrs(q, nbrsQ);
		//
		THashSet<TInt> SUBGq;
		GetIntersection(SUBG, nbrsQ, SUBGq);
		//
		THashSet<TInt> CANDq;
		GetIntersection(CAND, nbrsQ, CANDq);
		//
		Expand(SUBGq, CANDq);
		//
 		CAND.DelKey(q);
		m_Q.DelLast();
		//
		EXT.Clr();
		GetRelativeComplement(CAND, nbrsU, EXT);
	}
}
コード例 #17
0
ファイル: lsh.cpp プロジェクト: snap-stanford/curis-2012
void LSH::WordHashing(TQuoteBase* QuoteBase, THashSet<TMd5Sig>& Shingles) {
  Err("Hashing shingles using words...\n");
  TIntV QuoteIds;
  QuoteBase->GetAllQuoteIds(QuoteIds);
  for (int qt = 0; qt < QuoteIds.Len(); qt++) {
    if (qt % 1000 == 0) {
      Err("%d out of %d completed\n", qt, QuoteIds.Len());
    }
    TQuote Q;
    QuoteBase->GetQuote(QuoteIds[qt], Q);

    TStrV Content;
    Q.GetParsedContent(Content);

    int ContentLen = Content.Len();
    for (int i = 0; i < ContentLen; i++) {
      const TMd5Sig ShingleMd5(Content[i]);
      Shingles.AddKey(ShingleMd5);
    }
  }
  Err("Done with word hashing! Number of shingles: %d\n", Shingles.Len());
}
コード例 #18
0
ファイル: lsh.cpp プロジェクト: snap-stanford/curis-2012
void LSH::ComputeSignatures(THashSet<TMd5Sig>& Shingles,
    THash<TMd5Sig, TIntV>& Signatures, int NumSignatures) {
  if (NumSignatures < 1)
    return;
  TRnd RandomGenerator; // TODO: make this "more random" by incorporating time
  TInt NumShingles = Shingles.Len();

  for (int i = 0; i < NumSignatures; ++i) {
    // Create new signature
    TVec < TMd5Sig > Shuffle;
    Shingles.GetKeyV(Shuffle);
    Shuffle.Shuffle(RandomGenerator);

    for (int j = 0; j < NumShingles; j++) {
      TIntV Signature;
      Signatures.IsKeyGetDat(Shuffle[j], Signature);
      Signature.Add(j);
      Signatures.AddDat(Shuffle[j], Signature);
    }
  }
  Err("Computed %d signatures!\n", NumSignatures);
}
コード例 #19
0
void BigMain(int argc, char* argv[]) {
  TExeTm ExeTm;
  Env = TEnv(argc, argv, TNotify::StdNotify);
  Env.PrepArgs("QuotesApp");
  const TStr ToDo = Env.GetIfArgPrefixStr("-do:", "", "To do").GetLc();
  if (Env.IsEndOfRun()) {
    printf("To do:\n");
    printf("    MkDataset         : Make memes dataset (extract quotes and save txt)\n");
    printf("    ExtractSubset     : Extract a subset of memes containing particular words\n");
    printf("    MemesToQtBs       : Load memes dataset and create quote base\n");
    printf("    MkClustNet        : Build cluster network from the quote base\n");
    return;
  }	
#pragma region mkdataset
  // extract quotes and links and make them into a single file
  if (ToDo == "mkdataset") {
    const TStr InFNm = Env.GetIfArgPrefixStr("-i:", "files.txt", "Spinn3r input files (one file per line)");
    const TStr OutFNm = Env.GetIfArgPrefixStr("-o:", "Spinn3r-dataset.txt", "Output file");
    const int MinQtWrdLen = Env.GetIfArgPrefixInt("-w:", 3, "Minimum quote word length");
    const TStr UrlFNm = Env.GetIfArgPrefixStr("-u:", "", "Seen url set (THashSet<TMd5Sig>) file name");
    const bool UrlOnlyOnce = Env.GetIfArgPrefixBool("-q:", true, "Only keep unique Urls");
    //// parse directly from Spinn3r
    TStr Spinn3rFNm;
    THashSet<TMd5Sig> SeenUrlSet;
    if (UrlOnlyOnce && ! UrlFNm.Empty()) {  // keep track of already seen urls (so that there are no duplicate urls)
      TFIn FIn(UrlFNm);  SeenUrlSet.Load(FIn);
    }
    FILE *F = fopen(OutFNm.CStr(), "wt");
    TFIn FIn(InFNm);
    int Items=0;
    for (int f=0; FIn.GetNextLn(Spinn3rFNm); f++) {
      TQuoteExtractor QE(Spinn3rFNm.ToTrunc());
      printf("Processing %02d: %s [%s]\n", f+1, Spinn3rFNm.CStr(), TExeTm::GetCurTm());
      fflush(stdout);
      for (int item = 0; QE.Next(); item++) {
        const TMd5Sig PostMd5(QE.PostUrlStr);
        if (QE.QuoteV.Empty() && QE.LinkV.Empty()) { continue; } // no quotes, no links
        if (UrlOnlyOnce) {
          if (SeenUrlSet.IsKey(PostMd5)) { continue; }
          SeenUrlSet.AddKey(PostMd5);
        }
        fprintf(F, "P\t%s\n", QE.PostUrlStr.CStr());
        //if (QE.PubTm > TSecTm(2008,8,30) || QE.PubTm < TSecTm(2008,7,25)) { printf("%s\n", QE.PubTm.GetStr().CStr()); }
        fprintf(F, "T\t%s\n", QE.PubTm.GetYmdTmStr().CStr());
        for (int q = 0; q < QE.QuoteV.Len(); q++) {
          if (TStrUtil::CountWords(QE.QuoteV[q]) >= MinQtWrdLen) {
            fprintf(F, "Q\t%s\n", QE.QuoteV[q].CStr()); }
        }
        for (int l = 0; l < QE.LinkV.Len(); l++) {
          fprintf(F, "L\t%s\n", QE.LinkV[l].CStr()); }
        fprintf(F, "\n");
        if (item>0 && item % Kilo(100) == 0) {
          QE.DumpStat();  QE.ExeTm.Tick(); }
        Items++;
      }
      printf("file done. Total %d all posts, %d all items\n", SeenUrlSet.Len(), Items);
      fflush(stdout);
    }
    printf("all done. Saving %d post urls\n", SeenUrlSet.Len());  fflush(stdout);
    if (! SeenUrlSet.Empty()) {
      TFOut FOut(OutFNm.GetFMid()+".SeenUrlSet");
      SeenUrlSet.Save(FOut);
    }
    fclose(F);
  }
#pragma endregion mkdataset

#pragma region extractsubset
  // save posts with memes containing particular words
  else if (ToDo == "extractsubset") {
    const TStr InFNmWc = Env.GetIfArgPrefixStr("-i:", "memes_*.rar", "Input file prefix");
    const bool IsInFNmWc = Env.GetIfArgPrefixBool("-w:", true, "Input is wildcard (else a file with list of input files)");
    const TStr OutFNm = Env.GetIfArgPrefixStr("-o:", "memes-subset.txt", "Output memes file");
    const TStr WordsFNm = Env.GetIfArgPrefixStr("-p:", "phrases-in.txt", "Phrases that memes have to contain");

    TChAV CatchMemeV;// = TStr::GetV("great depression", "economic meltdown", "recession had bottomed out", "green shoots", "slow recovery", "gradual recovery");
    printf("Loading %s\n", WordsFNm.CStr());
    { TFIn FIn(WordsFNm);
    for (TStr Ln; FIn.GetNextLn(Ln); ) {
      printf("  %s\n", Ln.GetLc().CStr());
      CatchMemeV.Add(Ln.GetLc()); }
    }
    printf("%d strings loaded\n", CatchMemeV.Len());
    TFOut FOut(OutFNm);
    TMemesDataLoader Memes(InFNmWc, IsInFNmWc);
    for (int posts = 0, nsave=0; Memes.LoadNext(); posts++) {
      bool DoSave = false;
      for (int m = 0; m < Memes.MemeV.Len(); m++) {
        for (int i = 0; i < CatchMemeV.Len(); i++) {
          if (Memes.MemeV[m].SearchStr(CatchMemeV[i]) != -1) {
            DoSave=true; break; }
        }
        if (DoSave) { break; }
      }
      if (DoSave) { Memes.SaveTxt(FOut); nsave++; }
      if (posts % Mega(1) == 0) {
        printf("%dm posts, %d saved\n", posts/Mega(1), nsave);
        FOut.Flush();
      }
    }
  }
#pragma endregion extractsubset

#pragma region memestoqtbs
  // load memes dataset (MkDataset) and create quote base
  else if (ToDo == "memestoqtbs") {
    const TStr InFNm = Env.GetIfArgPrefixStr("-i:", "201007_201107.txt", "Input Memes dataset files");
    const TStr MediaUrlFNm = Env.GetIfArgPrefixStr("-u:", "news_media.txt", "Fule with news media urls");
    const TStr Pref = Env.GetIfArgPrefixStr("-o:", "qt", "Output file name prefix");
    const int MinWrdLen = Env.GetIfArgPrefixInt("-l:", 4, "Min quote word length");
    const int MinMemeFq = Env.GetIfArgPrefixInt("-f:", 5, "Min meme frequency");
		const TStr MinTmStr = Env.GetIfArgPrefixStr("-mint:", "20100714", "Min time of quotes, format = YYYYMMDD");
		const TStr MaxTmStr = Env.GetIfArgPrefixStr("-maxt:", "20110728", "Max time of quotes, format = YYYYMMDD");
		TSecTm MinTm(atoi(MinTmStr.GetSubStr(0,3).CStr()),atoi(MinTmStr.GetSubStr(4,5).CStr()),atoi(MinTmStr.GetSubStr(6,7).CStr()));
		TSecTm MaxTm(atoi(MaxTmStr.GetSubStr(0,3).CStr()),atoi(MaxTmStr.GetSubStr(4,5).CStr()),atoi(MaxTmStr.GetSubStr(6,7).CStr()));

		PQuoteBs QtBs = TQuoteBs::New();
		int HashTableSize = 100; // 100 for each quarter, for one year data, use 400
		int UrlSetSize = 4 * HashTableSize;
		QtBs->ConstructQtBsZarya(InFNm, Pref, MediaUrlFNm, MinTm, MaxTm, MinWrdLen, MinMemeFq, HashTableSize, UrlSetSize);
		}
#pragma endregion memestoqtbs

#pragma region mkclustnet
  // make cluster network
  else if (ToDo == "mkclustnet") {
    TStr InQtBsNm = Env.GetIfArgPrefixStr("-i:", "", "Input quote base file name");
    TStr Pref = Env.GetIfArgPrefixStr("-o:", "qt", "Output network/updated QtBs filename");
		TStr BlackListFNm = Env.GetIfArgPrefixStr("-b:", "quote_blacklist.txt", "Blacklist file name");
		bool IsShglReady = Env.GetIfArgPrefixBool("-shglready:", false, "Indicate whether shingle hash table is ready");
		bool IsNetReady = Env.GetIfArgPrefixBool("-netready:", false, "Indicate whether cluster net is ready");
		double BktThresh = Env.GetIfArgPrefixFlt("-bktthresh:", 0.4, "Threshold for bad shingle bucket elimination");
		double MxTmDelay = Env.GetIfArgPrefixFlt("-delaythresh:", 5, "Max time delay between two quotes in the same cluster");
		double MxTmDev = Env.GetIfArgPrefixFlt("-devthresh:", 3, "Max time deviation for a quote to be specific rather than general");
		double RefineThresh = Env.GetIfArgPrefixFlt("-refinethresh:", 0.2, "Threshold for merging quote cluster in refining process");
    const int MinWrdLen = Env.GetIfArgPrefixInt("-minl:", 4, "Min quote word length");
		const int MaxWrdLen = Env.GetIfArgPrefixInt("-maxl:", 200, "Max quote word length");
    const int MinMemeFq = Env.GetIfArgPrefixInt("-mf:", 5, "Min meme frequency");
		const int MinClustFq = Env.GetIfArgPrefixInt("-cf:", 50, "Min quote cluster frequency");

		// Load quote base
    PQuoteBs QtBs;
    if (TZipIn::IsZipFNm(InQtBsNm)) { TZipIn ZipIn(InQtBsNm);  QtBs = TQuoteBs::Load(ZipIn); }
    else { TFIn FIn(InQtBsNm);  QtBs = TQuoteBs::Load(FIn); }

		// Cluster the quotes
    QtBs->ClusterQuotes(MinMemeFq, MinWrdLen, MaxWrdLen, BlackListFNm, Pref, IsShglReady, IsNetReady, BktThresh, MxTmDelay, MxTmDev, RefineThresh);

		// Dump the clusters
		bool SkipUrl = true, FlashDisp = true;
		QtBs->DumpQuoteClusters(MinWrdLen, MinClustFq, SkipUrl, FlashDisp, Pref);
  }
#pragma endregion mkclustnet

#pragma region memeclust
	else if (ToDo.SearchStr(TStr("memeclust")) >= 0) {
    const TStr InFNm = Env.GetIfArgPrefixStr("-i:", "201101.txt", "Input Memes dataset files");
    const TStr MediaUrlFNm = Env.GetIfArgPrefixStr("-u:", "news_media.txt", "Fule with news media urls");
    TStr Pref = Env.GetIfArgPrefixStr("-o:", "qt", "Output file name prefix");
    const int MinWrdLen = Env.GetIfArgPrefixInt("-minl:", 4, "Min quote word length");
		const int MaxWrdLen = Env.GetIfArgPrefixInt("-maxl:", 200, "Max quote word length");
		const int MinMemeFq = Env.GetIfArgPrefixInt("-f:", 5, "Min meme frequency");

		const int MinClustFq = Env.GetIfArgPrefixInt("-cf:", 50, "Min quote cluster frequency");
		TStr BlackListFNm = Env.GetIfArgPrefixStr("-b:", "quote_blacklist.txt", "Blacklist file name");
		bool IsQtBsReady = Env.GetIfArgPrefixBool("-qtbsready:", false, "Indicate whether quote base is ready and can be loaded readily");
		bool IsShglReady = Env.GetIfArgPrefixBool("-shglready:", false, "Indicate whether shingle hash table is ready");
		bool IsNetReady = Env.GetIfArgPrefixBool("-netready:", false, "Indicate whether cluster net is ready");
		double BktThresh = Env.GetIfArgPrefixFlt("-bktthresh:", 0.4, "Threshold for bad shingle bucket elimination");
		double MxTmDelay = Env.GetIfArgPrefixFlt("-delaythresh:", 5, "Max time delay between two quotes in the same cluster");
		double MxTmDev = Env.GetIfArgPrefixFlt("-devthresh:", 3, "Max time deviation for a quote to be specific rather than general");
		double RefineThresh = Env.GetIfArgPrefixFlt("-refinethresh:", 0.2, "Threshold for merging quote cluster in refining process");

		const TStr MinTmStr = Env.GetIfArgPrefixStr("-mint:", "20010101", "Min time of quotes, format = YYYYMMDD");
		const TStr MaxTmStr = Env.GetIfArgPrefixStr("-maxt:", "20200101", "Max time of quotes, format = YYYYMMDD");
		TSecTm MinTm(atoi(MinTmStr.GetSubStr(0,3).CStr()),atoi(MinTmStr.GetSubStr(4,5).CStr()),atoi(MinTmStr.GetSubStr(6,7).CStr()));
		TSecTm MaxTm(atoi(MaxTmStr.GetSubStr(0,3).CStr()),atoi(MaxTmStr.GetSubStr(4,5).CStr()),atoi(MaxTmStr.GetSubStr(6,7).CStr()));

		// Construct the quote base from Zarya data
		PQuoteBs QtBs = TQuoteBs::New();

		if (!IsQtBsReady) {
			int HashTableSize = 100; // 100 for each quarter, for one year data, use 400
			if (ToDo == "memeclustzarya") {
				int UrlSetSize = 4 * HashTableSize;
				QtBs->ConstructQtBsZarya(InFNm, Pref, MediaUrlFNm, MinTm, MaxTm, MinWrdLen, MinMemeFq, HashTableSize, UrlSetSize);
			}	else if (ToDo == "memeclustqtonly") {
				QtBs->ConstructQtBsQtOnly(InFNm, Pref, MediaUrlFNm, MinWrdLen, MinMemeFq, HashTableSize);
			} else if (ToDo == "memeclustqttime") {
				QtBs->ConstructQtBsQtTime(InFNm, Pref, MediaUrlFNm, MinWrdLen, MinMemeFq, HashTableSize);
			} else {
				printf("Please specify one of the three options for -do : memeclustzarya, memeclustqtonly, memeclustqttime!\n");
				return;
			}
		} else {
			TStr InQtBsNm = TStr::Fmt("%s-w%dmfq%d.QtBs", Pref.CStr(), MinWrdLen, MinMemeFq);
			if (TZipIn::IsZipFNm(InQtBsNm)) { TZipIn ZipIn(InQtBsNm);  QtBs = TQuoteBs::Load(ZipIn); }
			else { TFIn FIn(InQtBsNm);  QtBs = TQuoteBs::Load(FIn); }
		}

		// Cluster the quotes
    QtBs->ClusterQuotes(MinMemeFq, MinWrdLen, MaxWrdLen, BlackListFNm, Pref, IsShglReady, IsNetReady, BktThresh, MxTmDelay, MxTmDev, RefineThresh);

		// Dump the clusters
		bool SkipUrl = true, FlashDisp = true;
		QtBs->DumpQuoteClusters(MinWrdLen, MinClustFq, SkipUrl, FlashDisp, Pref);
	}
#pragma endregion memeclust
}
コード例 #20
0
ファイル: WireNode.cpp プロジェクト: cbalderrama/wire3d
//----------------------------------------------------------------------------
void Node::WarmUpRendering(Renderer* pRenderer)
{
#ifndef WIRE_WII // Wii does not need to warm up by submitting draw calls
	WIRE_ASSERT(pRenderer);
	UpdateGS(0, true, false);

	Vector3F cameraLocation = WorldBound->GetCenter();
	cameraLocation.Z() += WorldBound->GetRadius();
	Vector3F viewDirection = -Vector3F::UNIT_Z;
	Vector3F up = Vector3F::UNIT_Y;
	Vector3F right = viewDirection.Cross(up);
	CameraPtr spCamera = WIRE_NEW Camera;
	spCamera->SetFrame(cameraLocation, viewDirection, up, right);

	Float fieldOfView = 60.0F;
	Float aspectRatio = 2;
	Float nearPlane = 0.1F;
	Float farPlane = WorldBound->GetRadius() * 2.0F;
	spCamera->SetFrustum(fieldOfView, aspectRatio, nearPlane, farPlane);

	CullerSorting culler;
	culler.SetCamera(spCamera);
	culler.ComputeVisibleSet(this);

	pRenderer->PreDraw(spCamera);

	// draw scene to warm up batching buffers
	pRenderer->Draw(culler.GetVisibleSets());

	// collect and draw all materials separately so none will be missed
	// by CULL_ALWAYS or Switch/LOD nodes.
	THashSet<Material*> materials;
	TStack<Node*> scene(1000);
	scene.Push(this);
	while (!scene.IsEmpty())
	{
		Node* pNode = NULL;
		scene.Pop(pNode);
		RenderObject* pRenderObject = pNode->GetRenderObject();
		if (pRenderObject && pRenderObject->GetMaterial())
		{
			materials.Insert(pRenderObject->GetMaterial());
		}

		for (UInt i = 0; i < pNode->GetQuantity(); i++)
		{
			Node* pChild = DynamicCast<Node>(pNode->GetChild(i)); 
			if (pChild)
			{
				scene.Push(pChild);
			}
		}
	}

	RenderObjectPtr spCube = StandardMesh::CreateCube24(4, pRenderer->
		GetMaxTextureStages(), true);
	THashSet<Material*>::Iterator it(&materials);
	Transformation transformation;
	transformation.SetTranslate(cameraLocation - Vector3F(0, 0, 3));
	for (Material** pMaterial = it.GetFirst(); pMaterial; pMaterial = 
		it.GetNext())
	{
		spCube->SetMaterial(*pMaterial);
		pRenderer->Draw(spCube, transformation);
	}

	pRenderer->PostDraw();
#endif
}
コード例 #21
0
ファイル: lsh.cpp プロジェクト: snap-stanford/curis-2012
void LSH::MinHash(TQuoteBase *QB, THashSet<TMd5Sig>& Shingles,
    TVec<THash<TMd5Sig, TIntSet> >& SignatureBandBuckets) {
  Err("Creating buckets...\n");
  THash < TMd5Sig, TIntV > Signatures;
  ComputeSignatures(Shingles, Signatures, NumBands * BandSize);

  // bucket creation
  for (int i = 0; i < NumBands; ++i) {
    SignatureBandBuckets.Add(THash<TMd5Sig, TIntSet>());
  }


  // bucket filling
  int NumShingles = Shingles.Len();
  THash<TInt, TQuote> Quotes;
  QB->GetIdToTQuotes(Quotes);

  THash<TInt, TQuote>::TIter CurI = Quotes.BegI();
  THash<TInt, TQuote>::TIter EndI = Quotes.EndI();
  TQuote Q; // SKYFALL

  for (; CurI < EndI; CurI++) {
    Q = CurI.GetDat();

    TStrV Content;
    Q.GetParsedContent(Content);
    TInt Id = Q.GetId();

    // signature for quote
    int ContentLen = Content.Len();
    TVec < TIntV > Signature;
    for (int i = 0; i < ContentLen; i++) {
      const TMd5Sig ShingleMd5(Content[i]);
      Signature.Add(Signatures.GetDat(ShingleMd5));
    }

    // place in bucket
    if (ContentLen < WordWindow) {
      for (int i = 0; i < NumBands; ++i) {
        TStr Sig;
        for (int j = 0; j < BandSize; ++j) {
          int CurSig = i * BandSize + j;

          TInt min = NumShingles;
          for (int k = 0; k < ContentLen; k++) {
            if (Signature[k][CurSig] < min) {
              min = Signature[k][CurSig];
            }
          }
          Sig += min.GetStr() + "-";
        }
        //Err(Sig.CStr());

        const TMd5Sig SigMd5(Sig);
        TIntSet Bucket;
        SignatureBandBuckets[i].IsKeyGetDat(SigMd5, Bucket);
        Bucket.AddKey(Id);
        SignatureBandBuckets[i].AddDat(SigMd5, Bucket);
      }
    } else {

    }

  }
  Err("Minhash step complete!\n");
}
コード例 #22
0
ファイル: cliques.cpp プロジェクト: Aleyasen/Alaki
/////////////////////////////////////////////////
// TCommunity implementation
void TCliqueOverlap::GetRelativeComplement(const THashSet<TInt>& A, const THashSet<TInt>& B, THashSet<TInt>& Complement) {
  for (THashSet<TInt>::TIter it=A.BegI(); it<A.EndI(); it++) {
		const int nId = it.GetKey();
		if (!B.IsKey(nId)) Complement.AddKey(nId);
	}
}
コード例 #23
0
ファイル: cliques.cpp プロジェクト: Aleyasen/Alaki
void TCliqueOverlap::GetNbrs(int NId, THashSet<TInt>& Nbrs) const{
	TUNGraph::TNodeI node = m_G->GetNI(NId);
	int deg = node.GetDeg();
	for (int i=0; i<deg; i++) Nbrs.AddKey(node.GetNbrNId(i));
}
コード例 #24
0
ファイル: sortSource.cpp プロジェクト: JohnMatta/snap
/**
 * Used for benchmarking sorting by source algorithm.
 * Takes as input starting point of
 * a top cascade and outputs time taken for casacade detection. 
 * Input : Source, Dest, Start, Duration 
 * Output : Prints the time for cascade detection
 */
int main(int argc,char* argv[]) {
  TTableContext Context;
  Schema TimeS;
  TimeS.Add(TPair<TStr,TAttrType>("Source",atInt));
  TimeS.Add(TPair<TStr,TAttrType>("Dest",atInt));
  TimeS.Add(TPair<TStr,TAttrType>("Start",atInt));
  TimeS.Add(TPair<TStr,TAttrType>("Duration",atInt));
  PTable P1 = TTable::LoadSS(TimeS,"./../../../../datasets/temporal/yemen_call_201001.txt",&Context,' ');
  TIntV MapV;
  TStrV SortBy;
  SortBy.Add("Source");
  P1->Order(SortBy);
  TIntV Source; // Sorted vec of start time
  P1->ReadIntCol("Source",Source);
  for (TRowIterator RI = P1->BegRI(); RI < P1->EndRI(); RI++) {
    MapV.Add(RI.GetRowIdx());
  }
  // Attribute to Int mapping
  TInt SIdx = P1->GetColIdx("Source");
  TInt DIdx = P1->GetColIdx("Dest");
  TInt StIdx = P1->GetColIdx("Start");
  TInt DuIdx = P1->GetColIdx("Duration");
  int W = atoi(argv[1]);
  int len = 0;
  // Find the starting point
  int TSource = atoi(argv[2]);
  int TDest = atoi(argv[3]);
  int TStart = atoi(argv[4]);
  int TDur = atoi(argv[5]);
  TInt RIdx;
  for (TRowIterator RI = P1->BegRI(); RI < P1->EndRI(); RI++) {
    RIdx = RI.GetRowIdx();
    int RSource = P1->GetIntValAtRowIdx(SIdx,RIdx).Val;
    int RDest = P1->GetIntValAtRowIdx(DIdx,RIdx).Val;
    int RStart = P1->GetIntValAtRowIdx(StIdx,RIdx).Val;
    int RDur = P1->GetIntValAtRowIdx(DuIdx,RIdx).Val;
    if (TSource == RSource && TDest == RDest && TStart == RStart && TDur == RDur) break;
  }
  // Start building the cascade from the start point
  clock_t st,et;
  st = clock();
  for (int i = 0; i < 1; i++) {
    THashSet<TInt> VisitedH;
    TSnapQueue<TInt> EventQ;
    EventQ.Push(RIdx);
    VisitedH.AddKey(RIdx);
    while (!EventQ.Empty()) {
      TInt CIdx = EventQ.Top();
      EventQ.Pop();
      int CDest = P1->GetIntValAtRowIdx(DIdx,CIdx).Val;
      int CStart = P1->GetIntValAtRowIdx(StIdx,CIdx).Val;
      int CDur = P1->GetIntValAtRowIdx(DuIdx,CIdx).Val;
      // In line binary search
      int val = CDest;
      int lo = 0;
      int hi = Source.Len() - 1;
      int index = -1;
      while (hi >= lo) {
        int mid = lo + (hi - lo)/2;
        if (Source.GetVal(mid) > val) { hi = mid - 1;}
        else if (Source.GetVal(mid) < val) { lo = mid + 1;}
        else { index = mid; hi = mid - 1;}
      } 
      // End of binary search
      int BIdx = index;
      for(int i = BIdx; i < Source.Len(); i++) {
        int PId = MapV.GetVal(i).Val;
        if (! VisitedH.IsKey(PId)) {
          int TSource = P1->GetIntValAtRowIdx(SIdx,PId).Val;
          int TStart = P1->GetIntValAtRowIdx(StIdx,PId).Val;
          if (TSource != CDest) {
            break;
          }
          if (TStart >= (CDur + CStart) && TStart - (CDur + CStart) <= W) {
            VisitedH.AddKey(PId);
            EventQ.Push(PId);
          }
        }
      }
    }
    len = VisitedH.Len();
  }
  et = clock();
  float diff = ((float) et - (float) st)/CLOCKS_PER_SEC;
  printf("Size %d,Time %f\n",len,diff);
  return 0;
}