Exemple #1
0
void TTable::GroupAux(const TStrV& GroupBy, TInt GroupByStartIdx, THash<TInt,TIntV>& grouping, const TIntV& IndexSet, TBool All){
  /* recursion base - add IndexSet as group */
  if(GroupByStartIdx == GroupBy.Len()){
    if(IndexSet.Len() == 0){return;}
	  TInt key = grouping.Len();
	  grouping.AddDat(key, IndexSet);
	  return;
  }
  if(!ColTypeMap.IsKey(GroupBy[GroupByStartIdx])){TExcept::Throw("no such column " + GroupBy[GroupByStartIdx]);}
  switch(GetColType(GroupBy[GroupByStartIdx])){
    case INT:{
      // group by current column
      // not sure of to estimate the size of T for constructor hinting purpose.
      // It is bounded by the length of the IndexSet or the length of the grouping column if the IndexSet vector is empty
      // but this bound may be way too big
	    THash<TInt,TIntV> T;  
	    GroupByIntCol(GroupBy[GroupByStartIdx], T, IndexSet, All);
	    for(THash<TInt,TIntV>::TIter it = T.BegI(); it < T.EndI(); it++){
	      TIntV& CurrGroup = it->Dat;
        // each group according to current column will be used as an IndexSet
        // for grouping according to next column
		    GroupAux(GroupBy, GroupByStartIdx+1, grouping, CurrGroup, false);
	   }
	    break;
	  }
	  case FLT:{
	    THash<TFlt,TIntV> T;
	    GroupByFltCol(GroupBy[GroupByStartIdx], T, IndexSet, All);
	    for(THash<TFlt,TIntV>::TIter it = T.BegI(); it < T.EndI(); it++){
	      TIntV& CurrGroup = it->Dat;
		    GroupAux(GroupBy, GroupByStartIdx+1, grouping, CurrGroup, false);
	    }
	    break;
	  }
	  case STR:{
	    THash<TStr,TIntV> T;
	    GroupByStrCol(GroupBy[GroupByStartIdx], T, IndexSet, All);
	    for(THash<TStr,TIntV>::TIter it = T.BegI(); it < T.EndI(); it++){
	      TIntV& CurrGroup = it->Dat;
	      GroupAux(GroupBy, GroupByStartIdx+1, grouping, CurrGroup, false);
	    }
	    break;
	  }
  }
}
void TAGMUtil::GetNodeMembership(THash<TInt,TIntSet >& NIDComVH, const THash<TInt,TIntV>& CmtyVH) {
    for (THash<TInt,TIntV>::TIter HI = CmtyVH.BegI(); HI < CmtyVH.EndI(); HI++) {
        int CID = HI.GetKey();
        for (int j = 0; j < HI.GetDat().Len(); j++) {
            int NID = HI.GetDat()[j];
            NIDComVH.AddDat(NID).AddKey(CID);
        }
    }
}
Exemple #3
0
void TTable::StoreGroupCol(TStr GroupColName, const THash<TInt,TIntV>& Grouping){
  GroupMapping.AddDat(GroupColName, Grouping);
  // add a column where the value of the i'th row is the group id of row i
  IntCols.Add(TIntV(NumRows));
  TInt L = IntCols.Len();
  ColTypeMap.AddDat(GroupColName, TPair<TYPE,TInt>(INT, L-1));
  for(THash<TInt,TIntV>::TIter it = Grouping.BegI(); it < Grouping.EndI(); it++){
    TIntV& G = it->Dat;
    for(TInt i = 0; i < G.Len(); i++){
      IntCols[L-1][G[i]] = it->Key;
    }
  }
}
Exemple #4
0
void TTable::Unique(TStr Col){
  if(!ColTypeMap.IsKey(Col)){TExcept::Throw("no such column " + Col);}
  TIntV RemainingRows = TIntV(NumValidRows,0);
  // group by given column (keys) and keep only first row for each key
  switch(GetColType(Col)){
    case INT:{
      THash<TInt,TIntV> T;  // can't really estimate the size of T for constructor hinting
      GroupByIntCol(Col, T, TIntV(0), true);
      for(THash<TInt,TIntV>::TIter it = T.BegI(); it < T.EndI(); it++){
        RemainingRows.Add(it->Dat[0]);
      }
      break;
    }
    case FLT:{
      THash<TFlt,TIntV> T;
      GroupByFltCol(Col, T, TIntV(0), true);
      for(THash<TFlt,TIntV>::TIter it = T.BegI(); it < T.EndI(); it++){
        RemainingRows.Add(it->Dat[0]);
      }
      break;
    }
    case STR:{
      THash<TStr,TIntV> T;
      GroupByStrCol(Col, T, TIntV(0), true);
      for(THash<TStr,TIntV>::TIter it = T.BegI(); it < T.EndI(); it++){
        RemainingRows.Add(it->Dat[0]);
      }
      break;
    }
  }
  // with the current implementation of GroupByX, RemainingRows is sorted:
  // GroupByX returns a hash Table T:X-->TIntV. In the current implementation,
  // if key X1 appears before key X2 in T Then T(X1)[0] <= T(X2)[0]
  // Not sure if we could always make this assumption. Might want to remove this sorting..
  RemainingRows.Sort();
  KeepSortedRows(RemainingRows);
}
Exemple #5
0
void LSH::MinHash(TQuoteBase *QB, THashSet<TMd5Sig>& Shingles,
    TVec<THash<TMd5Sig, TIntSet> >& SignatureBandBuckets) {
  Err("Creating buckets...\n");
  THash < TMd5Sig, TIntV > Signatures;
  ComputeSignatures(Shingles, Signatures, NumBands * BandSize);

  // bucket creation
  for (int i = 0; i < NumBands; ++i) {
    SignatureBandBuckets.Add(THash<TMd5Sig, TIntSet>());
  }


  // bucket filling
  int NumShingles = Shingles.Len();
  THash<TInt, TQuote> Quotes;
  QB->GetIdToTQuotes(Quotes);

  THash<TInt, TQuote>::TIter CurI = Quotes.BegI();
  THash<TInt, TQuote>::TIter EndI = Quotes.EndI();
  TQuote Q; // SKYFALL

  for (; CurI < EndI; CurI++) {
    Q = CurI.GetDat();

    TStrV Content;
    Q.GetParsedContent(Content);
    TInt Id = Q.GetId();

    // signature for quote
    int ContentLen = Content.Len();
    TVec < TIntV > Signature;
    for (int i = 0; i < ContentLen; i++) {
      const TMd5Sig ShingleMd5(Content[i]);
      Signature.Add(Signatures.GetDat(ShingleMd5));
    }

    // place in bucket
    if (ContentLen < WordWindow) {
      for (int i = 0; i < NumBands; ++i) {
        TStr Sig;
        for (int j = 0; j < BandSize; ++j) {
          int CurSig = i * BandSize + j;

          TInt min = NumShingles;
          for (int k = 0; k < ContentLen; k++) {
            if (Signature[k][CurSig] < min) {
              min = Signature[k][CurSig];
            }
          }
          Sig += min.GetStr() + "-";
        }
        //Err(Sig.CStr());

        const TMd5Sig SigMd5(Sig);
        TIntSet Bucket;
        SignatureBandBuckets[i].IsKeyGetDat(SigMd5, Bucket);
        Bucket.AddKey(Id);
        SignatureBandBuckets[i].AddDat(SigMd5, Bucket);
      }
    } else {

    }

  }
  Err("Minhash step complete!\n");
}