void TTable::GroupAux(const TStrV& GroupBy, TInt GroupByStartIdx, THash<TInt,TIntV>& grouping, const TIntV& IndexSet, TBool All){ /* recursion base - add IndexSet as group */ if(GroupByStartIdx == GroupBy.Len()){ if(IndexSet.Len() == 0){return;} TInt key = grouping.Len(); grouping.AddDat(key, IndexSet); return; } if(!ColTypeMap.IsKey(GroupBy[GroupByStartIdx])){TExcept::Throw("no such column " + GroupBy[GroupByStartIdx]);} switch(GetColType(GroupBy[GroupByStartIdx])){ case INT:{ // group by current column // not sure of to estimate the size of T for constructor hinting purpose. // It is bounded by the length of the IndexSet or the length of the grouping column if the IndexSet vector is empty // but this bound may be way too big THash<TInt,TIntV> T; GroupByIntCol(GroupBy[GroupByStartIdx], T, IndexSet, All); for(THash<TInt,TIntV>::TIter it = T.BegI(); it < T.EndI(); it++){ TIntV& CurrGroup = it->Dat; // each group according to current column will be used as an IndexSet // for grouping according to next column GroupAux(GroupBy, GroupByStartIdx+1, grouping, CurrGroup, false); } break; } case FLT:{ THash<TFlt,TIntV> T; GroupByFltCol(GroupBy[GroupByStartIdx], T, IndexSet, All); for(THash<TFlt,TIntV>::TIter it = T.BegI(); it < T.EndI(); it++){ TIntV& CurrGroup = it->Dat; GroupAux(GroupBy, GroupByStartIdx+1, grouping, CurrGroup, false); } break; } case STR:{ THash<TStr,TIntV> T; GroupByStrCol(GroupBy[GroupByStartIdx], T, IndexSet, All); for(THash<TStr,TIntV>::TIter it = T.BegI(); it < T.EndI(); it++){ TIntV& CurrGroup = it->Dat; GroupAux(GroupBy, GroupByStartIdx+1, grouping, CurrGroup, false); } break; } } }
void TAGMUtil::GetNodeMembership(THash<TInt,TIntSet >& NIDComVH, const THash<TInt,TIntV>& CmtyVH) { for (THash<TInt,TIntV>::TIter HI = CmtyVH.BegI(); HI < CmtyVH.EndI(); HI++) { int CID = HI.GetKey(); for (int j = 0; j < HI.GetDat().Len(); j++) { int NID = HI.GetDat()[j]; NIDComVH.AddDat(NID).AddKey(CID); } } }
void TTable::StoreGroupCol(TStr GroupColName, const THash<TInt,TIntV>& Grouping){ GroupMapping.AddDat(GroupColName, Grouping); // add a column where the value of the i'th row is the group id of row i IntCols.Add(TIntV(NumRows)); TInt L = IntCols.Len(); ColTypeMap.AddDat(GroupColName, TPair<TYPE,TInt>(INT, L-1)); for(THash<TInt,TIntV>::TIter it = Grouping.BegI(); it < Grouping.EndI(); it++){ TIntV& G = it->Dat; for(TInt i = 0; i < G.Len(); i++){ IntCols[L-1][G[i]] = it->Key; } } }
void TTable::Unique(TStr Col){ if(!ColTypeMap.IsKey(Col)){TExcept::Throw("no such column " + Col);} TIntV RemainingRows = TIntV(NumValidRows,0); // group by given column (keys) and keep only first row for each key switch(GetColType(Col)){ case INT:{ THash<TInt,TIntV> T; // can't really estimate the size of T for constructor hinting GroupByIntCol(Col, T, TIntV(0), true); for(THash<TInt,TIntV>::TIter it = T.BegI(); it < T.EndI(); it++){ RemainingRows.Add(it->Dat[0]); } break; } case FLT:{ THash<TFlt,TIntV> T; GroupByFltCol(Col, T, TIntV(0), true); for(THash<TFlt,TIntV>::TIter it = T.BegI(); it < T.EndI(); it++){ RemainingRows.Add(it->Dat[0]); } break; } case STR:{ THash<TStr,TIntV> T; GroupByStrCol(Col, T, TIntV(0), true); for(THash<TStr,TIntV>::TIter it = T.BegI(); it < T.EndI(); it++){ RemainingRows.Add(it->Dat[0]); } break; } } // with the current implementation of GroupByX, RemainingRows is sorted: // GroupByX returns a hash Table T:X-->TIntV. In the current implementation, // if key X1 appears before key X2 in T Then T(X1)[0] <= T(X2)[0] // Not sure if we could always make this assumption. Might want to remove this sorting.. RemainingRows.Sort(); KeepSortedRows(RemainingRows); }
void LSH::MinHash(TQuoteBase *QB, THashSet<TMd5Sig>& Shingles, TVec<THash<TMd5Sig, TIntSet> >& SignatureBandBuckets) { Err("Creating buckets...\n"); THash < TMd5Sig, TIntV > Signatures; ComputeSignatures(Shingles, Signatures, NumBands * BandSize); // bucket creation for (int i = 0; i < NumBands; ++i) { SignatureBandBuckets.Add(THash<TMd5Sig, TIntSet>()); } // bucket filling int NumShingles = Shingles.Len(); THash<TInt, TQuote> Quotes; QB->GetIdToTQuotes(Quotes); THash<TInt, TQuote>::TIter CurI = Quotes.BegI(); THash<TInt, TQuote>::TIter EndI = Quotes.EndI(); TQuote Q; // SKYFALL for (; CurI < EndI; CurI++) { Q = CurI.GetDat(); TStrV Content; Q.GetParsedContent(Content); TInt Id = Q.GetId(); // signature for quote int ContentLen = Content.Len(); TVec < TIntV > Signature; for (int i = 0; i < ContentLen; i++) { const TMd5Sig ShingleMd5(Content[i]); Signature.Add(Signatures.GetDat(ShingleMd5)); } // place in bucket if (ContentLen < WordWindow) { for (int i = 0; i < NumBands; ++i) { TStr Sig; for (int j = 0; j < BandSize; ++j) { int CurSig = i * BandSize + j; TInt min = NumShingles; for (int k = 0; k < ContentLen; k++) { if (Signature[k][CurSig] < min) { min = Signature[k][CurSig]; } } Sig += min.GetStr() + "-"; } //Err(Sig.CStr()); const TMd5Sig SigMd5(Sig); TIntSet Bucket; SignatureBandBuckets[i].IsKeyGetDat(SigMd5, Bucket); Bucket.AddKey(Id); SignatureBandBuckets[i].AddDat(SigMd5, Bucket); } } else { } } Err("Minhash step complete!\n"); }