/* Q: This approach of passing a hash table by reference and adding entries to it is common here. But it also implies certain assumptions about the input - i.e. empty input table. Why PHash is never used? - why not let the function allocate the new table and return a smart ptr PHash.. */ void TTable::GroupByIntCol(TStr GroupBy, THash<TInt,TIntV>& grouping, const TIntV& IndexSet, TBool All) const{ if(!ColTypeMap.IsKey(GroupBy)){TExcept::Throw("no such column " + GroupBy);} if(GetColType(GroupBy) != INT){TExcept::Throw(GroupBy + " values are not of expected type integer");} if(All){ // optimize for the common and most expensive case - itearte over only valid rows for(TRowIterator it = BegRI(); it < EndRI(); it++){ UpdateGrouping<TInt>(grouping, it.GetIntAttr(GroupBy), it.GetRowIdx()); } } else{ // consider only rows in IndexSet for(TInt i = 0; i < IndexSet.Len(); i++){ if(IsRowValid(IndexSet[i])){ TInt RowIdx = IndexSet[i]; const TIntV& Col = IntCols[GetColIdx(GroupBy)]; UpdateGrouping<TInt>(grouping, Col[RowIdx], RowIdx); } } } }
void TTable::SaveSS(const TStr& OutFNm){ FILE* F = fopen(OutFNm.CStr(), "w"); // debug if(F == NULL){ printf("failed to open file %s\n", OutFNm.CStr()); perror("fail "); return; } TInt L = S.Len(); // print title (schema) for(TInt i = 0; i < L-1; i++){ fprintf(F, "%s\t", GetSchemaColName(i).CStr()); } fprintf(F, "%s\n", GetSchemaColName(L-1).CStr()); // print table contents for(TRowIterator RowI = BegRI(); RowI < EndRI(); RowI++){ for(TInt i = 0; i < L; i++){ char C = (i == L-1) ? '\n' : '\t'; switch(GetSchemaColType(i)){ case INT:{ fprintf(F, "%d%c", RowI.GetIntAttr(GetSchemaColName(i)).Val, C); break; } case FLT:{ fprintf(F, "%f%c", RowI.GetFltAttr(GetSchemaColName(i)).Val, C); break; } case STR:{ fprintf(F, "%s%c", RowI.GetStrAttr(GetSchemaColName(i)).CStr(), C); break; } } } } fclose(F); }
// Q: Do we want to have any gurantees in terms of order of the 0t rows - i.e. // ordered by "this" table row idx as primary key and "Table" row idx as secondary key // This means only keeping joint row indices (pairs of original row indices), sorting them // and adding all rows in the end. Sorting can be expensive, but we would be able to pre-allocate // memory for the joint table.. PTable TTable::Join(TStr Col1, const TTable& Table, TStr Col2) { if(!ColTypeMap.IsKey(Col1)){ TExcept::Throw("no such column " + Col1); } if(!ColTypeMap.IsKey(Col2)){ TExcept::Throw("no such column " + Col2); } if (GetColType(Col1) != GetColType(Col2)) { TExcept::Throw("Trying to Join on columns of different type"); } // initialize result table PTable JointTable = InitializeJointTable(Table); // hash smaller table (group by column) TYPE ColType = GetColType(Col1); TBool ThisIsSmaller = (NumValidRows <= Table.NumValidRows); const TTable& TS = ThisIsSmaller ? *this : Table; const TTable& TB = ThisIsSmaller ? Table : *this; TStr ColS = ThisIsSmaller ? Col1 : Col2; TStr ColB = ThisIsSmaller ? Col2 : Col1; // iterate over the rows of the bigger table and check for "collisions" // with the group keys for the small table. switch(ColType){ case INT:{ THash<TInt, TIntV> T; TS.GroupByIntCol(Col1, T, TIntV(), true); for(TRowIterator RowI = TB.BegRI(); RowI < TB.EndRI(); RowI++){ TInt K = RowI.GetIntAttr(ColB); if(T.IsKey(K)){ TIntV& Group = T.GetDat(K); for(TInt i = 0; i < Group.Len(); i++){ if(ThisIsSmaller){ JointTable->AddJointRow(*this, Table, Group[i], RowI.GetRowIdx()); } else{ JointTable->AddJointRow(*this, Table, RowI.GetRowIdx(), Group[i]); } } } } break; } case FLT:{ THash<TFlt, TIntV> T; TS.GroupByFltCol(Col1, T, TIntV(), true); for(TRowIterator RowI = TB.BegRI(); RowI < TB.EndRI(); RowI++){ TFlt K = RowI.GetFltAttr(ColB); if(T.IsKey(K)){ TIntV& Group = T.GetDat(K); for(TInt i = 0; i < Group.Len(); i++){ if(ThisIsSmaller){ JointTable->AddJointRow(*this, Table, Group[i], RowI.GetRowIdx()); } else{ JointTable->AddJointRow(*this, Table, RowI.GetRowIdx(), Group[i]); } } } } break; } case STR:{ THash<TStr, TIntV> T; TS.GroupByStrCol(Col1, T, TIntV(), true); for(TRowIterator RowI = TB.BegRI(); RowI < TB.EndRI(); RowI++){ TStr K = RowI.GetStrAttr(ColB); if(T.IsKey(K)){ TIntV& Group = T.GetDat(K); for(TInt i = 0; i < Group.Len(); i++){ if(ThisIsSmaller){ JointTable->AddJointRow(*this, Table, Group[i], RowI.GetRowIdx()); } else{ JointTable->AddJointRow(*this, Table, RowI.GetRowIdx(), Group[i]); } } } } } break; } return JointTable; }