// [[Rcpp::export]] DataFrame inner_join_impl(DataFrame x, DataFrame y, IntegerVector by_x, IntegerVector by_y, IntegerVector aux_x, IntegerVector aux_y, bool na_match) { check_by(by_x); typedef VisitorSetIndexMap<DataFrameJoinVisitors, std::vector<int> > Map; DataFrameJoinVisitors visitors(x, y, by_x, by_y, false, na_match); Map map(visitors); int n_x = x.nrows(), n_y = y.nrows(); std::vector<int> indices_x; std::vector<int> indices_y; train_push_back_right(map, n_y); for (int i = 0; i < n_x; i++) { Map::iterator it = map.find(i); if (it != map.end()) { push_back_right(indices_y, it->second); push_back(indices_x, i, it->second.size()); } } return subset_join(x, y, indices_x, indices_y, by_x, by_y, aux_x, aux_y, get_class(x) ); }
// [[Rcpp::export]] DataFrame right_join_impl(DataFrame x, DataFrame y, IntegerVector by_x, IntegerVector by_y, IntegerVector aux_x, IntegerVector aux_y, bool na_match) { check_by(by_x); typedef VisitorSetIndexMap<DataFrameJoinVisitors, std::vector<int> > Map; DataFrameJoinVisitors visitors(x, y, by_x, by_y, false, na_match); Map map(visitors); // train the map in terms of x train_push_back(map, x.nrows()); std::vector<int> indices_x; std::vector<int> indices_y; int n_y = y.nrows(); for (int i = 0; i < n_y; i++) { // find a row in y that matches row i in x Map::iterator it = map.find(-i - 1); if (it != map.end()) { push_back(indices_x, it->second); push_back(indices_y, i, it->second.size()); } else { indices_x.push_back(-i - 1); // point to the i-th row in the right table indices_y.push_back(i); } } return subset_join(x, y, indices_x, indices_y, by_x, by_y, aux_x, aux_y, get_class(x) ); }
void RandomTree::findProximity(DataFrame & data, std::vector<unsigned int> & proximity) { std::vector<unsigned int> proxVec; unsigned int dSize = data.getNumDataVectors(); proxVec.resize(dSize); //Find out which node each vector is classified as for(unsigned int i = 0; i < dSize; i++) { std::string resultClass; unsigned int nodeId = classifyDataVector(data.getDataVector(i), resultClass); proxVec[i] = nodeId; } for(unsigned int j = 0; j < dSize; j++) { unsigned int tempId = proxVec[j]; for(unsigned int k = j; k < dSize;j++) { if(proxVec[k] == tempId) { proximity[j * dSize + k] += 1; } } } }
inline DataFrame grouped_subset( const Data& gdf, const LogicalVector& test, const CharacterVector& names, CharacterVector classes){ DataFrame data = gdf.data() ; DataFrame res = subset( data, test, names, classes) ; res.attr("vars") = data.attr("vars") ; strip_index(res); return Data(res).data() ; }
void DataFrameDiscretizer::discretize(DataFrame& df, TgsProgress* progress) { _df = &df; for (unsigned int i = 0; i < df.getNumFactors(); i++) { if (progress) { progress->setProgress((double)i / (double)df.getNumFactors()); } if (_df->isNominal(i) == false) { if (_df->getNullTreatment(i) == DataFrame::NullAsMissingValue) { // replace nulls with random sampling of data (imputation), otherwise nulls get put // into their own category. _replaceNulls(i); } _discretizeColumn(i); } } if (progress) { progress->setProgress(1.0); } }
// [[Rcpp::export]] DataFrame inner_join_impl(DataFrame x, DataFrame y, CharacterVector by_x, CharacterVector by_y, std::string& suffix_x, std::string& suffix_y, bool na_match) { if (by_x.size() == 0) stop("no variable to join by"); typedef VisitorSetIndexMap<DataFrameJoinVisitors, std::vector<int> > Map; DataFrameJoinVisitors visitors(x, y, SymbolVector(by_x), SymbolVector(by_y), true, na_match); Map map(visitors); int n_x = x.nrows(), n_y = y.nrows(); std::vector<int> indices_x; std::vector<int> indices_y; train_push_back_right(map, n_y); for (int i = 0; i < n_x; i++) { Map::iterator it = map.find(i); if (it != map.end()) { push_back_right(indices_y, it->second); push_back(indices_x, i, it->second.size()); } } return subset_join(x, y, indices_x, indices_y, by_x, by_y, suffix_x, suffix_y, get_class(x) ); }
// [[Rcpp::export]] DataFrame anti_join_impl(DataFrame x, DataFrame y, CharacterVector by_x, CharacterVector by_y, bool na_match) { if (by_x.size() == 0) stop("no variable to join by"); typedef VisitorSetIndexMap<DataFrameJoinVisitors, std::vector<int> > Map; DataFrameJoinVisitors visitors(x, y, SymbolVector(by_x), SymbolVector(by_y), false, na_match); Map map(visitors); // train the map in terms of x train_push_back(map, x.nrows()); int n_y = y.nrows(); // remove the rows in x that match for (int i = 0; i < n_y; i++) { Map::iterator it = map.find(-i - 1); if (it != map.end()) map.erase(it); } // collect what's left std::vector<int> indices; for (Map::iterator it = map.begin(); it != map.end(); ++it) push_back(indices, it->second); const DataFrame& out = subset(x, indices, x.names(), get_class(x)); strip_index(out); return out; }
// [[Rcpp::export]] DataFrame right_join_impl(DataFrame x, DataFrame y, CharacterVector by_x, CharacterVector by_y, std::string& suffix_x, std::string& suffix_y, bool na_match) { if (by_x.size() == 0) stop("no variable to join by"); typedef VisitorSetIndexMap<DataFrameJoinVisitors, std::vector<int> > Map; DataFrameJoinVisitors visitors(x, y, SymbolVector(by_x), SymbolVector(by_y), true, na_match); Map map(visitors); // train the map in terms of x train_push_back(map, x.nrows()); std::vector<int> indices_x; std::vector<int> indices_y; int n_y = y.nrows(); for (int i = 0; i < n_y; i++) { // find a row in y that matches row i in x Map::iterator it = map.find(-i - 1); if (it != map.end()) { push_back(indices_x, it->second); push_back(indices_y, i, it->second.size()); } else { indices_x.push_back(-i - 1); // point to the i-th row in the right table indices_y.push_back(i); } } return subset_join(x, y, indices_x, indices_y, by_x, by_y, suffix_x, suffix_y, get_class(x) ); }
// [[Rcpp::export]] DataFrame semi_join_impl(DataFrame x, DataFrame y, CharacterVector by_x, CharacterVector by_y, bool na_match) { if (by_x.size() == 0) stop("no variable to join by"); typedef VisitorSetIndexMap<DataFrameJoinVisitors, std::vector<int> > Map; DataFrameJoinVisitors visitors(x, y, SymbolVector(by_x), SymbolVector(by_y), false, na_match); Map map(visitors); // train the map in terms of x train_push_back(map, x.nrows()); int n_y = y.nrows(); // this will collect indices from rows in x that match rows in y std::vector<int> indices; for (int i = 0; i < n_y; i++) { // find a row in x that matches row i from y Map::iterator it = map.find(-i - 1); if (it != map.end()) { // collect the indices and remove them from the // map so that they are only found once. push_back(indices, it->second); map.erase(it); } } const DataFrame& out = subset(x, indices, x.names(), get_class(x)); strip_index(out); return out; }
// [[Rcpp::export]] SEXP filter_impl( DataFrame df, LazyDots dots){ if( df.nrows() == 0 || Rf_isNull(df) ) { return df ; } check_valid_colnames(df) ; assert_all_white_list(df) ; if( dots.size() == 0 ) return df ; // special case if( dots.size() == 1 && TYPEOF(dots[0].expr()) == LGLSXP){ LogicalVector what = dots[0].expr() ; if( what.size() == 1 ){ if( what[0] == TRUE ){ return df ; } else { return empty_subset( df, df.names(), is<GroupedDataFrame>(df) ? classes_grouped<GroupedDataFrame>() : classes_not_grouped() ) ; } } } if( is<GroupedDataFrame>( df ) ){ return filter_grouped<GroupedDataFrame, LazyGroupedSubsets>( GroupedDataFrame(df), dots); } else if( is<RowwiseDataFrame>(df) ){ return filter_grouped<RowwiseDataFrame, LazyRowwiseSubsets>( RowwiseDataFrame(df), dots); } else { return filter_not_grouped( df, dots ) ; } }
// update the device's data void Device::update() { // read current traffic DataFrame dataFrame = m_devReader.getNewDataFrame(); if(dataFrame.isValid()) { /* Depending on the CPU architecture and the OS interface * used for reading the device statistics, the counts can * overflow. We monitor the overflows and fix them. */ fixOverflows(dataFrame, m_dataFrameOld); m_deviceStatistics.insertDataFrame(dataFrame); m_deviceGraphIn.update(m_deviceStatistics.getDataInPerSecond()); m_deviceGraphOut.update(m_deviceStatistics.getDataOutPerSecond()); m_dataFrameOld = dataFrame; } else { m_deviceStatistics.reset(); m_deviceGraphIn.resetTrafficData(); m_deviceGraphOut.resetTrafficData(); } }
SEXP filter_not_grouped( DataFrame df, List args, const DataDots& dots){ CharacterVector names = df.names() ; SymbolSet set ; for( int i=0; i<names.size(); i++){ set.insert( Rf_install( names[i] ) ) ; } if( dots.single_env() ){ Environment env = dots.envir(0) ; // a, b, c -> a & b & c Shield<SEXP> call( and_calls( args, set ) ) ; // replace the symbols that are in the data frame by vectors from the data frame // and evaluate the expression CallProxy proxy( (SEXP)call, df, env ) ; LogicalVector test = proxy.eval() ; check_filter_result(test, df.nrows()); DataFrame res = subset( df, test, df.names(), classes_not_grouped() ) ; return res ; } else { int nargs = args.size() ; CallProxy first_proxy(args[0], df, dots.envir(0) ) ; LogicalVector test = first_proxy.eval() ; check_filter_result(test, df.nrows()); for( int i=1; i<nargs; i++){ LogicalVector test2 = CallProxy(args[i], df, dots.envir(i) ).eval() ; combine_and(test, test2) ; } DataFrame res = subset( df, test, df.names(), classes_not_grouped() ) ; return res ; } }
DataFrame filter_grouped_single_env( const GroupedDataFrame& gdf, const List& args, const Environment& env){ const DataFrame& data = gdf.data() ; CharacterVector names = data.names() ; SymbolSet set ; for( int i=0; i<names.size(); i++){ set.insert( Rf_install( names[i] ) ) ; } // a, b, c -> a & b & c Call call( and_calls( args, set ) ) ; int nrows = data.nrows() ; LogicalVector test = no_init(nrows); LogicalVector g_test ; GroupedCallProxy call_proxy( call, gdf, env ) ; int ngroups = gdf.ngroups() ; GroupedDataFrame::group_iterator git = gdf.group_begin() ; for( int i=0; i<ngroups; i++, ++git){ SlicingIndex indices = *git ; int chunk_size = indices.size() ; g_test = call_proxy.get( indices ); check_filter_result(g_test, chunk_size ) ; for( int j=0; j<chunk_size; j++){ test[ indices[j] ] = g_test[j] ; } } DataFrame res = subset( data, test, names, classes_grouped() ) ; res.attr( "vars") = data.attr("vars") ; return res ; }
// [[Rcpp::export]] DataFrame semi_join_impl( DataFrame x, DataFrame y, CharacterVector by){ typedef VisitorSetIndexMap<DataFrameJoinVisitors, std::vector<int> > Map ; DataFrameJoinVisitors visitors(x, y, by) ; Map map(visitors); // train the map in terms of x train_push_back( map, x.nrows(), x.nrows() / 10) ; int n_y = y.nrows() ; // this will collect indices from rows in x that match rows in y std::vector<int> indices ; for( int i=0; i<n_y; i++){ // find a row in x that matches row i from y Map::iterator it = map.find(-i-1) ; if( it != map.end() ){ // collect the indices and remove them from the // map so that they are only found once. push_back( indices, it->second ) ; map.erase(it) ; } } return subset(x, indices, x.names(), x.attr("class") ) ; }
// [[Rcpp::export]] DataFrame right_join_impl( DataFrame x, DataFrame y, CharacterVector by){ typedef VisitorSetIndexMap<DataFrameJoinVisitors, std::vector<int> > Map ; DataFrameJoinVisitors visitors(x, y, by) ; Map map(visitors); // train the map in terms of y train_push_back( map, x.nrows(), x.nrows() / 10 ) ; std::vector<int> indices_x ; std::vector<int> indices_y ; int n_y = y.nrows() ; for( int i=0; i<n_y; i++){ // find a row in y that matches row i in x Map::iterator it = map.find(-i-1) ; if( it != map.end() ){ push_back( indices_x, it->second ) ; push_back( indices_y, i, it->second.size() ) ; } else { indices_x.push_back(-1) ; // mark NA indices_y.push_back(i) ; } } return subset( x, y, indices_x, indices_y, by, x.attr( "class" ) ) ; }
DataFrame filter_grouped( const GroupedDataFrame& gdf, List args, Environment env){ // a, b, c -> a & b & c Language call = and_calls( args ) ; const DataFrame& data = gdf.data() ; int nrows = data.nrows() ; LogicalVector test = no_init(nrows); LogicalVector g_test ; GroupedCallProxy call_proxy( call, gdf, env ) ; int ngroups = gdf.ngroups() ; GroupedDataFrame::group_iterator git = gdf.group_begin() ; for( int i=0; i<ngroups; i++, ++git){ SlicingIndex indices = *git ; g_test = call_proxy.get( indices ); int chunk_size = indices.size() ; for( int j=0; j<chunk_size; j++){ test[ indices[j] ] = g_test[j] ; } } DataFrame res = subset( data, test, data.names(), classes_grouped() ) ; res.attr( "vars") = data.attr("vars") ; return res ; }
DataFrame subset( DataFrame x, DataFrame y, const Index& indices_x, const Index& indices_y, CharacterVector by, CharacterVector classes ){ CharacterVector x_columns = x.names() ; DataFrameVisitors visitors_x(x, x_columns) ; CharacterVector all_y_columns = y.names() ; CharacterVector y_columns = setdiff( all_y_columns, by ) ; JoinColumnSuffixer suffixer(x_columns, y_columns, by) ; DataFrameVisitors visitors_y(y, y_columns) ; int nrows = indices_x.size() ; int nv_x = visitors_x.size(), nv_y = visitors_y.size() ; List out(nv_x+nv_y); CharacterVector names(nv_x+nv_y) ; int k=0; for( ; k<nv_x; k++){ out[k] = visitors_x.get(k)->subset(indices_x) ; names[k] = suffixer.get( x_columns[k], ".x" ) ; } for( int i=0; i<nv_y; i++, k++){ out[k] = visitors_y.get(i)->subset(indices_y) ; names[k] = suffixer.get(y_columns[i], ".y" ) ; } out.attr("class") = classes ; set_rownames(out, nrows) ; out.names() = names ; SEXP vars = x.attr( "vars" ) ; if( !Rf_isNull(vars) ) out.attr( "vars" ) = vars ; return (SEXP)out ; }
double SymmetricUncertaintyCalculator::_calculateEntropy(const DataFrame& df, int factorIndex) { typedef HashMap<int, int> ClassCounts; ClassCounts cc; for(unsigned int i = 0; i < df.getNumDataVectors(); i++) { double v = df.getDataElement(i, factorIndex); // null values are not supported Use the DataFrameDiscretizer to "fix" nulls if (DataFrame::isNull(v) == true) { throw Tgs::Exception("Null values are not supported by SymmetricUncertaintyCalculator"); } cc[(int)(v + .5)]++; } double sum = 0.0; double totalSize = df.getNumDataVectors(); for (ClassCounts::const_iterator classIt = cc.begin(); classIt != cc.end(); classIt++) { double count = classIt->second; sum += count / totalSize * log2(count / totalSize); } return -sum; }
// [[Rcpp::export]] DataFrame arrange_impl( DataFrame data, List args, DataDots dots ){ int nargs = args.size() ; List variables(nargs) ; LogicalVector ascending(nargs) ; Shelter<SEXP> __ ; for(int i=0; i<nargs; i++){ SEXP call = args[i] ; bool is_desc = TYPEOF(call) == LANGSXP && Rf_install("desc") == CAR(call) ; CallProxy call_proxy( is_desc ? CADR(call) : call, data, dots.envir(i)) ; variables[i] = __(call_proxy.eval()) ; if( Rf_length(variables[i]) != data.nrows() ){ std::stringstream s ; s << "incorrect size (" << Rf_length(variables[i]) << "), expecting :" << data.nrows() ; stop(s.str()) ; } ascending[i] = !is_desc ; } OrderVisitors o(variables,ascending, nargs) ; IntegerVector index = o.apply() ; DataFrameVisitors visitors( data, data.names() ) ; DataFrame res = visitors.subset(index, data.attr("class") ) ; return res; }
// [[Rcpp::export]] SEXP distinct_impl(DataFrame df, CharacterVector vars, CharacterVector keep) { if (df.size() == 0) return df; // No vars means ungrouped data with keep_all = TRUE. if (vars.size() == 0) return df; check_valid_colnames(df); if (!vars.size()) { vars = df.names(); } DataFrameVisitors visitors(df, vars); std::vector<int> indices; VisitorSetIndexSet<DataFrameVisitors> set(visitors); int n = df.nrows(); for (int i=0; i<n; i++) { if (set.insert(i).second) { indices.push_back(i); } } return DataFrameSubsetVisitors(df, keep).subset(indices, df.attr("class")); }
SEXP structure_mutate( Proxy& call_proxy, const DataFrame& df, const CharacterVector& results_names, CharacterVector classes){ int n = call_proxy.nsubsets() ; List out(n) ; CharacterVector names(n) ; CharacterVector input_names = df.names() ; int ncolumns = df.size() ; int i=0 ; for( ; i<ncolumns; i++){ out[i] = call_proxy.get_variable(input_names[i]) ; SET_NAMED( out[i], 2 ); names[i] = input_names[i] ; } for( int k=0; i<n; k++ ){ String name = results_names[k] ; if( ! any( input_names.begin(), input_names.end(), name.get_sexp() ) ){ SEXP x = call_proxy.get_variable( name ) ; out[i] = x ; SET_NAMED( out[i], 2 ); names[i] = name ; i++ ; } } out.attr("class") = classes ; set_rownames( out, df.nrows() ) ; out.names() = names; return out ; }
// version of grouped filter when contributions to ... come from several environment DataFrame filter_grouped_multiple_env( const GroupedDataFrame& gdf, const List& args, const DataDots& dots){ const DataFrame& data = gdf.data() ; CharacterVector names = data.names() ; SymbolSet set ; for( int i=0; i<names.size(); i++){ set.insert( Rf_install( names[i] ) ) ; } int nrows = data.nrows() ; LogicalVector test(nrows, TRUE); LogicalVector g_test ; for( int k=0; k<args.size(); k++){ Call call( (SEXP)args[k] ) ; GroupedCallProxy call_proxy( call, gdf, dots.envir(k) ) ; int ngroups = gdf.ngroups() ; GroupedDataFrame::group_iterator git = gdf.group_begin() ; for( int i=0; i<ngroups; i++, ++git){ SlicingIndex indices = *git ; int chunk_size = indices.size() ; g_test = call_proxy.get( indices ); check_filter_result(g_test, chunk_size ) ; for( int j=0; j<chunk_size; j++){ test[ indices[j] ] = test[ indices[j] ] & g_test[j] ; } } } DataFrame res = subset( data, test, names, classes_grouped() ) ; res.attr( "vars") = data.attr("vars") ; return res ; }
// [[Rcpp::export]] DataFrame as_regular_df(DataFrame df){ DataFrame copy = shallow_copy(df) ; SET_ATTRIB(copy, strip_group_attributes(df)) ; SET_OBJECT(copy, OBJECT(df)) ; copy.attr("class") = CharacterVector::create("data.frame") ; return copy ; }
// [[Rcpp::export]] SEXP ex13_2(DataFrame input, CharacterVector columnName, double replace){ BEGIN_RCPP List names = input.names(); List mapObj = as<List>(input); // all rows for(int i=0; i<input.nrows(); i++){ ex13helper(mapObj, as<string>(columnName), i, replace); } return(wrap(mapObj)); END_RCPP }
SEXP structure_mutate( const NamedListAccumulator<SEXP>& accumulator, const DataFrame& df, CharacterVector classes){ List res = accumulator ; res.attr("class") = classes ; set_rownames( res, df.nrows() ) ; res.attr( "vars") = df.attr("vars") ; res.attr( "labels" ) = df.attr("labels" ); res.attr( "index") = df.attr("index") ; res.attr( "indices" ) = df.attr("indices" ) ; return res ; }
// [[Rcpp::export]] IntegerVector grouped_indices_impl(DataFrame data, ListOf<Symbol> symbols) { int nsymbols = symbols.size(); if (nsymbols == 0) return rep(1, data.nrows()); CharacterVector vars(nsymbols); for (int i=0; i<nsymbols; i++) { vars[i] = PRINTNAME(symbols[i]); const char* name = vars[i]; SEXP v; try { v = data[name]; } catch (...) { stop("unknown column '%s'", name); } if (!white_list(v) || TYPEOF(v) == VECSXP) { stop("cannot group column %s, of class '%s'", name, get_single_class(v)); } } DataFrameVisitors visitors(data, vars); ChunkIndexMap map(visitors); int n = data.nrows(); train_push_back(map, n); DataFrame labels = DataFrameSubsetVisitors(data, vars).subset(map, "data.frame"); IntegerVector labels_order = OrderVisitors(labels).apply(); labels = DataFrameSubsetVisitors(labels).subset(labels_order, "data.frame"); int ngroups = map.size(); IntegerVector res = no_init(n); std::vector<const std::vector<int>* > chunks(ngroups); ChunkIndexMap::const_iterator it = map.begin(); for (int i=0; i<ngroups; i++, ++it) { chunks[i] = &it->second; } for (int i=0; i<ngroups; i++) { int idx = labels_order[i]; const std::vector<int>& v = *chunks[idx]; int n_index = v.size(); for (int j=0; j<n_index; j++) { res[ v[j] ] = i+1; } } return res; }
// [[Rcpp::export]] DataFrame full_join_impl(DataFrame x, DataFrame y, IntegerVector by_x, IntegerVector by_y, IntegerVector aux_x, IntegerVector aux_y, bool na_match) { check_by(by_x); typedef VisitorSetIndexMap<DataFrameJoinVisitors, std::vector<int> > Map; DataFrameJoinVisitors visitors(y, x, by_y, by_x, false, na_match); Map map(visitors); // train the map in terms of y train_push_back(map, y.nrows()); std::vector<int> indices_x; std::vector<int> indices_y; int n_x = x.nrows(), n_y = y.nrows(); // get both the matches and the rows from left but not right for (int i = 0; i < n_x; i++) { // find a row in y that matches row i in x Map::iterator it = map.find(-i - 1); if (it != map.end()) { push_back(indices_y, it->second); push_back(indices_x, i, it->second.size()); } else { indices_y.push_back(-1); // mark NA indices_x.push_back(i); } } // train a new map in terms of x this time DataFrameJoinVisitors visitors2(x, y, by_x, by_y, false, na_match); Map map2(visitors2); train_push_back(map2, x.nrows()); for (int i = 0; i < n_y; i++) { // try to find row in x that matches this row of y Map::iterator it = map2.find(-i - 1); if (it == map2.end()) { indices_x.push_back(-i - 1); indices_y.push_back(i); } } return subset_join(x, y, indices_x, indices_y, by_x, by_y, aux_x, aux_y, get_class(x) ); }
double InformationGainCalculator::calculateInformationGain(const DataFrame& df1, int factorIndex1, const DataFrame& df2, int factorIndex2) { assert(df1.isNominal(factorIndex1)); assert(df2.isNominal(factorIndex2)); double hy = _calculateEntropy(df1, factorIndex1); double hyx = _calculateConditionalEntropy(df1, factorIndex1, df2, factorIndex2); double gain = hy - hyx; return gain; }
// [[Rcpp::export]] DataFrame union_data_frame( DataFrame x, DataFrame y){ if( !compatible_data_frame(x,y) ) stop( "not compatible" ); typedef VisitorSetIndexSet<DataFrameJoinVisitors> Set ; DataFrameJoinVisitors visitors(x, y, x.names() ) ; Set set(visitors); train_insert( set, x.nrows() ) ; train_insert_right( set, y.nrows() ) ; return visitors.subset( set, x.attr("class") ) ; }
// [[Rcpp::export]] SEXP distinct_impl( DataFrame df ){ DataFrameVisitors visitors(df) ; std::vector<int> indices ; VisitorSetIndexSet<DataFrameVisitors> set(visitors) ; int n = df.nrows() ; for( int i=0; i<n; i++){ if( set.insert(i).second ){ indices.push_back(i) ; } } return visitors.subset(indices, df.attr("class") ); }