inline DataFrame grouped_subset( const Data& gdf, const LogicalVector& test, const CharacterVector& names, CharacterVector classes){ DataFrame data = gdf.data() ; DataFrame res = subset( data, test, names, classes) ; res.attr("vars") = data.attr("vars") ; strip_index(res); return Data(res).data() ; }
DataFrame build_index_cpp(DataFrame data) { SymbolVector vars(get_vars(data)); const int nvars = vars.size(); CharacterVector names = data.names(); IntegerVector indx = vars.match_in_table(names); for (int i = 0; i < nvars; ++i) { int pos = indx[i]; if (pos == NA_INTEGER) { stop("unknown column '%s' ", vars[i].get_utf8_cstring()); } SEXP v = data[pos - 1]; if (!white_list(v) || TYPEOF(v) == VECSXP) { stop( "cannot group column %s, of class '%s'", vars[i].get_utf8_cstring(), get_single_class(v)); } } DataFrameVisitors visitors(data, vars); ChunkIndexMap map(visitors); train_push_back(map, data.nrows()); DataFrame labels = DataFrameSubsetVisitors(data, vars).subset(map, "data.frame"); int ngroups = labels.nrows(); IntegerVector labels_order = OrderVisitors(labels).apply(); labels = DataFrameSubsetVisitors(labels).subset(labels_order, "data.frame"); List indices(ngroups); IntegerVector group_sizes = no_init(ngroups); int biggest_group = 0; ChunkIndexMap::const_iterator it = map.begin(); std::vector<const std::vector<int>* > chunks(ngroups); for (int i = 0; i < ngroups; i++, ++it) { chunks[i] = &it->second; } for (int i = 0; i < ngroups; i++) { int idx = labels_order[i]; const std::vector<int>& chunk = *chunks[idx]; indices[i] = chunk; group_sizes[i] = chunk.size(); biggest_group = std::max(biggest_group, (int)chunk.size()); } data.attr("indices") = indices; data.attr("group_sizes") = group_sizes; data.attr("biggest_group_size") = biggest_group; data.attr("labels") = labels; set_class(data, CharacterVector::create("grouped_df", "tbl_df", "tbl", "data.frame")); return data; }
// [[Rcpp::export]] List arrange_impl( DataFrame data, LazyDots dots ){ if( data.size() == 0 ) return data ; check_valid_colnames(data) ; assert_all_white_list(data) ; if( dots.size() == 0 || data.nrows() == 0) return data ; int nargs = dots.size() ; List variables(nargs) ; LogicalVector ascending(nargs) ; for(int i=0; i<nargs; i++){ const Lazy& lazy = dots[i] ; Shield<SEXP> call_( lazy.expr() ) ; SEXP call = call_ ; bool is_desc = TYPEOF(call) == LANGSXP && Rf_install("desc") == CAR(call) ; CallProxy call_proxy(is_desc ? CADR(call) : call, data, lazy.env()) ; Shield<SEXP> v(call_proxy.eval()) ; if( !white_list(v) ){ stop( "cannot arrange column of class '%s'", get_single_class(v) ) ; } if( Rf_inherits(v, "data.frame" ) ){ DataFrame df(v) ; int nr = df.nrows() ; if( nr != data.nrows() ){ stop( "data frame column with incompatible number of rows (%d), expecting : %d", nr, data.nrows() ); } } else if( Rf_isMatrix(v) ) { stop( "can't arrange by a matrix" ) ; } else { if( Rf_length(v) != data.nrows() ){ stop( "incorrect size (%d), expecting : %d", Rf_length(v), data.nrows() ) ; } } variables[i] = v ; ascending[i] = !is_desc ; } OrderVisitors o(variables, ascending, nargs) ; IntegerVector index = o.apply() ; DataFrameSubsetVisitors visitors( data, data.names() ) ; List res = visitors.subset(index, data.attr("class") ) ; if( is<GroupedDataFrame>(data) ){ // so that all attributes are recalculated (indices ... ) // see the lazyness feature in GroupedDataFrame // if we don't do that, we get the values of the un-arranged data // set for free from subset (#1064) res.attr("labels") = R_NilValue ; res.attr( "vars" ) = data.attr("vars" ) ; return GroupedDataFrame(res).data() ; } SET_ATTRIB(res, strip_group_attributes(res)); return res ; }
SEXP structure_mutate( const NamedListAccumulator<SEXP>& accumulator, const DataFrame& df, CharacterVector classes){ List res = accumulator ; res.attr("class") = classes ; set_rownames( res, df.nrows() ) ; res.attr( "vars") = df.attr("vars") ; res.attr( "labels" ) = df.attr("labels" ); res.attr( "index") = df.attr("index") ; res.attr( "indices" ) = df.attr("indices" ) ; return res ; }
DataFrame filter_grouped_single_env( const GroupedDataFrame& gdf, const List& args, const Environment& env){ const DataFrame& data = gdf.data() ; CharacterVector names = data.names() ; SymbolSet set ; for( int i=0; i<names.size(); i++){ set.insert( Rf_install( names[i] ) ) ; } // a, b, c -> a & b & c Call call( and_calls( args, set ) ) ; int nrows = data.nrows() ; LogicalVector test = no_init(nrows); LogicalVector g_test ; GroupedCallProxy call_proxy( call, gdf, env ) ; int ngroups = gdf.ngroups() ; GroupedDataFrame::group_iterator git = gdf.group_begin() ; for( int i=0; i<ngroups; i++, ++git){ SlicingIndex indices = *git ; int chunk_size = indices.size() ; g_test = call_proxy.get( indices ); check_filter_result(g_test, chunk_size ) ; for( int j=0; j<chunk_size; j++){ test[ indices[j] ] = g_test[j] ; } } DataFrame res = subset( data, test, names, classes_grouped() ) ; res.attr( "vars") = data.attr("vars") ; return res ; }
// version of grouped filter when contributions to ... come from several environment DataFrame filter_grouped_multiple_env( const GroupedDataFrame& gdf, const List& args, const DataDots& dots){ const DataFrame& data = gdf.data() ; CharacterVector names = data.names() ; SymbolSet set ; for( int i=0; i<names.size(); i++){ set.insert( Rf_install( names[i] ) ) ; } int nrows = data.nrows() ; LogicalVector test(nrows, TRUE); LogicalVector g_test ; for( int k=0; k<args.size(); k++){ Call call( (SEXP)args[k] ) ; GroupedCallProxy call_proxy( call, gdf, dots.envir(k) ) ; int ngroups = gdf.ngroups() ; GroupedDataFrame::group_iterator git = gdf.group_begin() ; for( int i=0; i<ngroups; i++, ++git){ SlicingIndex indices = *git ; int chunk_size = indices.size() ; g_test = call_proxy.get( indices ); check_filter_result(g_test, chunk_size ) ; for( int j=0; j<chunk_size; j++){ test[ indices[j] ] = test[ indices[j] ] & g_test[j] ; } } } DataFrame res = subset( data, test, names, classes_grouped() ) ; res.attr( "vars") = data.attr("vars") ; return res ; }
DataFrame subset( DataFrame x, DataFrame y, const Index& indices_x, const Index& indices_y, CharacterVector by, CharacterVector classes ){ CharacterVector x_columns = x.names() ; DataFrameVisitors visitors_x(x, x_columns) ; CharacterVector all_y_columns = y.names() ; CharacterVector y_columns = setdiff( all_y_columns, by ) ; JoinColumnSuffixer suffixer(x_columns, y_columns, by) ; DataFrameVisitors visitors_y(y, y_columns) ; int nrows = indices_x.size() ; int nv_x = visitors_x.size(), nv_y = visitors_y.size() ; List out(nv_x+nv_y); CharacterVector names(nv_x+nv_y) ; int k=0; for( ; k<nv_x; k++){ out[k] = visitors_x.get(k)->subset(indices_x) ; names[k] = suffixer.get( x_columns[k], ".x" ) ; } for( int i=0; i<nv_y; i++, k++){ out[k] = visitors_y.get(i)->subset(indices_y) ; names[k] = suffixer.get(y_columns[i], ".y" ) ; } out.attr("class") = classes ; set_rownames(out, nrows) ; out.names() = names ; SEXP vars = x.attr( "vars" ) ; if( !Rf_isNull(vars) ) out.attr( "vars" ) = vars ; return (SEXP)out ; }
// [[Rcpp::export]] DataFrame as_regular_df(DataFrame df){ DataFrame copy = shallow_copy(df) ; SET_ATTRIB(copy, strip_group_attributes(df)) ; SET_OBJECT(copy, OBJECT(df)) ; copy.attr("class") = CharacterVector::create("data.frame") ; return copy ; }
// [[Rcpp::export]] DataFrame right_join_impl( DataFrame x, DataFrame y, CharacterVector by){ typedef VisitorSetIndexMap<DataFrameJoinVisitors, std::vector<int> > Map ; DataFrameJoinVisitors visitors(x, y, by) ; Map map(visitors); // train the map in terms of y train_push_back( map, x.nrows(), x.nrows() / 10 ) ; std::vector<int> indices_x ; std::vector<int> indices_y ; int n_y = y.nrows() ; for( int i=0; i<n_y; i++){ // find a row in y that matches row i in x Map::iterator it = map.find(-i-1) ; if( it != map.end() ){ push_back( indices_x, it->second ) ; push_back( indices_y, i, it->second.size() ) ; } else { indices_x.push_back(-1) ; // mark NA indices_y.push_back(i) ; } } return subset( x, y, indices_x, indices_y, by, x.attr( "class" ) ) ; }
// [[Rcpp::export]] DataFrame arrange_impl( DataFrame data, List args, DataDots dots ){ int nargs = args.size() ; List variables(nargs) ; LogicalVector ascending(nargs) ; Shelter<SEXP> __ ; for(int i=0; i<nargs; i++){ SEXP call = args[i] ; bool is_desc = TYPEOF(call) == LANGSXP && Rf_install("desc") == CAR(call) ; CallProxy call_proxy( is_desc ? CADR(call) : call, data, dots.envir(i)) ; variables[i] = __(call_proxy.eval()) ; if( Rf_length(variables[i]) != data.nrows() ){ std::stringstream s ; s << "incorrect size (" << Rf_length(variables[i]) << "), expecting :" << data.nrows() ; stop(s.str()) ; } ascending[i] = !is_desc ; } OrderVisitors o(variables,ascending, nargs) ; IntegerVector index = o.apply() ; DataFrameVisitors visitors( data, data.names() ) ; DataFrame res = visitors.subset(index, data.attr("class") ) ; return res; }
// [[Rcpp::export]] DataFrame semi_join_impl( DataFrame x, DataFrame y, CharacterVector by){ typedef VisitorSetIndexMap<DataFrameJoinVisitors, std::vector<int> > Map ; DataFrameJoinVisitors visitors(x, y, by) ; Map map(visitors); // train the map in terms of x train_push_back( map, x.nrows(), x.nrows() / 10) ; int n_y = y.nrows() ; // this will collect indices from rows in x that match rows in y std::vector<int> indices ; for( int i=0; i<n_y; i++){ // find a row in x that matches row i from y Map::iterator it = map.find(-i-1) ; if( it != map.end() ){ // collect the indices and remove them from the // map so that they are only found once. push_back( indices, it->second ) ; map.erase(it) ; } } return subset(x, indices, x.names(), x.attr("class") ) ; }
// [[Rcpp::export]] SEXP distinct_impl(DataFrame df, CharacterVector vars, CharacterVector keep) { if (df.size() == 0) return df; // No vars means ungrouped data with keep_all = TRUE. if (vars.size() == 0) return df; check_valid_colnames(df); if (!vars.size()) { vars = df.names(); } DataFrameVisitors visitors(df, vars); std::vector<int> indices; VisitorSetIndexSet<DataFrameVisitors> set(visitors); int n = df.nrows(); for (int i=0; i<n; i++) { if (set.insert(i).second) { indices.push_back(i); } } return DataFrameSubsetVisitors(df, keep).subset(indices, df.attr("class")); }
DataFrame filter_grouped( const GroupedDataFrame& gdf, List args, Environment env){ // a, b, c -> a & b & c Language call = and_calls( args ) ; const DataFrame& data = gdf.data() ; int nrows = data.nrows() ; LogicalVector test = no_init(nrows); LogicalVector g_test ; GroupedCallProxy call_proxy( call, gdf, env ) ; int ngroups = gdf.ngroups() ; GroupedDataFrame::group_iterator git = gdf.group_begin() ; for( int i=0; i<ngroups; i++, ++git){ SlicingIndex indices = *git ; g_test = call_proxy.get( indices ); int chunk_size = indices.size() ; for( int j=0; j<chunk_size; j++){ test[ indices[j] ] = g_test[j] ; } } DataFrame res = subset( data, test, data.names(), classes_grouped() ) ; res.attr( "vars") = data.attr("vars") ; return res ; }
// [[Rcpp::export]] DataFrame build_index_cpp( DataFrame data ){ CharacterVector vars = Rf_getAttrib( data.attr( "vars" ), R_NamesSymbol ) ; DataFrameVisitors visitors(data, vars) ; ChunkIndexMap map( visitors ) ; train_push_back( map, data.nrows() ) ; DataFrame labels = visitors.subset( map, "data.frame") ; int ngroups = labels.nrows() ; OrderVisitors order_labels( labels, vars ) ; IntegerVector orders = order_labels.apply() ; std::vector< const std::vector<int>* > chunks(ngroups) ; ChunkIndexMap::const_iterator it = map.begin() ; for( int i=0; i<ngroups; i++, ++it){ chunks[ i ] = &it->second ; } IntegerVector group_sizes = no_init( ngroups ); int biggest_group = 0 ; std::vector<int> indices ; indices.reserve( data.nrows() ); for( int i=0; i<ngroups; i++){ const std::vector<int>& chunk = *chunks[orders[i]] ; push_back( indices, chunk ) ; biggest_group = std::max( biggest_group, (int)chunk.size() ); group_sizes[i] = chunk.size() ; } DataFrameVisitors all_variables_visitors(data, data.names() ) ; data = all_variables_visitors.subset( indices, classes_grouped() ) ; // TODO: we own labels, so perhaps we can do an inplace sort, // to reuse its memory instead of creating a new data frame DataFrameVisitors labels_visitors( labels, vars) ; labels = labels_visitors.subset( orders, "data.frame" ) ; labels.attr( "vars" ) = R_NilValue ; data.attr( "group_sizes") = group_sizes ; data.attr( "biggest_group_size" ) = biggest_group ; data.attr( "labels" ) = labels ; return data ; }
SEXP mutate_grouped(GroupedDataFrame gdf, List args, Environment env){ const DataFrame& df = gdf.data() ; int nexpr = args.size() ; CharacterVector results_names = args.names() ; GroupedCallProxy proxy(gdf, env) ; Shelter<SEXP> __ ; for( int i=0; i<nexpr; i++){ proxy.set_call( args[i] ); boost::scoped_ptr<Gatherer> gather( gatherer( proxy, gdf ) ); proxy.input( results_names[i], __( gather->collect() ) ) ; } DataFrame res = structure_mutate( proxy, df, results_names, classes_grouped() ) ; res.attr( "vars") = df.attr("vars") ; res.attr( "labels" ) = df.attr("labels" ); res.attr( "index") = df.attr("index") ; return res ; }
// [[Rcpp::export]] DataFrame union_data_frame( DataFrame x, DataFrame y){ if( !compatible_data_frame(x,y) ) stop( "not compatible" ); typedef VisitorSetIndexSet<DataFrameJoinVisitors> Set ; DataFrameJoinVisitors visitors(x, y, x.names() ) ; Set set(visitors); train_insert( set, x.nrows() ) ; train_insert_right( set, y.nrows() ) ; return visitors.subset( set, x.attr("class") ) ; }
/** * - creates water reservoir from given vectors of variables and options */ wateres::wateres( DataFrame reser, vector<double> storage, bool throw_exceed, double volume) : storage(storage), throw_exceed(throw_exceed), volume(volume) { unsigned row_count = reser.nrows(); vector<string> col_names = as<vector<string> >(reser.attr("names")); var.resize(var_count); for (unsigned v = 0; v < var_count; v++) { if (find(col_names.begin(), col_names.end(), var_names[v]) != col_names.end()) { var[v] = as<vector<double> >(reser[var_names[v]]); } else { var[v].resize(row_count, 0); } } //custom inflow from another reservoir instead of natural inflow if (find(col_names.begin(), col_names.end(), "I") != col_names.end()) { var[INFLOW] = as<vector<double> >(reser["I"]); } this->minutes = as<vector<unsigned> >(reser["minutes"]); area = as<double>(reser.attr("area")); double tmp[5] = { 0, 0.1, 0.3, 0.5, 0.75 }; plant_covers.assign(&tmp[0], &tmp[0] + 5); double tmp2[5] = { 1, 1.03, 1.08, 1.14, 1.22 }; plant_coeffs.assign(&tmp2[0], &tmp2[0] + 5); Rcpp::Nullable<double> tmp_plant = as<Rcpp::Nullable<double> >(reser.attr("plant_cover")); if (tmp_plant.isNotNull()) plant_cover = as<double>(tmp_plant); else plant_cover = 0; plant_coeff = interpolate_linear(plant_covers, plant_coeffs, plant_cover); eas = as<DataFrame>(reser.attr("eas")); transfer_add = true; }
SEXP getQueryStats(){ vector<int> qTF; vector<int> qIndex; for(int i=0; i < resultsData.queryStemOrder.size(); i++ ){ string term = resultsData.queryStemOrder.at(i); qTF.push_back(resultsData.queryStems[term]); qIndex.push_back(resultsData.queryStemIndex[term]); } DataFrame d = DataFrame::create( Named("qTF")= qTF, Named("qIndex") = qIndex); d.attr("row.names") = resultsData.queryStemOrder; return d; }
// [[Rcpp::export]] SEXP distinct_impl( DataFrame df ){ DataFrameVisitors visitors(df) ; std::vector<int> indices ; VisitorSetIndexSet<DataFrameVisitors> set(visitors) ; int n = df.nrows() ; for( int i=0; i<n; i++){ if( set.insert(i).second ){ indices.push_back(i) ; } } return visitors.subset(indices, df.attr("class") ); }
SEXP getTermStats(){ vector<string> statName; statName.push_back("DocFreq"); statName.push_back("IDF"); statName.push_back("cTF"); arma::vec idf = arma::log((environment.documentCount() + 1) / (resultsData.dfVector + 0.5)); DataFrame d = DataFrame::create(Named("DocFreq")=resultsData.dfVector, Named("IDF")=idf, Named("cTF")=resultsData.ctfVector); d.attr("row.names") = terms; return d; }
// [[Rcpp::export]] DataFrame CPP_get_openmp_threads() { int num_threads = openmp_threads; #ifdef _OPENMP int max_threads = omp_get_max_threads(); #else int max_threads = 0; #endif DataFrame res = DataFrame::create(_["available"] = max_threads > 0, _["max"] = max_threads, _["threads"] = num_threads); res.attr("row.names") = "OpenMP"; return res; }
// Get a row from a data.frame with rowname ---------------------------------------------------------- // [[Rcpp::export]] CharacterVector getRowFromDfWithRowname(DataFrame df, int n){ int ncols = df.size(); // number of columns in df CharacterVector out; // output vector with length 'ncol' CharacterVector rnames = df.attr("row.names"); // get row names. // assign first item in 'out' as the rowname out.push_back(rnames[n]); // for loop to get values in nth column. for(int i=0; i<ncols; i++){ //counting starts from 0 in c++ CharacterVector df_column = df[i]; out.push_back(df_column[n]); } return out; }
DataFrame filter_grouped_multiple_env( const Data& gdf, const LazyDots& dots){ const DataFrame& data = gdf.data() ; CharacterVector names = data.names() ; SymbolSet set ; for( int i=0; i<names.size(); i++){ set.insert( Rf_installChar( names[i] ) ) ; } int nrows = data.nrows() ; LogicalVector test(nrows, TRUE); LogicalVector g_test ; for( int k=0; k<dots.size(); k++){ Rcpp::checkUserInterrupt() ; const Lazy& lazy = dots[k] ; Call call( lazy.expr() ) ; GroupedCallProxy<Data, Subsets> call_proxy( call, gdf, lazy.env() ) ; int ngroups = gdf.ngroups() ; typename Data::group_iterator git = gdf.group_begin() ; for( int i=0; i<ngroups; i++, ++git){ SlicingIndex indices = *git ; int chunk_size = indices.size() ; g_test = check_filter_logical_result(call_proxy.get( indices )); if( g_test.size() == 1 ){ if( g_test[0] != TRUE ){ for( int j=0; j<chunk_size; j++){ test[indices[j]] = FALSE ; } } } else { check_filter_result(g_test, chunk_size ) ; for( int j=0; j<chunk_size; j++){ if( g_test[j] != TRUE ){ test[ indices[j] ] = FALSE ; } } } } } DataFrame res = subset( data, test, names, classes_grouped<Data>() ) ; res.attr( "vars") = data.attr("vars") ; return res ; }
DataFrame filter_grouped_single_env( const Data& gdf, const LazyDots& dots){ typedef GroupedCallProxy<Data, Subsets> Proxy ; Environment env = dots[0].env() ; const DataFrame& data = gdf.data() ; CharacterVector names = data.names() ; SymbolSet set ; for( int i=0; i<names.size(); i++){ set.insert( Rf_installChar( names[i] ) ) ; } // a, b, c -> a & b & c Call call( and_calls( dots, set, env ) ) ; int nrows = data.nrows() ; LogicalVector test(nrows, TRUE); LogicalVector g_test ; Proxy call_proxy( call, gdf, env ) ; int ngroups = gdf.ngroups() ; typename Data::group_iterator git = gdf.group_begin() ; for( int i=0; i<ngroups; i++, ++git){ SlicingIndex indices = *git ; int chunk_size = indices.size() ; g_test = check_filter_logical_result( call_proxy.get( indices ) ) ; if( g_test.size() == 1 ){ int val = g_test[0] == TRUE ; for( int j=0; j<chunk_size; j++){ test[ indices[j] ] = val ; } } else { check_filter_result(g_test, chunk_size ) ; for( int j=0; j<chunk_size; j++){ if( g_test[j] != TRUE ) test[ indices[j] ] = FALSE ; } } } DataFrame res = subset( data, test, names, classes_grouped<Data>() ) ; res.attr( "vars") = data.attr("vars") ; return res ; }
SEXP structure_mutate(const NamedListAccumulator<Data>& accumulator, const DataFrame& df, CharacterVector classes) { List res = accumulator; set_class(res, classes); set_rownames(res, df.nrows()); copy_vars(res, df); res.attr("labels") = df.attr("labels"); res.attr("index") = df.attr("index"); res.attr("indices") = df.attr("indices"); res.attr("drop") = df.attr("drop"); res.attr("group_sizes") = df.attr("group_sizes"); res.attr("biggest_group_size") = df.attr("biggest_group_size"); return res; }
// [[Rcpp::export]] DataFrame setdiff_data_frame( DataFrame x, DataFrame y){ if( !compatible_data_frame(x,y) ) stop( "not compatible" ); typedef VisitorSetIndexSet<DataFrameJoinVisitors> Set ; DataFrameJoinVisitors visitors(y, x, y.names() ) ; Set set(visitors); train_insert( set, y.nrows() ) ; std::vector<int> indices ; int n_x = x.nrows() ; for( int i=0; i<n_x; i++) { if( !set.count(-i-1) ){ set.insert(-i-1) ; indices.push_back(-i-1) ; } } return visitors.subset( indices, x.attr("class") ) ; }
// [[Rcpp::export]] DataFrame intersect_data_frame( DataFrame x, DataFrame y){ if( !compatible_data_frame(x,y) ) stop( "not compatible" ); typedef VisitorSetIndexSet<DataFrameJoinVisitors> Set ; DataFrameJoinVisitors visitors(x, y, x.names() ) ; Set set(visitors); train_insert( set, x.nrows() ) ; std::vector<int> indices ; int n_y = y.nrows() ; for( int i=0; i<n_y; i++) { Set::iterator it = set.find( -i-1 ) ; if( it != set.end() ){ indices.push_back(*it) ; set.erase(it) ; } } return visitors.subset( indices, x.attr("class") ) ; }
// [[Rcpp::export]] DataFrame arrange_impl( DataFrame data, List args, Environment env ){ int nargs = args.size() ; SEXP tmp ; List variables(nargs) ; LogicalVector ascending(nargs) ; for(int i=0; i<nargs; i++){ tmp = args[i] ; if( TYPEOF(tmp) == LANGSXP && CAR(tmp) == Rf_install("desc") ){ variables[i] = Rf_eval( CAR(CDR(tmp) ), env ) ; ascending[i] = false ; } else{ variables[i] = Rf_eval( tmp, env ); ascending[i] = true ; } } OrderVisitors o(variables,ascending, nargs) ; IntegerVector index = o.apply() ; DataFrameVisitors visitors( data, data.names() ) ; DataFrame res = visitors.subset(index, data.attr("class") ) ; return res; }
// [[Rcpp::export]] DataFrame anti_join_impl( DataFrame x, DataFrame y, CharacterVector by){ typedef VisitorSetIndexMap<DataFrameJoinVisitors, std::vector<int> > Map ; DataFrameJoinVisitors visitors(x, y, by) ; Map map(visitors); // train the map in terms of x train_push_back( map, x.nrows(), x.nrows() / 10 ) ; int n_y = y.nrows() ; // remove the rows in x that match for( int i=0; i<n_y; i++){ Map::iterator it = map.find(-i-1) ; if( it != map.end() ) map.erase(it) ; } // collect what's left std::vector<int> indices ; for( Map::iterator it = map.begin() ; it != map.end(); ++it) push_back( indices, it->second ) ; return subset(x, indices, x.names(), x.attr( "class" ) ) ; }
// [[Rcpp::export]] DataFrame build_index_cpp( DataFrame data ){ ListOf<Symbol> symbols( data.attr( "vars" ) ) ; int nsymbols = symbols.size() ; CharacterVector vars(nsymbols) ; for( int i=0; i<nsymbols; i++){ vars[i] = PRINTNAME(symbols[i]) ; } DataFrameVisitors visitors(data, vars) ; ChunkIndexMap map( visitors ) ; // checking 10 times for interupts train_push_back( map, data.nrows(), data.nrows() / 10 ) ; DataFrame labels = visitors.subset( map, "data.frame") ; int ngroups = labels.nrows() ; List indices(ngroups) ; IntegerVector group_sizes = no_init( ngroups ); int biggest_group = 0 ; ChunkIndexMap::const_iterator it = map.begin() ; for( int i=0; i<ngroups; i++, ++it){ const std::vector<int>& chunk = it->second ; indices[i] = chunk ; group_sizes[i] = chunk.size() ; biggest_group = std::max( biggest_group, (int)chunk.size() ); } data.attr( "indices" ) = indices ; data.attr( "group_sizes") = group_sizes ; data.attr( "biggest_group_size" ) = biggest_group ; data.attr( "labels" ) = labels ; data.attr( "class" ) = CharacterVector::create("grouped_df", "tbl_df", "tbl", "data.frame") ; return data ; }