DataFrameJoinVisitors::DataFrameJoinVisitors(const DataFrame& left_, const DataFrame& right_, const SymbolVector& names_left, const SymbolVector& names_right, bool warn_, bool na_match) : left(left_), right(right_), visitor_names_left(names_left), visitor_names_right(names_right), visitors(names_left.size()), warn(warn_) { IntegerVector indices_left = names_left.match_in_table(RCPP_GET_NAMES(left)); IntegerVector indices_right = names_right.match_in_table(RCPP_GET_NAMES(right)); const int nvisitors = indices_left.size(); if (indices_right.size() != nvisitors) { stop("Different size of join column index vectors"); } for (int i = 0; i < nvisitors; i++) { const SymbolString& name_left = names_left[i]; const SymbolString& name_right = names_right[i]; if (indices_left[i] == NA_INTEGER) { stop("'%s' column not found in lhs, cannot join", name_left.get_utf8_cstring()); } if (indices_right[i] == NA_INTEGER) { stop("'%s' column not found in rhs, cannot join", name_right.get_utf8_cstring()); } visitors[i] = join_visitor( Column(left[indices_left[i] - 1], name_left), Column(right[indices_right[i] - 1], name_right), warn, na_match ); } }
// [[Rcpp::export]] DataFrame grouped_df_impl(DataFrame data, SymbolVector symbols) { assert_all_white_list(data); DataFrame copy(shallow_copy(data)); set_class(copy, classes_grouped<GroupedDataFrame>()); if (!symbols.size()) stop("no variables to group by"); GroupedDataFrame::set_groups(copy, build_index_cpp(copy, symbols)); return copy; }
DataFrameVisitors::DataFrameVisitors(const DataFrame& data_, const SymbolVector& names) : data(data_), visitors(), visitor_names(names) { int n = names.size(); CharacterVector data_names = vec_names_or_empty(data); IntegerVector indices = names.match_in_table(data_names); for (int i = 0; i < n; i++) { if (indices[i] == NA_INTEGER) { bad_col(names[i], "is unknown"); } SEXP column = data[indices[i] - 1]; visitors.push_back(visitor(column)); } }
SEXP slice_grouped(GroupedDataFrame gdf, const LazyDots& dots) { typedef GroupedCallProxy<GroupedDataFrame, LazyGroupedSubsets> Proxy; const DataFrame& data = gdf.data(); const Lazy& lazy = dots[0]; Environment env = lazy.env(); SymbolVector names = data.names(); SymbolSet set; for (int i=0; i<names.size(); i++) { set.insert(names[i].get_symbol()); } // we already checked that we have only one expression Call call(lazy.expr()); std::vector<int> indx; indx.reserve(1000); IntegerVector g_test; Proxy call_proxy(call, gdf, env); int ngroups = gdf.ngroups(); GroupedDataFrame::group_iterator git = gdf.group_begin(); for (int i=0; i<ngroups; i++, ++git) { const SlicingIndex& indices = *git; int nr = indices.size(); g_test = check_filter_integer_result(call_proxy.get(indices)); CountIndices counter(indices.size(), g_test); if (counter.is_positive()) { // positive indexing int ntest = g_test.size(); for (int j=0; j<ntest; j++) { if (!(g_test[j] > nr || g_test[j] == NA_INTEGER)) { indx.push_back(indices[g_test[j]-1]); } } } else if (counter.get_n_negative() != 0) { // negative indexing std::set<int> drop; int n = g_test.size(); for (int j=0; j<n; j++) { if (g_test[j] != NA_INTEGER) drop.insert(-g_test[j]); } int n_drop = drop.size(); std::set<int>::const_iterator drop_it = drop.begin(); int k = 0, j = 0; while (drop_it != drop.end()) { int next_drop = *drop_it - 1; while (j < next_drop) { indx.push_back(indices[j++]); k++; } j++; ++drop_it; } while (k < nr - n_drop) { indx.push_back(indices[j++]); k++; } } } DataFrame res = subset(data, indx, names, classes_grouped<GroupedDataFrame>()); set_vars(res, get_vars(data)); strip_index(res); return GroupedDataFrame(res).data(); }
SEXP build_index_cpp(const DataFrame& data, const SymbolVector& vars) { const int nvars = vars.size(); CharacterVector names = data.names(); IntegerVector indx = vars.match_in_table(names); std::vector<SEXP> visited_data(nvars); CharacterVector groups_names(nvars + 1); for (int i = 0; i < nvars; ++i) { int pos = indx[i]; if (pos == NA_INTEGER) { bad_col(vars[i], "is unknown"); } SEXP v = data[pos - 1]; visited_data[i] = v; groups_names[i] = names[pos - 1]; if (!white_list(v) || TYPEOF(v) == VECSXP) { bad_col(vars[i], "can't be used as a grouping variable because it's a {type}", _["type"] = get_single_class(v)); } } DataFrameVisitors visitors(data, vars); boost::shared_ptr<Slicer> s = slicer(std::vector<int>(), 0, visited_data, visitors); int ncases = s->size(); // construct the groups data List vec_groups(nvars + 1); List indices(ncases); ListCollecter indices_collecter(indices); for (int i = 0; i < nvars; i++) { vec_groups[i] = Rf_allocVector(TYPEOF(visited_data[i]), ncases); copy_most_attributes(vec_groups[i], visited_data[i]); } vec_groups[nvars] = indices; groups_names[nvars] = ".rows"; s->make(vec_groups, indices_collecter); // warn about NA in factors for (int i = 0; i < nvars; i++) { SEXP x = vec_groups[i]; if (Rf_isFactor(x)) { IntegerVector xi(x); if (std::find(xi.begin(), xi.end(), NA_INTEGER) < xi.end()) { warningcall(R_NilValue, tfm::format("Factor `%s` contains implicit NA, consider using `forcats::fct_explicit_na`", CHAR(groups_names[i].get()))); } } } vec_groups.attr("names") = groups_names; vec_groups.attr("row.names") = IntegerVector::create(NA_INTEGER, -ncases); vec_groups.attr("class") = classes_not_grouped() ; return vec_groups; }