DataFrame subset_join(DataFrame x, DataFrame y, const Index& indices_x, const Index& indices_y, const IntegerVector& by_x, const IntegerVector& by_y, const IntegerVector& aux_x, const IntegerVector& aux_y, CharacterVector classes) { // construct out object List out(x.ncol() + aux_y.size()); // first the joined columns (all x columns keep their location) DataFrameJoinVisitors join_visitors(x, y, by_x, by_y, true, false); for (int i = 0; i < by_x.size(); i++) { JoinVisitor* v = join_visitors.get(i); out[by_x[i] - 1] = v->subset(indices_x); } // then the auxiliary x columns (all x columns keep their location) DataFrameSubsetVisitors visitors_x(x, aux_x); for (int i = 0; i < aux_x.size(); i++) { SubsetVectorVisitor* const v = visitors_x.get(i); out[aux_x[i] - 1] = v->subset(indices_x); } // then the auxiliary y columns (all y columns keep their relative location) DataFrameSubsetVisitors visitors_y(y, aux_y); for (int i = 0, k = x.ncol(); i < visitors_y.size(); i++, k++) { SubsetVectorVisitor* const v = visitors_y.get(i); out[k] = v->subset(indices_y); } int nrows = indices_x.size(); set_rownames(out, nrows); set_class(out, classes); return (SEXP)out; }
DataFrame subset_join(DataFrame x, DataFrame y, const Index& indices_x, const Index& indices_y, CharacterVector by_x, CharacterVector by_y, const std::string& suffix_x, const std::string& suffix_y, CharacterVector classes) { if (suffix_x.length() == 0 && suffix_y.length() == 0) { stop("Cannot use empty string for both x and y suffixes"); } // first the joined columns DataFrameJoinVisitors join_visitors(x, y, SymbolVector(by_x), SymbolVector(by_y), false, false); int n_join_visitors = join_visitors.size(); // then columns from x but not y CharacterVector all_x_columns = x.names(); std::vector<bool> joiner(all_x_columns.size()); CharacterVector x_columns(all_x_columns.size() - n_join_visitors); IntegerVector xm = r_match(all_x_columns, by_x); for (int i = 0, k = 0; i < all_x_columns.size(); i++) { if (xm[i] == NA_INTEGER) { joiner[i] = false; x_columns[k++] = all_x_columns[i]; } else { joiner[i] = true; } } DataFrameSubsetVisitors visitors_x(x, SymbolVector(x_columns)); int nv_x = visitors_x.size(); // then columns from y but not x CharacterVector all_y_columns = y.names(); CharacterVector y_columns(all_y_columns.size() - n_join_visitors); IntegerVector ym = r_match(all_y_columns, by_y); for (int i = 0, k = 0; i < all_y_columns.size(); i++) { if (ym[i] == NA_INTEGER) { y_columns[k++] = all_y_columns[i]; } } DataFrameSubsetVisitors visitors_y(y, SymbolVector(y_columns)); int nv_y = visitors_y.size(); // construct out object int nrows = indices_x.size(); List out(n_join_visitors + nv_x + nv_y); CharacterVector names(n_join_visitors + nv_x + nv_y); int index_join_visitor = 0; int index_x_visitor = 0; // ---- join visitors for (int i = 0; i < all_x_columns.size(); i++) { String col_name = all_x_columns[i]; if (joiner[i]) { JoinVisitor* v = join_visitors.get(xm[i] - 1); out[i] = v->subset(indices_x); index_join_visitor++; } else { // we suffix by .x if this column is in y_columns (and if the suffix is not empty) if (suffix_x.length() > 0) { while ( (std::find(y_columns.begin(), y_columns.end(), col_name.get_sexp()) != y_columns.end()) || (std::find(names.begin(), names.begin() + i, col_name.get_sexp()) != names.begin() + i) ) { col_name += suffix_x; } } out[i] = visitors_x.get(index_x_visitor)->subset(indices_x); index_x_visitor++; } names[i] = col_name; } int k = index_join_visitor + index_x_visitor; for (int i = 0; i < nv_y; i++, k++) { String col_name = y_columns[i]; // we suffix by .y if this column is in x_columns (and if the suffix is not empty) if (suffix_y.length() > 0) { while ( (std::find(all_x_columns.begin(), all_x_columns.end(), col_name.get_sexp()) != all_x_columns.end()) || (std::find(names.begin(), names.begin() + k, col_name.get_sexp()) != names.begin() + k) ) { col_name += suffix_y; } } out[k] = visitors_y.get(i)->subset(indices_y); names[k] = col_name; } set_class(out, classes); set_rownames(out, nrows); out.names() = names; // out group columns SymbolVector group_cols_x = get_vars(x); int n_group_cols = group_cols_x.size(); SymbolVector group_cols(n_group_cols); IntegerVector group_col_indices = group_cols_x.match_in_table(all_x_columns); // get updated column names for (int i = 0; i < n_group_cols; i++) { int group_col_index = group_col_indices[i]; if (group_col_index != NA_INTEGER) { group_cols.set(i, names[group_col_index - 1]); } else { stop("unknown group column '%s'", group_cols_x[i].get_utf8_cstring()); } } set_vars(out, group_cols); return (SEXP)out; }