예제 #1
0
DataFrame subset_join(DataFrame x, DataFrame y,
                      const Index& indices_x, const Index& indices_y,
                      const IntegerVector& by_x, const IntegerVector& by_y,
                      const IntegerVector& aux_x, const IntegerVector& aux_y,
                      CharacterVector classes) {
  // construct out object
  List out(x.ncol() + aux_y.size());

  // first the joined columns (all x columns keep their location)
  DataFrameJoinVisitors join_visitors(x, y, by_x, by_y, true, false);
  for (int i = 0; i < by_x.size(); i++) {
    JoinVisitor* v = join_visitors.get(i);
    out[by_x[i] - 1] = v->subset(indices_x);
  }

  // then the auxiliary x columns (all x columns keep their location)
  DataFrameSubsetVisitors visitors_x(x, aux_x);
  for (int i = 0; i < aux_x.size(); i++) {
    SubsetVectorVisitor* const v = visitors_x.get(i);
    out[aux_x[i] - 1] = v->subset(indices_x);
  }

  // then the auxiliary y columns (all y columns keep their relative location)
  DataFrameSubsetVisitors visitors_y(y, aux_y);
  for (int i = 0, k = x.ncol(); i < visitors_y.size(); i++, k++) {
    SubsetVectorVisitor* const v = visitors_y.get(i);
    out[k] = v->subset(indices_y);
  }

  int nrows = indices_x.size();
  set_rownames(out, nrows);

  set_class(out, classes);

  return (SEXP)out;
}
예제 #2
0
DataFrame subset_join(DataFrame x, DataFrame y,
                      const Index& indices_x, const Index& indices_y,
                      CharacterVector by_x, CharacterVector by_y,
                      const std::string& suffix_x, const std::string& suffix_y,
                      CharacterVector classes) {
  if (suffix_x.length() == 0 && suffix_y.length() == 0) {
    stop("Cannot use empty string for both x and y suffixes");
  }

  // first the joined columns
  DataFrameJoinVisitors join_visitors(x, y, SymbolVector(by_x), SymbolVector(by_y), false, false);
  int n_join_visitors = join_visitors.size();

  // then columns from x but not y
  CharacterVector all_x_columns = x.names();
  std::vector<bool> joiner(all_x_columns.size());
  CharacterVector x_columns(all_x_columns.size() - n_join_visitors);
  IntegerVector xm = r_match(all_x_columns, by_x);
  for (int i = 0, k = 0; i < all_x_columns.size(); i++) {
    if (xm[i] == NA_INTEGER) {
      joiner[i] = false;
      x_columns[k++] = all_x_columns[i];
    } else {
      joiner[i] = true;
    }
  }
  DataFrameSubsetVisitors visitors_x(x, SymbolVector(x_columns));
  int nv_x = visitors_x.size();

  // then columns from y but not x
  CharacterVector all_y_columns = y.names();
  CharacterVector y_columns(all_y_columns.size() - n_join_visitors);
  IntegerVector ym = r_match(all_y_columns, by_y);
  for (int i = 0, k = 0; i < all_y_columns.size(); i++) {
    if (ym[i] == NA_INTEGER) {
      y_columns[k++] = all_y_columns[i];
    }
  }
  DataFrameSubsetVisitors visitors_y(y, SymbolVector(y_columns));

  int nv_y = visitors_y.size();

  // construct out object
  int nrows = indices_x.size();
  List out(n_join_visitors + nv_x + nv_y);
  CharacterVector names(n_join_visitors + nv_x + nv_y);

  int index_join_visitor = 0;
  int index_x_visitor = 0;
  // ---- join visitors
  for (int i = 0; i < all_x_columns.size(); i++) {
    String col_name = all_x_columns[i];
    if (joiner[i]) {
      JoinVisitor* v = join_visitors.get(xm[i] - 1);
      out[i] = v->subset(indices_x);
      index_join_visitor++;
    } else {

      // we suffix by .x if this column is in y_columns (and if the suffix is not empty)
      if (suffix_x.length() > 0) {
        while (
          (std::find(y_columns.begin(), y_columns.end(), col_name.get_sexp()) != y_columns.end()) ||
          (std::find(names.begin(), names.begin() + i, col_name.get_sexp()) != names.begin() + i)
        ) {
          col_name += suffix_x;
        }
      }

      out[i] = visitors_x.get(index_x_visitor)->subset(indices_x);
      index_x_visitor++;
    }
    names[i] = col_name;
  }

  int k = index_join_visitor +  index_x_visitor;
  for (int i = 0; i < nv_y; i++, k++) {
    String col_name = y_columns[i];

    // we suffix by .y if this column is in x_columns (and if the suffix is not empty)
    if (suffix_y.length() > 0) {
      while (
        (std::find(all_x_columns.begin(), all_x_columns.end(), col_name.get_sexp()) != all_x_columns.end()) ||
        (std::find(names.begin(), names.begin() + k, col_name.get_sexp()) != names.begin() + k)
      ) {
        col_name += suffix_y;
      }
    }

    out[k] = visitors_y.get(i)->subset(indices_y);
    names[k] = col_name;
  }
  set_class(out, classes);
  set_rownames(out, nrows);
  out.names() = names;

  // out group columns
  SymbolVector group_cols_x = get_vars(x);
  int n_group_cols = group_cols_x.size();
  SymbolVector group_cols(n_group_cols);
  IntegerVector group_col_indices = group_cols_x.match_in_table(all_x_columns);
  // get updated column names
  for (int i = 0; i < n_group_cols; i++) {
    int group_col_index = group_col_indices[i];
    if (group_col_index != NA_INTEGER) {
      group_cols.set(i, names[group_col_index - 1]);
    } else {
      stop("unknown group column '%s'", group_cols_x[i].get_utf8_cstring());
    }
  }
  set_vars(out, group_cols);

  return (SEXP)out;
}