DataFrame subset( DataFrame x, DataFrame y, const Index& indices_x, const Index& indices_y, CharacterVector by, CharacterVector classes ){ CharacterVector x_columns = x.names() ; DataFrameVisitors visitors_x(x, x_columns) ; CharacterVector all_y_columns = y.names() ; CharacterVector y_columns = setdiff( all_y_columns, by ) ; JoinColumnSuffixer suffixer(x_columns, y_columns, by) ; DataFrameVisitors visitors_y(y, y_columns) ; int nrows = indices_x.size() ; int nv_x = visitors_x.size(), nv_y = visitors_y.size() ; List out(nv_x+nv_y); CharacterVector names(nv_x+nv_y) ; int k=0; for( ; k<nv_x; k++){ out[k] = visitors_x.get(k)->subset(indices_x) ; names[k] = suffixer.get( x_columns[k], ".x" ) ; } for( int i=0; i<nv_y; i++, k++){ out[k] = visitors_y.get(i)->subset(indices_y) ; names[k] = suffixer.get(y_columns[i], ".y" ) ; } out.attr("class") = classes ; set_rownames(out, nrows) ; out.names() = names ; SEXP vars = x.attr( "vars" ) ; if( !Rf_isNull(vars) ) out.attr( "vars" ) = vars ; return (SEXP)out ; }
SEXP filter_not_grouped( DataFrame df, List args, const DataDots& dots){ CharacterVector names = df.names() ; SymbolSet set ; for( int i=0; i<names.size(); i++){ set.insert( Rf_install( names[i] ) ) ; } if( dots.single_env() ){ Environment env = dots.envir(0) ; // a, b, c -> a & b & c Shield<SEXP> call( and_calls( args, set ) ) ; // replace the symbols that are in the data frame by vectors from the data frame // and evaluate the expression CallProxy proxy( (SEXP)call, df, env ) ; LogicalVector test = proxy.eval() ; check_filter_result(test, df.nrows()); DataFrame res = subset( df, test, df.names(), classes_not_grouped() ) ; return res ; } else { int nargs = args.size() ; CallProxy first_proxy(args[0], df, dots.envir(0) ) ; LogicalVector test = first_proxy.eval() ; check_filter_result(test, df.nrows()); for( int i=1; i<nargs; i++){ LogicalVector test2 = CallProxy(args[i], df, dots.envir(i) ).eval() ; combine_and(test, test2) ; } DataFrame res = subset( df, test, df.names(), classes_not_grouped() ) ; return res ; } }
DataFrame filter_not_grouped( DataFrame df, const LazyDots& dots){ CharacterVector names = df.names() ; SymbolSet set ; for( int i=0; i<names.size(); i++){ set.insert( Rf_installChar( names[i] ) ) ; } if( dots.single_env() ){ Environment env = dots[0].env() ; // a, b, c -> a & b & c Shield<SEXP> call( and_calls( dots, set, env ) ) ; // replace the symbols that are in the data frame by vectors from the data frame // and evaluate the expression CallProxy proxy( (SEXP)call, df, env ) ; LogicalVector test = check_filter_logical_result(proxy.eval()) ; if( test.size() == 1){ if( test[0] == TRUE ){ return df ; } else { return empty_subset(df, df.names(), classes_not_grouped()) ; } } else { check_filter_result(test, df.nrows()); return subset(df, test, classes_not_grouped() ) ; } } else { int nargs = dots.size() ; Call call(dots[0].expr()); CallProxy first_proxy(call, df, dots[0].env() ) ; LogicalVector test = check_filter_logical_result(first_proxy.eval()) ; if( test.size() == 1 ) { if( !test[0] ){ return empty_subset(df, df.names(), classes_not_grouped() ) ; } } else { check_filter_result(test, df.nrows()); } for( int i=1; i<nargs; i++){ Rcpp::checkUserInterrupt() ; Call call( dots[i].expr() ) ; CallProxy proxy(call, df, dots[i].env() ) ; LogicalVector test2 = check_filter_logical_result(proxy.eval()) ; if( combine_and(test, test2) ){ return empty_subset(df, df.names(), classes_not_grouped() ) ; } } DataFrame res = subset( df, test, classes_not_grouped() ) ; return res ; } }
// [[Rcpp::export]] DataFrame semi_join_impl(DataFrame x, DataFrame y, CharacterVector by_x, CharacterVector by_y, bool na_match) { if (by_x.size() == 0) stop("no variable to join by"); typedef VisitorSetIndexMap<DataFrameJoinVisitors, std::vector<int> > Map; DataFrameJoinVisitors visitors(x, y, SymbolVector(by_x), SymbolVector(by_y), false, na_match); Map map(visitors); // train the map in terms of x train_push_back(map, x.nrows()); int n_y = y.nrows(); // this will collect indices from rows in x that match rows in y std::vector<int> indices; for (int i = 0; i < n_y; i++) { // find a row in x that matches row i from y Map::iterator it = map.find(-i - 1); if (it != map.end()) { // collect the indices and remove them from the // map so that they are only found once. push_back(indices, it->second); map.erase(it); } } const DataFrame& out = subset(x, indices, x.names(), get_class(x)); strip_index(out); return out; }
// [[Rcpp::export]] SEXP distinct_impl(DataFrame df, CharacterVector vars, CharacterVector keep) { if (df.size() == 0) return df; // No vars means ungrouped data with keep_all = TRUE. if (vars.size() == 0) return df; check_valid_colnames(df); if (!vars.size()) { vars = df.names(); } DataFrameVisitors visitors(df, vars); std::vector<int> indices; VisitorSetIndexSet<DataFrameVisitors> set(visitors); int n = df.nrows(); for (int i=0; i<n; i++) { if (set.insert(i).second) { indices.push_back(i); } } return DataFrameSubsetVisitors(df, keep).subset(indices, df.attr("class")); }
// [[Rcpp::export]] DataFrame anti_join_impl(DataFrame x, DataFrame y, CharacterVector by_x, CharacterVector by_y, bool na_match) { if (by_x.size() == 0) stop("no variable to join by"); typedef VisitorSetIndexMap<DataFrameJoinVisitors, std::vector<int> > Map; DataFrameJoinVisitors visitors(x, y, SymbolVector(by_x), SymbolVector(by_y), false, na_match); Map map(visitors); // train the map in terms of x train_push_back(map, x.nrows()); int n_y = y.nrows(); // remove the rows in x that match for (int i = 0; i < n_y; i++) { Map::iterator it = map.find(-i - 1); if (it != map.end()) map.erase(it); } // collect what's left std::vector<int> indices; for (Map::iterator it = map.begin(); it != map.end(); ++it) push_back(indices, it->second); const DataFrame& out = subset(x, indices, x.names(), get_class(x)); strip_index(out); return out; }
// [[Rcpp::export]] DataFrame semi_join_impl( DataFrame x, DataFrame y, CharacterVector by){ typedef VisitorSetIndexMap<DataFrameJoinVisitors, std::vector<int> > Map ; DataFrameJoinVisitors visitors(x, y, by) ; Map map(visitors); // train the map in terms of x train_push_back( map, x.nrows(), x.nrows() / 10) ; int n_y = y.nrows() ; // this will collect indices from rows in x that match rows in y std::vector<int> indices ; for( int i=0; i<n_y; i++){ // find a row in x that matches row i from y Map::iterator it = map.find(-i-1) ; if( it != map.end() ){ // collect the indices and remove them from the // map so that they are only found once. push_back( indices, it->second ) ; map.erase(it) ; } } return subset(x, indices, x.names(), x.attr("class") ) ; }
SEXP structure_mutate( Proxy& call_proxy, const DataFrame& df, const CharacterVector& results_names, CharacterVector classes){ int n = call_proxy.nsubsets() ; List out(n) ; CharacterVector names(n) ; CharacterVector input_names = df.names() ; int ncolumns = df.size() ; int i=0 ; for( ; i<ncolumns; i++){ out[i] = call_proxy.get_variable(input_names[i]) ; SET_NAMED( out[i], 2 ); names[i] = input_names[i] ; } for( int k=0; i<n; k++ ){ String name = results_names[k] ; if( ! any( input_names.begin(), input_names.end(), name.get_sexp() ) ){ SEXP x = call_proxy.get_variable( name ) ; out[i] = x ; SET_NAMED( out[i], 2 ); names[i] = name ; i++ ; } } out.attr("class") = classes ; set_rownames( out, df.nrows() ) ; out.names() = names; return out ; }
// [[Rcpp::export]] SEXP filter_impl( DataFrame df, LazyDots dots){ if( df.nrows() == 0 || Rf_isNull(df) ) { return df ; } check_valid_colnames(df) ; assert_all_white_list(df) ; if( dots.size() == 0 ) return df ; // special case if( dots.size() == 1 && TYPEOF(dots[0].expr()) == LGLSXP){ LogicalVector what = dots[0].expr() ; if( what.size() == 1 ){ if( what[0] == TRUE ){ return df ; } else { return empty_subset( df, df.names(), is<GroupedDataFrame>(df) ? classes_grouped<GroupedDataFrame>() : classes_not_grouped() ) ; } } } if( is<GroupedDataFrame>( df ) ){ return filter_grouped<GroupedDataFrame, LazyGroupedSubsets>( GroupedDataFrame(df), dots); } else if( is<RowwiseDataFrame>(df) ){ return filter_grouped<RowwiseDataFrame, LazyRowwiseSubsets>( RowwiseDataFrame(df), dots); } else { return filter_not_grouped( df, dots ) ; } }
// [[Rcpp::export]] SEXP ex10(DataFrame input){ CharacterVector names = input.names(); for(int i = 0; i<names.size(); i++){ cout << "names[" << i << "]=" << names[i] << "\n"; } return(names); }
// [[Rcpp::export]] DataFrame arrange_impl( DataFrame data, List args, DataDots dots ){ int nargs = args.size() ; List variables(nargs) ; LogicalVector ascending(nargs) ; Shelter<SEXP> __ ; for(int i=0; i<nargs; i++){ SEXP call = args[i] ; bool is_desc = TYPEOF(call) == LANGSXP && Rf_install("desc") == CAR(call) ; CallProxy call_proxy( is_desc ? CADR(call) : call, data, dots.envir(i)) ; variables[i] = __(call_proxy.eval()) ; if( Rf_length(variables[i]) != data.nrows() ){ std::stringstream s ; s << "incorrect size (" << Rf_length(variables[i]) << "), expecting :" << data.nrows() ; stop(s.str()) ; } ascending[i] = !is_desc ; } OrderVisitors o(variables,ascending, nargs) ; IntegerVector index = o.apply() ; DataFrameVisitors visitors( data, data.names() ) ; DataFrame res = visitors.subset(index, data.attr("class") ) ; return res; }
// [[Rcpp::export]] List arrange_impl(DataFrame data, QuosureList quosures) { if (data.size() == 0) return data; check_valid_colnames(data); assert_all_white_list(data); if (quosures.size() == 0 || data.nrows() == 0) return data; int nargs = quosures.size(); List variables(nargs); LogicalVector ascending(nargs); for (int i = 0; i < nargs; i++) { const NamedQuosure& quosure = quosures[i]; Shield<SEXP> call_(quosure.expr()); SEXP call = call_; bool is_desc = TYPEOF(call) == LANGSXP && Rf_install("desc") == CAR(call); CallProxy call_proxy(is_desc ? CADR(call) : call, data, quosure.env()); Shield<SEXP> v(call_proxy.eval()); if (!white_list(v)) { stop("cannot arrange column of class '%s'", get_single_class(v)); } if (Rf_inherits(v, "data.frame")) { DataFrame df(v); int nr = df.nrows(); if (nr != data.nrows()) { stop("data frame column with incompatible number of rows (%d), expecting : %d", nr, data.nrows()); } } else if (Rf_isMatrix(v)) { stop("can't arrange by a matrix"); } else { if (Rf_length(v) != data.nrows()) { stop("incorrect size (%d), expecting : %d", Rf_length(v), data.nrows()); } } variables[i] = v; ascending[i] = !is_desc; } OrderVisitors o(variables, ascending, nargs); IntegerVector index = o.apply(); DataFrameSubsetVisitors visitors(data, data.names()); List res = visitors.subset(index, get_class(data)); if (is<GroupedDataFrame>(data)) { // so that all attributes are recalculated (indices ... ) // see the lazyness feature in GroupedDataFrame // if we don't do that, we get the values of the un-arranged data // set for free from subset (#1064) res.attr("labels") = R_NilValue; copy_vars(res, data); return GroupedDataFrame(res).data(); } SET_ATTRIB(res, strip_group_attributes(res)); return res; }
DataFrame build_index_cpp(DataFrame data) { SymbolVector vars(get_vars(data)); const int nvars = vars.size(); CharacterVector names = data.names(); IntegerVector indx = vars.match_in_table(names); for (int i = 0; i < nvars; ++i) { int pos = indx[i]; if (pos == NA_INTEGER) { stop("unknown column '%s' ", vars[i].get_utf8_cstring()); } SEXP v = data[pos - 1]; if (!white_list(v) || TYPEOF(v) == VECSXP) { stop( "cannot group column %s, of class '%s'", vars[i].get_utf8_cstring(), get_single_class(v)); } } DataFrameVisitors visitors(data, vars); ChunkIndexMap map(visitors); train_push_back(map, data.nrows()); DataFrame labels = DataFrameSubsetVisitors(data, vars).subset(map, "data.frame"); int ngroups = labels.nrows(); IntegerVector labels_order = OrderVisitors(labels).apply(); labels = DataFrameSubsetVisitors(labels).subset(labels_order, "data.frame"); List indices(ngroups); IntegerVector group_sizes = no_init(ngroups); int biggest_group = 0; ChunkIndexMap::const_iterator it = map.begin(); std::vector<const std::vector<int>* > chunks(ngroups); for (int i = 0; i < ngroups; i++, ++it) { chunks[i] = &it->second; } for (int i = 0; i < ngroups; i++) { int idx = labels_order[i]; const std::vector<int>& chunk = *chunks[idx]; indices[i] = chunk; group_sizes[i] = chunk.size(); biggest_group = std::max(biggest_group, (int)chunk.size()); } data.attr("indices") = indices; data.attr("group_sizes") = group_sizes; data.attr("biggest_group_size") = biggest_group; data.attr("labels") = labels; set_class(data, CharacterVector::create("grouped_df", "tbl_df", "tbl", "data.frame")); return data; }
// [[Rcpp::export]] SEXP ex10_2(DataFrame input){ BEGIN_RCPP List names = input.names(); for(int i = 0; i<names.size(); i++){ cout << "names[" << i << "]=" << as<string>(names[i]) << "\n"; } return(wrap(names)); END_RCPP }
// [[Rcpp::export]] DataFrame sort_impl( DataFrame data ){ OrderVisitors o(data) ; IntegerVector index = o.apply() ; DataFrameVisitors visitors( data, data.names() ) ; DataFrame res = visitors.subset(index, "data.frame" ) ; return res; }
// [[Rcpp::export]] SEXP ex11_2(DataFrame input){ BEGIN_RCPP List names = input.names(); map<string, NumericVector> mapObj; for(int i = 0; i<names.size(); i++){ mapObj.insert(pair<string, NumericVector>(as<string>(names[i]),input(i))); } return(wrap(mapObj)); END_RCPP }
// [[Rcpp::export]] SEXP ex10_1(DataFrame input){ BEGIN_RCPP CharacterVector names = input.names(); // vector<string> str_names = as<string>(names); // error string str_names = as<string>(names); // names must have one element. for(int i = 0; i<names.size(); i++){ cout << "names[" << i << "]=" << names[i] << "\n"; } return(wrap(str_names)); END_RCPP }
// [[Rcpp::export]] SEXP ex10_3(DataFrame input){ BEGIN_RCPP List names = input.names(); vector<string> strVec; for(int i = 0; i<names.size(); i++){ string s = as<string>(names[i]); strVec.push_back(s); } return(wrap(strVec)); END_RCPP }
// [[Rcpp::export]] SEXP ex12(DataFrame input, CharacterVector columnName, int rowIndex, double replace){ BEGIN_RCPP List names = input.names(); map<string, NumericVector> mapObj; for(int i = 0; i<names.size(); i++){ mapObj.insert(pair<string, NumericVector>(as<string>(names[i]),input(i))); } ex12helper(mapObj, as<string>(columnName), rowIndex, replace); return(wrap(mapObj)); END_RCPP }
// [[Rcpp::export]] SEXP ex13_2(DataFrame input, CharacterVector columnName, double replace){ BEGIN_RCPP List names = input.names(); List mapObj = as<List>(input); // all rows for(int i=0; i<input.nrows(); i++){ ex13helper(mapObj, as<string>(columnName), i, replace); } return(wrap(mapObj)); END_RCPP }
SEXP filter_not_grouped( DataFrame df, List args, Environment env){ // a, b, c -> a & b & c Language call = and_calls( args ) ; // replace the symbols that are in the data frame by vectors from the data frame // and evaluate the expression CallProxy proxy( call, df, env ) ; LogicalVector test = proxy.eval() ; DataFrame res = subset( df, test, df.names(), classes_not_grouped() ) ; return res ; }
// [[Rcpp::export]] DataFrame union_data_frame( DataFrame x, DataFrame y){ if( !compatible_data_frame(x,y) ) stop( "not compatible" ); typedef VisitorSetIndexSet<DataFrameJoinVisitors> Set ; DataFrameJoinVisitors visitors(x, y, x.names() ) ; Set set(visitors); train_insert( set, x.nrows() ) ; train_insert_right( set, y.nrows() ) ; return visitors.subset( set, x.attr("class") ) ; }
// [[Rcpp::export]] SEXP ex13(DataFrame input, CharacterVector columnName, double replace){ BEGIN_RCPP List names = input.names(); map<string, NumericVector> mapObj; for(int i = 0; i<names.size(); i++){ mapObj.insert(pair<string, NumericVector>(as<string>(names[i]),input(i))); } // all rows for(int i=0; i<input.nrows(); i++){ ex13helper(wrap(mapObj), as<string>(columnName), i, replace); } return(wrap(mapObj)); END_RCPP }
// [[Rcpp::export]] DataFrame build_index_cpp( DataFrame data ){ CharacterVector vars = Rf_getAttrib( data.attr( "vars" ), R_NamesSymbol ) ; DataFrameVisitors visitors(data, vars) ; ChunkIndexMap map( visitors ) ; train_push_back( map, data.nrows() ) ; DataFrame labels = visitors.subset( map, "data.frame") ; int ngroups = labels.nrows() ; OrderVisitors order_labels( labels, vars ) ; IntegerVector orders = order_labels.apply() ; std::vector< const std::vector<int>* > chunks(ngroups) ; ChunkIndexMap::const_iterator it = map.begin() ; for( int i=0; i<ngroups; i++, ++it){ chunks[ i ] = &it->second ; } IntegerVector group_sizes = no_init( ngroups ); int biggest_group = 0 ; std::vector<int> indices ; indices.reserve( data.nrows() ); for( int i=0; i<ngroups; i++){ const std::vector<int>& chunk = *chunks[orders[i]] ; push_back( indices, chunk ) ; biggest_group = std::max( biggest_group, (int)chunk.size() ); group_sizes[i] = chunk.size() ; } DataFrameVisitors all_variables_visitors(data, data.names() ) ; data = all_variables_visitors.subset( indices, classes_grouped() ) ; // TODO: we own labels, so perhaps we can do an inplace sort, // to reuse its memory instead of creating a new data frame DataFrameVisitors labels_visitors( labels, vars) ; labels = labels_visitors.subset( orders, "data.frame" ) ; labels.attr( "vars" ) = R_NilValue ; data.attr( "group_sizes") = group_sizes ; data.attr( "biggest_group_size" ) = biggest_group ; data.attr( "labels" ) = labels ; return data ; }
// [[Rcpp::export]] IntegerVector match_data_frame( DataFrame x, DataFrame y){ if( !compatible_data_frame(x,y) ) stop( "not compatible" ); typedef VisitorSetIndexSet<DataFrameJoinVisitors> Set ; DataFrameJoinVisitors visitors(y, x, x.names() ) ; Set set(visitors); train_insert( set, y.nrows() ) ; int n_x = x.nrows() ; IntegerVector res = no_init( n_x ); for( int i=0; i<n_x; i++) { Set::iterator it = set.find( -i-1 ); res[i] = ( it == set.end() ) ? NA_INTEGER : (*it+1) ; } return res ; }
List cbind__impl( Dots dots ){ int n = dots.size() ; // first check that the number of rows is the same DataFrame df = dots[0] ; int nrows = df.nrows() ; int nv = df.size() ; for( int i=1; i<n; i++){ DataFrame current = dots[i] ; if( current.nrows() != nrows ){ std::stringstream ss ; ss << "incompatible number of rows (" << current.size() << ", expecting " << nrows ; stop( ss.str() ) ; } nv += current.size() ; } // collect columns List out(nv) ; CharacterVector out_names(nv) ; // then do the subsequent dfs for( int i=0, k=0 ; i<n; i++){ Rcpp::checkUserInterrupt() ; DataFrame current = dots[i] ; CharacterVector current_names = current.names() ; int nc = current.size() ; for( int j=0; j<nc; j++, k++){ out[k] = shared_SEXP(current[j]) ; out_names[k] = current_names[j] ; } } out.names() = out_names ; set_rownames( out, nrows ) ; out.attr( "class") = "data.frame" ; return out ; }
// [[Rcpp::export]] void assert_all_allow_list(const DataFrame& data) { // checking variables are on the allow list int nc = data.size(); for (int i = 0; i < nc; i++) { if (!allow_list(data[i])) { SymbolVector names = data.names(); const SymbolString& name_i = names[i]; SEXP v = data[i]; SEXP klass = Rf_getAttrib(v, R_ClassSymbol); if (!Rf_isNull(klass)) { bad_col(name_i, "is of unsupported class {type}", _["type"] = get_single_class(v)); } else { bad_col(name_i, "is of unsupported type {type}", _["type"] = Rf_type2char(TYPEOF(v))); } } } }
// [[Rcpp::export]] void assert_all_white_list(const DataFrame& data) { // checking variables are on the white list int nc = data.size(); for (int i=0; i<nc; i++) { if (!white_list(data[i])) { SymbolVector names = data.names(); const SymbolString& name_i = names[i]; SEXP v = data[i]; SEXP klass = Rf_getAttrib(v, R_ClassSymbol); if (!Rf_isNull(klass)) { stop("column '%s' has unsupported class : %s", name_i.get_utf8_cstring() , get_single_class(v)); } else { stop("column '%s' has unsupported type : %s", name_i.get_utf8_cstring() , Rf_type2char(TYPEOF(v))); } } } }
// [[Rcpp::export]] DataFrame setdiff_data_frame( DataFrame x, DataFrame y){ if( !compatible_data_frame(x,y) ) stop( "not compatible" ); typedef VisitorSetIndexSet<DataFrameJoinVisitors> Set ; DataFrameJoinVisitors visitors(y, x, y.names() ) ; Set set(visitors); train_insert( set, y.nrows() ) ; std::vector<int> indices ; int n_x = x.nrows() ; for( int i=0; i<n_x; i++) { if( !set.count(-i-1) ){ set.insert(-i-1) ; indices.push_back(-i-1) ; } } return visitors.subset( indices, x.attr("class") ) ; }
// [[Rcpp::export]] DataFrame intersect_data_frame( DataFrame x, DataFrame y){ if( !compatible_data_frame(x,y) ) stop( "not compatible" ); typedef VisitorSetIndexSet<DataFrameJoinVisitors> Set ; DataFrameJoinVisitors visitors(x, y, x.names() ) ; Set set(visitors); train_insert( set, x.nrows() ) ; std::vector<int> indices ; int n_y = y.nrows() ; for( int i=0; i<n_y; i++) { Set::iterator it = set.find( -i-1 ) ; if( it != set.end() ){ indices.push_back(*it) ; set.erase(it) ; } } return visitors.subset( indices, x.attr("class") ) ; }