// [[Rcpp::export]] SEXP filter_impl( DataFrame df, LazyDots dots){ if( df.nrows() == 0 || Rf_isNull(df) ) { return df ; } check_valid_colnames(df) ; assert_all_white_list(df) ; if( dots.size() == 0 ) return df ; // special case if( dots.size() == 1 && TYPEOF(dots[0].expr()) == LGLSXP){ LogicalVector what = dots[0].expr() ; if( what.size() == 1 ){ if( what[0] == TRUE ){ return df ; } else { return empty_subset( df, df.names(), is<GroupedDataFrame>(df) ? classes_grouped<GroupedDataFrame>() : classes_not_grouped() ) ; } } } if( is<GroupedDataFrame>( df ) ){ return filter_grouped<GroupedDataFrame, LazyGroupedSubsets>( GroupedDataFrame(df), dots); } else if( is<RowwiseDataFrame>(df) ){ return filter_grouped<RowwiseDataFrame, LazyRowwiseSubsets>( RowwiseDataFrame(df), dots); } else { return filter_not_grouped( df, dots ) ; } }
// [[Rcpp::export]] List arrange_impl( DataFrame data, LazyDots dots ){ if( data.size() == 0 ) return data ; check_valid_colnames(data) ; assert_all_white_list(data) ; if( dots.size() == 0 || data.nrows() == 0) return data ; int nargs = dots.size() ; List variables(nargs) ; LogicalVector ascending(nargs) ; for(int i=0; i<nargs; i++){ const Lazy& lazy = dots[i] ; Shield<SEXP> call_( lazy.expr() ) ; SEXP call = call_ ; bool is_desc = TYPEOF(call) == LANGSXP && Rf_install("desc") == CAR(call) ; CallProxy call_proxy(is_desc ? CADR(call) : call, data, lazy.env()) ; Shield<SEXP> v(call_proxy.eval()) ; if( !white_list(v) ){ stop( "cannot arrange column of class '%s'", get_single_class(v) ) ; } if( Rf_inherits(v, "data.frame" ) ){ DataFrame df(v) ; int nr = df.nrows() ; if( nr != data.nrows() ){ stop( "data frame column with incompatible number of rows (%d), expecting : %d", nr, data.nrows() ); } } else if( Rf_isMatrix(v) ) { stop( "can't arrange by a matrix" ) ; } else { if( Rf_length(v) != data.nrows() ){ stop( "incorrect size (%d), expecting : %d", Rf_length(v), data.nrows() ) ; } } variables[i] = v ; ascending[i] = !is_desc ; } OrderVisitors o(variables, ascending, nargs) ; IntegerVector index = o.apply() ; DataFrameSubsetVisitors visitors( data, data.names() ) ; List res = visitors.subset(index, data.attr("class") ) ; if( is<GroupedDataFrame>(data) ){ // so that all attributes are recalculated (indices ... ) // see the lazyness feature in GroupedDataFrame // if we don't do that, we get the values of the un-arranged data // set for free from subset (#1064) res.attr("labels") = R_NilValue ; res.attr( "vars" ) = data.attr("vars" ) ; return GroupedDataFrame(res).data() ; } SET_ATTRIB(res, strip_group_attributes(res)); return res ; }
DataFrame filter_not_grouped( DataFrame df, const LazyDots& dots){ CharacterVector names = df.names() ; SymbolSet set ; for( int i=0; i<names.size(); i++){ set.insert( Rf_installChar( names[i] ) ) ; } if( dots.single_env() ){ Environment env = dots[0].env() ; // a, b, c -> a & b & c Shield<SEXP> call( and_calls( dots, set, env ) ) ; // replace the symbols that are in the data frame by vectors from the data frame // and evaluate the expression CallProxy proxy( (SEXP)call, df, env ) ; LogicalVector test = check_filter_logical_result(proxy.eval()) ; if( test.size() == 1){ if( test[0] == TRUE ){ return df ; } else { return empty_subset(df, df.names(), classes_not_grouped()) ; } } else { check_filter_result(test, df.nrows()); return subset(df, test, classes_not_grouped() ) ; } } else { int nargs = dots.size() ; Call call(dots[0].expr()); CallProxy first_proxy(call, df, dots[0].env() ) ; LogicalVector test = check_filter_logical_result(first_proxy.eval()) ; if( test.size() == 1 ) { if( !test[0] ){ return empty_subset(df, df.names(), classes_not_grouped() ) ; } } else { check_filter_result(test, df.nrows()); } for( int i=1; i<nargs; i++){ Rcpp::checkUserInterrupt() ; Call call( dots[i].expr() ) ; CallProxy proxy(call, df, dots[i].env() ) ; LogicalVector test2 = check_filter_logical_result(proxy.eval()) ; if( combine_and(test, test2) ){ return empty_subset(df, df.names(), classes_not_grouped() ) ; } } DataFrame res = subset( df, test, classes_not_grouped() ) ; return res ; } }
DataFrame filter_grouped( const Data& gdf, const LazyDots& dots){ if( dots.single_env() ){ return filter_grouped_single_env<Data, Subsets>(gdf, dots) ; } else { return filter_grouped_multiple_env<Data, Subsets>(gdf, dots) ; } }
SEXP summarise_grouped(const DataFrame& df, const LazyDots& dots){ Data gdf(df) ; int nexpr = dots.size() ; int nvars = gdf.nvars() ; check_not_groups(dots, gdf); NamedListAccumulator<Data> accumulator ; int i=0; for( ; i<nvars; i++){ accumulator.set( PRINTNAME(gdf.symbol(i)), shared_SEXP(gdf.label(i)) ) ; } Subsets subsets(gdf) ; for( int k=0; k<nexpr; k++, i++ ){ Rcpp::checkUserInterrupt() ; const Lazy& lazy = dots[k] ; const Environment& env = lazy.env() ; Shield<SEXP> expr_(lazy.expr()) ; SEXP expr = expr_ ; boost::scoped_ptr<Result> res( get_handler( expr, subsets, env ) ); // if we could not find a direct Result // we can use a GroupedCallReducer which will callback to R if( !res ) { res.reset( new GroupedCallReducer<Data, Subsets>( lazy.expr(), subsets, env) ); } Shield<SEXP> result( res->process(gdf) ) ; accumulator.set( lazy.name(), result ); subsets.input( lazy.name(), SummarisedVariable(result) ) ; } return summarised_grouped_tbl_cpp<Data>(accumulator, gdf ); }
SEXP summarise_not_grouped(DataFrame df, const LazyDots& dots){ int nexpr = dots.size() ; if( nexpr == 0) return DataFrame() ; LazySubsets subsets( df ) ; std::vector<SEXP> results ; NamedListAccumulator<DataFrame> accumulator ; for( int i=0; i<nexpr; i++){ Rcpp::checkUserInterrupt() ; const Lazy& lazy = dots[i] ; Environment env = lazy.env() ; Shield<SEXP> expr_(lazy.expr()) ; SEXP expr = expr_ ; boost::scoped_ptr<Result> res( get_handler( expr, subsets, env ) ) ; RObject result ; if(res) { result = res->process( FullDataFrame(df) ) ; } else { result = CallProxy( lazy.expr(), subsets, env).eval() ; } if( Rf_length(result) != 1 ){ stop( "expecting result of length one, got : %d", Rf_length(result) ) ; } accumulator.set(lazy.name(), result); subsets.input( lazy.name(), result ) ; } return tbl_cpp( accumulator, 1 ) ; }
DataFrame filter_grouped( const GroupedDataFrame& gdf, const LazyDots& dots){ if( dots.single_env() ){ return filter_grouped_single_env(gdf, dots) ; } else { return filter_grouped_multiple_env(gdf, dots) ; } }
SEXP summarise_not_grouped(DataFrame df, const LazyDots& dots){ int nexpr = dots.size() ; if( nexpr == 0) return DataFrame() ; LazySubsets subsets( df ) ; std::vector<SEXP> results ; NamedListAccumulator<DataFrame> accumulator ; Rcpp::Shelter<SEXP> __ ; for( int i=0; i<nexpr; i++){ Rcpp::checkUserInterrupt() ; const Lazy& lazy = dots[i] ; Environment env = lazy.env() ; Result* res = get_handler( lazy.expr(), subsets, env ) ; SEXP result ; if(res) { result = __(res->process( FullDataFrame(df) )) ; } else { result = __(CallProxy( lazy.expr(), subsets, env).eval()) ; } delete res ; if( Rf_length(result) != 1 ){ stop( "expecting result of length one, got : %d", Rf_length(result) ) ; } accumulator.set(lazy.name(), result); subsets.input( Symbol(lazy.name()), result ) ; } return tbl_cpp( accumulator, 1 ) ; }
void check_not_groups(const LazyDots& dots, const GroupedDataFrame& gdf) { int n = dots.size(); for (int i=0; i<n; i++) { if (gdf.has_group(dots[i].name())) stop("cannot modify grouping variable"); } }
// [[Rcpp::export]] SEXP slice_impl(DataFrame df, LazyDots dots) { if (dots.size() == 0) return df; if (dots.size() != 1) stop("slice only accepts one expression"); if (is<GroupedDataFrame>(df)) { return slice_grouped(GroupedDataFrame(df), dots); } else { return slice_not_grouped(df, dots); } }
// [[Rcpp::export]] SEXP mutate_impl(DataFrame df, LazyDots dots) { if (dots.size() == 0) return df; check_valid_colnames(df); if (is<RowwiseDataFrame>(df)) { return mutate_grouped<RowwiseDataFrame, LazyRowwiseSubsets>(df, dots); } else if (is<GroupedDataFrame>(df)) { return mutate_grouped<GroupedDataFrame, LazyGroupedSubsets>(df, dots); } else { return mutate_not_grouped(df, dots); } }
SEXP and_calls( const LazyDots& dots, const SymbolSet& set, const Environment& env ){ int ncalls = dots.size() ; if( !ncalls ) { stop("incompatible input") ; } Rcpp::Armor<SEXP> res( assert_correct_filter_subcall(dots[0].expr(), set, env) ) ; SEXP and_symbol = Rf_install( "&" ) ; for( int i=1; i<ncalls; i++) res = Rcpp_lang3( and_symbol, res, assert_correct_filter_subcall(dots[i].expr(), set, env) ) ; return res ; }
DataFrame filter_grouped_multiple_env( const Data& gdf, const LazyDots& dots){ const DataFrame& data = gdf.data() ; CharacterVector names = data.names() ; SymbolSet set ; for( int i=0; i<names.size(); i++){ set.insert( Rf_installChar( names[i] ) ) ; } int nrows = data.nrows() ; LogicalVector test(nrows, TRUE); LogicalVector g_test ; for( int k=0; k<dots.size(); k++){ Rcpp::checkUserInterrupt() ; const Lazy& lazy = dots[k] ; Call call( lazy.expr() ) ; GroupedCallProxy<Data, Subsets> call_proxy( call, gdf, lazy.env() ) ; int ngroups = gdf.ngroups() ; typename Data::group_iterator git = gdf.group_begin() ; for( int i=0; i<ngroups; i++, ++git){ SlicingIndex indices = *git ; int chunk_size = indices.size() ; g_test = check_filter_logical_result(call_proxy.get( indices )); if( g_test.size() == 1 ){ if( g_test[0] != TRUE ){ for( int j=0; j<chunk_size; j++){ test[indices[j]] = FALSE ; } } } else { check_filter_result(g_test, chunk_size ) ; for( int j=0; j<chunk_size; j++){ if( g_test[j] != TRUE ){ test[ indices[j] ] = FALSE ; } } } } } DataFrame res = subset( data, test, names, classes_grouped<Data>() ) ; res.attr( "vars") = data.attr("vars") ; return res ; }
SEXP mutate_not_grouped(DataFrame df, const LazyDots& dots) { int nexpr = dots.size(); int nrows = df.nrows(); NamedListAccumulator<DataFrame> accumulator; int nvars = df.size(); if (nvars) { CharacterVector df_names = df.names(); for (int i=0; i<nvars; i++) { accumulator.set(Symbol(df_names[i]), df[i]); } } CallProxy call_proxy(df); List results(nexpr); for (int i=0; i<nexpr; i++) { Rcpp::checkUserInterrupt(); const Lazy& lazy = dots[i]; Shield<SEXP> call_(lazy.expr()); SEXP call = call_; Symbol name = lazy.name(); Environment env = lazy.env(); call_proxy.set_env(env); if (TYPEOF(call) == SYMSXP) { if (call_proxy.has_variable(call)) { results[i] = call_proxy.get_variable(PRINTNAME(call)); } else { results[i] = shared_SEXP(env.find(CHAR(PRINTNAME(call)))); } } else if (TYPEOF(call) == LANGSXP) { call_proxy.set_call(call); results[i] = call_proxy.eval(); } else if (Rf_length(call) == 1) { boost::scoped_ptr<Gatherer> gather(constant_gatherer(call, nrows)); results[i] = gather->collect(); } else if (Rf_isNull(call)) { accumulator.rm(name); continue; } else { stop("cannot handle"); } check_supported_type(results[i], name.c_str()); if (Rf_inherits(results[i], "POSIXlt")) { stop("`mutate` does not support `POSIXlt` results"); } int n_res = Rf_length(results[i]); if (n_res == nrows) { // ok } else if (n_res == 1) { // recycle boost::scoped_ptr<Gatherer> gather(constant_gatherer(results[i] , df.nrows())); results[i] = gather->collect(); } else { stop("wrong result size (%d), expected %d or 1", n_res, nrows); } call_proxy.input(name, results[i]); accumulator.set(name, results[i]); } List res = structure_mutate(accumulator, df, classes_not_grouped()); return res; }
SEXP mutate_grouped(const DataFrame& df, const LazyDots& dots) { LOG_VERBOSE << "checking zero rows"; // special 0 rows case if (df.nrows() == 0) { DataFrame res = mutate_not_grouped(df, dots); res.attr("vars") = df.attr("vars"); res.attr("class") = df.attr("class"); return Data(res).data(); } LOG_VERBOSE << "initializing proxy"; typedef GroupedCallProxy<Data, Subsets> Proxy; Data gdf(df); int nexpr = dots.size(); check_not_groups(dots, gdf); Proxy proxy(gdf); LOG_VERBOSE << "copying data to accumulator"; NamedListAccumulator<Data> accumulator; int ncolumns = df.size(); CharacterVector column_names = df.names(); for (int i=0; i<ncolumns; i++) { accumulator.set(Symbol(column_names[i]), df[i]); } LOG_VERBOSE << "processing " << nexpr << " variables"; List variables(nexpr); for (int i=0; i<nexpr; i++) { Rcpp::checkUserInterrupt(); const Lazy& lazy = dots[i]; Environment env = lazy.env(); Shield<SEXP> call_(lazy.expr()); SEXP call = call_; Symbol name = lazy.name(); proxy.set_env(env); LOG_VERBOSE << "processing " << CharacterVector(name); if (TYPEOF(call) == LANGSXP || TYPEOF(call) == SYMSXP) { proxy.set_call(call); boost::scoped_ptr<Gatherer> gather(gatherer<Data, Subsets>(proxy, gdf, name)); SEXP variable = variables[i] = gather->collect(); proxy.input(name, variable); accumulator.set(name, variable); } else if (Rf_length(call) == 1) { boost::scoped_ptr<Gatherer> gather(constant_gatherer(call, gdf.nrows())); SEXP variable = variables[i] = gather->collect(); proxy.input(name, variable); accumulator.set(name, variable); } else if (Rf_isNull(call)) { accumulator.rm(name); continue; } else { stop("cannot handle"); } } return structure_mutate(accumulator, df, df.attr("class")); }
// [[Rcpp::export]] List arrange_impl( DataFrame data, LazyDots dots ){ if( data.size() == 0 ) return data ; check_valid_colnames(data) ; assert_all_white_list(data) ; // special case arrange() with no arguments for grouped data if( dots.size() == 0 && is<GroupedDataFrame>(data) ){ GroupedDataFrame gdata(data) ; data = gdata.data() ; DataFrame labels( data.attr( "labels" ) ); OrderVisitors o(labels) ; IntegerVector index = o.apply() ; // reorganize labels = DataFrameSubsetVisitors( labels, labels.names() ).subset( index, labels.attr("class") ); ListOf<IntegerVector> indices( data.attr("indices") ) ; int ngroups = indices.size() ; List new_indices(ngroups) ; IntegerVector master_index(data.nrows()) ; for( int i=0; i<ngroups; i++){ new_indices[index[i]] = indices[i] ; } IntegerVector group_sizes = data.attr("group_sizes") ; IntegerVector new_group_sizes(ngroups); for( int i=0, k=0; i<ngroups; i++){ IntegerVector idx = new_indices[i] ; IntegerVector new_group_index = seq(k, k + idx.size() - 1 ); for( int j=0; j<idx.size(); j++, k++){ master_index[k] = idx[j] ; } new_indices[i] = new_group_index ; new_group_sizes[i] = idx.size() ; } DataFrame res = DataFrameSubsetVisitors( data, data.names() ).subset( master_index, data.attr("class" ) ) ; res.attr( "labels" ) = labels ; res.attr( "indices" ) = new_indices ; res.attr( "vars" ) = data.attr("vars" ) ; res.attr( "group_sizes" ) = new_group_sizes ; res.attr( "biggest_group_size" ) = data.attr("biggest_group_size") ; res.attr( "drop" ) = data.attr("drop") ; return res ; } if( dots.size() == 0 || data.nrows() == 0) return data ; int nargs = dots.size() ; if( is<GroupedDataFrame>(data) ){ nargs += GroupedDataFrame(data).nvars() ; } List variables(nargs) ; LogicalVector ascending(nargs) ; int k = 0 ; if( is<GroupedDataFrame>(data) ){ GroupedDataFrame gdf(data); for( ; k< gdf.nvars(); k++) { ascending[k] = true ; String s = PRINTNAME(gdf.symbol(k)); variables[k] = data[s] ; } } for(int i=0; k<nargs; i++, k++){ const Lazy& lazy = dots[i] ; Shield<SEXP> call_( lazy.expr() ) ; SEXP call = call_ ; bool is_desc = TYPEOF(call) == LANGSXP && Rf_install("desc") == CAR(call) ; CallProxy call_proxy(is_desc ? CADR(call) : call, data, lazy.env()) ; Shield<SEXP> v(call_proxy.eval()) ; if( !white_list(v) ){ stop( "cannot arrange column of class '%s'", get_single_class(v) ) ; } if( Rf_inherits(v, "data.frame" ) ){ DataFrame df(v) ; int nr = df.nrows() ; if( nr != data.nrows() ){ stop( "data frame column with incompatible number of rows (%d), expecting : %d", nr, data.nrows() ); } } else if( Rf_isMatrix(v) ) { SEXP dim = Rf_getAttrib(v, Rf_install( "dim" ) ) ; int nr = INTEGER(dim)[0] ; if( nr != data.nrows() ){ stop( "matrix column with incompatible number of rows (%d), expecting : ", nr, data.nrows() ) ; } } else { if( Rf_length(v) != data.nrows() ){ stop( "incorrect size (%d), expecting : %d", Rf_length(v), data.nrows() ) ; } } variables[k] = v ; ascending[k] = !is_desc ; } OrderVisitors o(variables, ascending, nargs) ; IntegerVector index = o.apply() ; DataFrameSubsetVisitors visitors( data, data.names() ) ; List res = visitors.subset(index, data.attr("class") ) ; if( is<GroupedDataFrame>(data) ){ // so that all attributes are recalculated (indices ... ) // see the lazyness feature in GroupedDataFrame // if we don't do that, we get the values of the un-arranged data // set for free from subset (#1064) res.attr("labels") = R_NilValue ; res.attr( "vars" ) = data.attr("vars" ) ; return GroupedDataFrame(res).data() ; } SET_ATTRIB(res, strip_group_attributes(res)); return res ; }
SEXP summarise_grouped(const DataFrame& df, const LazyDots& dots){ Data gdf(df) ; int nexpr = dots.size() ; int nvars = gdf.nvars() ; check_not_groups(dots, gdf); NamedListAccumulator<Data> accumulator ; int i=0; List results(nvars + nexpr) ; for( ; i<nvars; i++){ results[i] = shared_SEXP(gdf.label(i)) ; accumulator.set( PRINTNAME(gdf.symbol(i)), results[i] ) ; } Subsets subsets(gdf) ; for( int k=0; k<nexpr; k++, i++ ){ Rcpp::checkUserInterrupt() ; const Lazy& lazy = dots[k] ; const Environment& env = lazy.env() ; Shield<SEXP> expr_(lazy.expr()) ; SEXP expr = expr_ ; boost::scoped_ptr<Result> res( get_handler( expr, subsets, env ) ); // if we could not find a direct Result // we can use a GroupedCallReducer which will callback to R if( !res ) { res.reset( new GroupedCallReducer<Data, Subsets>( lazy.expr(), subsets, env) ); } RObject result = res->process(gdf) ; results[i] = result ; accumulator.set( lazy.name(), result ); subsets.input( lazy.name(), SummarisedVariable(result) ) ; } List out = accumulator ; copy_most_attributes( out, df) ; out.names() = accumulator.names() ; int nr = gdf.ngroups() ; set_rownames(out, nr ) ; if( gdf.nvars() > 1){ out.attr( "class" ) = classes_grouped<Data>() ; List vars = gdf.data().attr("vars") ; vars.erase( gdf.nvars() - 1) ; out.attr( "vars") = vars ; out.attr( "labels") = R_NilValue ; out.attr( "indices") = R_NilValue ; out.attr( "group_sizes") = R_NilValue ; out.attr( "biggest_group_size") = R_NilValue ; out.attr( "drop" ) = true ; } else { out.attr( "class" ) = classes_not_grouped() ; SET_ATTRIB( out, strip_group_attributes(out) ) ; } return out ; }