Пример #1
0
// [[Rcpp::export]]
SEXP filter_impl( DataFrame df, LazyDots dots){
    if( df.nrows() == 0 || Rf_isNull(df) ) {
        return df ;
    }
    check_valid_colnames(df) ;
    assert_all_white_list(df) ;

    if( dots.size() == 0 ) return df ;

    // special case
    if( dots.size() == 1 && TYPEOF(dots[0].expr()) == LGLSXP){
        LogicalVector what = dots[0].expr() ;
        if( what.size() == 1 ){
            if( what[0] == TRUE ){
                return df ;
            } else {
                return empty_subset( df, df.names(), is<GroupedDataFrame>(df) ? classes_grouped<GroupedDataFrame>() : classes_not_grouped() ) ;
            }
        }
    }
    if( is<GroupedDataFrame>( df ) ){
        return filter_grouped<GroupedDataFrame, LazyGroupedSubsets>( GroupedDataFrame(df), dots);
    } else if( is<RowwiseDataFrame>(df) ){
        return filter_grouped<RowwiseDataFrame, LazyRowwiseSubsets>( RowwiseDataFrame(df), dots);
    } else {
        return filter_not_grouped( df, dots ) ;
    }
}
Пример #2
0
// [[Rcpp::export]]
List arrange_impl( DataFrame data, LazyDots dots ){
    if( data.size() == 0 ) return data ;
    check_valid_colnames(data) ;
    assert_all_white_list(data) ;

    if( dots.size() == 0 || data.nrows() == 0) return data ;

    int nargs = dots.size() ;
    List variables(nargs) ;
    LogicalVector ascending(nargs) ;

    for(int i=0; i<nargs; i++){
        const Lazy& lazy = dots[i] ;

        Shield<SEXP> call_( lazy.expr() ) ;
        SEXP call = call_ ;
        bool is_desc = TYPEOF(call) == LANGSXP && Rf_install("desc") == CAR(call) ;

        CallProxy call_proxy(is_desc ? CADR(call) : call, data, lazy.env()) ;

        Shield<SEXP> v(call_proxy.eval()) ;
        if( !white_list(v) ){
            stop( "cannot arrange column of class '%s'", get_single_class(v) ) ;
        }

        if( Rf_inherits(v, "data.frame" ) ){
            DataFrame df(v) ;
            int nr = df.nrows() ;
            if( nr != data.nrows() ){
                stop( "data frame column with incompatible number of rows (%d), expecting : %d", nr, data.nrows() );
            }
        } else if( Rf_isMatrix(v) ) {
            stop( "can't arrange by a matrix" ) ;
        } else {
            if( Rf_length(v) != data.nrows() ){
                stop( "incorrect size (%d), expecting : %d", Rf_length(v), data.nrows() ) ;
            }
        }
        variables[i] = v ;
        ascending[i] = !is_desc ;
    }
    OrderVisitors o(variables, ascending, nargs) ;
    IntegerVector index = o.apply() ;

    DataFrameSubsetVisitors visitors( data, data.names() ) ;
    List res = visitors.subset(index, data.attr("class") ) ;

    if( is<GroupedDataFrame>(data) ){
        // so that all attributes are recalculated (indices ... )
        // see the lazyness feature in GroupedDataFrame
        // if we don't do that, we get the values of the un-arranged data
        // set for free from subset (#1064)
        res.attr("labels") = R_NilValue ;
        res.attr( "vars" )  = data.attr("vars" ) ;
        return GroupedDataFrame(res).data() ;
    }
    SET_ATTRIB(res, strip_group_attributes(res));
    return res ;
}
Пример #3
0
SEXP summarise_grouped(const DataFrame& df, const LazyDots& dots){
    Data gdf(df) ;
    
    int nexpr = dots.size() ;
    int nvars = gdf.nvars() ;
    check_not_groups(dots, gdf);
    NamedListAccumulator<Data> accumulator ;

    int i=0;
    for( ; i<nvars; i++){
        accumulator.set( PRINTNAME(gdf.symbol(i)), shared_SEXP(gdf.label(i)) ) ;
    }

    Subsets subsets(gdf) ;
    for( int k=0; k<nexpr; k++, i++ ){
        Rcpp::checkUserInterrupt() ;
        const Lazy& lazy = dots[k] ;
        const Environment& env = lazy.env() ;
        
        Shield<SEXP> expr_(lazy.expr()) ; SEXP expr = expr_ ;
        boost::scoped_ptr<Result> res( get_handler( expr, subsets, env ) );
        
        // if we could not find a direct Result
        // we can use a GroupedCallReducer which will callback to R
        if( !res ) {
            res.reset( new GroupedCallReducer<Data, Subsets>( lazy.expr(), subsets, env) );
        }
        Shield<SEXP> result( res->process(gdf) ) ;
        accumulator.set( lazy.name(), result );
        subsets.input( lazy.name(), SummarisedVariable(result) ) ;
        
    }

    return summarised_grouped_tbl_cpp<Data>(accumulator, gdf );
}
Пример #4
0
SEXP summarise_not_grouped(DataFrame df, const LazyDots& dots){
    int nexpr = dots.size() ;
    if( nexpr == 0) return DataFrame() ;
    
    LazySubsets subsets( df ) ;
    std::vector<SEXP> results ;
    NamedListAccumulator<DataFrame> accumulator ;

    for( int i=0; i<nexpr; i++){
        Rcpp::checkUserInterrupt() ;
        
        const Lazy& lazy = dots[i] ;
        Environment env = lazy.env() ;
        Shield<SEXP> expr_(lazy.expr()) ; SEXP expr = expr_ ;
        boost::scoped_ptr<Result> res( get_handler( expr, subsets, env ) ) ;
        RObject result ;
        if(res) {
            result = res->process( FullDataFrame(df) ) ;
        } else {
            result = CallProxy( lazy.expr(), subsets, env).eval() ;
        }
        if( Rf_length(result) != 1 ){
            stop( "expecting result of length one, got : %d", Rf_length(result) ) ;
        }
        accumulator.set(lazy.name(), result);
        subsets.input( lazy.name(), result ) ;
    }

    return tbl_cpp( accumulator, 1 ) ;
}
Пример #5
0
void check_not_groups(const LazyDots& dots, const GroupedDataFrame& gdf) {
  int n = dots.size();
  for (int i=0; i<n; i++) {
    if (gdf.has_group(dots[i].name()))
      stop("cannot modify grouping variable");
  }
}
Пример #6
0
SEXP summarise_not_grouped(DataFrame df, const LazyDots& dots){
    int nexpr = dots.size() ;
    if( nexpr == 0) return DataFrame() ;
    
    LazySubsets subsets( df ) ;
    std::vector<SEXP> results ;
    NamedListAccumulator<DataFrame> accumulator ;

    Rcpp::Shelter<SEXP> __ ;
    for( int i=0; i<nexpr; i++){
        Rcpp::checkUserInterrupt() ;
        
        const Lazy& lazy = dots[i] ;
        Environment env = lazy.env() ;
        Result* res = get_handler( lazy.expr(), subsets, env ) ;
        
        SEXP result ;
        if(res) {
            result = __(res->process( FullDataFrame(df) )) ;
        } else {
            result = __(CallProxy( lazy.expr(), subsets, env).eval()) ;
        }
        delete res ;
        if( Rf_length(result) != 1 ){
            stop( "expecting result of length one, got : %d", Rf_length(result) ) ;
        }
        accumulator.set(lazy.name(), result);
        subsets.input( Symbol(lazy.name()), result ) ;
    }

    return tbl_cpp( accumulator, 1 ) ;
}
Пример #7
0
// [[Rcpp::export]]
SEXP slice_impl(DataFrame df, LazyDots dots) {
  if (dots.size() == 0) return df;
  if (dots.size() != 1)
    stop("slice only accepts one expression");
  if (is<GroupedDataFrame>(df)) {
    return slice_grouped(GroupedDataFrame(df), dots);
  } else {
    return slice_not_grouped(df, dots);
  }
}
Пример #8
0
DataFrame filter_not_grouped( DataFrame df, const LazyDots& dots){
    CharacterVector names = df.names() ;
    SymbolSet set ;
    for( int i=0; i<names.size(); i++){
        set.insert( Rf_installChar( names[i] ) ) ;
    }
    if( dots.single_env() ){
        Environment env = dots[0].env() ;
        // a, b, c ->  a & b & c
        Shield<SEXP> call( and_calls( dots, set, env ) ) ;

        // replace the symbols that are in the data frame by vectors from the data frame
        // and evaluate the expression
        CallProxy proxy( (SEXP)call, df, env ) ;
        LogicalVector test = check_filter_logical_result(proxy.eval()) ;

        if( test.size() == 1){
            if( test[0] == TRUE ){
                return df ;
            } else {
                return empty_subset(df, df.names(), classes_not_grouped()) ;
            }
        } else {
            check_filter_result(test, df.nrows());
            return subset(df, test, classes_not_grouped() ) ;
        }
    } else {
        int nargs = dots.size() ;

        Call call(dots[0].expr());
        CallProxy first_proxy(call, df, dots[0].env() ) ;
        LogicalVector test = check_filter_logical_result(first_proxy.eval()) ;
        if( test.size() == 1 ) {
            if( !test[0] ){
                return empty_subset(df, df.names(), classes_not_grouped() ) ;
            }
        } else {
            check_filter_result(test, df.nrows());
        }

        for( int i=1; i<nargs; i++){
            Rcpp::checkUserInterrupt() ;

            Call call( dots[i].expr() ) ;
            CallProxy proxy(call, df, dots[i].env() ) ;
            LogicalVector test2 = check_filter_logical_result(proxy.eval()) ;
            if( combine_and(test, test2) ){
                return empty_subset(df, df.names(), classes_not_grouped() ) ;
            }
        }

        DataFrame res = subset( df, test, classes_not_grouped() ) ;
        return res ;
    }
}
Пример #9
0
// [[Rcpp::export]]
SEXP mutate_impl(DataFrame df, LazyDots dots) {
  if (dots.size() == 0) return df;
  check_valid_colnames(df);
  if (is<RowwiseDataFrame>(df)) {
    return mutate_grouped<RowwiseDataFrame, LazyRowwiseSubsets>(df, dots);
  } else if (is<GroupedDataFrame>(df)) {
    return mutate_grouped<GroupedDataFrame, LazyGroupedSubsets>(df, dots);
  } else {
    return mutate_not_grouped(df, dots);
  }
}
Пример #10
0
SEXP and_calls( const LazyDots& dots, const SymbolSet& set, const Environment& env ){
    int ncalls = dots.size() ;
    if( !ncalls ) {
        stop("incompatible input") ;
    }

    Rcpp::Armor<SEXP> res( assert_correct_filter_subcall(dots[0].expr(), set, env) ) ;
    SEXP and_symbol = Rf_install( "&" ) ;
    for( int i=1; i<ncalls; i++)
        res = Rcpp_lang3( and_symbol, res, assert_correct_filter_subcall(dots[i].expr(), set, env) ) ;
    
    return res ;
}
Пример #11
0
DataFrame filter_grouped_multiple_env( const Data& gdf, const LazyDots& dots){
    const DataFrame& data = gdf.data() ;
    CharacterVector names = data.names() ;
    SymbolSet set ;
    for( int i=0; i<names.size(); i++){
        set.insert( Rf_installChar( names[i] ) ) ;
    }

    int nrows = data.nrows() ;
    LogicalVector test(nrows, TRUE);

    LogicalVector g_test ;

    for( int k=0; k<dots.size(); k++){
        Rcpp::checkUserInterrupt() ;
        const Lazy& lazy = dots[k] ;

        Call call( lazy.expr() ) ;
        GroupedCallProxy<Data, Subsets> call_proxy( call, gdf, lazy.env() ) ;
        int ngroups = gdf.ngroups() ;
        typename Data::group_iterator git = gdf.group_begin() ;
        for( int i=0; i<ngroups; i++, ++git){
            SlicingIndex indices = *git ;
            int chunk_size = indices.size() ;

            g_test  = check_filter_logical_result(call_proxy.get( indices ));
            if( g_test.size() == 1 ){
                if( g_test[0] != TRUE ){
                    for( int j=0; j<chunk_size; j++){
                        test[indices[j]] = FALSE ;
                    }
                }
            } else {
                check_filter_result(g_test, chunk_size ) ;
                for( int j=0; j<chunk_size; j++){
                    if( g_test[j] != TRUE ){
                        test[ indices[j] ] = FALSE ;
                    }
                }
            }
        }
    }
    DataFrame res = subset( data, test, names, classes_grouped<Data>() ) ;
    res.attr( "vars") = data.attr("vars") ;

    return res ;
}
Пример #12
0
// [[Rcpp::export]]
List arrange_impl( DataFrame data, LazyDots dots ){
    if( data.size() == 0 ) return data ;
    check_valid_colnames(data) ;
    assert_all_white_list(data) ;

    // special case arrange() with no arguments for grouped data
    if( dots.size() == 0 && is<GroupedDataFrame>(data) ){
        GroupedDataFrame gdata(data) ;
        data = gdata.data() ;
        DataFrame labels( data.attr( "labels" ) );
        OrderVisitors o(labels) ;
        IntegerVector index = o.apply() ;

        // reorganize
        labels = DataFrameSubsetVisitors( labels, labels.names() ).subset( index, labels.attr("class") );

        ListOf<IntegerVector> indices( data.attr("indices") ) ;
        int ngroups = indices.size() ;
        List new_indices(ngroups) ;
        IntegerVector master_index(data.nrows()) ;
        for( int i=0; i<ngroups; i++){
            new_indices[index[i]] = indices[i] ;
        }
        IntegerVector group_sizes = data.attr("group_sizes") ;
        IntegerVector new_group_sizes(ngroups);
        for( int i=0, k=0; i<ngroups; i++){
            IntegerVector idx = new_indices[i] ;
            IntegerVector new_group_index = seq(k, k + idx.size() - 1 );
            for( int j=0; j<idx.size(); j++, k++){
                master_index[k] = idx[j] ;
            }
            new_indices[i] = new_group_index ;
            new_group_sizes[i] = idx.size() ;
        }

        DataFrame res = DataFrameSubsetVisitors( data, data.names() ).subset( master_index, data.attr("class" ) ) ;
        res.attr( "labels" )  = labels ;
        res.attr( "indices" ) = new_indices ;
        res.attr( "vars"    ) = data.attr("vars" ) ;
        res.attr( "group_sizes" ) = new_group_sizes ;
        res.attr( "biggest_group_size" ) = data.attr("biggest_group_size") ;
        res.attr( "drop" ) = data.attr("drop") ;
        return res ;
    }

    if( dots.size() == 0 || data.nrows() == 0) return data ;

    int nargs = dots.size() ;
    if( is<GroupedDataFrame>(data) ){
        nargs += GroupedDataFrame(data).nvars() ;
    }
    List variables(nargs) ;
    LogicalVector ascending(nargs) ;

    int k = 0 ;
    if( is<GroupedDataFrame>(data) ){
        GroupedDataFrame gdf(data);
        for( ; k< gdf.nvars(); k++) {
            ascending[k] = true ;

            String s = PRINTNAME(gdf.symbol(k));
            variables[k] = data[s] ;
        }
    }

    for(int i=0; k<nargs; i++, k++){
        const Lazy& lazy = dots[i] ;

        Shield<SEXP> call_( lazy.expr() ) ;
        SEXP call = call_ ;
        bool is_desc = TYPEOF(call) == LANGSXP && Rf_install("desc") == CAR(call) ;

        CallProxy call_proxy(is_desc ? CADR(call) : call, data, lazy.env()) ;

        Shield<SEXP> v(call_proxy.eval()) ;
        if( !white_list(v) ){
            stop( "cannot arrange column of class '%s'", get_single_class(v) ) ;
        }

        if( Rf_inherits(v, "data.frame" ) ){
            DataFrame df(v) ;
            int nr = df.nrows() ;
            if( nr != data.nrows() ){
                stop( "data frame column with incompatible number of rows (%d), expecting : %d", nr, data.nrows() );
            }
        } else if( Rf_isMatrix(v) ) {
            SEXP dim = Rf_getAttrib(v, Rf_install( "dim" ) ) ;
            int nr = INTEGER(dim)[0] ;
            if( nr != data.nrows() ){
                stop( "matrix column with incompatible number of rows (%d), expecting : ", nr, data.nrows() ) ;
            }
        } else {
            if( Rf_length(v) != data.nrows() ){
                stop( "incorrect size (%d), expecting : %d", Rf_length(v), data.nrows() ) ;
            }
        }
        variables[k] = v ;
        ascending[k] = !is_desc ;
    }
    OrderVisitors o(variables, ascending, nargs) ;
    IntegerVector index = o.apply() ;

    DataFrameSubsetVisitors visitors( data, data.names() ) ;
    List res = visitors.subset(index, data.attr("class") ) ;

    if( is<GroupedDataFrame>(data) ){
        // so that all attributes are recalculated (indices ... )
        // see the lazyness feature in GroupedDataFrame
        // if we don't do that, we get the values of the un-arranged data
        // set for free from subset (#1064)
        res.attr("labels") = R_NilValue ;
        res.attr( "vars" )  = data.attr("vars" ) ;
        return GroupedDataFrame(res).data() ;
    }
    SET_ATTRIB(res, strip_group_attributes(res));
    return res ;
}
Пример #13
0
SEXP mutate_not_grouped(DataFrame df, const LazyDots& dots) {
  int nexpr = dots.size();
  int nrows = df.nrows();

  NamedListAccumulator<DataFrame> accumulator;
  int nvars = df.size();
  if (nvars) {
    CharacterVector df_names = df.names();
    for (int i=0; i<nvars; i++) {
      accumulator.set(Symbol(df_names[i]), df[i]);
    }
  }

  CallProxy call_proxy(df);
  List results(nexpr);

  for (int i=0; i<nexpr; i++) {
    Rcpp::checkUserInterrupt();
    const Lazy& lazy = dots[i];

    Shield<SEXP> call_(lazy.expr());
    SEXP call = call_;
    Symbol name = lazy.name();
    Environment env = lazy.env();
    call_proxy.set_env(env);

    if (TYPEOF(call) == SYMSXP) {
      if (call_proxy.has_variable(call)) {
        results[i] = call_proxy.get_variable(PRINTNAME(call));
      } else {
        results[i] = shared_SEXP(env.find(CHAR(PRINTNAME(call))));
      }
    } else if (TYPEOF(call) == LANGSXP) {
      call_proxy.set_call(call);
      results[i] = call_proxy.eval();
    } else if (Rf_length(call) == 1) {
      boost::scoped_ptr<Gatherer> gather(constant_gatherer(call, nrows));
      results[i] = gather->collect();
    } else if (Rf_isNull(call)) {
      accumulator.rm(name);
      continue;
    } else {
      stop("cannot handle");
    }

    check_supported_type(results[i], name.c_str());

    if (Rf_inherits(results[i], "POSIXlt")) {
      stop("`mutate` does not support `POSIXlt` results");
    }
    int n_res = Rf_length(results[i]);
    if (n_res == nrows) {
      // ok
    } else if (n_res == 1) {
      // recycle
      boost::scoped_ptr<Gatherer> gather(constant_gatherer(results[i] , df.nrows()));
      results[i] = gather->collect();
    } else {
      stop("wrong result size (%d), expected %d or 1", n_res, nrows);
    }

    call_proxy.input(name, results[i]);
    accumulator.set(name, results[i]);
  }
  List res = structure_mutate(accumulator, df, classes_not_grouped());

  return res;
}
Пример #14
0
SEXP mutate_grouped(const DataFrame& df, const LazyDots& dots) {
  LOG_VERBOSE << "checking zero rows";

  // special 0 rows case
  if (df.nrows() == 0) {
    DataFrame res = mutate_not_grouped(df, dots);
    res.attr("vars") = df.attr("vars");
    res.attr("class") = df.attr("class");
    return Data(res).data();
  }

  LOG_VERBOSE << "initializing proxy";

  typedef GroupedCallProxy<Data, Subsets> Proxy;
  Data gdf(df);
  int nexpr = dots.size();
  check_not_groups(dots, gdf);

  Proxy proxy(gdf);

  LOG_VERBOSE << "copying data to accumulator";

  NamedListAccumulator<Data> accumulator;
  int ncolumns = df.size();
  CharacterVector column_names = df.names();
  for (int i=0; i<ncolumns; i++) {
    accumulator.set(Symbol(column_names[i]), df[i]);
  }

  LOG_VERBOSE << "processing " << nexpr << " variables";

  List variables(nexpr);
  for (int i=0; i<nexpr; i++) {
    Rcpp::checkUserInterrupt();
    const Lazy& lazy = dots[i];

    Environment env = lazy.env();
    Shield<SEXP> call_(lazy.expr());
    SEXP call = call_;
    Symbol name = lazy.name();
    proxy.set_env(env);

    LOG_VERBOSE << "processing " << CharacterVector(name);

    if (TYPEOF(call) == LANGSXP || TYPEOF(call) == SYMSXP) {
      proxy.set_call(call);
      boost::scoped_ptr<Gatherer> gather(gatherer<Data, Subsets>(proxy, gdf, name));
      SEXP variable = variables[i] = gather->collect();
      proxy.input(name, variable);
      accumulator.set(name, variable);
    } else if (Rf_length(call) == 1) {
      boost::scoped_ptr<Gatherer> gather(constant_gatherer(call, gdf.nrows()));
      SEXP variable = variables[i] = gather->collect();
      proxy.input(name, variable);
      accumulator.set(name, variable);
    } else if (Rf_isNull(call)) {
      accumulator.rm(name);
      continue;
    } else {
      stop("cannot handle");
    }
  }

  return structure_mutate(accumulator, df, df.attr("class"));
}
Пример #15
0
SEXP summarise_grouped(const DataFrame& df, const LazyDots& dots){
    Data gdf(df) ;

    int nexpr = dots.size() ;
    int nvars = gdf.nvars() ;
    check_not_groups(dots, gdf);
    NamedListAccumulator<Data> accumulator ;

    int i=0;
    List results(nvars + nexpr) ;
    for( ; i<nvars; i++){
        results[i] = shared_SEXP(gdf.label(i)) ;
        accumulator.set( PRINTNAME(gdf.symbol(i)), results[i] ) ;
    }

    Subsets subsets(gdf) ;
    for( int k=0; k<nexpr; k++, i++ ){
        Rcpp::checkUserInterrupt() ;
        const Lazy& lazy = dots[k] ;
        const Environment& env = lazy.env() ;

        Shield<SEXP> expr_(lazy.expr()) ; SEXP expr = expr_ ;
        boost::scoped_ptr<Result> res( get_handler( expr, subsets, env ) );
        
        // if we could not find a direct Result
        // we can use a GroupedCallReducer which will callback to R
        if( !res ) {
            res.reset( new GroupedCallReducer<Data, Subsets>( lazy.expr(), subsets, env) );
        }
        RObject result = res->process(gdf)  ;
        results[i] = result ;
        accumulator.set( lazy.name(), result );
        subsets.input( lazy.name(), SummarisedVariable(result) ) ;

    }

    List out = accumulator ;
    copy_most_attributes( out, df) ;
    out.names() = accumulator.names() ;

    int nr = gdf.ngroups() ;
    set_rownames(out, nr ) ;

    if( gdf.nvars() > 1){
        out.attr( "class" ) = classes_grouped<Data>()  ;
        List vars = gdf.data().attr("vars") ;
        vars.erase( gdf.nvars() - 1) ;
        out.attr( "vars") = vars ;
        out.attr( "labels") = R_NilValue ;
        out.attr( "indices") = R_NilValue ;
        out.attr( "group_sizes") = R_NilValue ;
        out.attr( "biggest_group_size") = R_NilValue ;

        out.attr( "drop" ) = true ;
    } else {
        out.attr( "class" ) = classes_not_grouped()  ;
        SET_ATTRIB( out, strip_group_attributes(out) ) ;
    }

    return out ;
}