Example #1
0
inline DataFrame grouped_subset( const Data& gdf, const LogicalVector& test, const CharacterVector& names, CharacterVector classes){
  DataFrame data = gdf.data() ;
  DataFrame res = subset( data, test, names, classes) ;
  res.attr("vars")   = data.attr("vars") ;
  strip_index(res);
  return Data(res).data() ;
}
Example #2
0
DataFrame build_index_cpp(DataFrame data) {
  SymbolVector vars(get_vars(data));
  const int nvars = vars.size();

  CharacterVector names = data.names();
  IntegerVector indx = vars.match_in_table(names);

  for (int i = 0; i < nvars; ++i) {
    int pos = indx[i];
    if (pos == NA_INTEGER) {
      stop("unknown column '%s' ", vars[i].get_utf8_cstring());
    }

    SEXP v = data[pos - 1];

    if (!white_list(v) || TYPEOF(v) == VECSXP) {
      stop(
        "cannot group column %s, of class '%s'",
        vars[i].get_utf8_cstring(),
        get_single_class(v));
    }
  }

  DataFrameVisitors visitors(data, vars);
  ChunkIndexMap map(visitors);

  train_push_back(map, data.nrows());

  DataFrame labels = DataFrameSubsetVisitors(data, vars).subset(map, "data.frame");
  int ngroups = labels.nrows();
  IntegerVector labels_order = OrderVisitors(labels).apply();

  labels = DataFrameSubsetVisitors(labels).subset(labels_order, "data.frame");

  List indices(ngroups);
  IntegerVector group_sizes = no_init(ngroups);
  int biggest_group = 0;

  ChunkIndexMap::const_iterator it = map.begin();
  std::vector<const std::vector<int>* > chunks(ngroups);
  for (int i = 0; i < ngroups; i++, ++it) {
    chunks[i] = &it->second;
  }

  for (int i = 0; i < ngroups; i++) {
    int idx = labels_order[i];
    const std::vector<int>& chunk = *chunks[idx];
    indices[i] = chunk;
    group_sizes[i] = chunk.size();
    biggest_group = std::max(biggest_group, (int)chunk.size());
  }

  data.attr("indices") = indices;
  data.attr("group_sizes") = group_sizes;
  data.attr("biggest_group_size") = biggest_group;
  data.attr("labels") = labels;
  set_class(data, CharacterVector::create("grouped_df", "tbl_df", "tbl", "data.frame"));
  return data;
}
Example #3
0
// [[Rcpp::export]]
List arrange_impl( DataFrame data, LazyDots dots ){
    if( data.size() == 0 ) return data ;
    check_valid_colnames(data) ;
    assert_all_white_list(data) ;

    if( dots.size() == 0 || data.nrows() == 0) return data ;

    int nargs = dots.size() ;
    List variables(nargs) ;
    LogicalVector ascending(nargs) ;

    for(int i=0; i<nargs; i++){
        const Lazy& lazy = dots[i] ;

        Shield<SEXP> call_( lazy.expr() ) ;
        SEXP call = call_ ;
        bool is_desc = TYPEOF(call) == LANGSXP && Rf_install("desc") == CAR(call) ;

        CallProxy call_proxy(is_desc ? CADR(call) : call, data, lazy.env()) ;

        Shield<SEXP> v(call_proxy.eval()) ;
        if( !white_list(v) ){
            stop( "cannot arrange column of class '%s'", get_single_class(v) ) ;
        }

        if( Rf_inherits(v, "data.frame" ) ){
            DataFrame df(v) ;
            int nr = df.nrows() ;
            if( nr != data.nrows() ){
                stop( "data frame column with incompatible number of rows (%d), expecting : %d", nr, data.nrows() );
            }
        } else if( Rf_isMatrix(v) ) {
            stop( "can't arrange by a matrix" ) ;
        } else {
            if( Rf_length(v) != data.nrows() ){
                stop( "incorrect size (%d), expecting : %d", Rf_length(v), data.nrows() ) ;
            }
        }
        variables[i] = v ;
        ascending[i] = !is_desc ;
    }
    OrderVisitors o(variables, ascending, nargs) ;
    IntegerVector index = o.apply() ;

    DataFrameSubsetVisitors visitors( data, data.names() ) ;
    List res = visitors.subset(index, data.attr("class") ) ;

    if( is<GroupedDataFrame>(data) ){
        // so that all attributes are recalculated (indices ... )
        // see the lazyness feature in GroupedDataFrame
        // if we don't do that, we get the values of the un-arranged data
        // set for free from subset (#1064)
        res.attr("labels") = R_NilValue ;
        res.attr( "vars" )  = data.attr("vars" ) ;
        return GroupedDataFrame(res).data() ;
    }
    SET_ATTRIB(res, strip_group_attributes(res));
    return res ;
}
Example #4
0
SEXP structure_mutate( const NamedListAccumulator<SEXP>& accumulator, const DataFrame& df, CharacterVector classes){
    List res = accumulator ;
    res.attr("class") = classes ;
    set_rownames( res, df.nrows() ) ;
    res.attr( "vars")     = df.attr("vars") ;
    res.attr( "labels" )  = df.attr("labels" );
    res.attr( "index")    = df.attr("index") ;
    res.attr( "indices" ) = df.attr("indices" ) ;

    return res ;
}
Example #5
0
DataFrame filter_grouped_single_env( const GroupedDataFrame& gdf, const List& args, const Environment& env){
    const DataFrame& data = gdf.data() ;
    CharacterVector names = data.names() ;
    SymbolSet set ;
    for( int i=0; i<names.size(); i++){
        set.insert( Rf_install( names[i] ) ) ;
    }

    // a, b, c ->  a & b & c
    Call call( and_calls( args, set ) ) ;

    int nrows = data.nrows() ;
    LogicalVector test = no_init(nrows);

    LogicalVector g_test ;
    GroupedCallProxy call_proxy( call, gdf, env ) ;

    int ngroups = gdf.ngroups() ;
    GroupedDataFrame::group_iterator git = gdf.group_begin() ;
    for( int i=0; i<ngroups; i++, ++git){
        SlicingIndex indices = *git ;
        int chunk_size = indices.size() ;

        g_test  = call_proxy.get( indices );
        check_filter_result(g_test, chunk_size ) ;
        for( int j=0; j<chunk_size; j++){
            test[ indices[j] ] = g_test[j] ;
        }
    }

    DataFrame res = subset( data, test, names, classes_grouped() ) ;
    res.attr( "vars")   = data.attr("vars") ;

    return res ;
}
Example #6
0
// version of grouped filter when contributions to ... come from several environment
DataFrame filter_grouped_multiple_env( const GroupedDataFrame& gdf, const List& args, const DataDots& dots){
    const DataFrame& data = gdf.data() ;
    CharacterVector names = data.names() ;
    SymbolSet set ;
    for( int i=0; i<names.size(); i++){
        set.insert( Rf_install( names[i] ) ) ;
    }

    int nrows = data.nrows() ;
    LogicalVector test(nrows, TRUE);

    LogicalVector g_test ;

    for( int k=0; k<args.size(); k++){
        Call call( (SEXP)args[k] ) ;
        GroupedCallProxy call_proxy( call, gdf, dots.envir(k) ) ;
        int ngroups = gdf.ngroups() ;
        GroupedDataFrame::group_iterator git = gdf.group_begin() ;
        for( int i=0; i<ngroups; i++, ++git){
            SlicingIndex indices = *git ;
            int chunk_size = indices.size() ;

            g_test  = call_proxy.get( indices );
            check_filter_result(g_test, chunk_size ) ;
            for( int j=0; j<chunk_size; j++){
                test[ indices[j] ] = test[ indices[j] ] & g_test[j] ;
            }
        }
    }
    DataFrame res = subset( data, test, names, classes_grouped() ) ;
    res.attr( "vars")   = data.attr("vars") ;

    return res ;
}
Example #7
0
DataFrame subset( DataFrame x, DataFrame y, const Index& indices_x, const Index& indices_y, CharacterVector by, CharacterVector classes ){
    CharacterVector x_columns = x.names() ;
    DataFrameVisitors visitors_x(x, x_columns) ;

    CharacterVector all_y_columns = y.names() ;
    CharacterVector y_columns = setdiff( all_y_columns, by ) ;
    JoinColumnSuffixer suffixer(x_columns, y_columns, by) ;

    DataFrameVisitors visitors_y(y, y_columns) ;

    int nrows = indices_x.size() ;
    int nv_x = visitors_x.size(), nv_y = visitors_y.size() ;
    List out(nv_x+nv_y);
    CharacterVector names(nv_x+nv_y) ;
    int k=0;
    for( ; k<nv_x; k++){
       out[k] = visitors_x.get(k)->subset(indices_x) ;
       names[k] = suffixer.get( x_columns[k], ".x" ) ;
    }
    for( int i=0; i<nv_y; i++, k++){
       out[k] = visitors_y.get(i)->subset(indices_y) ;
       names[k] = suffixer.get(y_columns[i], ".y" ) ;
    }
    out.attr("class") = classes ;
    set_rownames(out, nrows) ;
    out.names() = names ;

    SEXP vars = x.attr( "vars" ) ;
    if( !Rf_isNull(vars) )
        out.attr( "vars" ) = vars ;

    return (SEXP)out ;
}
Example #8
0
// [[Rcpp::export]]
DataFrame as_regular_df(DataFrame df){
  DataFrame copy = shallow_copy(df) ;
  SET_ATTRIB(copy, strip_group_attributes(df)) ;
  SET_OBJECT(copy, OBJECT(df)) ;
  copy.attr("class") = CharacterVector::create("data.frame") ;
  return copy ;
}
Example #9
0
// [[Rcpp::export]]
DataFrame right_join_impl( DataFrame x, DataFrame y, CharacterVector by){
    typedef VisitorSetIndexMap<DataFrameJoinVisitors, std::vector<int> > Map ;
    DataFrameJoinVisitors visitors(x, y, by) ;
    Map map(visitors);

    // train the map in terms of y
    train_push_back( map, x.nrows(), x.nrows() / 10 ) ;

    std::vector<int> indices_x ;
    std::vector<int> indices_y ;

    int n_y = y.nrows() ;
    for( int i=0; i<n_y; i++){
        // find a row in y that matches row i in x
        Map::iterator it = map.find(-i-1) ;
        if( it != map.end() ){
            push_back( indices_x,    it->second ) ;
            push_back( indices_y, i, it->second.size() ) ;
        } else {
            indices_x.push_back(-1) ; // mark NA
            indices_y.push_back(i) ;
        }
    }
    return subset( x, y, indices_x, indices_y, by, x.attr( "class" ) ) ;
}
Example #10
0
// [[Rcpp::export]]
DataFrame arrange_impl( DataFrame data, List args, DataDots dots ){
    int nargs = args.size() ;
    List variables(nargs) ;
    LogicalVector ascending(nargs) ;
    Shelter<SEXP> __ ;

    for(int i=0; i<nargs; i++){
        SEXP call = args[i] ;
        bool is_desc = TYPEOF(call) == LANGSXP && Rf_install("desc") == CAR(call) ;

        CallProxy call_proxy( is_desc ? CADR(call) : call, data, dots.envir(i)) ;
        variables[i] = __(call_proxy.eval()) ;
        if( Rf_length(variables[i]) != data.nrows() ){
            std::stringstream s ;
            s << "incorrect size ("
              << Rf_length(variables[i])
              << "), expecting :"
              << data.nrows() ;
            stop(s.str()) ;
        }
        ascending[i] = !is_desc ;
    }
    OrderVisitors o(variables,ascending, nargs) ;
    IntegerVector index = o.apply() ;

    DataFrameVisitors visitors( data, data.names() ) ;
    DataFrame res = visitors.subset(index, data.attr("class") ) ;
    return res;
}
Example #11
0
// [[Rcpp::export]]
DataFrame semi_join_impl( DataFrame x, DataFrame y, CharacterVector by){
    typedef VisitorSetIndexMap<DataFrameJoinVisitors, std::vector<int> > Map ;
    DataFrameJoinVisitors visitors(x, y, by) ;
    Map map(visitors);

    // train the map in terms of x
    train_push_back( map, x.nrows(), x.nrows() / 10) ;

    int n_y = y.nrows() ;
    // this will collect indices from rows in x that match rows in y
    std::vector<int> indices ;
    for( int i=0; i<n_y; i++){
        // find a row in x that matches row i from y
        Map::iterator it = map.find(-i-1) ;

        if( it != map.end() ){
            // collect the indices and remove them from the
            // map so that they are only found once.
            push_back( indices, it->second ) ;

            map.erase(it) ;

        }
    }

    return subset(x, indices, x.names(), x.attr("class") ) ;
}
Example #12
0
// [[Rcpp::export]]
SEXP distinct_impl(DataFrame df, CharacterVector vars, CharacterVector keep) {
  if (df.size() == 0)
    return df;

  // No vars means ungrouped data with keep_all = TRUE.
  if (vars.size() == 0)
    return df;

  check_valid_colnames(df);
  if (!vars.size()) {
    vars = df.names();
  }
  DataFrameVisitors visitors(df, vars);

  std::vector<int> indices;
  VisitorSetIndexSet<DataFrameVisitors> set(visitors);

  int n = df.nrows();
  for (int i=0; i<n; i++) {
    if (set.insert(i).second) {
      indices.push_back(i);
    }
  }

  return DataFrameSubsetVisitors(df, keep).subset(indices, df.attr("class"));
}
Example #13
0
DataFrame filter_grouped( const GroupedDataFrame& gdf, List args, Environment env){
    // a, b, c ->  a & b & c
    Language call = and_calls( args ) ;
    
    const DataFrame& data = gdf.data() ;
    int nrows = data.nrows() ;
    LogicalVector test = no_init(nrows);
    
    LogicalVector g_test ;
    GroupedCallProxy call_proxy( call, gdf, env ) ;
    
    int ngroups = gdf.ngroups() ;
    GroupedDataFrame::group_iterator git = gdf.group_begin() ;
    for( int i=0; i<ngroups; i++, ++git){
        SlicingIndex indices = *git ;
        g_test  = call_proxy.get( indices );
        
        int chunk_size = indices.size() ;
        for( int j=0; j<chunk_size; j++){
            test[ indices[j] ] = g_test[j] ;  
        }
    }
    DataFrame res = subset( data, test, data.names(), classes_grouped() ) ;
    res.attr( "vars")   = data.attr("vars") ;
            
    return res ;
}
Example #14
0
// [[Rcpp::export]]
DataFrame build_index_cpp( DataFrame data ){
    CharacterVector vars = Rf_getAttrib( data.attr( "vars" ), R_NamesSymbol ) ;
    
    DataFrameVisitors visitors(data, vars) ;
    ChunkIndexMap map( visitors ) ;
    train_push_back( map, data.nrows() ) ;
    
    DataFrame labels = visitors.subset( map, "data.frame") ;
    int ngroups = labels.nrows() ;
    
    OrderVisitors order_labels( labels, vars ) ;
    IntegerVector orders = order_labels.apply() ;
    
    std::vector< const std::vector<int>* > chunks(ngroups) ;
    ChunkIndexMap::const_iterator it = map.begin() ;
    for( int i=0; i<ngroups; i++, ++it){
        chunks[ i ] = &it->second ;
    }
    IntegerVector group_sizes = no_init( ngroups );
    int biggest_group = 0 ;
    std::vector<int> indices ;
    indices.reserve( data.nrows() );
    for( int i=0; i<ngroups; i++){
        const std::vector<int>& chunk = *chunks[orders[i]] ;
        push_back( indices, chunk ) ;
        biggest_group = std::max( biggest_group, (int)chunk.size() );
        group_sizes[i] = chunk.size() ;
    }
    
    DataFrameVisitors all_variables_visitors(data, data.names() ) ;
    data = all_variables_visitors.subset( indices, classes_grouped() ) ;
    
    // TODO: we own labels, so perhaps we can do an inplace sort, 
    //       to reuse its memory instead of creating a new data frame
    DataFrameVisitors labels_visitors( labels, vars) ;
    
    labels = labels_visitors.subset( orders, "data.frame" ) ;
    labels.attr( "vars" ) = R_NilValue ;
    
    data.attr( "group_sizes") = group_sizes ;
    data.attr( "biggest_group_size" ) = biggest_group ;
    data.attr( "labels" ) = labels ;
    return data ;
}
Example #15
0
SEXP mutate_grouped(GroupedDataFrame gdf, List args, Environment env){
    const DataFrame& df = gdf.data() ;
    
    int nexpr = args.size() ;
    CharacterVector results_names = args.names() ;
    
    GroupedCallProxy proxy(gdf, env) ;
    Shelter<SEXP> __ ;
    
    for( int i=0; i<nexpr; i++){
        proxy.set_call( args[i] );
        boost::scoped_ptr<Gatherer> gather( gatherer( proxy, gdf ) );
        proxy.input( results_names[i], __( gather->collect() ) ) ;
    }
    
    DataFrame res = structure_mutate( proxy, df, results_names, classes_grouped() ) ;
    res.attr( "vars")    = df.attr("vars") ;
    res.attr( "labels" ) = df.attr("labels" );
    res.attr( "index")   = df.attr("index") ;
    
    return res ;
}
Example #16
0
// [[Rcpp::export]]
DataFrame union_data_frame( DataFrame x, DataFrame y){
    if( !compatible_data_frame(x,y) )
        stop( "not compatible" );

    typedef VisitorSetIndexSet<DataFrameJoinVisitors> Set ;
    DataFrameJoinVisitors visitors(x, y, x.names() ) ;
    Set set(visitors);

    train_insert( set, x.nrows() ) ;
    train_insert_right( set, y.nrows() ) ;

    return visitors.subset( set, x.attr("class") ) ;
}
Example #17
0
/**
 * - creates water reservoir from given vectors of variables and options
 */
wateres::wateres(
  DataFrame reser, vector<double> storage, bool throw_exceed, double volume) : storage(storage),
  throw_exceed(throw_exceed), volume(volume)
{
  unsigned row_count = reser.nrows();
  vector<string> col_names = as<vector<string> >(reser.attr("names"));
  var.resize(var_count);

  for (unsigned v = 0; v < var_count; v++) {
    if (find(col_names.begin(), col_names.end(), var_names[v]) != col_names.end()) {
      var[v] = as<vector<double> >(reser[var_names[v]]);
    }
    else {
      var[v].resize(row_count, 0);
    }
  }
  //custom inflow from another reservoir instead of natural inflow
  if (find(col_names.begin(), col_names.end(), "I") != col_names.end()) {
    var[INFLOW] = as<vector<double> >(reser["I"]);
  }

  this->minutes = as<vector<unsigned> >(reser["minutes"]);
  area = as<double>(reser.attr("area"));

  double tmp[5] = { 0, 0.1, 0.3, 0.5, 0.75 };
  plant_covers.assign(&tmp[0], &tmp[0] + 5);
  double tmp2[5] = { 1, 1.03, 1.08, 1.14, 1.22 };
  plant_coeffs.assign(&tmp2[0], &tmp2[0] + 5);

  Rcpp::Nullable<double> tmp_plant = as<Rcpp::Nullable<double> >(reser.attr("plant_cover"));
  if (tmp_plant.isNotNull())
    plant_cover = as<double>(tmp_plant);
  else
    plant_cover = 0;
  plant_coeff = interpolate_linear(plant_covers, plant_coeffs, plant_cover);
  eas = as<DataFrame>(reser.attr("eas"));
  transfer_add = true;
}
Example #18
0
    SEXP getQueryStats(){
        vector<int> qTF;
        vector<int> qIndex;
        for(int i=0; i < resultsData.queryStemOrder.size(); i++ ){
            string term = resultsData.queryStemOrder.at(i);
            qTF.push_back(resultsData.queryStems[term]);
            qIndex.push_back(resultsData.queryStemIndex[term]);
        }
        DataFrame d = DataFrame::create( Named("qTF")= qTF,
                                Named("qIndex") = qIndex);
        d.attr("row.names") = resultsData.queryStemOrder;
        return d;

    }
Example #19
0
// [[Rcpp::export]]
SEXP distinct_impl( DataFrame df ){
    DataFrameVisitors visitors(df) ;
    
    std::vector<int> indices ;
    VisitorSetIndexSet<DataFrameVisitors> set(visitors) ;
    
    int n = df.nrows() ;
    for( int i=0; i<n; i++){
        if( set.insert(i).second ){
            indices.push_back(i) ;    
        }
    }
    return visitors.subset(indices, df.attr("class") ); 
}
Example #20
0
    SEXP getTermStats(){

        vector<string> statName;
        statName.push_back("DocFreq");
        statName.push_back("IDF");
        statName.push_back("cTF");
        arma::vec idf = arma::log((environment.documentCount() + 1) /
                (resultsData.dfVector + 0.5));
        DataFrame d = DataFrame::create(Named("DocFreq")=resultsData.dfVector,
                                        Named("IDF")=idf,
                                        Named("cTF")=resultsData.ctfVector);
        d.attr("row.names") = terms;
        return d;
    }
Example #21
0
// [[Rcpp::export]]
DataFrame CPP_get_openmp_threads() {
  int num_threads = openmp_threads;
#ifdef _OPENMP
  int max_threads = omp_get_max_threads();
#else
  int max_threads = 0;
#endif
  DataFrame res =
    DataFrame::create(_["available"] = max_threads > 0,
                      _["max"] = max_threads, 
                      _["threads"] = num_threads);
  res.attr("row.names") = "OpenMP";
  return res;
}
// Get a row from a data.frame with rowname ----------------------------------------------------------
// [[Rcpp::export]]
CharacterVector getRowFromDfWithRowname(DataFrame df, int n){
  int ncols = df.size();  // number of columns in df
  CharacterVector out;  // output vector with length 'ncol'
  
  CharacterVector rnames = df.attr("row.names");  // get row names.
  
  // assign first item in 'out' as the rowname
  out.push_back(rnames[n]);
  
  // for loop to get values in nth column.
  for(int i=0; i<ncols; i++){  //counting starts from 0 in c++
    CharacterVector df_column = df[i];
    out.push_back(df_column[n]);
  }
  return out;
}
Example #23
0
DataFrame filter_grouped_multiple_env( const Data& gdf, const LazyDots& dots){
    const DataFrame& data = gdf.data() ;
    CharacterVector names = data.names() ;
    SymbolSet set ;
    for( int i=0; i<names.size(); i++){
        set.insert( Rf_installChar( names[i] ) ) ;
    }

    int nrows = data.nrows() ;
    LogicalVector test(nrows, TRUE);

    LogicalVector g_test ;

    for( int k=0; k<dots.size(); k++){
        Rcpp::checkUserInterrupt() ;
        const Lazy& lazy = dots[k] ;

        Call call( lazy.expr() ) ;
        GroupedCallProxy<Data, Subsets> call_proxy( call, gdf, lazy.env() ) ;
        int ngroups = gdf.ngroups() ;
        typename Data::group_iterator git = gdf.group_begin() ;
        for( int i=0; i<ngroups; i++, ++git){
            SlicingIndex indices = *git ;
            int chunk_size = indices.size() ;

            g_test  = check_filter_logical_result(call_proxy.get( indices ));
            if( g_test.size() == 1 ){
                if( g_test[0] != TRUE ){
                    for( int j=0; j<chunk_size; j++){
                        test[indices[j]] = FALSE ;
                    }
                }
            } else {
                check_filter_result(g_test, chunk_size ) ;
                for( int j=0; j<chunk_size; j++){
                    if( g_test[j] != TRUE ){
                        test[ indices[j] ] = FALSE ;
                    }
                }
            }
        }
    }
    DataFrame res = subset( data, test, names, classes_grouped<Data>() ) ;
    res.attr( "vars") = data.attr("vars") ;

    return res ;
}
Example #24
0
DataFrame filter_grouped_single_env( const Data& gdf, const LazyDots& dots){
    typedef GroupedCallProxy<Data, Subsets> Proxy ;
    Environment env = dots[0].env() ;

    const DataFrame& data = gdf.data() ;
    CharacterVector names = data.names() ;
    SymbolSet set ;
    for( int i=0; i<names.size(); i++){
        set.insert( Rf_installChar( names[i] ) ) ;
    }

    // a, b, c ->  a & b & c
    Call call( and_calls( dots, set, env ) ) ;

    int nrows = data.nrows() ;
    LogicalVector test(nrows, TRUE);

    LogicalVector g_test ;
    Proxy call_proxy( call, gdf, env ) ;

    int ngroups = gdf.ngroups() ;
    typename Data::group_iterator git = gdf.group_begin() ;
    for( int i=0; i<ngroups; i++, ++git){
        SlicingIndex indices = *git ;
        int chunk_size = indices.size() ;

        g_test = check_filter_logical_result( call_proxy.get( indices ) ) ;
        if( g_test.size() == 1 ){
            int val = g_test[0] == TRUE ;
            for( int j=0; j<chunk_size; j++){
                test[ indices[j] ] = val ;
            }
        } else {
            check_filter_result(g_test, chunk_size ) ;
            for( int j=0; j<chunk_size; j++){
                if( g_test[j] != TRUE ) test[ indices[j] ] = FALSE ;
            }
        }
    }
    DataFrame res = subset( data, test, names, classes_grouped<Data>() ) ;
    res.attr( "vars")   = data.attr("vars") ;

    return res ;
}
Example #25
0
SEXP structure_mutate(const NamedListAccumulator<Data>& accumulator, const DataFrame& df, CharacterVector classes) {
  List res = accumulator;
  set_class(res, classes);
  set_rownames(res, df.nrows());
  copy_vars(res, df);
  res.attr("labels")  = df.attr("labels");
  res.attr("index")  = df.attr("index");
  res.attr("indices") = df.attr("indices");
  res.attr("drop") = df.attr("drop");
  res.attr("group_sizes") = df.attr("group_sizes");
  res.attr("biggest_group_size") = df.attr("biggest_group_size");

  return res;
}
Example #26
0
// [[Rcpp::export]]
DataFrame setdiff_data_frame( DataFrame x, DataFrame y){
    if( !compatible_data_frame(x,y) )
        stop( "not compatible" );

    typedef VisitorSetIndexSet<DataFrameJoinVisitors> Set ;
    DataFrameJoinVisitors visitors(y, x, y.names() ) ;
    Set set(visitors);

    train_insert( set, y.nrows() ) ;

    std::vector<int> indices ;

    int n_x = x.nrows() ;
    for( int i=0; i<n_x; i++) {
        if( !set.count(-i-1) ){
            set.insert(-i-1) ;
            indices.push_back(-i-1) ;
        }
    }

    return visitors.subset( indices, x.attr("class") ) ;
}
Example #27
0
// [[Rcpp::export]]
DataFrame intersect_data_frame( DataFrame x, DataFrame y){
    if( !compatible_data_frame(x,y) )
        stop( "not compatible" );

    typedef VisitorSetIndexSet<DataFrameJoinVisitors> Set ;
    DataFrameJoinVisitors visitors(x, y, x.names() ) ;
    Set set(visitors);

    train_insert( set, x.nrows() ) ;

    std::vector<int> indices ;
    int n_y = y.nrows() ;
    for( int i=0; i<n_y; i++) {
        Set::iterator it = set.find( -i-1 ) ;
        if( it != set.end() ){
            indices.push_back(*it) ;
            set.erase(it) ;
        }
    }

    return visitors.subset( indices, x.attr("class") ) ;
}
Example #28
0
// [[Rcpp::export]] 
DataFrame arrange_impl( DataFrame data, List args, Environment env ){
    int nargs = args.size() ;  
    SEXP tmp ;
    List variables(nargs) ; 
    LogicalVector ascending(nargs) ;
    for(int i=0; i<nargs; i++){
        tmp = args[i] ;
        if( TYPEOF(tmp) == LANGSXP && CAR(tmp) == Rf_install("desc") ){
            variables[i] = Rf_eval( CAR(CDR(tmp) ), env ) ;
            ascending[i] = false ;
        } else{
            variables[i] = Rf_eval( tmp, env );
            ascending[i] = true ;
        }
    }
    OrderVisitors o(variables,ascending, nargs) ;
	IntegerVector index = o.apply() ;
	
	DataFrameVisitors visitors( data, data.names() ) ;
	DataFrame res = visitors.subset(index, data.attr("class") ) ;
	return res;
}
Example #29
0
// [[Rcpp::export]]
DataFrame anti_join_impl( DataFrame x, DataFrame y, CharacterVector by){
    typedef VisitorSetIndexMap<DataFrameJoinVisitors, std::vector<int> > Map ;
    DataFrameJoinVisitors visitors(x, y, by) ;
    Map map(visitors);

    // train the map in terms of x
    train_push_back( map, x.nrows(), x.nrows() / 10 ) ;

    int n_y = y.nrows() ;
    // remove the rows in x that match
    for( int i=0; i<n_y; i++){
        Map::iterator it = map.find(-i-1) ;
        if( it != map.end() )
            map.erase(it) ;
    }

    // collect what's left
    std::vector<int> indices ;
    for( Map::iterator it = map.begin() ; it != map.end(); ++it)
        push_back( indices, it->second ) ;

    return subset(x, indices, x.names(), x.attr( "class" ) ) ;
}
Example #30
0
// [[Rcpp::export]]
DataFrame build_index_cpp( DataFrame data ){
    ListOf<Symbol> symbols( data.attr( "vars" ) ) ;

    int nsymbols = symbols.size() ;
    CharacterVector vars(nsymbols) ;
    for( int i=0; i<nsymbols; i++){
        vars[i] = PRINTNAME(symbols[i]) ;
    }

    DataFrameVisitors visitors(data, vars) ;
    ChunkIndexMap map( visitors ) ;

    // checking 10 times for interupts
    train_push_back( map, data.nrows(), data.nrows() / 10 ) ;

    DataFrame labels = visitors.subset( map, "data.frame") ;
    int ngroups = labels.nrows() ;

    List indices(ngroups) ;
    IntegerVector group_sizes = no_init( ngroups );
    int biggest_group = 0 ;

    ChunkIndexMap::const_iterator it = map.begin() ;
    for( int i=0; i<ngroups; i++, ++it){
        const std::vector<int>& chunk = it->second ;
        indices[i] = chunk ;
        group_sizes[i] = chunk.size() ;
        biggest_group = std::max( biggest_group, (int)chunk.size() );
    }

    data.attr( "indices" ) = indices ;
    data.attr( "group_sizes") = group_sizes ;
    data.attr( "biggest_group_size" ) = biggest_group ;
    data.attr( "labels" ) = labels ;
    data.attr( "class" ) = CharacterVector::create("grouped_df", "tbl_df", "tbl", "data.frame") ;
    return data ;
}