// version of grouped filter when contributions to ... come from several environment DataFrame filter_grouped_multiple_env( const GroupedDataFrame& gdf, const List& args, const DataDots& dots){ const DataFrame& data = gdf.data() ; CharacterVector names = data.names() ; SymbolSet set ; for( int i=0; i<names.size(); i++){ set.insert( Rf_install( names[i] ) ) ; } int nrows = data.nrows() ; LogicalVector test(nrows, TRUE); LogicalVector g_test ; for( int k=0; k<args.size(); k++){ Call call( (SEXP)args[k] ) ; GroupedCallProxy call_proxy( call, gdf, dots.envir(k) ) ; int ngroups = gdf.ngroups() ; GroupedDataFrame::group_iterator git = gdf.group_begin() ; for( int i=0; i<ngroups; i++, ++git){ SlicingIndex indices = *git ; int chunk_size = indices.size() ; g_test = call_proxy.get( indices ); check_filter_result(g_test, chunk_size ) ; for( int j=0; j<chunk_size; j++){ test[ indices[j] ] = test[ indices[j] ] & g_test[j] ; } } } DataFrame res = subset( data, test, names, classes_grouped() ) ; res.attr( "vars") = data.attr("vars") ; return res ; }
DataFrame filter_grouped( const GroupedDataFrame& gdf, List args, Environment env){ // a, b, c -> a & b & c Language call = and_calls( args ) ; const DataFrame& data = gdf.data() ; int nrows = data.nrows() ; LogicalVector test = no_init(nrows); LogicalVector g_test ; GroupedCallProxy call_proxy( call, gdf, env ) ; int ngroups = gdf.ngroups() ; GroupedDataFrame::group_iterator git = gdf.group_begin() ; for( int i=0; i<ngroups; i++, ++git){ SlicingIndex indices = *git ; g_test = call_proxy.get( indices ); int chunk_size = indices.size() ; for( int j=0; j<chunk_size; j++){ test[ indices[j] ] = g_test[j] ; } } DataFrame res = subset( data, test, data.names(), classes_grouped() ) ; res.attr( "vars") = data.attr("vars") ; return res ; }
DataFrame filter_grouped_single_env( const GroupedDataFrame& gdf, const List& args, const Environment& env){ const DataFrame& data = gdf.data() ; CharacterVector names = data.names() ; SymbolSet set ; for( int i=0; i<names.size(); i++){ set.insert( Rf_install( names[i] ) ) ; } // a, b, c -> a & b & c Call call( and_calls( args, set ) ) ; int nrows = data.nrows() ; LogicalVector test = no_init(nrows); LogicalVector g_test ; GroupedCallProxy call_proxy( call, gdf, env ) ; int ngroups = gdf.ngroups() ; GroupedDataFrame::group_iterator git = gdf.group_begin() ; for( int i=0; i<ngroups; i++, ++git){ SlicingIndex indices = *git ; int chunk_size = indices.size() ; g_test = call_proxy.get( indices ); check_filter_result(g_test, chunk_size ) ; for( int j=0; j<chunk_size; j++){ test[ indices[j] ] = g_test[j] ; } } DataFrame res = subset( data, test, names, classes_grouped() ) ; res.attr( "vars") = data.attr("vars") ; return res ; }
// [[Rcpp::export]] IntegerVector grouped_indices_grouped_df_impl(GroupedDataFrame gdf) { int n=gdf.nrows(); IntegerVector res = no_init(n); int ngroups = gdf.ngroups(); GroupedDataFrameIndexIterator it = gdf.group_begin(); for (int i=0; i<ngroups; i++, ++it) { SlicingIndex index = *it; int n_index = index.size(); for (int j=0; j<n_index; j++) { res[ index[j] ] = i + 1; } } return res; }
// version of grouped filter when contributions to ... come from several environment DataFrame filter_grouped_multiple_env( const GroupedDataFrame& gdf, const LazyDots& dots){ const DataFrame& data = gdf.data() ; CharacterVector names = data.names() ; SymbolSet set ; for( int i=0; i<names.size(); i++){ set.insert( Rf_install( names[i] ) ) ; } int nrows = data.nrows() ; LogicalVector test(nrows, TRUE); LogicalVector g_test ; for( int k=0; k<dots.size(); k++){ Rcpp::checkUserInterrupt() ; const Lazy& lazy = dots[k] ; Call call( lazy.expr() ) ; GroupedCallProxy<GroupedDataFrame> call_proxy( call, gdf, lazy.env() ) ; int ngroups = gdf.ngroups() ; GroupedDataFrame::group_iterator git = gdf.group_begin() ; for( int i=0; i<ngroups; i++, ++git){ SlicingIndex indices = *git ; int chunk_size = indices.size() ; g_test = check_filter_logical_result(call_proxy.get( indices )); if( g_test.size() == 1 ){ if( g_test[0] != TRUE ){ for( int j=0; j<chunk_size; j++){ test[indices[j]] = FALSE ; } } } else { check_filter_result(g_test, chunk_size ) ; for( int j=0; j<chunk_size; j++){ if( g_test[j] != TRUE ){ test[ indices[j] ] = FALSE ; } } } } } DataFrame res = subset( data, test, names, classes_grouped<GroupedDataFrame>() ) ; res.attr( "vars") = data.attr("vars") ; return res ; }
DataFrame filter_grouped_single_env( const GroupedDataFrame& gdf, const LazyDots& dots){ typedef GroupedCallProxy<GroupedDataFrame, LazyGroupedSubsets> Proxy ; Environment env = dots[0].env() ; const DataFrame& data = gdf.data() ; CharacterVector names = data.names() ; SymbolSet set ; for( int i=0; i<names.size(); i++){ set.insert( Rf_install( names[i] ) ) ; } // a, b, c -> a & b & c Call call( and_calls( dots, set, env ) ) ; int nrows = data.nrows() ; LogicalVector test(nrows, TRUE); LogicalVector g_test ; Proxy call_proxy( call, gdf, env ) ; int ngroups = gdf.ngroups() ; GroupedDataFrame::group_iterator git = gdf.group_begin() ; for( int i=0; i<ngroups; i++, ++git){ SlicingIndex indices = *git ; int chunk_size = indices.size() ; g_test = check_filter_logical_result( call_proxy.get( indices ) ) ; if( g_test.size() == 1 ){ int val = g_test[0] == TRUE ; for( int j=0; j<chunk_size; j++){ test[ indices[j] ] = val ; } } else { check_filter_result(g_test, chunk_size ) ; for( int j=0; j<chunk_size; j++){ if( g_test[j] != TRUE ) test[ indices[j] ] = FALSE ; } } } DataFrame res = subset( data, test, names, classes_grouped<GroupedDataFrame>() ) ; res.attr( "vars") = data.attr("vars") ; return res ; }
SEXP slice_grouped(GroupedDataFrame gdf, const LazyDots& dots) { typedef GroupedCallProxy<GroupedDataFrame, LazyGroupedSubsets> Proxy; const DataFrame& data = gdf.data(); const Lazy& lazy = dots[0]; Environment env = lazy.env(); SymbolVector names = data.names(); // we already checked that we have only one expression Call call(lazy.expr()); std::vector<int> indx; indx.reserve(1000); IntegerVector g_test; Proxy call_proxy(call, gdf, env); int ngroups = gdf.ngroups(); GroupedDataFrame::group_iterator git = gdf.group_begin(); for (int i=0; i<ngroups; i++, ++git) { const SlicingIndex& indices = *git; int nr = indices.size(); g_test = check_filter_integer_result(call_proxy.get(indices)); CountIndices counter(indices.size(), g_test); if (counter.is_positive()) { // positive indexing int ntest = g_test.size(); for (int j=0; j<ntest; j++) { if (!(g_test[j] > nr || g_test[j] == NA_INTEGER)) { indx.push_back(indices[g_test[j]-1]); } } } else if (counter.get_n_negative() != 0) { // negative indexing std::set<int> drop; int n = g_test.size(); for (int j=0; j<n; j++) { if (g_test[j] != NA_INTEGER) drop.insert(-g_test[j]); } int n_drop = drop.size(); std::set<int>::const_iterator drop_it = drop.begin(); int k = 0, j = 0; while (drop_it != drop.end()) { int next_drop = *drop_it - 1; while (j < next_drop) { indx.push_back(indices[j++]); k++; } j++; ++drop_it; } while (k < nr - n_drop) { indx.push_back(indices[j++]); k++; } } } DataFrame res = subset(data, indx, names, classes_grouped<GroupedDataFrame>()); set_vars(res, get_vars(data)); strip_index(res); return GroupedDataFrame(res).data(); }
//[[Rcpp::export]] DataFrame complement_impl(GroupedDataFrame gdf, DataFrame genome) { genome_map_t chrom_sizes = makeChromSizes(genome) ; DataFrame df = gdf.data() ; IntegerVector starts = df["start"] ; IntegerVector ends = df["end"] ; CharacterVector chroms = df["chrom"] ; std::vector<std::string> chroms_out ; std::vector<int> starts_out ; std::vector<int> ends_out ; int ngroups = gdf.ngroups() ; GroupedDataFrame::group_iterator git = gdf.group_begin() ; for (int i = 0; i < ngroups; ++i, ++git) { SlicingIndex indices = *git ; int ni = indices.size() ; int start, end ; int last_end = 1 ; // get chrom from first index auto chrom = as<std::string>(chroms[indices[0]]) ; for (int j = 0; j < ni; ++j) { start = starts[indices[j]] ; end = ends[indices[j]] ; if (j == 0) { if (start == 1) { last_end = end ; continue ; } else { chroms_out.push_back(chrom) ; starts_out.push_back(1) ; ends_out.push_back(start) ; } } else { chroms_out.push_back(chrom) ; starts_out.push_back(last_end) ; ends_out.push_back(start) ; } last_end = end; } auto chrom_size = chrom_sizes[chrom] ; if (last_end < chrom_size) { chroms_out.push_back(chrom) ; starts_out.push_back(last_end) ; ends_out.push_back(chrom_size) ; } } return DataFrame::create(_("chrom") = chroms_out, _("start") = starts_out, _("end") = ends_out, _("stringsAsFactors") = false) ; }