DataFrame filter_grouped_single_env( const GroupedDataFrame& gdf, const List& args, const Environment& env){ const DataFrame& data = gdf.data() ; CharacterVector names = data.names() ; SymbolSet set ; for( int i=0; i<names.size(); i++){ set.insert( Rf_install( names[i] ) ) ; } // a, b, c -> a & b & c Call call( and_calls( args, set ) ) ; int nrows = data.nrows() ; LogicalVector test = no_init(nrows); LogicalVector g_test ; GroupedCallProxy call_proxy( call, gdf, env ) ; int ngroups = gdf.ngroups() ; GroupedDataFrame::group_iterator git = gdf.group_begin() ; for( int i=0; i<ngroups; i++, ++git){ SlicingIndex indices = *git ; int chunk_size = indices.size() ; g_test = call_proxy.get( indices ); check_filter_result(g_test, chunk_size ) ; for( int j=0; j<chunk_size; j++){ test[ indices[j] ] = g_test[j] ; } } DataFrame res = subset( data, test, names, classes_grouped() ) ; res.attr( "vars") = data.attr("vars") ; return res ; }
// version of grouped filter when contributions to ... come from several environment DataFrame filter_grouped_multiple_env( const GroupedDataFrame& gdf, const List& args, const DataDots& dots){ const DataFrame& data = gdf.data() ; CharacterVector names = data.names() ; SymbolSet set ; for( int i=0; i<names.size(); i++){ set.insert( Rf_install( names[i] ) ) ; } int nrows = data.nrows() ; LogicalVector test(nrows, TRUE); LogicalVector g_test ; for( int k=0; k<args.size(); k++){ Call call( (SEXP)args[k] ) ; GroupedCallProxy call_proxy( call, gdf, dots.envir(k) ) ; int ngroups = gdf.ngroups() ; GroupedDataFrame::group_iterator git = gdf.group_begin() ; for( int i=0; i<ngroups; i++, ++git){ SlicingIndex indices = *git ; int chunk_size = indices.size() ; g_test = call_proxy.get( indices ); check_filter_result(g_test, chunk_size ) ; for( int j=0; j<chunk_size; j++){ test[ indices[j] ] = test[ indices[j] ] & g_test[j] ; } } } DataFrame res = subset( data, test, names, classes_grouped() ) ; res.attr( "vars") = data.attr("vars") ; return res ; }
//[[Rcpp::export]] DataFrame reldist_impl(GroupedDataFrame x, GroupedDataFrame y) { std::vector<float> rel_distances ; std::vector<int> indices_x ; DataFrame df_x = x.data() ; PairedGroupApply(x, y, reldist_grouped, std::ref(indices_x), std::ref(rel_distances)); DataFrame subset_x = DataFrameSubsetVisitors(df_x, names(df_x)).subset(indices_x, "data.frame"); auto ncol_x = subset_x.size() ; CharacterVector names(ncol_x + 1) ; CharacterVector names_x = subset_x.attr("names") ; List out(ncol_x + 1) ; // x names, data for( int i=0; i<ncol_x; i++) { names[i] = names_x[i] ; out[i] = subset_x[i] ; } out[ncol_x] = rel_distances ; names[ncol_x] = "reldist" ; out.attr("names") = names ; out.attr("class") = classes_not_grouped() ; auto nrows = subset_x.nrows() ; set_rownames(out, nrows) ; return out ; }
SEXP summarise_grouped(const GroupedDataFrame& gdf, List args, const DataDots& dots){ DataFrame df = gdf.data() ; int nexpr = args.size() ; int nvars = gdf.nvars() ; CharacterVector results_names = args.names() ; check_not_groups(results_names, gdf); NamedListAccumulator<SEXP> accumulator ; int i=0; for( ; i<nvars; i++){ SET_NAMED(gdf.label(i), 2) ; accumulator.set( PRINTNAME(gdf.symbol(i)), gdf.label(i) ) ; } LazyGroupedSubsets subsets(gdf) ; Shelter<SEXP> __ ; for( int k=0; k<nexpr; k++, i++ ){ Environment env = dots.envir(k) ; Result* res = get_handler( args[k], subsets, env ) ; // if we could not find a direct Result // we can use a GroupedCalledReducer which will callback to R if( !res ) res = new GroupedCalledReducer( args[k], subsets, env) ; SEXP result = __( res->process(gdf) ) ; SEXP name = results_names[k] ; accumulator.set( name, result ); subsets.input( Symbol(name), SummarisedVariable(result) ) ; delete res; } return summarised_grouped_tbl_cpp(accumulator, gdf ); }
SEXP summarise_grouped(const GroupedDataFrame& gdf, List args, Environment env){ DataFrame df = gdf.data() ; int nexpr = args.size() ; int nvars = gdf.nvars() ; CharacterVector results_names = args.names() ; List out(nexpr + nvars) ; CharacterVector names(nexpr + nvars) ; int i=0; for( ; i<nvars; i++){ out[i] = gdf.label(i) ; SET_NAMED(out[i], 2) ; names[i] = CHAR(PRINTNAME(gdf.symbol(i))) ; } LazyGroupedSubsets subsets(gdf) ; for( int k=0; k<nexpr; k++, i++ ){ Result* res( get_handler( args[k], subsets ) ) ; // if we could not find a direct Result // we can use a GroupedCalledReducer which will callback to R if( !res ) res = new GroupedCalledReducer( args[k], gdf, env) ; out[i] = res->process(gdf) ; names[i] = results_names[k] ; delete res; } return summarised_grouped_tbl_cpp(out, names, gdf ); }
DataFrame filter_grouped( const GroupedDataFrame& gdf, List args, Environment env){ // a, b, c -> a & b & c Language call = and_calls( args ) ; const DataFrame& data = gdf.data() ; int nrows = data.nrows() ; LogicalVector test = no_init(nrows); LogicalVector g_test ; GroupedCallProxy call_proxy( call, gdf, env ) ; int ngroups = gdf.ngroups() ; GroupedDataFrame::group_iterator git = gdf.group_begin() ; for( int i=0; i<ngroups; i++, ++git){ SlicingIndex indices = *git ; g_test = call_proxy.get( indices ); int chunk_size = indices.size() ; for( int j=0; j<chunk_size; j++){ test[ indices[j] ] = g_test[j] ; } } DataFrame res = subset( data, test, data.names(), classes_grouped() ) ; res.attr( "vars") = data.attr("vars") ; return res ; }
// version of grouped filter when contributions to ... come from several environment DataFrame filter_grouped_multiple_env( const GroupedDataFrame& gdf, const LazyDots& dots){ const DataFrame& data = gdf.data() ; CharacterVector names = data.names() ; SymbolSet set ; for( int i=0; i<names.size(); i++){ set.insert( Rf_install( names[i] ) ) ; } int nrows = data.nrows() ; LogicalVector test(nrows, TRUE); LogicalVector g_test ; for( int k=0; k<dots.size(); k++){ Rcpp::checkUserInterrupt() ; const Lazy& lazy = dots[k] ; Call call( lazy.expr() ) ; GroupedCallProxy<GroupedDataFrame> call_proxy( call, gdf, lazy.env() ) ; int ngroups = gdf.ngroups() ; GroupedDataFrame::group_iterator git = gdf.group_begin() ; for( int i=0; i<ngroups; i++, ++git){ SlicingIndex indices = *git ; int chunk_size = indices.size() ; g_test = check_filter_logical_result(call_proxy.get( indices )); if( g_test.size() == 1 ){ if( g_test[0] != TRUE ){ for( int j=0; j<chunk_size; j++){ test[indices[j]] = FALSE ; } } } else { check_filter_result(g_test, chunk_size ) ; for( int j=0; j<chunk_size; j++){ if( g_test[j] != TRUE ){ test[ indices[j] ] = FALSE ; } } } } } DataFrame res = subset( data, test, names, classes_grouped<GroupedDataFrame>() ) ; res.attr( "vars") = data.attr("vars") ; return res ; }
DataFrame filter_grouped_single_env( const GroupedDataFrame& gdf, const LazyDots& dots){ typedef GroupedCallProxy<GroupedDataFrame, LazyGroupedSubsets> Proxy ; Environment env = dots[0].env() ; const DataFrame& data = gdf.data() ; CharacterVector names = data.names() ; SymbolSet set ; for( int i=0; i<names.size(); i++){ set.insert( Rf_install( names[i] ) ) ; } // a, b, c -> a & b & c Call call( and_calls( dots, set, env ) ) ; int nrows = data.nrows() ; LogicalVector test(nrows, TRUE); LogicalVector g_test ; Proxy call_proxy( call, gdf, env ) ; int ngroups = gdf.ngroups() ; GroupedDataFrame::group_iterator git = gdf.group_begin() ; for( int i=0; i<ngroups; i++, ++git){ SlicingIndex indices = *git ; int chunk_size = indices.size() ; g_test = check_filter_logical_result( call_proxy.get( indices ) ) ; if( g_test.size() == 1 ){ int val = g_test[0] == TRUE ; for( int j=0; j<chunk_size; j++){ test[ indices[j] ] = val ; } } else { check_filter_result(g_test, chunk_size ) ; for( int j=0; j<chunk_size; j++){ if( g_test[j] != TRUE ) test[ indices[j] ] = FALSE ; } } } DataFrame res = subset( data, test, names, classes_grouped<GroupedDataFrame>() ) ; res.attr( "vars") = data.attr("vars") ; return res ; }
SEXP mutate_grouped(GroupedDataFrame gdf, List args, Environment env){ const DataFrame& df = gdf.data() ; int nexpr = args.size() ; CharacterVector results_names = args.names() ; GroupedCallProxy proxy(gdf, env) ; Shelter<SEXP> __ ; for( int i=0; i<nexpr; i++){ proxy.set_call( args[i] ); boost::scoped_ptr<Gatherer> gather( gatherer( proxy, gdf ) ); proxy.input( results_names[i], __( gather->collect() ) ) ; } DataFrame res = structure_mutate( proxy, df, results_names, classes_grouped() ) ; res.attr( "vars") = df.attr("vars") ; res.attr( "labels" ) = df.attr("labels" ); res.attr( "index") = df.attr("index") ; return res ; }
SEXP mutate_grouped(GroupedDataFrame gdf, List args, const DataDots& dots){ const DataFrame& df = gdf.data() ; int nexpr = args.size() ; CharacterVector results_names = args.names() ; check_not_groups(results_names, gdf); Environment env = dots.envir(0) ; GroupedCallProxy proxy(gdf, env) ; Shelter<SEXP> __ ; NamedListAccumulator<SEXP> accumulator ; int ncolumns = df.size() ; CharacterVector column_names = df.names() ; for( int i=0; i<ncolumns; i++){ accumulator.set( column_names[i], df[i] ) ; } for( int i=0; i<nexpr; i++){ env = dots.envir(i) ; proxy.set_env( env ) ; SEXP call = args[i] ; SEXP name = results_names[i] ; SEXP variable = R_NilValue ; if( TYPEOF(call) == SYMSXP ){ if(proxy.has_variable(call)){ variable = proxy.get_variable( PRINTNAME(call) ) ; } else { SEXP v = env.find(CHAR(PRINTNAME(call))) ; if( Rf_isNull(v) ){ std::stringstream s ; s << "unknown variable: " << CHAR(PRINTNAME(call)) ; stop(s.str()); } else if( Rf_length(v) == 1){ Replicator* rep = constant_replicator(v, gdf.nrows() ); variable = __( rep->collect() ); delete rep ; } else { Replicator* rep = replicator(v, gdf) ; variable = __( rep->collect() ); delete rep ; } } } else if(TYPEOF(call) == LANGSXP){ proxy.set_call( call ); Gatherer* gather = gatherer( proxy, gdf ) ; variable = __( gather->collect() ) ; delete gather ; } else if(Rf_length(call) == 1) { boost::scoped_ptr<Gatherer> gather( constant_gatherer( call, gdf.nrows() ) ); variable = __( gather->collect() ) ; } else { stop( "cannot handle" ) ; } proxy.input( name, variable ) ; accumulator.set( name, variable) ; } return structure_mutate(accumulator, df, classes_grouped() ); }
//[[Rcpp::export]] DataFrame intersect_impl(GroupedDataFrame x, GroupedDataFrame y, const std::string& suffix_x = ".x", const std::string& suffix_y = ".y") { // indices for subsetting std::vector<int> indices_x ; std::vector<int> indices_y ; // overlap sizes std::vector<int> overlap_sizes ; auto data_x = x.data() ; auto data_y = y.data() ; // set up interval trees for each chromosome and apply intersect_group GroupApply(x, y, intersect_group, std::ref(indices_x), std::ref(indices_y), std::ref(overlap_sizes)); DataFrame subset_x = DataFrameSubsetVisitors(data_x, names(data_x)).subset(indices_x, "data.frame"); DataFrame subset_y = DataFrameSubsetVisitors(data_y, names(data_y)).subset(indices_y, "data.frame"); auto ncol_x = subset_x.size() ; auto ncol_y = subset_y.size() ; CharacterVector names(ncol_x + ncol_y) ; CharacterVector names_x = subset_x.attr("names") ; CharacterVector names_y = subset_y.attr("names") ; // replacing y chrom with overlap, same number of cols List out(ncol_x + ncol_y) ; // x names, data for (int i = 0; i < ncol_x; i++) { auto name_x = as<std::string>(names_x[i]) ; if (name_x != "chrom") { name_x += suffix_x ; } names[i] = name_x ; out[i] = subset_x[i] ; } // y names, data for (int i = 0; i < ncol_y; i++) { auto name_y = as<std::string>(names_y[i]) ; if (name_y == "chrom") continue ; name_y += suffix_y ; names[i + ncol_x - 1] = name_y ; out[i + ncol_x - 1] = subset_y[i] ; } // overlaps out[ncol_x + ncol_y - 1] = overlap_sizes ; names[ncol_x + ncol_y - 1] = ".overlap" ; out.attr("names") = names ; out.attr("class") = classes_not_grouped() ; auto nrows = subset_x.nrows() ; set_rownames(out, nrows) ; return out ; }
SEXP slice_grouped(GroupedDataFrame gdf, const LazyDots& dots) { typedef GroupedCallProxy<GroupedDataFrame, LazyGroupedSubsets> Proxy; const DataFrame& data = gdf.data(); const Lazy& lazy = dots[0]; Environment env = lazy.env(); SymbolVector names = data.names(); // we already checked that we have only one expression Call call(lazy.expr()); std::vector<int> indx; indx.reserve(1000); IntegerVector g_test; Proxy call_proxy(call, gdf, env); int ngroups = gdf.ngroups(); GroupedDataFrame::group_iterator git = gdf.group_begin(); for (int i=0; i<ngroups; i++, ++git) { const SlicingIndex& indices = *git; int nr = indices.size(); g_test = check_filter_integer_result(call_proxy.get(indices)); CountIndices counter(indices.size(), g_test); if (counter.is_positive()) { // positive indexing int ntest = g_test.size(); for (int j=0; j<ntest; j++) { if (!(g_test[j] > nr || g_test[j] == NA_INTEGER)) { indx.push_back(indices[g_test[j]-1]); } } } else if (counter.get_n_negative() != 0) { // negative indexing std::set<int> drop; int n = g_test.size(); for (int j=0; j<n; j++) { if (g_test[j] != NA_INTEGER) drop.insert(-g_test[j]); } int n_drop = drop.size(); std::set<int>::const_iterator drop_it = drop.begin(); int k = 0, j = 0; while (drop_it != drop.end()) { int next_drop = *drop_it - 1; while (j < next_drop) { indx.push_back(indices[j++]); k++; } j++; ++drop_it; } while (k < nr - n_drop) { indx.push_back(indices[j++]); k++; } } } DataFrame res = subset(data, indx, names, classes_grouped<GroupedDataFrame>()); set_vars(res, get_vars(data)); strip_index(res); return GroupedDataFrame(res).data(); }
//[[Rcpp::export]] DataFrame complement_impl(GroupedDataFrame gdf, DataFrame genome) { genome_map_t chrom_sizes = makeChromSizes(genome) ; DataFrame df = gdf.data() ; IntegerVector starts = df["start"] ; IntegerVector ends = df["end"] ; CharacterVector chroms = df["chrom"] ; std::vector<std::string> chroms_out ; std::vector<int> starts_out ; std::vector<int> ends_out ; int ngroups = gdf.ngroups() ; GroupedDataFrame::group_iterator git = gdf.group_begin() ; for (int i = 0; i < ngroups; ++i, ++git) { SlicingIndex indices = *git ; int ni = indices.size() ; int start, end ; int last_end = 1 ; // get chrom from first index auto chrom = as<std::string>(chroms[indices[0]]) ; for (int j = 0; j < ni; ++j) { start = starts[indices[j]] ; end = ends[indices[j]] ; if (j == 0) { if (start == 1) { last_end = end ; continue ; } else { chroms_out.push_back(chrom) ; starts_out.push_back(1) ; ends_out.push_back(start) ; } } else { chroms_out.push_back(chrom) ; starts_out.push_back(last_end) ; ends_out.push_back(start) ; } last_end = end; } auto chrom_size = chrom_sizes[chrom] ; if (last_end < chrom_size) { chroms_out.push_back(chrom) ; starts_out.push_back(last_end) ; ends_out.push_back(chrom_size) ; } } return DataFrame::create(_("chrom") = chroms_out, _("start") = starts_out, _("end") = ends_out, _("stringsAsFactors") = false) ; }