Example #1
0
DataFrame filter_grouped_single_env( const GroupedDataFrame& gdf, const List& args, const Environment& env){
    const DataFrame& data = gdf.data() ;
    CharacterVector names = data.names() ;
    SymbolSet set ;
    for( int i=0; i<names.size(); i++){
        set.insert( Rf_install( names[i] ) ) ;
    }

    // a, b, c ->  a & b & c
    Call call( and_calls( args, set ) ) ;

    int nrows = data.nrows() ;
    LogicalVector test = no_init(nrows);

    LogicalVector g_test ;
    GroupedCallProxy call_proxy( call, gdf, env ) ;

    int ngroups = gdf.ngroups() ;
    GroupedDataFrame::group_iterator git = gdf.group_begin() ;
    for( int i=0; i<ngroups; i++, ++git){
        SlicingIndex indices = *git ;
        int chunk_size = indices.size() ;

        g_test  = call_proxy.get( indices );
        check_filter_result(g_test, chunk_size ) ;
        for( int j=0; j<chunk_size; j++){
            test[ indices[j] ] = g_test[j] ;
        }
    }

    DataFrame res = subset( data, test, names, classes_grouped() ) ;
    res.attr( "vars")   = data.attr("vars") ;

    return res ;
}
Example #2
0
// version of grouped filter when contributions to ... come from several environment
DataFrame filter_grouped_multiple_env( const GroupedDataFrame& gdf, const List& args, const DataDots& dots){
    const DataFrame& data = gdf.data() ;
    CharacterVector names = data.names() ;
    SymbolSet set ;
    for( int i=0; i<names.size(); i++){
        set.insert( Rf_install( names[i] ) ) ;
    }

    int nrows = data.nrows() ;
    LogicalVector test(nrows, TRUE);

    LogicalVector g_test ;

    for( int k=0; k<args.size(); k++){
        Call call( (SEXP)args[k] ) ;
        GroupedCallProxy call_proxy( call, gdf, dots.envir(k) ) ;
        int ngroups = gdf.ngroups() ;
        GroupedDataFrame::group_iterator git = gdf.group_begin() ;
        for( int i=0; i<ngroups; i++, ++git){
            SlicingIndex indices = *git ;
            int chunk_size = indices.size() ;

            g_test  = call_proxy.get( indices );
            check_filter_result(g_test, chunk_size ) ;
            for( int j=0; j<chunk_size; j++){
                test[ indices[j] ] = test[ indices[j] ] & g_test[j] ;
            }
        }
    }
    DataFrame res = subset( data, test, names, classes_grouped() ) ;
    res.attr( "vars")   = data.attr("vars") ;

    return res ;
}
Example #3
0
//[[Rcpp::export]]
DataFrame reldist_impl(GroupedDataFrame x, GroupedDataFrame y) {
  
  std::vector<float> rel_distances ; 
  std::vector<int> indices_x ;
  
  DataFrame df_x = x.data() ;
  PairedGroupApply(x, y, reldist_grouped, std::ref(indices_x), std::ref(rel_distances)); 
  
  DataFrame subset_x = DataFrameSubsetVisitors(df_x, names(df_x)).subset(indices_x, "data.frame");
  
  auto ncol_x = subset_x.size() ;
  
  CharacterVector names(ncol_x + 1) ;
  CharacterVector names_x = subset_x.attr("names") ;
  
  List out(ncol_x + 1) ;
  
  // x names, data
  for( int i=0; i<ncol_x; i++) {
    names[i] = names_x[i] ;
    out[i] = subset_x[i] ;
  }
  out[ncol_x] = rel_distances ;
  names[ncol_x] = "reldist" ;
  
  out.attr("names") = names ; 
  out.attr("class") = classes_not_grouped() ;
  auto nrows = subset_x.nrows() ; 
  set_rownames(out, nrows) ;
  
  return out ; 
  
}
Example #4
0
SEXP summarise_grouped(const GroupedDataFrame& gdf, List args, const DataDots& dots){
    DataFrame df = gdf.data() ;

    int nexpr = args.size() ;
    int nvars = gdf.nvars() ;
    CharacterVector results_names = args.names() ;
    check_not_groups(results_names, gdf);
    NamedListAccumulator<SEXP> accumulator ;

    int i=0;
    for( ; i<nvars; i++){
        SET_NAMED(gdf.label(i), 2) ;
        accumulator.set( PRINTNAME(gdf.symbol(i)), gdf.label(i) ) ;
    }

    LazyGroupedSubsets subsets(gdf) ;
    Shelter<SEXP> __ ;
    for( int k=0; k<nexpr; k++, i++ ){
        Environment env = dots.envir(k) ;

        Result* res = get_handler( args[k], subsets, env ) ;

        // if we could not find a direct Result
        // we can use a GroupedCalledReducer which will callback to R
        if( !res ) res = new GroupedCalledReducer( args[k], subsets, env) ;

        SEXP result = __( res->process(gdf) ) ;
        SEXP name = results_names[k] ;
        accumulator.set( name, result );
        subsets.input( Symbol(name), SummarisedVariable(result) ) ;
        delete res;
    }

    return summarised_grouped_tbl_cpp(accumulator, gdf );
}
Example #5
0
SEXP summarise_grouped(const GroupedDataFrame& gdf, List args, Environment env){
    DataFrame df = gdf.data() ;
    
    int nexpr = args.size() ;
    int nvars = gdf.nvars() ;
    CharacterVector results_names = args.names() ;
    List out(nexpr + nvars) ;
    CharacterVector names(nexpr + nvars) ;
    
    int i=0; 
    for( ; i<nvars; i++){
        out[i]      = gdf.label(i) ;
        SET_NAMED(out[i], 2) ;
        names[i]    = CHAR(PRINTNAME(gdf.symbol(i))) ;
    }
    
    LazyGroupedSubsets subsets(gdf) ;
    for( int k=0; k<nexpr; k++, i++ ){
        Result* res( get_handler( args[k], subsets ) ) ;
        
        // if we could not find a direct Result 
        // we can use a GroupedCalledReducer which will callback to R
        if( !res ) res = new GroupedCalledReducer( args[k], gdf, env) ;
        
        out[i] = res->process(gdf) ;
        names[i] = results_names[k] ;
        delete res;
    }
    
    return summarised_grouped_tbl_cpp(out, names, gdf );
}
Example #6
0
DataFrame filter_grouped( const GroupedDataFrame& gdf, List args, Environment env){
    // a, b, c ->  a & b & c
    Language call = and_calls( args ) ;
    
    const DataFrame& data = gdf.data() ;
    int nrows = data.nrows() ;
    LogicalVector test = no_init(nrows);
    
    LogicalVector g_test ;
    GroupedCallProxy call_proxy( call, gdf, env ) ;
    
    int ngroups = gdf.ngroups() ;
    GroupedDataFrame::group_iterator git = gdf.group_begin() ;
    for( int i=0; i<ngroups; i++, ++git){
        SlicingIndex indices = *git ;
        g_test  = call_proxy.get( indices );
        
        int chunk_size = indices.size() ;
        for( int j=0; j<chunk_size; j++){
            test[ indices[j] ] = g_test[j] ;  
        }
    }
    DataFrame res = subset( data, test, data.names(), classes_grouped() ) ;
    res.attr( "vars")   = data.attr("vars") ;
            
    return res ;
}
Example #7
0
// version of grouped filter when contributions to ... come from several environment
DataFrame filter_grouped_multiple_env( const GroupedDataFrame& gdf, const LazyDots& dots){
    const DataFrame& data = gdf.data() ;
    CharacterVector names = data.names() ;
    SymbolSet set ;
    for( int i=0; i<names.size(); i++){
        set.insert( Rf_install( names[i] ) ) ;
    }

    int nrows = data.nrows() ;
    LogicalVector test(nrows, TRUE);

    LogicalVector g_test ;

    for( int k=0; k<dots.size(); k++){
        Rcpp::checkUserInterrupt() ;
        const Lazy& lazy = dots[k] ;
        
        Call call( lazy.expr() ) ;
        GroupedCallProxy<GroupedDataFrame> call_proxy( call, gdf, lazy.env() ) ;
        int ngroups = gdf.ngroups() ;
        GroupedDataFrame::group_iterator git = gdf.group_begin() ;
        for( int i=0; i<ngroups; i++, ++git){
            SlicingIndex indices = *git ;
            int chunk_size = indices.size() ;

            g_test  = check_filter_logical_result(call_proxy.get( indices ));
            if( g_test.size() == 1 ){
                if( g_test[0] != TRUE ){
                    for( int j=0; j<chunk_size; j++){
                        test[indices[j]] = FALSE ;    
                    }
                }
            } else {
                check_filter_result(g_test, chunk_size ) ;
                for( int j=0; j<chunk_size; j++){
                    if( g_test[j] != TRUE ){
                        test[ indices[j] ] = FALSE ;
                    }
                }
            }
        }
    }
    DataFrame res = subset( data, test, names, classes_grouped<GroupedDataFrame>() ) ;
    res.attr( "vars") = data.attr("vars") ;

    return res ;
}
Example #8
0
DataFrame filter_grouped_single_env( const GroupedDataFrame& gdf, const LazyDots& dots){
    typedef GroupedCallProxy<GroupedDataFrame, LazyGroupedSubsets> Proxy ; 
    Environment env = dots[0].env() ;
    
    const DataFrame& data = gdf.data() ;
    CharacterVector names = data.names() ;
    SymbolSet set ;
    for( int i=0; i<names.size(); i++){
        set.insert( Rf_install( names[i] ) ) ;
    }

    // a, b, c ->  a & b & c
    Call call( and_calls( dots, set, env ) ) ;

    int nrows = data.nrows() ;
    LogicalVector test(nrows, TRUE);

    LogicalVector g_test ;
    Proxy call_proxy( call, gdf, env ) ;

    int ngroups = gdf.ngroups() ;
    GroupedDataFrame::group_iterator git = gdf.group_begin() ;
    for( int i=0; i<ngroups; i++, ++git){
        SlicingIndex indices = *git ;
        int chunk_size = indices.size() ;
        
        g_test = check_filter_logical_result( call_proxy.get( indices ) ) ;
        if( g_test.size() == 1 ){
            int val = g_test[0] == TRUE ;
            for( int j=0; j<chunk_size; j++){
                test[ indices[j] ] = val ;
            }
        } else {
            check_filter_result(g_test, chunk_size ) ;
            for( int j=0; j<chunk_size; j++){
                if( g_test[j] != TRUE ) test[ indices[j] ] = FALSE ;
            }
        }
    }
    DataFrame res = subset( data, test, names, classes_grouped<GroupedDataFrame>() ) ;
    res.attr( "vars")   = data.attr("vars") ;

    return res ;
}
Example #9
0
SEXP mutate_grouped(GroupedDataFrame gdf, List args, Environment env){
    const DataFrame& df = gdf.data() ;
    
    int nexpr = args.size() ;
    CharacterVector results_names = args.names() ;
    
    GroupedCallProxy proxy(gdf, env) ;
    Shelter<SEXP> __ ;
    
    for( int i=0; i<nexpr; i++){
        proxy.set_call( args[i] );
        boost::scoped_ptr<Gatherer> gather( gatherer( proxy, gdf ) );
        proxy.input( results_names[i], __( gather->collect() ) ) ;
    }
    
    DataFrame res = structure_mutate( proxy, df, results_names, classes_grouped() ) ;
    res.attr( "vars")    = df.attr("vars") ;
    res.attr( "labels" ) = df.attr("labels" );
    res.attr( "index")   = df.attr("index") ;
    
    return res ;
}
Example #10
0
SEXP mutate_grouped(GroupedDataFrame gdf, List args, const DataDots& dots){
    const DataFrame& df = gdf.data() ;
    int nexpr = args.size() ;
    CharacterVector results_names = args.names() ;
    check_not_groups(results_names, gdf);

    Environment env = dots.envir(0) ;
    GroupedCallProxy proxy(gdf, env) ;
    Shelter<SEXP> __ ;

    NamedListAccumulator<SEXP> accumulator ;
    int ncolumns = df.size() ;
    CharacterVector column_names = df.names() ;
    for( int i=0; i<ncolumns; i++){
        accumulator.set( column_names[i], df[i] ) ;
    }

    for( int i=0; i<nexpr; i++){
        env = dots.envir(i) ;
        proxy.set_env( env ) ;
        SEXP call = args[i] ;
        SEXP name = results_names[i] ;
        SEXP variable = R_NilValue ;
        if( TYPEOF(call) == SYMSXP ){
            if(proxy.has_variable(call)){
                variable = proxy.get_variable( PRINTNAME(call) ) ;
            } else {
                SEXP v = env.find(CHAR(PRINTNAME(call))) ;
                if( Rf_isNull(v) ){
                    std::stringstream s ;
                    s << "unknown variable: " << CHAR(PRINTNAME(call)) ;
                    stop(s.str());
                } else if( Rf_length(v) == 1){
                    Replicator* rep = constant_replicator(v, gdf.nrows() );
                    variable = __( rep->collect() );
                    delete rep ;
                } else {
                    Replicator* rep = replicator(v, gdf) ;
                    variable = __( rep->collect() );
                    delete rep ;
                }
            }

        } else if(TYPEOF(call) == LANGSXP){
            proxy.set_call( call );
            Gatherer* gather = gatherer( proxy, gdf ) ;
            variable = __( gather->collect() ) ;
            delete gather ;
        } else if(Rf_length(call) == 1) {
            boost::scoped_ptr<Gatherer> gather( constant_gatherer( call, gdf.nrows() ) );
            variable = __( gather->collect() ) ;
        } else {
            stop( "cannot handle" ) ;
        }

        proxy.input( name, variable ) ;
        accumulator.set( name, variable) ;
    }

    return structure_mutate(accumulator, df, classes_grouped() );
}
Example #11
0
//[[Rcpp::export]]
DataFrame intersect_impl(GroupedDataFrame x, GroupedDataFrame y,
                         const std::string& suffix_x = ".x",
                         const std::string& suffix_y = ".y") {

  // indices for subsetting
  std::vector<int> indices_x ;
  std::vector<int> indices_y ;

  // overlap sizes
  std::vector<int> overlap_sizes ;

  auto data_x = x.data() ;
  auto data_y = y.data() ;

  // set up interval trees for each chromosome and apply intersect_group
  GroupApply(x, y, intersect_group, std::ref(indices_x), std::ref(indices_y), std::ref(overlap_sizes));

  DataFrame subset_x = DataFrameSubsetVisitors(data_x, names(data_x)).subset(indices_x, "data.frame");
  DataFrame subset_y = DataFrameSubsetVisitors(data_y, names(data_y)).subset(indices_y, "data.frame");

  auto ncol_x = subset_x.size() ;
  auto ncol_y = subset_y.size() ;

  CharacterVector names(ncol_x + ncol_y) ;
  CharacterVector names_x = subset_x.attr("names") ;
  CharacterVector names_y = subset_y.attr("names") ;

  // replacing y chrom with overlap, same number of cols
  List out(ncol_x + ncol_y) ;

  // x names, data
  for (int i = 0; i < ncol_x; i++) {
    auto name_x = as<std::string>(names_x[i]) ;
    if (name_x != "chrom") {
      name_x += suffix_x ;
    }
    names[i] = name_x ;
    out[i] = subset_x[i] ;
  }

  // y names, data
  for (int i = 0; i < ncol_y; i++) {
    auto name_y = as<std::string>(names_y[i]) ;

    if (name_y == "chrom") continue ;

    name_y += suffix_y ;

    names[i + ncol_x - 1] = name_y ;
    out[i + ncol_x - 1] = subset_y[i] ;
  }

  // overlaps
  out[ncol_x + ncol_y - 1] = overlap_sizes ;
  names[ncol_x + ncol_y - 1] = ".overlap" ;

  out.attr("names") = names ;
  out.attr("class") = classes_not_grouped() ;
  auto nrows = subset_x.nrows() ;
  set_rownames(out, nrows) ;

  return out ;

}
Example #12
0
SEXP slice_grouped(GroupedDataFrame gdf, const LazyDots& dots) {
  typedef GroupedCallProxy<GroupedDataFrame, LazyGroupedSubsets> Proxy;

  const DataFrame& data = gdf.data();
  const Lazy& lazy = dots[0];
  Environment env = lazy.env();
  SymbolVector names = data.names();

  // we already checked that we have only one expression
  Call call(lazy.expr());

  std::vector<int> indx;
  indx.reserve(1000);

  IntegerVector g_test;
  Proxy call_proxy(call, gdf, env);

  int ngroups = gdf.ngroups();
  GroupedDataFrame::group_iterator git = gdf.group_begin();
  for (int i=0; i<ngroups; i++, ++git) {
    const SlicingIndex& indices = *git;
    int nr = indices.size();
    g_test = check_filter_integer_result(call_proxy.get(indices));
    CountIndices counter(indices.size(), g_test);

    if (counter.is_positive()) {
      // positive indexing
      int ntest = g_test.size();
      for (int j=0; j<ntest; j++) {
        if (!(g_test[j] > nr || g_test[j] == NA_INTEGER)) {
          indx.push_back(indices[g_test[j]-1]);
        }
      }
    } else if (counter.get_n_negative() != 0) {
      // negative indexing
      std::set<int> drop;
      int n = g_test.size();
      for (int j=0; j<n; j++) {
        if (g_test[j] != NA_INTEGER)
          drop.insert(-g_test[j]);
      }
      int n_drop = drop.size();
      std::set<int>::const_iterator drop_it = drop.begin();

      int k = 0, j = 0;
      while (drop_it != drop.end()) {
        int next_drop = *drop_it - 1;
        while (j < next_drop) {
          indx.push_back(indices[j++]);
          k++;
        }
        j++;
        ++drop_it;
      }
      while (k < nr - n_drop) {
        indx.push_back(indices[j++]);
        k++;
      }

    }
  }
  DataFrame res = subset(data, indx, names, classes_grouped<GroupedDataFrame>());
  set_vars(res, get_vars(data));
  strip_index(res);

  return GroupedDataFrame(res).data();

}
Example #13
0
//[[Rcpp::export]]
DataFrame complement_impl(GroupedDataFrame gdf, DataFrame genome) {

  genome_map_t chrom_sizes = makeChromSizes(genome) ;

  DataFrame df = gdf.data() ;

  IntegerVector starts = df["start"] ;
  IntegerVector ends = df["end"] ;
  CharacterVector chroms = df["chrom"] ;

  std::vector<std::string> chroms_out ;
  std::vector<int> starts_out ;
  std::vector<int> ends_out ;

  int ngroups = gdf.ngroups() ;
  GroupedDataFrame::group_iterator git = gdf.group_begin() ;
  for (int i = 0; i < ngroups; ++i, ++git) {

    SlicingIndex indices = *git ;
    int ni = indices.size() ;

    int start, end ;
    int last_end = 1 ;

    // get chrom from first index
    auto chrom = as<std::string>(chroms[indices[0]]) ;

    for (int j = 0; j < ni; ++j) {

      start = starts[indices[j]] ;
      end = ends[indices[j]] ;

      if (j == 0) {
        if (start == 1) {
          last_end = end ;
          continue ;
        } else {
          chroms_out.push_back(chrom) ;
          starts_out.push_back(1) ;
          ends_out.push_back(start) ;
        }
      } else {
        chroms_out.push_back(chrom) ;
        starts_out.push_back(last_end) ;
        ends_out.push_back(start) ;
      }

      last_end = end;
    }

    auto chrom_size = chrom_sizes[chrom] ;

    if (last_end < chrom_size) {
      chroms_out.push_back(chrom) ;
      starts_out.push_back(last_end) ;
      ends_out.push_back(chrom_size) ;
    }
  }

  return DataFrame::create(_("chrom") = chroms_out,
                           _("start") = starts_out,
                           _("end") = ends_out,
                           _("stringsAsFactors") = false) ;
}