Example #1
0
// [[Rcpp::export]]
DataFrame inner_join_impl(DataFrame x, DataFrame y,
                          IntegerVector by_x, IntegerVector by_y,
                          IntegerVector aux_x, IntegerVector aux_y,
                          bool na_match) {
  check_by(by_x);

  typedef VisitorSetIndexMap<DataFrameJoinVisitors, std::vector<int> > Map;
  DataFrameJoinVisitors visitors(x, y, by_x, by_y, false, na_match);
  Map map(visitors);

  int n_x = x.nrows(), n_y = y.nrows();

  std::vector<int> indices_x;
  std::vector<int> indices_y;

  train_push_back_right(map, n_y);

  for (int i = 0; i < n_x; i++) {
    Map::iterator it = map.find(i);
    if (it != map.end()) {
      push_back_right(indices_y, it->second);
      push_back(indices_x, i, it->second.size());
    }
  }

  return subset_join(x, y,
                     indices_x, indices_y,
                     by_x, by_y,
                     aux_x, aux_y,
                     get_class(x)
                    );
}
Example #2
0
// [[Rcpp::export]]
DataFrame right_join_impl(DataFrame x, DataFrame y,
                          IntegerVector by_x, IntegerVector by_y,
                          IntegerVector aux_x, IntegerVector aux_y,
                          bool na_match) {
  check_by(by_x);

  typedef VisitorSetIndexMap<DataFrameJoinVisitors, std::vector<int> > Map;
  DataFrameJoinVisitors visitors(x, y, by_x, by_y, false, na_match);
  Map map(visitors);

  // train the map in terms of x
  train_push_back(map, x.nrows());

  std::vector<int> indices_x;
  std::vector<int> indices_y;

  int n_y = y.nrows();
  for (int i = 0; i < n_y; i++) {
    // find a row in y that matches row i in x
    Map::iterator it = map.find(-i - 1);
    if (it != map.end()) {
      push_back(indices_x,  it->second);
      push_back(indices_y, i, it->second.size());
    } else {
      indices_x.push_back(-i - 1); // point to the i-th row in the right table
      indices_y.push_back(i);
    }
  }
  return subset_join(x, y,
                     indices_x, indices_y,
                     by_x, by_y,
                     aux_x, aux_y,
                     get_class(x)
                    );
}
Example #3
0
  void RandomTree::findProximity(DataFrame & data, std::vector<unsigned int> & proximity)
  {
    std::vector<unsigned int> proxVec;

    unsigned int dSize = data.getNumDataVectors();
    proxVec.resize(dSize);

    //Find out which node each vector is classified as 
    for(unsigned int i = 0; i < dSize; i++)
    {
      std::string resultClass;
      unsigned int nodeId = classifyDataVector(data.getDataVector(i), resultClass);

      proxVec[i] = nodeId;
    }

    for(unsigned int j = 0; j < dSize; j++)
    {
      unsigned int tempId = proxVec[j];
      for(unsigned int k = j; k < dSize;j++)
      {
        if(proxVec[k] == tempId)
        {
          proximity[j * dSize + k] += 1;
        }
      }
    }
  }
Example #4
0
inline DataFrame grouped_subset( const Data& gdf, const LogicalVector& test, const CharacterVector& names, CharacterVector classes){
  DataFrame data = gdf.data() ;
  DataFrame res = subset( data, test, names, classes) ;
  res.attr("vars")   = data.attr("vars") ;
  strip_index(res);
  return Data(res).data() ;
}
  void DataFrameDiscretizer::discretize(DataFrame& df, TgsProgress* progress)
  {
    _df = &df;

    for (unsigned int i = 0; i < df.getNumFactors(); i++)
    {
      if (progress)
      {
        progress->setProgress((double)i / (double)df.getNumFactors());
      }
      if (_df->isNominal(i) == false)
      {
        if (_df->getNullTreatment(i) == DataFrame::NullAsMissingValue)
        {
          // replace nulls with random sampling of data (imputation), otherwise nulls get put
          // into their own category.
          _replaceNulls(i);
        } 
        _discretizeColumn(i);
      }
    }
    if (progress)
    {
      progress->setProgress(1.0);
    }
  }
Example #6
0
// [[Rcpp::export]]
DataFrame inner_join_impl(DataFrame x, DataFrame y,
                          CharacterVector by_x, CharacterVector by_y,
                          std::string& suffix_x, std::string& suffix_y,
                          bool na_match) {
  if (by_x.size() == 0) stop("no variable to join by");
  typedef VisitorSetIndexMap<DataFrameJoinVisitors, std::vector<int> > Map;
  DataFrameJoinVisitors visitors(x, y, SymbolVector(by_x), SymbolVector(by_y), true, na_match);
  Map map(visitors);

  int n_x = x.nrows(), n_y = y.nrows();

  std::vector<int> indices_x;
  std::vector<int> indices_y;

  train_push_back_right(map, n_y);

  for (int i = 0; i < n_x; i++) {
    Map::iterator it = map.find(i);
    if (it != map.end()) {
      push_back_right(indices_y, it->second);
      push_back(indices_x, i, it->second.size());
    }
  }

  return subset_join(x, y,
                     indices_x, indices_y,
                     by_x, by_y,
                     suffix_x, suffix_y,
                     get_class(x)
                    );
}
Example #7
0
// [[Rcpp::export]]
DataFrame anti_join_impl(DataFrame x, DataFrame y, CharacterVector by_x, CharacterVector by_y, bool na_match) {
  if (by_x.size() == 0) stop("no variable to join by");
  typedef VisitorSetIndexMap<DataFrameJoinVisitors, std::vector<int> > Map;
  DataFrameJoinVisitors visitors(x, y, SymbolVector(by_x), SymbolVector(by_y), false, na_match);
  Map map(visitors);

  // train the map in terms of x
  train_push_back(map, x.nrows());

  int n_y = y.nrows();
  // remove the rows in x that match
  for (int i = 0; i < n_y; i++) {
    Map::iterator it = map.find(-i - 1);
    if (it != map.end())
      map.erase(it);
  }

  // collect what's left
  std::vector<int> indices;
  for (Map::iterator it = map.begin(); it != map.end(); ++it)
    push_back(indices, it->second);

  const DataFrame& out = subset(x, indices, x.names(), get_class(x));
  strip_index(out);
  return out;
}
Example #8
0
// [[Rcpp::export]]
DataFrame right_join_impl(DataFrame x, DataFrame y,
                          CharacterVector by_x, CharacterVector by_y,
                          std::string& suffix_x, std::string& suffix_y,
                          bool na_match) {
  if (by_x.size() == 0) stop("no variable to join by");
  typedef VisitorSetIndexMap<DataFrameJoinVisitors, std::vector<int> > Map;
  DataFrameJoinVisitors visitors(x, y, SymbolVector(by_x), SymbolVector(by_y), true, na_match);
  Map map(visitors);

  // train the map in terms of x
  train_push_back(map, x.nrows());

  std::vector<int> indices_x;
  std::vector<int> indices_y;

  int n_y = y.nrows();
  for (int i = 0; i < n_y; i++) {
    // find a row in y that matches row i in x
    Map::iterator it = map.find(-i - 1);
    if (it != map.end()) {
      push_back(indices_x,  it->second);
      push_back(indices_y, i, it->second.size());
    } else {
      indices_x.push_back(-i - 1); // point to the i-th row in the right table
      indices_y.push_back(i);
    }
  }
  return subset_join(x, y,
                     indices_x, indices_y,
                     by_x, by_y,
                     suffix_x, suffix_y,
                     get_class(x)
                    );
}
Example #9
0
// [[Rcpp::export]]
DataFrame semi_join_impl(DataFrame x, DataFrame y, CharacterVector by_x, CharacterVector by_y, bool na_match) {
  if (by_x.size() == 0) stop("no variable to join by");
  typedef VisitorSetIndexMap<DataFrameJoinVisitors, std::vector<int> > Map;
  DataFrameJoinVisitors visitors(x, y, SymbolVector(by_x), SymbolVector(by_y), false, na_match);
  Map map(visitors);

  // train the map in terms of x
  train_push_back(map, x.nrows());

  int n_y = y.nrows();
  // this will collect indices from rows in x that match rows in y
  std::vector<int> indices;
  for (int i = 0; i < n_y; i++) {
    // find a row in x that matches row i from y
    Map::iterator it = map.find(-i - 1);

    if (it != map.end()) {
      // collect the indices and remove them from the
      // map so that they are only found once.
      push_back(indices, it->second);

      map.erase(it);

    }
  }

  const DataFrame& out = subset(x, indices, x.names(), get_class(x));
  strip_index(out);
  return out;
}
Example #10
0
// [[Rcpp::export]]
SEXP filter_impl( DataFrame df, LazyDots dots){
    if( df.nrows() == 0 || Rf_isNull(df) ) {
        return df ;
    }
    check_valid_colnames(df) ;
    assert_all_white_list(df) ;

    if( dots.size() == 0 ) return df ;

    // special case
    if( dots.size() == 1 && TYPEOF(dots[0].expr()) == LGLSXP){
        LogicalVector what = dots[0].expr() ;
        if( what.size() == 1 ){
            if( what[0] == TRUE ){
                return df ;
            } else {
                return empty_subset( df, df.names(), is<GroupedDataFrame>(df) ? classes_grouped<GroupedDataFrame>() : classes_not_grouped() ) ;
            }
        }
    }
    if( is<GroupedDataFrame>( df ) ){
        return filter_grouped<GroupedDataFrame, LazyGroupedSubsets>( GroupedDataFrame(df), dots);
    } else if( is<RowwiseDataFrame>(df) ){
        return filter_grouped<RowwiseDataFrame, LazyRowwiseSubsets>( RowwiseDataFrame(df), dots);
    } else {
        return filter_not_grouped( df, dots ) ;
    }
}
Example #11
0
// update the device's data
void Device::update()
{
    // read current traffic
    DataFrame dataFrame = m_devReader.getNewDataFrame();

    if(dataFrame.isValid())
    {
        /* Depending on the CPU architecture and the OS interface
         * used for reading the device statistics, the counts can
         * overflow. We monitor the overflows and fix them.
         */
        fixOverflows(dataFrame, m_dataFrameOld);

        m_deviceStatistics.insertDataFrame(dataFrame);

        m_deviceGraphIn.update(m_deviceStatistics.getDataInPerSecond());
        m_deviceGraphOut.update(m_deviceStatistics.getDataOutPerSecond());

        m_dataFrameOld = dataFrame;
    }
    else
    {
        m_deviceStatistics.reset();
        m_deviceGraphIn.resetTrafficData();
        m_deviceGraphOut.resetTrafficData();
    }
}
Example #12
0
SEXP filter_not_grouped( DataFrame df, List args, const DataDots& dots){
    CharacterVector names = df.names() ;
    SymbolSet set ;
    for( int i=0; i<names.size(); i++){
        set.insert( Rf_install( names[i] ) ) ;
    }

    if( dots.single_env() ){
        Environment env = dots.envir(0) ;
        // a, b, c ->  a & b & c
        Shield<SEXP> call( and_calls( args, set ) ) ;

        // replace the symbols that are in the data frame by vectors from the data frame
        // and evaluate the expression
        CallProxy proxy( (SEXP)call, df, env ) ;
        LogicalVector test = proxy.eval() ;
        check_filter_result(test, df.nrows());
        DataFrame res = subset( df, test, df.names(), classes_not_grouped() ) ;
        return res ;
    } else {
        int nargs = args.size() ;
        CallProxy first_proxy(args[0], df, dots.envir(0) ) ;
        LogicalVector test = first_proxy.eval() ;
        check_filter_result(test, df.nrows());

        for( int i=1; i<nargs; i++){
            LogicalVector test2 = CallProxy(args[i], df, dots.envir(i) ).eval() ;
            combine_and(test, test2) ;
        }

        DataFrame res = subset( df, test, df.names(), classes_not_grouped() ) ;
        return res ;
    }
}
Example #13
0
DataFrame filter_grouped_single_env( const GroupedDataFrame& gdf, const List& args, const Environment& env){
    const DataFrame& data = gdf.data() ;
    CharacterVector names = data.names() ;
    SymbolSet set ;
    for( int i=0; i<names.size(); i++){
        set.insert( Rf_install( names[i] ) ) ;
    }

    // a, b, c ->  a & b & c
    Call call( and_calls( args, set ) ) ;

    int nrows = data.nrows() ;
    LogicalVector test = no_init(nrows);

    LogicalVector g_test ;
    GroupedCallProxy call_proxy( call, gdf, env ) ;

    int ngroups = gdf.ngroups() ;
    GroupedDataFrame::group_iterator git = gdf.group_begin() ;
    for( int i=0; i<ngroups; i++, ++git){
        SlicingIndex indices = *git ;
        int chunk_size = indices.size() ;

        g_test  = call_proxy.get( indices );
        check_filter_result(g_test, chunk_size ) ;
        for( int j=0; j<chunk_size; j++){
            test[ indices[j] ] = g_test[j] ;
        }
    }

    DataFrame res = subset( data, test, names, classes_grouped() ) ;
    res.attr( "vars")   = data.attr("vars") ;

    return res ;
}
Example #14
0
// [[Rcpp::export]]
DataFrame semi_join_impl( DataFrame x, DataFrame y, CharacterVector by){
    typedef VisitorSetIndexMap<DataFrameJoinVisitors, std::vector<int> > Map ;
    DataFrameJoinVisitors visitors(x, y, by) ;
    Map map(visitors);

    // train the map in terms of x
    train_push_back( map, x.nrows(), x.nrows() / 10) ;

    int n_y = y.nrows() ;
    // this will collect indices from rows in x that match rows in y
    std::vector<int> indices ;
    for( int i=0; i<n_y; i++){
        // find a row in x that matches row i from y
        Map::iterator it = map.find(-i-1) ;

        if( it != map.end() ){
            // collect the indices and remove them from the
            // map so that they are only found once.
            push_back( indices, it->second ) ;

            map.erase(it) ;

        }
    }

    return subset(x, indices, x.names(), x.attr("class") ) ;
}
Example #15
0
// [[Rcpp::export]]
DataFrame right_join_impl( DataFrame x, DataFrame y, CharacterVector by){
    typedef VisitorSetIndexMap<DataFrameJoinVisitors, std::vector<int> > Map ;
    DataFrameJoinVisitors visitors(x, y, by) ;
    Map map(visitors);

    // train the map in terms of y
    train_push_back( map, x.nrows(), x.nrows() / 10 ) ;

    std::vector<int> indices_x ;
    std::vector<int> indices_y ;

    int n_y = y.nrows() ;
    for( int i=0; i<n_y; i++){
        // find a row in y that matches row i in x
        Map::iterator it = map.find(-i-1) ;
        if( it != map.end() ){
            push_back( indices_x,    it->second ) ;
            push_back( indices_y, i, it->second.size() ) ;
        } else {
            indices_x.push_back(-1) ; // mark NA
            indices_y.push_back(i) ;
        }
    }
    return subset( x, y, indices_x, indices_y, by, x.attr( "class" ) ) ;
}
Example #16
0
DataFrame filter_grouped( const GroupedDataFrame& gdf, List args, Environment env){
    // a, b, c ->  a & b & c
    Language call = and_calls( args ) ;
    
    const DataFrame& data = gdf.data() ;
    int nrows = data.nrows() ;
    LogicalVector test = no_init(nrows);
    
    LogicalVector g_test ;
    GroupedCallProxy call_proxy( call, gdf, env ) ;
    
    int ngroups = gdf.ngroups() ;
    GroupedDataFrame::group_iterator git = gdf.group_begin() ;
    for( int i=0; i<ngroups; i++, ++git){
        SlicingIndex indices = *git ;
        g_test  = call_proxy.get( indices );
        
        int chunk_size = indices.size() ;
        for( int j=0; j<chunk_size; j++){
            test[ indices[j] ] = g_test[j] ;  
        }
    }
    DataFrame res = subset( data, test, data.names(), classes_grouped() ) ;
    res.attr( "vars")   = data.attr("vars") ;
            
    return res ;
}
Example #17
0
DataFrame subset( DataFrame x, DataFrame y, const Index& indices_x, const Index& indices_y, CharacterVector by, CharacterVector classes ){
    CharacterVector x_columns = x.names() ;
    DataFrameVisitors visitors_x(x, x_columns) ;

    CharacterVector all_y_columns = y.names() ;
    CharacterVector y_columns = setdiff( all_y_columns, by ) ;
    JoinColumnSuffixer suffixer(x_columns, y_columns, by) ;

    DataFrameVisitors visitors_y(y, y_columns) ;

    int nrows = indices_x.size() ;
    int nv_x = visitors_x.size(), nv_y = visitors_y.size() ;
    List out(nv_x+nv_y);
    CharacterVector names(nv_x+nv_y) ;
    int k=0;
    for( ; k<nv_x; k++){
       out[k] = visitors_x.get(k)->subset(indices_x) ;
       names[k] = suffixer.get( x_columns[k], ".x" ) ;
    }
    for( int i=0; i<nv_y; i++, k++){
       out[k] = visitors_y.get(i)->subset(indices_y) ;
       names[k] = suffixer.get(y_columns[i], ".y" ) ;
    }
    out.attr("class") = classes ;
    set_rownames(out, nrows) ;
    out.names() = names ;

    SEXP vars = x.attr( "vars" ) ;
    if( !Rf_isNull(vars) )
        out.attr( "vars" ) = vars ;

    return (SEXP)out ;
}
  double SymmetricUncertaintyCalculator::_calculateEntropy(const DataFrame& df, int factorIndex)
  {
    typedef HashMap<int, int> ClassCounts;
    ClassCounts cc;
    
    for(unsigned int i = 0; i < df.getNumDataVectors(); i++)
    {
      double v = df.getDataElement(i, factorIndex);
      // null values are not supported Use the DataFrameDiscretizer to "fix" nulls
      if (DataFrame::isNull(v) == true)
      {
        throw Tgs::Exception("Null values are not supported by SymmetricUncertaintyCalculator");
      }
      cc[(int)(v + .5)]++;
    }

    double sum = 0.0;
    double totalSize = df.getNumDataVectors();
    for (ClassCounts::const_iterator classIt = cc.begin(); classIt != cc.end(); classIt++)
    {
      double count = classIt->second;
      sum += count / totalSize * log2(count / totalSize);
    }

    return -sum;
  }
Example #19
0
// [[Rcpp::export]]
DataFrame arrange_impl( DataFrame data, List args, DataDots dots ){
    int nargs = args.size() ;
    List variables(nargs) ;
    LogicalVector ascending(nargs) ;
    Shelter<SEXP> __ ;

    for(int i=0; i<nargs; i++){
        SEXP call = args[i] ;
        bool is_desc = TYPEOF(call) == LANGSXP && Rf_install("desc") == CAR(call) ;

        CallProxy call_proxy( is_desc ? CADR(call) : call, data, dots.envir(i)) ;
        variables[i] = __(call_proxy.eval()) ;
        if( Rf_length(variables[i]) != data.nrows() ){
            std::stringstream s ;
            s << "incorrect size ("
              << Rf_length(variables[i])
              << "), expecting :"
              << data.nrows() ;
            stop(s.str()) ;
        }
        ascending[i] = !is_desc ;
    }
    OrderVisitors o(variables,ascending, nargs) ;
    IntegerVector index = o.apply() ;

    DataFrameVisitors visitors( data, data.names() ) ;
    DataFrame res = visitors.subset(index, data.attr("class") ) ;
    return res;
}
Example #20
0
// [[Rcpp::export]]
SEXP distinct_impl(DataFrame df, CharacterVector vars, CharacterVector keep) {
  if (df.size() == 0)
    return df;

  // No vars means ungrouped data with keep_all = TRUE.
  if (vars.size() == 0)
    return df;

  check_valid_colnames(df);
  if (!vars.size()) {
    vars = df.names();
  }
  DataFrameVisitors visitors(df, vars);

  std::vector<int> indices;
  VisitorSetIndexSet<DataFrameVisitors> set(visitors);

  int n = df.nrows();
  for (int i=0; i<n; i++) {
    if (set.insert(i).second) {
      indices.push_back(i);
    }
  }

  return DataFrameSubsetVisitors(df, keep).subset(indices, df.attr("class"));
}
Example #21
0
SEXP structure_mutate( Proxy& call_proxy, const DataFrame& df, const CharacterVector& results_names, CharacterVector classes){
    int n = call_proxy.nsubsets() ;
    
    List out(n) ;
    CharacterVector names(n) ;
    
    CharacterVector input_names = df.names() ;
    int ncolumns = df.size() ;
    int i=0 ;
    for( ; i<ncolumns; i++){
        out[i] = call_proxy.get_variable(input_names[i]) ;
        SET_NAMED( out[i], 2 );
        names[i] = input_names[i] ;
    }
    for( int k=0; i<n; k++ ){
        String name = results_names[k] ;
        
        if( ! any( input_names.begin(), input_names.end(), name.get_sexp() ) ){
            SEXP x   = call_proxy.get_variable( name ) ; 
            out[i]   = x ;
            SET_NAMED( out[i], 2 );
            names[i] = name ;
            i++ ;
        }
    }
    
    
    out.attr("class") = classes ;
    set_rownames( out, df.nrows() ) ;
    out.names() = names;
    
    return out ;    
}
Example #22
0
// version of grouped filter when contributions to ... come from several environment
DataFrame filter_grouped_multiple_env( const GroupedDataFrame& gdf, const List& args, const DataDots& dots){
    const DataFrame& data = gdf.data() ;
    CharacterVector names = data.names() ;
    SymbolSet set ;
    for( int i=0; i<names.size(); i++){
        set.insert( Rf_install( names[i] ) ) ;
    }

    int nrows = data.nrows() ;
    LogicalVector test(nrows, TRUE);

    LogicalVector g_test ;

    for( int k=0; k<args.size(); k++){
        Call call( (SEXP)args[k] ) ;
        GroupedCallProxy call_proxy( call, gdf, dots.envir(k) ) ;
        int ngroups = gdf.ngroups() ;
        GroupedDataFrame::group_iterator git = gdf.group_begin() ;
        for( int i=0; i<ngroups; i++, ++git){
            SlicingIndex indices = *git ;
            int chunk_size = indices.size() ;

            g_test  = call_proxy.get( indices );
            check_filter_result(g_test, chunk_size ) ;
            for( int j=0; j<chunk_size; j++){
                test[ indices[j] ] = test[ indices[j] ] & g_test[j] ;
            }
        }
    }
    DataFrame res = subset( data, test, names, classes_grouped() ) ;
    res.attr( "vars")   = data.attr("vars") ;

    return res ;
}
Example #23
0
// [[Rcpp::export]]
DataFrame as_regular_df(DataFrame df){
  DataFrame copy = shallow_copy(df) ;
  SET_ATTRIB(copy, strip_group_attributes(df)) ;
  SET_OBJECT(copy, OBJECT(df)) ;
  copy.attr("class") = CharacterVector::create("data.frame") ;
  return copy ;
}
// [[Rcpp::export]]
SEXP ex13_2(DataFrame input, CharacterVector columnName, double replace){
  BEGIN_RCPP
  List names = input.names();
  List mapObj = as<List>(input);
  // all rows
  for(int i=0; i<input.nrows(); i++){
    ex13helper(mapObj, as<string>(columnName), i, replace);
  }
  return(wrap(mapObj));
  END_RCPP
}
Example #25
0
SEXP structure_mutate( const NamedListAccumulator<SEXP>& accumulator, const DataFrame& df, CharacterVector classes){
    List res = accumulator ;
    res.attr("class") = classes ;
    set_rownames( res, df.nrows() ) ;
    res.attr( "vars")     = df.attr("vars") ;
    res.attr( "labels" )  = df.attr("labels" );
    res.attr( "index")    = df.attr("index") ;
    res.attr( "indices" ) = df.attr("indices" ) ;

    return res ;
}
Example #26
0
// [[Rcpp::export]]
IntegerVector grouped_indices_impl(DataFrame data, ListOf<Symbol> symbols) {
  int nsymbols = symbols.size();
  if (nsymbols == 0)
    return rep(1, data.nrows());
  CharacterVector vars(nsymbols);
  for (int i=0; i<nsymbols; i++) {
    vars[i] = PRINTNAME(symbols[i]);

    const char* name = vars[i];
    SEXP v;
    try {
      v = data[name];
    } catch (...) {
      stop("unknown column '%s'", name);
    }
    if (!white_list(v) || TYPEOF(v) == VECSXP) {
      stop("cannot group column %s, of class '%s'", name, get_single_class(v));
    }
  }

  DataFrameVisitors visitors(data, vars);
  ChunkIndexMap map(visitors);
  int n = data.nrows();
  train_push_back(map, n);

  DataFrame labels = DataFrameSubsetVisitors(data, vars).subset(map, "data.frame");
  IntegerVector labels_order = OrderVisitors(labels).apply();

  labels = DataFrameSubsetVisitors(labels).subset(labels_order, "data.frame");

  int ngroups = map.size();

  IntegerVector res = no_init(n);

  std::vector<const std::vector<int>* > chunks(ngroups);
  ChunkIndexMap::const_iterator it = map.begin();
  for (int i=0; i<ngroups; i++, ++it) {
    chunks[i] = &it->second;
  }

  for (int i=0; i<ngroups; i++) {
    int idx = labels_order[i];
    const std::vector<int>& v = *chunks[idx];

    int n_index = v.size();
    for (int j=0; j<n_index; j++) {
      res[ v[j] ] = i+1;
    }
  }

  return res;
}
Example #27
0
// [[Rcpp::export]]
DataFrame full_join_impl(DataFrame x, DataFrame y,
                         IntegerVector by_x, IntegerVector by_y,
                         IntegerVector aux_x, IntegerVector aux_y,
                         bool na_match) {
  check_by(by_x);

  typedef VisitorSetIndexMap<DataFrameJoinVisitors, std::vector<int> > Map;
  DataFrameJoinVisitors visitors(y, x, by_y, by_x, false, na_match);
  Map map(visitors);

  // train the map in terms of y
  train_push_back(map, y.nrows());

  std::vector<int> indices_x;
  std::vector<int> indices_y;

  int n_x = x.nrows(), n_y = y.nrows();

  // get both the matches and the rows from left but not right
  for (int i = 0; i < n_x; i++) {
    // find a row in y that matches row i in x
    Map::iterator it = map.find(-i - 1);
    if (it != map.end()) {
      push_back(indices_y,  it->second);
      push_back(indices_x, i, it->second.size());
    } else {
      indices_y.push_back(-1); // mark NA
      indices_x.push_back(i);
    }
  }

  // train a new map in terms of x this time
  DataFrameJoinVisitors visitors2(x, y, by_x, by_y, false, na_match);
  Map map2(visitors2);
  train_push_back(map2, x.nrows());

  for (int i = 0; i < n_y; i++) {
    // try to find row in x that matches this row of y
    Map::iterator it = map2.find(-i - 1);
    if (it == map2.end()) {
      indices_x.push_back(-i - 1);
      indices_y.push_back(i);
    }
  }

  return subset_join(x, y,
                     indices_x, indices_y,
                     by_x, by_y,
                     aux_x, aux_y,
                     get_class(x)
                    );
}
  double InformationGainCalculator::calculateInformationGain(const DataFrame& df1, 
    int factorIndex1, const DataFrame& df2, int factorIndex2)
  {
    assert(df1.isNominal(factorIndex1));
    assert(df2.isNominal(factorIndex2));

    double hy = _calculateEntropy(df1, factorIndex1);
    double hyx = _calculateConditionalEntropy(df1, factorIndex1, df2, factorIndex2);

    double gain = hy - hyx;

    return gain;
  }  
Example #29
0
// [[Rcpp::export]]
DataFrame union_data_frame( DataFrame x, DataFrame y){
    if( !compatible_data_frame(x,y) )
        stop( "not compatible" );

    typedef VisitorSetIndexSet<DataFrameJoinVisitors> Set ;
    DataFrameJoinVisitors visitors(x, y, x.names() ) ;
    Set set(visitors);

    train_insert( set, x.nrows() ) ;
    train_insert_right( set, y.nrows() ) ;

    return visitors.subset( set, x.attr("class") ) ;
}
Example #30
0
// [[Rcpp::export]]
SEXP distinct_impl( DataFrame df ){
    DataFrameVisitors visitors(df) ;
    
    std::vector<int> indices ;
    VisitorSetIndexSet<DataFrameVisitors> set(visitors) ;
    
    int n = df.nrows() ;
    for( int i=0; i<n; i++){
        if( set.insert(i).second ){
            indices.push_back(i) ;    
        }
    }
    return visitors.subset(indices, df.attr("class") ); 
}