예제 #1
0
SEXP hashed_model_matrix(RObject tf, DataFrameLike data, unsigned long hash_size, bool transpose, S4 retval, bool keep_hashing_mapping, bool is_xi, bool progress) {
  if (hash_size > 4294967296) throw std::invalid_argument("hash_size is too big!");
  NameClassMapping reference_class(get_class(data));
  Environment e(Environment::base_env().new_child(wrap(true)));
  std::shared_ptr<HashFunction> pHF(NULL), pBHF(NULL);
  if (keep_hashing_mapping) {
    pHF.reset(new MurmurHash3LogHashFunction(wrap(e), MURMURHASH3_H_SEED));
  } else {
    pHF.reset(new MurmurHash3HashFunction(MURMURHASH3_H_SEED));
  }
  if (is_xi) pBHF.reset(new MurmurHash3HashFunction(MURMURHASH3_XI_SEED));
  else pBHF.reset(new NullHashFunction);
  ConvertersVec converters(get_converters(reference_class, tf, data, pHF.get(), pBHF.get(), hash_size));
  #ifdef NOISY_DEBUG
  Rprintf("The size of convertres is %d\n", converters.size());
  #endif
  std::vector<int> ivec, pvec(1, 0);
  std::vector<double> xvec;
  bool is_intercept = as<bool>(tf.attr("intercept"));
  #ifdef NOISY_DEBUG
  Rprintf("nrow(data): %d length(converters): %d\n", data.nrows(), converters.size());
  #endif
  std::shared_ptr<boost::progress_display> pd(NULL);
  if (transpose) {
    if (progress) pd.reset(new boost::progress_display(data.nrows(), Rcpp::Rcout));
    for(auto i = 0;i < data.nrows();i++) {
      if (progress) ++(*pd);
      if (is_intercept) {
        ivec.push_back(0);
        xvec.push_back(1.0);
      }
      for(auto j = converters.begin();j != converters.end();j++) {
        pVectorConverter& p(*j);
        const std::vector<uint32_t>& i_origin(p->get_feature(i));
        const std::vector<double>& x_origin(p->get_value(i));
        #ifdef NOISY_DEBUG
        std::for_each(i_origin.begin(), i_origin.end(), [&hash_size](uint32_t hashed_value) {
          Rprintf("(%zu module %d = %d),", hashed_value, hash_size, hashed_value % hash_size);
        });
        Rprintf("\n");
        #endif
        std::for_each(i_origin.begin(), i_origin.end(), [&ivec, &xvec, &hash_size](uint32_t hashed_value) {
          ivec.push_back(hashed_value);
        });
        xvec.insert(xvec.end(), x_origin.begin(), x_origin.end());
      }
      pvec.push_back(ivec.size());
    }
  }
  else {
    if (progress) pd.reset(new boost::progress_display(data.nrows(), Rcpp::Rcout));
    std::map< uint32_t, std::pair< std::vector<int>, std::vector<double> > > cache;
    if (is_intercept) {
      std::pair< std::vector<int>, std::vector<double> >& k(cache[0]);
      k.first.resize(data.nrows());
      for(int i = 0;i < data.nrows();i++) {
        k.first[i] = i;
      }
      k.second.resize(data.nrows(), 1.0);
    }
    for(auto i = 0;i < data.nrows();i++) {
      if (progress) ++(*pd);
      for(auto j = converters.begin();j != converters.end();j++) {
        pVectorConverter& p(*j);
        const std::vector<uint32_t>& i_origin(p->get_feature(i));
        const std::vector<double>& x_origin(p->get_value(i));
        auto x_value = x_origin.begin();
        std::for_each(i_origin.begin(), i_origin.end(), [&cache, &hash_size, &x_value, &i](uint32_t hashed_value) {
          std::pair< std::vector<int>, std::vector<double> >& k(cache[hashed_value]);
          k.first.push_back(i);
          k.second.push_back(*(x_value++));
        });
      }
    }
    int pvec_value = ivec.size();
    for(auto i = cache.begin();i != cache.end();i++) {
      while(pvec.size() <= i->first) pvec.push_back(pvec_value);
      ivec.insert(ivec.end(), i->second.first.begin(), i->second.first.end());
      {
        std::vector<int> tmp;
        i->second.first.swap(tmp);
      }
      xvec.insert(xvec.end(), i->second.second.begin(), i->second.second.end());
      {
        std::vector<double> tmp;
        i->second.second.swap(tmp);
      }
      pvec_value = ivec.size();
    }
    pvec.resize(hash_size + 1, pvec_value);
  }
  retval.slot("i") = wrap(ivec);
  retval.slot("p") = wrap(pvec);
  retval.slot("x") = wrap(xvec);
  IntegerVector dim(2);
  if (transpose) {
    dim[0] = hash_size;
    dim[1] = pvec.size() - 1;
    retval.slot("Dim") = dim;
  }
  else {
    dim[0] = data.nrows();
    dim[1] = hash_size;
    retval.slot("Dim") = dim;
  }
  {
    List dimnames(2);
    dimnames[0] = CharacterVector(0);
    dimnames[1] = CharacterVector(0);
    retval.slot("Dimnames") = dimnames;
  }
  retval.slot("factors") = List();
  {
    CharacterVector key(e.ls(true));
    std::for_each(key.begin(), key.end(), [&e, &hash_size](const char* s) {
      uint32_t *p = (uint32_t*) INTEGER(e[s]);
      p[0] = p[0] % hash_size;
    });
  }
  retval.attr("mapping") = e;
  return retval;
}