SEXP hashed_model_matrix(RObject tf, DataFrameLike data, unsigned long hash_size, bool transpose, S4 retval, bool keep_hashing_mapping, bool is_xi, bool progress) { if (hash_size > 4294967296) throw std::invalid_argument("hash_size is too big!"); NameClassMapping reference_class(get_class(data)); Environment e(Environment::base_env().new_child(wrap(true))); std::shared_ptr<HashFunction> pHF(NULL), pBHF(NULL); if (keep_hashing_mapping) { pHF.reset(new MurmurHash3LogHashFunction(wrap(e), MURMURHASH3_H_SEED)); } else { pHF.reset(new MurmurHash3HashFunction(MURMURHASH3_H_SEED)); } if (is_xi) pBHF.reset(new MurmurHash3HashFunction(MURMURHASH3_XI_SEED)); else pBHF.reset(new NullHashFunction); ConvertersVec converters(get_converters(reference_class, tf, data, pHF.get(), pBHF.get(), hash_size)); #ifdef NOISY_DEBUG Rprintf("The size of convertres is %d\n", converters.size()); #endif std::vector<int> ivec, pvec(1, 0); std::vector<double> xvec; bool is_intercept = as<bool>(tf.attr("intercept")); #ifdef NOISY_DEBUG Rprintf("nrow(data): %d length(converters): %d\n", data.nrows(), converters.size()); #endif std::shared_ptr<boost::progress_display> pd(NULL); if (transpose) { if (progress) pd.reset(new boost::progress_display(data.nrows(), Rcpp::Rcout)); for(auto i = 0;i < data.nrows();i++) { if (progress) ++(*pd); if (is_intercept) { ivec.push_back(0); xvec.push_back(1.0); } for(auto j = converters.begin();j != converters.end();j++) { pVectorConverter& p(*j); const std::vector<uint32_t>& i_origin(p->get_feature(i)); const std::vector<double>& x_origin(p->get_value(i)); #ifdef NOISY_DEBUG std::for_each(i_origin.begin(), i_origin.end(), [&hash_size](uint32_t hashed_value) { Rprintf("(%zu module %d = %d),", hashed_value, hash_size, hashed_value % hash_size); }); Rprintf("\n"); #endif std::for_each(i_origin.begin(), i_origin.end(), [&ivec, &xvec, &hash_size](uint32_t hashed_value) { ivec.push_back(hashed_value); }); xvec.insert(xvec.end(), x_origin.begin(), x_origin.end()); } pvec.push_back(ivec.size()); } } else { if (progress) pd.reset(new boost::progress_display(data.nrows(), Rcpp::Rcout)); std::map< uint32_t, std::pair< std::vector<int>, std::vector<double> > > cache; if (is_intercept) { std::pair< std::vector<int>, std::vector<double> >& k(cache[0]); k.first.resize(data.nrows()); for(int i = 0;i < data.nrows();i++) { k.first[i] = i; } k.second.resize(data.nrows(), 1.0); } for(auto i = 0;i < data.nrows();i++) { if (progress) ++(*pd); for(auto j = converters.begin();j != converters.end();j++) { pVectorConverter& p(*j); const std::vector<uint32_t>& i_origin(p->get_feature(i)); const std::vector<double>& x_origin(p->get_value(i)); auto x_value = x_origin.begin(); std::for_each(i_origin.begin(), i_origin.end(), [&cache, &hash_size, &x_value, &i](uint32_t hashed_value) { std::pair< std::vector<int>, std::vector<double> >& k(cache[hashed_value]); k.first.push_back(i); k.second.push_back(*(x_value++)); }); } } int pvec_value = ivec.size(); for(auto i = cache.begin();i != cache.end();i++) { while(pvec.size() <= i->first) pvec.push_back(pvec_value); ivec.insert(ivec.end(), i->second.first.begin(), i->second.first.end()); { std::vector<int> tmp; i->second.first.swap(tmp); } xvec.insert(xvec.end(), i->second.second.begin(), i->second.second.end()); { std::vector<double> tmp; i->second.second.swap(tmp); } pvec_value = ivec.size(); } pvec.resize(hash_size + 1, pvec_value); } retval.slot("i") = wrap(ivec); retval.slot("p") = wrap(pvec); retval.slot("x") = wrap(xvec); IntegerVector dim(2); if (transpose) { dim[0] = hash_size; dim[1] = pvec.size() - 1; retval.slot("Dim") = dim; } else { dim[0] = data.nrows(); dim[1] = hash_size; retval.slot("Dim") = dim; } { List dimnames(2); dimnames[0] = CharacterVector(0); dimnames[1] = CharacterVector(0); retval.slot("Dimnames") = dimnames; } retval.slot("factors") = List(); { CharacterVector key(e.ls(true)); std::for_each(key.begin(), key.end(), [&e, &hash_size](const char* s) { uint32_t *p = (uint32_t*) INTEGER(e[s]); p[0] = p[0] % hash_size; }); } retval.attr("mapping") = e; return retval; }