bool LccrFeaturizer::AccumulateFeatureCountFromLine( trie_t &trie, const std::vector<std::vector<std::string>> &textfeats) { for (const std::vector<std::string> &posfeat : textfeats) { for (const std::string &feat : posfeat) { trie.update(feat.c_str(), feat.size(), 1); } } return true; }
void LccrFeaturizer::FilterFeatureWithCount(trie_t &raw, trie_t &filtered, int cutoff) { namespace signal = boost::signals2::detail; size_t from(0), p(0); signal::auto_buffer<char, signal::store_n_bytes<LccrFeaturizer::MAXFEATLEN>> buffer(LccrFeaturizer::MAXFEATLEN, '\0'); int featcnt; for (featcnt = raw.begin(from, p); featcnt != trie_t::CEDAR_NO_PATH; featcnt = raw.next(from, p)) { if (featcnt >= cutoff) { raw.suffix(buffer.data(), p, from); int key = filtered.num_keys(); filtered.update(buffer.data(), p, 0) = key; } } }
void makeIndex (trie_t & trie, props_t & props, bool & abort, Config const & cfg) { try { props_t tmp_props; tmp_props.reserve(1024); std::map<tstring, int, icasecompare> tmp_propmap; for (SearchLocationInfo const & info : cfg.m_locations) { if (abort) return; SearchDirectory( info.m_dir_path, info.m_includes, info.m_excludes, info.m_recursive, info.m_follow_symlinks , TEXT("") , [] (tstring const & fname, tstring const & cmp) { return true; } , [&trie, &tmp_props, &tmp_propmap] (tstring const & fname, tstring const & fpath) { std::map<tstring, int>::iterator it = tmp_propmap.find(fname); if (it == tmp_propmap.end()) { tstring fname_lwr = fname; boost::algorithm::to_lower(fname_lwr); tmp_props.push_back(Props(fname_lwr, fpath)); trie_t::result_type const id = static_cast<trie_t::result_type>(tmp_props.size() - 1); tmp_propmap[fname_lwr] = id; //dbg_printf("insert: fname=%s fpath=%s idx=%i\n", fname_lwr.c_str(), fpath.c_str(), id); trie.update(fname_lwr.c_str(), fname_lwr.length(), id); } else { tmp_props[it->second].m_fpath.push_back(fpath); //dbg_printf("update: fname=%s fpath=%s idx=%i\n", fname.c_str(), fpath.c_str(), it->second); } } , abort); } props = tmp_props; } catch (std::regex_error const & e) { dbg_printf("Exception caught: %s", e.what()); } //printf("keys: %ld\n", trie.num_keys ()); //printf("size: %ld\n", trie.size ()); }