void LccrFeaturizer::FilterFeatureWithCount(trie_t &raw, trie_t &filtered, int cutoff) { namespace signal = boost::signals2::detail; size_t from(0), p(0); signal::auto_buffer<char, signal::store_n_bytes<LccrFeaturizer::MAXFEATLEN>> buffer(LccrFeaturizer::MAXFEATLEN, '\0'); int featcnt; for (featcnt = raw.begin(from, p); featcnt != trie_t::CEDAR_NO_PATH; featcnt = raw.next(from, p)) { if (featcnt >= cutoff) { raw.suffix(buffer.data(), p, from); int key = filtered.num_keys(); filtered.update(buffer.data(), p, 0) = key; } } }
bool LccrFeaturizer::AccumulateFeatureCountFromLine( trie_t &trie, const std::vector<std::vector<std::string>> &textfeats) { for (const std::vector<std::string> &posfeat : textfeats) { for (const std::string &feat : posfeat) { trie.update(feat.c_str(), feat.size(), 1); } } return true; }
bool saveIndex (trie_t const & t, tstring const & fpath) { if (t.save(fpath.c_str()) != 0) { // err return false; } return true; }
float calc_entropy(const string& word , const trie_result_t& res , trie_t& entro_trie , const uint32_t total_freq) { char suffix[256]; hash_t rlt_hash; trie_result_list rlts; size_t rlts_len = entro_trie.commonPrefixPredict(word.c_str(), rlts, NUM_RESULT); float entropy = 0.0; if(rlts_len > 1) { int entropy_freq = 0; // Ignore itself trie_result_list::iterator it = rlts.begin(); for(it++; it != rlts.end(); it++) { assert(it->length < 250); entro_trie.suffix(suffix, it->length, it->id); string tmp_s(suffix); string tmp(tmp_s.begin(), tmp_s.begin()+2); //fprintf(glog.fd, "%s %s %s %d\n",word.c_str(), suffix, tmp.c_str(), it->value); hash_t::iterator it_map = rlt_hash.find(tmp); if (it_map == rlt_hash.end()) { rlt_hash[tmp] = it->value; } else { it_map->second += it->value; } entropy_freq += it->value; } for(hash_t::iterator map_it = rlt_hash.begin(); map_it != rlt_hash.end(); map_it++) { float p = static_cast<float>(map_it->second) / entropy_freq; entropy -= p * log(p); //fprintf(glog.fd, "entropy %s\t%d\n", map_it->first.c_str(), map_it->second); } } else { entropy = static_cast<float>(res.value)/20.0; } return entropy; }
void makeIndex (trie_t & trie, props_t & props, bool & abort, Config const & cfg) { try { props_t tmp_props; tmp_props.reserve(1024); std::map<tstring, int, icasecompare> tmp_propmap; for (SearchLocationInfo const & info : cfg.m_locations) { if (abort) return; SearchDirectory( info.m_dir_path, info.m_includes, info.m_excludes, info.m_recursive, info.m_follow_symlinks , TEXT("") , [] (tstring const & fname, tstring const & cmp) { return true; } , [&trie, &tmp_props, &tmp_propmap] (tstring const & fname, tstring const & fpath) { std::map<tstring, int>::iterator it = tmp_propmap.find(fname); if (it == tmp_propmap.end()) { tstring fname_lwr = fname; boost::algorithm::to_lower(fname_lwr); tmp_props.push_back(Props(fname_lwr, fpath)); trie_t::result_type const id = static_cast<trie_t::result_type>(tmp_props.size() - 1); tmp_propmap[fname_lwr] = id; //dbg_printf("insert: fname=%s fpath=%s idx=%i\n", fname_lwr.c_str(), fpath.c_str(), id); trie.update(fname_lwr.c_str(), fname_lwr.length(), id); } else { tmp_props[it->second].m_fpath.push_back(fpath); //dbg_printf("update: fname=%s fpath=%s idx=%i\n", fname.c_str(), fpath.c_str(), it->second); } } , abort); } props = tmp_props; } catch (std::regex_error const & e) { dbg_printf("Exception caught: %s", e.what()); } //printf("keys: %ld\n", trie.num_keys ()); //printf("size: %ld\n", trie.size ()); }
using mapped_t = int; using trie_t = trie<mapped_t>; using value_t = trie_t::value_type; using key_t = trie_t::key_type; using map_t = std::map<key_t, mapped_t>; static auto kt = [](mapped_t k)->key_t { return boost::lexical_cast<key_t>(k); }; static auto mv = [&](mapped_t x)->value_t { return std::make_pair(kt(x), x); }; static auto vt = [&](mapped_t k, mapped_t v)->value_t { return std::make_pair(kt(k), v); }; /** Test trie<T>::swap() */ BEGIN_PHASE(71, ut_swapf_empty_to_empty ) { trie_t a, b; BOOST_CHECK( a.empty() ); BOOST_CHECK( b.empty() ); std::swap(a, b); BOOST_CHECK( a.empty() ); BOOST_CHECK( b.empty() ); } END_PHASE() #endif #if TEST_PHASE >= 72 BEGIN_PHASE(72, ut_swapf_pop_to_empty ) {
bool loadIndex (trie_t & t, tstring const & fpath) { return t.open(fpath.c_str()) == 0; }