void LccrFeaturizer::FilterFeatureWithCount(trie_t &raw, trie_t &filtered, int cutoff) { namespace signal = boost::signals2::detail; size_t from(0), p(0); signal::auto_buffer<char, signal::store_n_bytes<LccrFeaturizer::MAXFEATLEN>> buffer(LccrFeaturizer::MAXFEATLEN, '\0'); int featcnt; for (featcnt = raw.begin(from, p); featcnt != trie_t::CEDAR_NO_PATH; featcnt = raw.next(from, p)) { if (featcnt >= cutoff) { raw.suffix(buffer.data(), p, from); int key = filtered.num_keys(); filtered.update(buffer.data(), p, 0) = key; } } }
float calc_entropy(const string& word , const trie_result_t& res , trie_t& entro_trie , const uint32_t total_freq) { char suffix[256]; hash_t rlt_hash; trie_result_list rlts; size_t rlts_len = entro_trie.commonPrefixPredict(word.c_str(), rlts, NUM_RESULT); float entropy = 0.0; if(rlts_len > 1) { int entropy_freq = 0; // Ignore itself trie_result_list::iterator it = rlts.begin(); for(it++; it != rlts.end(); it++) { assert(it->length < 250); entro_trie.suffix(suffix, it->length, it->id); string tmp_s(suffix); string tmp(tmp_s.begin(), tmp_s.begin()+2); //fprintf(glog.fd, "%s %s %s %d\n",word.c_str(), suffix, tmp.c_str(), it->value); hash_t::iterator it_map = rlt_hash.find(tmp); if (it_map == rlt_hash.end()) { rlt_hash[tmp] = it->value; } else { it_map->second += it->value; } entropy_freq += it->value; } for(hash_t::iterator map_it = rlt_hash.begin(); map_it != rlt_hash.end(); map_it++) { float p = static_cast<float>(map_it->second) / entropy_freq; entropy -= p * log(p); //fprintf(glog.fd, "entropy %s\t%d\n", map_it->first.c_str(), map_it->second); } } else { entropy = static_cast<float>(res.value)/20.0; } return entropy; }