예제 #1
0
void LccrFeaturizer::FilterFeatureWithCount(trie_t &raw, trie_t &filtered,
                                            int cutoff) {
  namespace signal = boost::signals2::detail;
  size_t from(0), p(0);
  signal::auto_buffer<char, signal::store_n_bytes<LccrFeaturizer::MAXFEATLEN>>
      buffer(LccrFeaturizer::MAXFEATLEN, '\0');
  int featcnt;
  for (featcnt = raw.begin(from, p); featcnt != trie_t::CEDAR_NO_PATH;
       featcnt = raw.next(from, p)) {
    if (featcnt >= cutoff) {
      raw.suffix(buffer.data(), p, from);
      int key = filtered.num_keys();
      filtered.update(buffer.data(), p, 0) = key;
    }
  }
}
예제 #2
0
		float calc_entropy(const string& word
				, const trie_result_t& res
				, trie_t& entro_trie
				, const uint32_t total_freq)
		{
			char suffix[256];
			hash_t		rlt_hash;
			trie_result_list rlts;
			size_t rlts_len = entro_trie.commonPrefixPredict(word.c_str(), rlts, NUM_RESULT);
			float entropy = 0.0;
			if(rlts_len > 1) {
				int entropy_freq = 0;
				// Ignore itself
				trie_result_list::iterator it = rlts.begin();
				for(it++; it != rlts.end(); it++) {
					assert(it->length < 250);
					entro_trie.suffix(suffix, it->length, it->id);
					string tmp_s(suffix);
					string tmp(tmp_s.begin(), tmp_s.begin()+2);
					//fprintf(glog.fd, "%s %s %s %d\n",word.c_str(), suffix, tmp.c_str(), it->value);

					hash_t::iterator it_map = rlt_hash.find(tmp);
					if (it_map == rlt_hash.end()) {
						rlt_hash[tmp] = it->value;
					} else {
						it_map->second += it->value;
					}

					entropy_freq += it->value;
				}
				for(hash_t::iterator map_it = rlt_hash.begin();
						map_it != rlt_hash.end();
						map_it++) {
					float p = static_cast<float>(map_it->second) / entropy_freq;
					entropy -= p * log(p);
					//fprintf(glog.fd, "entropy %s\t%d\n", map_it->first.c_str(), map_it->second);
				}
			} else {
				entropy = static_cast<float>(res.value)/20.0;
			}

			return entropy;
		}