Example #1
0
void LccrFeaturizer::FilterFeatureWithCount(trie_t &raw, trie_t &filtered,
                                            int cutoff) {
  namespace signal = boost::signals2::detail;
  size_t from(0), p(0);
  signal::auto_buffer<char, signal::store_n_bytes<LccrFeaturizer::MAXFEATLEN>>
      buffer(LccrFeaturizer::MAXFEATLEN, '\0');
  int featcnt;
  for (featcnt = raw.begin(from, p); featcnt != trie_t::CEDAR_NO_PATH;
       featcnt = raw.next(from, p)) {
    if (featcnt >= cutoff) {
      raw.suffix(buffer.data(), p, from);
      int key = filtered.num_keys();
      filtered.update(buffer.data(), p, 0) = key;
    }
  }
}
Example #2
0
bool LccrFeaturizer::AccumulateFeatureCountFromLine(
    trie_t &trie, const std::vector<std::vector<std::string>> &textfeats) {
  for (const std::vector<std::string> &posfeat : textfeats) {
    for (const std::string &feat : posfeat) {
      trie.update(feat.c_str(), feat.size(), 1);
    }
  }
  return true;
}
Example #3
0
bool saveIndex (trie_t const & t, tstring const & fpath)
{
	if (t.save(fpath.c_str()) != 0)
	{
		// err
		return false;
	}
	return true;
}
Example #4
0
		float calc_entropy(const string& word
				, const trie_result_t& res
				, trie_t& entro_trie
				, const uint32_t total_freq)
		{
			char suffix[256];
			hash_t		rlt_hash;
			trie_result_list rlts;
			size_t rlts_len = entro_trie.commonPrefixPredict(word.c_str(), rlts, NUM_RESULT);
			float entropy = 0.0;
			if(rlts_len > 1) {
				int entropy_freq = 0;
				// Ignore itself
				trie_result_list::iterator it = rlts.begin();
				for(it++; it != rlts.end(); it++) {
					assert(it->length < 250);
					entro_trie.suffix(suffix, it->length, it->id);
					string tmp_s(suffix);
					string tmp(tmp_s.begin(), tmp_s.begin()+2);
					//fprintf(glog.fd, "%s %s %s %d\n",word.c_str(), suffix, tmp.c_str(), it->value);

					hash_t::iterator it_map = rlt_hash.find(tmp);
					if (it_map == rlt_hash.end()) {
						rlt_hash[tmp] = it->value;
					} else {
						it_map->second += it->value;
					}

					entropy_freq += it->value;
				}
				for(hash_t::iterator map_it = rlt_hash.begin();
						map_it != rlt_hash.end();
						map_it++) {
					float p = static_cast<float>(map_it->second) / entropy_freq;
					entropy -= p * log(p);
					//fprintf(glog.fd, "entropy %s\t%d\n", map_it->first.c_str(), map_it->second);
				}
			} else {
				entropy = static_cast<float>(res.value)/20.0;
			}

			return entropy;
		}
Example #5
0
void makeIndex (trie_t & trie, props_t & props, bool & abort, Config const & cfg)
{
	try
	{
		props_t tmp_props;
		tmp_props.reserve(1024);
		std::map<tstring, int, icasecompare> tmp_propmap;
		for (SearchLocationInfo const & info : cfg.m_locations)
		{
			if (abort)
				return;

			SearchDirectory(
				  info.m_dir_path, info.m_includes, info.m_excludes, info.m_recursive, info.m_follow_symlinks
				, TEXT("")
				, [] (tstring const & fname, tstring const & cmp) { return true; }
				, [&trie, &tmp_props, &tmp_propmap] (tstring const & fname, tstring const & fpath)
					 {
						std::map<tstring, int>::iterator it = tmp_propmap.find(fname);
						if (it == tmp_propmap.end())
						{
							tstring fname_lwr = fname;
							boost::algorithm::to_lower(fname_lwr);
							tmp_props.push_back(Props(fname_lwr, fpath));
							trie_t::result_type const id = static_cast<trie_t::result_type>(tmp_props.size() - 1);
							tmp_propmap[fname_lwr] = id;
							//dbg_printf("insert: fname=%s fpath=%s idx=%i\n", fname_lwr.c_str(), fpath.c_str(), id);
							trie.update(fname_lwr.c_str(), fname_lwr.length(), id);
						}
						else
						{
							tmp_props[it->second].m_fpath.push_back(fpath);
							//dbg_printf("update: fname=%s fpath=%s idx=%i\n", fname.c_str(), fpath.c_str(), it->second);
						}
					 }
				, abort);
		}
		props = tmp_props;
	}
	catch (std::regex_error const & e)
	{
		dbg_printf("Exception caught: %s", e.what());
	}

	//printf("keys: %ld\n", trie.num_keys ());
	//printf("size: %ld\n", trie.size ());
}
using mapped_t = int;
using trie_t = trie<mapped_t>;
using value_t = trie_t::value_type;
using key_t = trie_t::key_type;
using map_t = std::map<key_t, mapped_t>;


static auto kt = [](mapped_t k)->key_t { return boost::lexical_cast<key_t>(k); };
static auto mv = [&](mapped_t x)->value_t { return std::make_pair(kt(x), x); };
static auto vt = [&](mapped_t k, mapped_t v)->value_t { return std::make_pair(kt(k), v); };



/** Test trie<T>::swap() */
BEGIN_PHASE(71, ut_swapf_empty_to_empty ) {
	trie_t a, b;

	BOOST_CHECK( a.empty() );
	BOOST_CHECK( b.empty() );

	std::swap(a, b);

	BOOST_CHECK( a.empty() );
	BOOST_CHECK( b.empty() );
} END_PHASE()
#endif



#if TEST_PHASE >= 72
BEGIN_PHASE(72, ut_swapf_pop_to_empty ) {
Example #7
0
bool loadIndex (trie_t & t, tstring const & fpath)
{
	return t.open(fpath.c_str()) == 0;
}