예제 #1
0
void ucs4getline(std::istream& is, ucs4string& ustr)
{
	while (!is.eof())
	{
		ucs4_t ch = ucs4_t('\n');
		is.read(reinterpret_cast<char *>(&ch), 4);

		if (ch == ucs4_t('\n'))
			break;
		ustr.push_back(ch);
	}
}
예제 #2
0
void Segmentation::DoSegment(ucs4string base, double base_rr, ucs4string to_seg, SegmentData& data)
{
	for (size_t i=1; i<to_seg.size(); ++i)
	{
		ucs4string us = to_seg.substr(0, i);
		DoSegment(base + ucs4_t(' ') + us, base_rr*GetRateReciprocal(us), to_seg.substr(i), data);
	}

	double rr = base_rr*GetRateReciprocal(to_seg);
	if (rr > data.m_rr)
		return;

	data.m_rr = rr;
	data.m_res = base + ucs4_t(' ') + to_seg;
}
예제 #3
0
void Segmentation::LoadDict(std::istream& dict_is)
{
	unsigned int sum = 0;
	unsigned int word_max_len = 0;
	typedef std::vector<ucs4string> UStrVec;
	while (!dict_is.eof())
	{
		ucs4string s;
		ucs4getline(dict_is, s);

		UStrVec usvec;
		ustrslipt(s, ucs4_t(' '), usvec);

		if (usvec.size() != 2)
			continue;

		ucs4string& word = usvec[0];
		ucs4_t cnt = ustrtou(usvec[1]);

		m_dict_map[word] = cnt;
		sum += cnt;

		if (word_max_len < word.size())
			word_max_len = word.size();
	}
	m_word_sum = sum * 1.0;
	m_max_rr = pow(m_word_sum, word_max_len+1.0);
}
예제 #4
0
    static ucs4_t const *pick(char const *, wchar_t const *cstr)
    {
        inter_str.clear();
        while(*cstr != 0)
        {
            inter_str.push_back(ucs4_t(*cstr));
            ++cstr;
        }

        return inter_str.c_str();
    }
예제 #5
0
int main(int argc, char *argv[])
{
	if (argc != 3)
	{
		std::cerr << "Usage:" << argv[0] << " <ucs4-phrase-file> <ucs4-dictionary>" << std::endl;
		exit(1);
	}

	std::ifstream fin(argv[1], std::ios::binary);
	if (!fin.is_open())
	{
		std::cerr << "Failed to open " << argv[1] << " for read." << std::endl;
		exit(2);
	}
	std::ofstream fout(argv[2], std::ios::binary);
	if (!fout.is_open())
	{
		std::cerr << "Failed to open " << argv[2] << " for write." << std::endl;
		exit(3);
	}

	DictionaryGenerator dg;

	while (!fin.eof())
	{
		ucs4string s;

		ucs4getline(fin, s);

		dg.SetmentPhrase(s);
	}

	DictMap::const_iterator it = dg.GetDictionary().begin();
	DictMap::const_iterator itend = dg.GetDictionary().end();
	for (; it!=itend; ++it)
	{
		const ucs4string ustr=it->first;
		unsigned int cnt=it->second;
		if (cnt==1 && ustr.size()!=1)
			continue;
		ucs4putstr(fout, ustr);
		ucs4putstr(fout, stdtoustr(" "));
		ucs4putstr(fout, utoustr(cnt));
		ucs4putch(fout, ucs4_t('\n'));
	}

	return 0;
}
예제 #6
0
파일: utils.cpp 프로젝트: hltj/wxMEdit
bool GetRawBytesFromHexUnicodeText(std::vector<char>& cs, const std::vector<ucs4_t>& ucs)
{
	std::vector<int> tmp_hex;

	BOOST_FOREACH(ucs4_t u, ucs)
	{
		if (u_isUWhiteSpace(u) && tmp_hex.size()%2==0)
			continue;
		if (!isxdigit(u))
			return false;

		int hex = (u > ucs4_t('9'))? (u | 0x20) - 'a' + 10: u - '0';
		tmp_hex.push_back(hex);
	}

	if (tmp_hex.empty() || tmp_hex.size() % 2 != 0)
		return false;

	for (size_t i=0; i<tmp_hex.size(); i+=2)
		cs.push_back(char((tmp_hex[i]<<4) | tmp_hex[i+1]));

	return true;
}