void ucs4getline(std::istream& is, ucs4string& ustr) { while (!is.eof()) { ucs4_t ch = ucs4_t('\n'); is.read(reinterpret_cast<char *>(&ch), 4); if (ch == ucs4_t('\n')) break; ustr.push_back(ch); } }
void Segmentation::DoSegment(ucs4string base, double base_rr, ucs4string to_seg, SegmentData& data) { for (size_t i=1; i<to_seg.size(); ++i) { ucs4string us = to_seg.substr(0, i); DoSegment(base + ucs4_t(' ') + us, base_rr*GetRateReciprocal(us), to_seg.substr(i), data); } double rr = base_rr*GetRateReciprocal(to_seg); if (rr > data.m_rr) return; data.m_rr = rr; data.m_res = base + ucs4_t(' ') + to_seg; }
void Segmentation::LoadDict(std::istream& dict_is) { unsigned int sum = 0; unsigned int word_max_len = 0; typedef std::vector<ucs4string> UStrVec; while (!dict_is.eof()) { ucs4string s; ucs4getline(dict_is, s); UStrVec usvec; ustrslipt(s, ucs4_t(' '), usvec); if (usvec.size() != 2) continue; ucs4string& word = usvec[0]; ucs4_t cnt = ustrtou(usvec[1]); m_dict_map[word] = cnt; sum += cnt; if (word_max_len < word.size()) word_max_len = word.size(); } m_word_sum = sum * 1.0; m_max_rr = pow(m_word_sum, word_max_len+1.0); }
static ucs4_t const *pick(char const *, wchar_t const *cstr) { inter_str.clear(); while(*cstr != 0) { inter_str.push_back(ucs4_t(*cstr)); ++cstr; } return inter_str.c_str(); }
int main(int argc, char *argv[]) { if (argc != 3) { std::cerr << "Usage:" << argv[0] << " <ucs4-phrase-file> <ucs4-dictionary>" << std::endl; exit(1); } std::ifstream fin(argv[1], std::ios::binary); if (!fin.is_open()) { std::cerr << "Failed to open " << argv[1] << " for read." << std::endl; exit(2); } std::ofstream fout(argv[2], std::ios::binary); if (!fout.is_open()) { std::cerr << "Failed to open " << argv[2] << " for write." << std::endl; exit(3); } DictionaryGenerator dg; while (!fin.eof()) { ucs4string s; ucs4getline(fin, s); dg.SetmentPhrase(s); } DictMap::const_iterator it = dg.GetDictionary().begin(); DictMap::const_iterator itend = dg.GetDictionary().end(); for (; it!=itend; ++it) { const ucs4string ustr=it->first; unsigned int cnt=it->second; if (cnt==1 && ustr.size()!=1) continue; ucs4putstr(fout, ustr); ucs4putstr(fout, stdtoustr(" ")); ucs4putstr(fout, utoustr(cnt)); ucs4putch(fout, ucs4_t('\n')); } return 0; }
bool GetRawBytesFromHexUnicodeText(std::vector<char>& cs, const std::vector<ucs4_t>& ucs) { std::vector<int> tmp_hex; BOOST_FOREACH(ucs4_t u, ucs) { if (u_isUWhiteSpace(u) && tmp_hex.size()%2==0) continue; if (!isxdigit(u)) return false; int hex = (u > ucs4_t('9'))? (u | 0x20) - 'a' + 10: u - '0'; tmp_hex.push_back(hex); } if (tmp_hex.empty() || tmp_hex.size() % 2 != 0) return false; for (size_t i=0; i<tmp_hex.size(); i+=2) cs.push_back(char((tmp_hex[i]<<4) | tmp_hex[i+1])); return true; }