bool NE::open(const Param ¶m) { close(); if (action_mode() == PARSING_MODE) { const std::string filename = param.get<std::string>("ne-model"); std::vector<const char*> argv; argv.push_back(param.program_name()); argv.push_back("-m"); argv.push_back(filename.c_str()); model_ = crfpp_model_new(argv.size(), const_cast<char **>(&argv[0])); // CHECK_FALSE(tagger_) << crfpp_strerror(tagger_); // CHECK_FALSE(crfpp_ysize(tagger_) >= 2); // CHECK_FALSE(crfpp_xsize(tagger_) == 3); // for (size_t i = 0; i < crfpp_ysize(tagger_); ++i) { // const char *p = crfpp_yname(tagger_, i); // CHECK_FALSE(p && (p[0] == 'B' || p[0] == 'I' || p[0] == 'O')); // } } // "名詞,数," ne_composite_ipa_ ="\xE5\x90\x8D\xE8\xA9\x9E,\xE6\x95\xB0,"; // "名詞,数詞" ne_composite_juman_ = "\xE5\x90\x8D\xE8\xA9\x9E,\xE6\x95\xB0\xE8\xA9\x9E,"; // "名詞,数詞" ne_composite_unidic_ = "\xE5\x90\x8D\xE8\xA9\x9E,\xE6\x95\xB0\xE8\xA9\x9E,"; Iconv iconv; iconv.open(UTF8, charset()); CHECK_DIE(iconv.convert(&ne_composite_ipa_)); CHECK_DIE(iconv.convert(&ne_composite_juman_)); CHECK_DIE(iconv.convert(&ne_composite_unidic_)); CHECK_FALSE(!ne_composite_ipa_.empty()); CHECK_FALSE(!ne_composite_juman_.empty()); CHECK_FALSE(!ne_composite_unidic_.empty()); return true; }
bool Dictionary::compile(const Param ¶m, const std::vector<std::string> &dics, const char *matrix_file, const char *matrix_bin_file, const char *left_id_file, const char *right_id_file, const char *rewrite_file, const char *pos_id_file, const char *output) { Connector matrix; scoped_ptr<DictionaryRewriter> rewrite(0); scoped_ptr<POSIDGenerator> posid(0); scoped_ptr<ContextID> cid(0); scoped_ptr<Writer> writer(0); scoped_ptr<StringBuffer> os(0); Node node; std::vector<std::pair<std::string, Token*> > dic; size_t offset = 0; unsigned int lexsize = 0; std::string w, feature, ufeature, lfeature, rfeature, fbuf, key; int lid, rid, cost; const std::string from = param.get<std::string>("dictionary-charset"); const std::string to = param.get<std::string>("charset"); const bool wakati = param.get<bool>("wakati"); const int type = param.get<int>("type"); const std::string node_format = param.get<std::string>("node-format"); // for backward compatibility std::string config_charset = param.get<std::string>("config-charset"); if (config_charset.empty()) config_charset = from; CHECK_DIE(!from.empty()) << "input dictionary charset is empty"; CHECK_DIE(!to.empty()) << "output dictionary charset is empty"; Iconv iconv; CHECK_DIE(iconv.open(from.c_str(), to.c_str())) << "iconv_open() failed with from=" << from << " to=" << to; Iconv config_iconv; CHECK_DIE(config_iconv.open(config_charset.c_str(), from.c_str())) << "iconv_open() failed with from=" << config_charset << " to=" << from; if (!node_format.empty()) { writer.reset(new Writer); os.reset(new StringBuffer); memset(&node, 0, sizeof(node)); } if (!matrix.openText(matrix_file) && !matrix.open(matrix_bin_file)) { matrix.set_left_size(1); matrix.set_right_size(1); } posid.reset(new POSIDGenerator); posid->open(pos_id_file, &config_iconv); std::istringstream iss(UNK_DEF_DEFAULT); for (size_t i = 0; i < dics.size(); ++i) { std::ifstream ifs(dics[i].c_str()); std::istream *is = &ifs; if (!ifs) { if (type == MECAB_UNK_DIC) { std::cerr << dics[i] << " is not found. minimum setting is used." << std::endl; is = &iss; } else { CHECK_DIE(ifs) << "no such file or directory: " << dics[i]; } } std::cout << "reading " << dics[i] << " ... "; char line[BUF_SIZE]; size_t num = 0; while (is->getline(line, sizeof(line))) { char *col[8]; const size_t n = tokenizeCSV(line, col, 5); CHECK_DIE(n == 5) << "format error: " << line; w = col[0]; lid = std::atoi(col[1]); rid = std::atoi(col[2]); cost = std::atoi(col[3]); feature = col[4]; int pid = posid->id(feature.c_str()); if (lid < 0 || rid < 0) { if (!rewrite.get()) { rewrite.reset(new DictionaryRewriter); rewrite->open(rewrite_file, &config_iconv); } CHECK_DIE(rewrite->rewrite(feature, &ufeature, &lfeature, &rfeature)) << "rewrite failed: " << feature; if (!cid.get()) { cid.reset(new ContextID); cid->open(left_id_file, right_id_file, &config_iconv); CHECK_DIE(cid->left_size() == matrix.left_size() && cid->right_size() == matrix.right_size()) << "Context ID files(" << left_id_file << " or " << right_id_file << " may be broken"; } lid = cid->lid(lfeature.c_str()); rid = cid->rid(rfeature.c_str()); } CHECK_DIE(lid >= 0 && rid >= 0 && matrix.is_valid(lid, rid)) << "invalid ids are found lid=" << lid << " rid=" << rid; if (w.empty()) { std::cerr << "empty word is found, discard this line" << std::endl; continue; } if (!iconv.convert(&feature)) { std::cerr << "iconv conversion failed. skip this entry" << std::endl; continue; } if (type != MECAB_UNK_DIC && !iconv.convert(&w)) { std::cerr << "iconv conversion failed. skip this entry" << std::endl; continue; } if (!node_format.empty()) { node.surface = w.c_str(); node.feature = feature.c_str(); node.length = w.size(); node.rlength = w.size(); node.posid = pid; node.stat = MECAB_NOR_NODE; CHECK_DIE(os.get()); CHECK_DIE(writer.get()); os->clear(); CHECK_DIE(writer->writeNode(&*os, node_format.c_str(), w.c_str(), &node)) << "conversion error: " << feature << " with " << node_format; *os << '\0'; feature = os->str(); } key.clear(); if (!wakati) key = feature + '\0'; Token* token = new Token; token->lcAttr = lid; token->rcAttr = rid; token->posid = pid; token->wcost = cost; token->feature = offset; token->compound = 0; dic.push_back(std::make_pair<std::string, Token*>(w, token)); // append to output buffer if (!wakati) fbuf.append(key.data(), key.size()); offset += key.size(); ++num; ++lexsize; } std::cout << num << std::endl; } if (wakati) fbuf.append("\0", 1); std::sort(dic.begin(), dic.end()); size_t bsize = 0; size_t idx = 0; std::string prev; std::vector<const char *> str; std::vector<size_t> len; std::vector<Darts::DoubleArray::result_type> val; for (size_t i = 0; i < dic.size(); ++i) { if (i != 0 && prev != dic[i].first) { str.push_back(dic[idx].first.c_str()); len.push_back(dic[idx].first.size()); val.push_back(bsize +(idx << 8)); bsize = 1; idx = i; } else { ++bsize; } prev = dic[i].first; } str.push_back(dic[idx].first.c_str()); len.push_back(dic[idx].first.size()); val.push_back(bsize +(idx << 8)); CHECK_DIE(str.size() == len.size()); CHECK_DIE(str.size() == val.size()); Darts::DoubleArray da; CHECK_DIE(da.build(str.size(), const_cast<char **>(&str[0]), &len[0], &val[0], &progress_bar_darts) == 0) << "unkown error in building double-array"; std::string tbuf; for (size_t i = 0; i < dic.size(); ++i) { tbuf.append(reinterpret_cast<const char*>(dic[i].second), sizeof(Token)); delete dic[i].second; } dic.clear(); // needs to be 8byte(64bit) aligned while (tbuf.size() % 8 != 0) { Token dummy; memset(&dummy, 0, sizeof(Token)); tbuf.append(reinterpret_cast<const char*>(&dummy), sizeof(Token)); } unsigned int dummy = 0; unsigned int lsize = matrix.left_size(); unsigned int rsize = matrix.right_size(); unsigned int dsize = da.unit_size() * da.size(); unsigned int tsize = tbuf.size(); unsigned int fsize = fbuf.size(); unsigned int version = DIC_VERSION; char charset[32]; std::fill(charset, charset + sizeof(charset), '\0'); std::strncpy(charset, to.c_str(), 31); std::ofstream bofs(output, std::ios::binary|std::ios::out); CHECK_DIE(bofs) << "permission denied: " << output; unsigned int magic = 0; // needs to be 64bit aligned // 10*32 = 64*5 bofs.write(reinterpret_cast<const char *>(&magic), sizeof(unsigned int)); bofs.write(reinterpret_cast<const char *>(&version), sizeof(unsigned int)); bofs.write(reinterpret_cast<const char *>(&type), sizeof(unsigned int)); bofs.write(reinterpret_cast<const char *>(&lexsize), sizeof(unsigned int)); bofs.write(reinterpret_cast<const char *>(&lsize), sizeof(unsigned int)); bofs.write(reinterpret_cast<const char *>(&rsize), sizeof(unsigned int)); bofs.write(reinterpret_cast<const char *>(&dsize), sizeof(unsigned int)); bofs.write(reinterpret_cast<const char *>(&tsize), sizeof(unsigned int)); bofs.write(reinterpret_cast<const char *>(&fsize), sizeof(unsigned int)); bofs.write(reinterpret_cast<const char *>(&dummy), sizeof(unsigned int)); // 32 * 8 = 64 * 4 bofs.write(reinterpret_cast<const char *>(charset), sizeof(charset)); bofs.write(reinterpret_cast<const char*>(da.array()), da.unit_size() * da.size()); bofs.write(const_cast<const char *>(tbuf.data()), tbuf.size()); bofs.write(const_cast<const char *>(fbuf.data()), fbuf.size()); // save magic id magic = static_cast<unsigned int>(bofs.tellp()); magic ^= DictionaryMagicID; bofs.seekp(0); bofs.write(reinterpret_cast<const char *>(&magic), sizeof(unsigned int)); bofs.close(); return true; }