bool load_dictionary_resource(Param *param) { //debug std::cout << "[" << __FILE__ << ":" << __LINE__ << "]: " << "load_dictionary_resource(Param *param)" << std::endl; /// std::string rcfile = param->get<std::string>("rcfile"); #ifdef HAVE_GETENV if (rcfile.empty()) { //debug std::cout << "[" << __FILE__ << ":" << __LINE__ << "]: " << "rcfile.empty()" << std::endl; /// const char *homedir = getenv("HOME"); if (homedir) { const std::string s = MeCab::create_filename(std::string(homedir), ".mecabrc"); std::ifstream ifs(WPATH(s.c_str())); if (ifs) { rcfile = s; } } } if (rcfile.empty()) { const char *rcenv = getenv("MECABRC"); if (rcenv) { rcfile = rcenv; } } #endif #if defined (HAVE_GETENV) && defined(_WIN32) && !defined(__CYGWIN__) if (rcfile.empty()) { scoped_fixed_array<wchar_t, BUF_SIZE> buf; const DWORD len = ::GetEnvironmentVariableW(L"MECABRC", buf.get(), buf.size()); if (len < buf.size() && len > 0) { rcfile = WideToUtf8(buf.get()); } } #endif #if defined(_WIN32) && !defined(__CYGWIN__) HKEY hKey; scoped_fixed_array<wchar_t, BUF_SIZE> v; DWORD vt; DWORD size = v.size() * sizeof(v[0]); if (rcfile.empty()) { ::RegOpenKeyExW(HKEY_LOCAL_MACHINE, L"software\\mecab", 0, KEY_READ, &hKey); ::RegQueryValueExW(hKey, L"mecabrc", 0, &vt, reinterpret_cast<BYTE *>(v.get()), &size); ::RegCloseKey(hKey); if (vt == REG_SZ) { rcfile = WideToUtf8(v.get()); } } if (rcfile.empty()) { ::RegOpenKeyExW(HKEY_CURRENT_USER, L"software\\mecab", 0, KEY_READ, &hKey); ::RegQueryValueExW(hKey, L"mecabrc", 0, &vt, reinterpret_cast<BYTE *>(v.get()), &size); ::RegCloseKey(hKey); if (vt == REG_SZ) { rcfile = WideToUtf8(v.get()); } } if (rcfile.empty()) { vt = ::GetModuleFileNameW(DllInstance, v.get(), size); if (vt != 0) { scoped_fixed_array<wchar_t, _MAX_DRIVE> drive; scoped_fixed_array<wchar_t, _MAX_DRIVE> dir; _wsplitpath(v.get(), drive.get(), dir.get(), NULL, NULL); const std::wstring path = std::wstring(drive.get()) + std::wstring(dir.get()) + L"mecabrc"; if (::GetFileAttributesW(path.c_str()) != -1) { rcfile = WideToUtf8(path); } } } #endif if (rcfile.empty()) { rcfile = MECAB_DEFAULT_RC; } if (!param->load(rcfile.c_str())) { return false; } std::string dicdir = param->get<std::string>("dicdir"); if (dicdir.empty()) { dicdir = "."; // current } remove_filename(&rcfile); replace_string(&dicdir, "$(rcpath)", rcfile); param->set<std::string>("dicdir", dicdir, true); dicdir = create_filename(dicdir, DICRC); if (!param->load(dicdir.c_str())) { return false; } return true; }
bool Encoder::learn(const char *templfile, const char *trainfile, const char *modelfile, bool textmodelfile, size_t maxitr, size_t freq, double eta, double C, unsigned short thread_num, unsigned short shrinking_size, int algorithm) { std::cout << COPYRIGHT << std::endl; CHECK_FALSE(eta > 0.0) << "eta must be > 0.0"; CHECK_FALSE(C >= 0.0) << "C must be >= 0.0"; CHECK_FALSE(shrinking_size >= 1) << "shrinking-size must be >= 1"; CHECK_FALSE(thread_num > 0) << "thread must be > 0"; #ifndef CRFPP_USE_THREAD CHECK_FALSE(thread_num == 1) << "This architecture doesn't support multi-thrading"; #endif if (algorithm == MIRA && thread_num > 1) { std::cerr << "MIRA doesn't support multi-thrading. use thread_num=1" << std::endl; } EncoderFeatureIndex feature_index; Allocator allocator(thread_num); std::vector<TaggerImpl* > x; x.reserve(max_line_nums); //adjust vector.capacity() to accelerate operation:push_back() std::cout.setf(std::ios::fixed, std::ios::floatfield); std::cout.precision(5); #define WHAT_ERROR(msg) do { \ for (std::vector<TaggerImpl *>::iterator it = x.begin(); \ it != x.end(); ++it) \ delete *it; \ std::cerr << msg << std::endl; \ return false; } while (0) CHECK_FALSE(feature_index.open(templfile, trainfile)) << feature_index.what(); { progress_timer pg; std::ifstream ifs(WPATH(trainfile)); CHECK_FALSE(ifs) << "cannot open: " << trainfile; std::cout << "reading training data: " << std::flush; size_t line = 0; while (ifs) { TaggerImpl *_x = new TaggerImpl(); _x->open(&feature_index, &allocator); if (!_x->read(&ifs) || !_x->shrink()) { WHAT_ERROR(_x->what()); } if (!_x->empty()) { x.push_back(_x); } else { delete _x; continue; } _x->set_thread_id(line % thread_num); if (++line % 10000 == 0) { std::cout << line << ".. " << std::endl << std::flush; } } ifs.close(); std::cout << "\nDone!"; } feature_index.shrink(freq, &allocator); std::vector <double> alpha(feature_index.size()); // parameter std::fill(alpha.begin(), alpha.end(), 0.0); feature_index.set_alpha(&alpha[0]); std::cout << "Number of sentences: " << x.size() << std::endl; std::cout << "Number of features: " << feature_index.size() << std::endl; std::cout << "Number of thread(s): " << thread_num << std::endl; std::cout << "Freq: " << freq << std::endl; std::cout << "eta: " << eta << std::endl; std::cout << "C: " << C << std::endl; std::cout << "shrinking size: " << shrinking_size << std::endl; progress_timer pg; switch (algorithm) { case MIRA: if (!runMIRA(x, &feature_index, &alpha[0], maxitr, C, eta, shrinking_size, thread_num)) { WHAT_ERROR("MIRA execute error"); } break; case CRF_L2: if (!runCRF(x, &feature_index, &alpha[0], maxitr, C, eta, shrinking_size, thread_num, false)) { WHAT_ERROR("CRF_L2 execute error"); } break; case CRF_L1: if (!runCRF(x, &feature_index, &alpha[0], maxitr, C, eta, shrinking_size, thread_num, true)) { WHAT_ERROR("CRF_L1 execute error"); } break; } for (std::vector<TaggerImpl *>::iterator it = x.begin(); it != x.end(); ++it) { delete *it; } if (!feature_index.save(modelfile, textmodelfile)) { WHAT_ERROR(feature_index.what()); } std::cout << "\nDone!"; return true; }
bool EncoderFeatureIndex::save(const char *filename, bool textmodelfile) { std::vector<char *> key; std::vector<int> val; std::string y_str; for (size_t i = 0; i < y_.size(); ++i) { y_str += y_[i]; y_str += '\0'; } std::string templ_str; for (size_t i = 0; i < unigram_templs_.size(); ++i) { templ_str += unigram_templs_[i]; templ_str += '\0'; } for (size_t i = 0; i < bigram_templs_.size(); ++i) { templ_str += bigram_templs_[i]; templ_str += '\0'; } while ((y_str.size() + templ_str.size()) % 4 != 0) { templ_str += '\0'; } for (std::map<std::string, std::pair<int, unsigned int> >::iterator it = dic_.begin(); it != dic_.end(); ++it) { key.push_back(const_cast<char *>(it->first.c_str())); val.push_back(it->second.first); } Darts::DoubleArray da; CHECK_FALSE(da.build(key.size(), &key[0], 0, &val[0]) == 0) << "cannot build double-array"; std::ofstream bofs; bofs.open(WPATH(filename), OUTPUT_MODE); CHECK_FALSE(bofs) << "open failed: " << filename; unsigned int version_ = version; bofs.write(reinterpret_cast<char *>(&version_), sizeof(unsigned int)); int type = 0; bofs.write(reinterpret_cast<char *>(&type), sizeof(type)); bofs.write(reinterpret_cast<char *>(&cost_factor_), sizeof(cost_factor_)); bofs.write(reinterpret_cast<char *>(&maxid_), sizeof(maxid_)); if (max_xsize_ > 0) { xsize_ = std::min(xsize_, max_xsize_); } bofs.write(reinterpret_cast<char *>(&xsize_), sizeof(xsize_)); unsigned int dsize = da.unit_size() * da.size(); bofs.write(reinterpret_cast<char *>(&dsize), sizeof(dsize)); unsigned int size = y_str.size(); bofs.write(reinterpret_cast<char *>(&size), sizeof(size)); bofs.write(const_cast<char *>(y_str.data()), y_str.size()); size = templ_str.size(); bofs.write(reinterpret_cast<char *>(&size), sizeof(size)); bofs.write(const_cast<char *>(templ_str.data()), templ_str.size()); bofs.write(reinterpret_cast<const char *>(da.array()), dsize); for (size_t i = 0; i < maxid_; ++i) { float alpha = static_cast<float>(alpha_[i]); bofs.write(reinterpret_cast<char *>(&alpha), sizeof(alpha)); } bofs.close(); if (textmodelfile) { std::string filename2 = filename; filename2 += ".txt"; std::ofstream tofs(WPATH(filename2.c_str())); CHECK_FALSE(tofs) << " no such file or directory: " << filename2; // header tofs << "version: " << version_ << std::endl; tofs << "cost-factor: " << cost_factor_ << std::endl; tofs << "maxid: " << maxid_ << std::endl; tofs << "xsize: " << xsize_ << std::endl; tofs << std::endl; // y for (size_t i = 0; i < y_.size(); ++i) { tofs << y_[i] << std::endl; } tofs << std::endl; // template for (size_t i = 0; i < unigram_templs_.size(); ++i) { tofs << unigram_templs_[i] << std::endl; } for (size_t i = 0; i < bigram_templs_.size(); ++i) { tofs << bigram_templs_[i] << std::endl; } tofs << std::endl; // dic for (std::map<std::string, std::pair<int, unsigned int> >::iterator it = dic_.begin(); it != dic_.end(); ++it) { tofs << it->second.first << " " << it->first << std::endl; } tofs << std::endl; tofs.setf(std::ios::fixed, std::ios::floatfield); tofs.precision(16); for (size_t i = 0; i < maxid_; ++i) { tofs << alpha_[i] << std::endl; } } return true; }
bool EncoderFeatureIndex::convert(const char *text_filename, const char *binary_filename) { std::ifstream ifs(WPATH(text_filename)); y_.clear(); dic_.clear(); unigram_templs_.clear(); bigram_templs_.clear(); xsize_ = 0; maxid_ = 0; CHECK_FALSE(ifs) << "open failed: " << text_filename; scoped_fixed_array<char, 8192> line; char *column[8]; // read header while (true) { CHECK_FALSE(ifs.getline(line.get(), line.size())) << " format error: " << text_filename; if (std::strlen(line.get()) == 0) { break; } const size_t size = tokenize(line.get(), "\t ", column, 2); CHECK_FALSE(size == 2) << "format error: " << text_filename; if (std::strcmp(column[0], "xsize:") == 0) { xsize_ = std::atoi(column[1]); } if (std::strcmp(column[0], "maxid:") == 0) { maxid_ = std::atoi(column[1]); } } CHECK_FALSE(maxid_ > 0) << "maxid is not defined: " << text_filename; CHECK_FALSE(xsize_ > 0) << "xsize is not defined: " << text_filename; while (true) { CHECK_FALSE(ifs.getline(line.get(), line.size())) << "format error: " << text_filename; if (std::strlen(line.get()) == 0) { break; } y_.push_back(line.get()); } while (true) { CHECK_FALSE(ifs.getline(line.get(), line.size())) << "format error: " << text_filename; if (std::strlen(line.get()) == 0) { break; } if (line[0] == 'U') { unigram_templs_.push_back(line.get()); } else if (line[0] == 'B') { bigram_templs_.push_back(line.get()); } else { CHECK_FALSE(true) << "unknown type: " << line.get() << " " << text_filename; } } while (true) { CHECK_FALSE(ifs.getline(line.get(), line.size())) << "format error: " << text_filename; if (std::strlen(line.get()) == 0) { break; } const size_t size = tokenize(line.get(), "\t ", column, 2); CHECK_FALSE(size == 2) << "format error: " << text_filename; dic_.insert(std::make_pair (std::string(column[1]), std::make_pair(std::atoi(column[0]), static_cast<unsigned int>(1)))); } std::vector<double> alpha; while (ifs.getline(line.get(), line.size())) { alpha.push_back(std::atof(line.get())); } alpha_ = &alpha[0]; CHECK_FALSE(alpha.size() == maxid_) << " file is broken: " << text_filename; return save(binary_filename, false); }
bool CharProperty::compile(const char *cfile, const char *ufile, const char *ofile) { scoped_fixed_array<char, BUF_SIZE> line; scoped_fixed_array<char *, 512> col; size_t id = 0; std::vector<Range> range; std::map<std::string, CharInfo> category; std::vector<std::string> category_ary; std::ifstream ifs(WPATH(cfile)); std::istringstream iss(CHAR_PROPERTY_DEF_DEFAULT); std::istream *is = &ifs; if (!ifs) { std::cerr << cfile << " is not found. minimum setting is used" << std::endl; is = &iss; } while (is->getline(line.get(), line.size())) { if (std::strlen(line.get()) == 0 || line[0] == '#') { continue; } const size_t size = tokenize2(line.get(), "\t ", col.get(), col.size()); CHECK_DIE(size >= 2) << "format error: " << line.get(); // 0xFFFF..0xFFFF hoge hoge hgoe # if (std::strncmp(col[0], "0x", 2) == 0) { std::string low = col[0]; std::string high; size_t pos = low.find(".."); if (pos != std::string::npos) { high = low.substr(pos + 2, low.size() - pos - 2); low = low.substr(0, pos); } else { high = low; } Range r; r.low = atohex(low.c_str()); r.high = atohex(high.c_str()); CHECK_DIE(r.low >= 0 && r.low < 0xffff && r.high >= 0 && r.high < 0xffff && r.low <= r.high) << "range error: low=" << r.low << " high=" << r.high; for (size_t i = 1; i < size; ++i) { if (col[i][0] == '#') { break; // skip comments } CHECK_DIE(category.find(std::string(col[i])) != category.end()) << "category [" << col[i] << "] is undefined"; r.c.push_back(col[i]); } range.push_back(r); } else { CHECK_DIE(size >= 4) << "format error: " << line.get(); std::string key = col[0]; CHECK_DIE(category.find(key) == category.end()) << "category " << key << " is already defined"; CharInfo c; std::memset(&c, 0, sizeof(c)); c.invoke = std::atoi(col[1]); c.group = std::atoi(col[2]); c.length = std::atoi(col[3]); c.default_type = id++; category.insert(std::pair<std::string, CharInfo>(key, c)); category_ary.push_back(key); } } CHECK_DIE(category.size() < 18) << "too many categories(>= 18)"; CHECK_DIE(category.find("DEFAULT") != category.end()) << "category [DEFAULT] is undefined"; CHECK_DIE(category.find("SPACE") != category.end()) << "category [SPACE] is undefined"; std::istringstream iss2(UNK_DEF_DEFAULT); std::ifstream ifs2(WPATH(ufile)); std::istream *is2 = &ifs2; if (!ifs2) { std::cerr << ufile << " is not found. minimum setting is used." << std::endl; is2 = &iss2; } std::set<std::string> unk; while (is2->getline(line.get(), line.size())) { const size_t n = tokenizeCSV(line.get(), col.get(), 2); CHECK_DIE(n >= 1) << "format error: " << line.get(); const std::string key = col[0]; CHECK_DIE(category.find(key) != category.end()) << "category [" << key << "] is undefined in " << cfile; unk.insert(key); } for (std::map<std::string, CharInfo>::const_iterator it = category.begin(); it != category.end(); ++it) { CHECK_DIE(unk.find(it->first) != unk.end()) << "category [" << it->first << "] is undefined in " << ufile; } std::vector<CharInfo> table(0xffff); { std::vector<std::string> tmp; tmp.push_back("DEFAULT"); const CharInfo c = encode(tmp, &category); std::fill(table.begin(), table.end(), c); } for (std::vector<Range>::const_iterator it = range.begin(); it != range.end(); ++it) { const CharInfo c = encode(it->c, &category); for (int i = it->low; i <= it->high; ++i) { table[i] = c; } } // output binary table { std::ofstream ofs(WPATH(ofile), std::ios::binary|std::ios::out); CHECK_DIE(ofs) << "permission denied: " << ofile; unsigned int size = static_cast<unsigned int>(category.size()); ofs.write(reinterpret_cast<const char*>(&size), sizeof(size)); for (std::vector<std::string>::const_iterator it = category_ary.begin(); it != category_ary.end(); ++it) { char buf[32]; std::fill(buf, buf + sizeof(buf), '\0'); std::strncpy(buf, it->c_str(), sizeof(buf) - 1); ofs.write(reinterpret_cast<const char*>(buf), sizeof(buf)); } ofs.write(reinterpret_cast<const char*>(&table[0]), sizeof(CharInfo) * table.size()); ofs.close(); } return true; }
static bool eval(int argc, char **argv) { static const MeCab::Option long_options[] = { { "level", 'l', "0 -1", "STR", "set level of evaluations" }, { "output", 'o', 0, "FILE", "set the output file name" }, { "version", 'v', 0, 0, "show the version and exit" }, { "help", 'h', 0, 0, "show this help and exit." }, { 0, 0, 0, 0 } }; MeCab::Param param; param.open(argc, argv, long_options); if (!param.open(argc, argv, long_options)) { std::cout << param.what() << "\n\n" << COPYRIGHT << "\ntry '--help' for more information." << std::endl; #if 1 /* for Open JTalk */ return false; #else return -1; #endif } if (!param.help_version()) return 0; const std::vector<std::string> &files = param.rest_args(); if (files.size() < 2) { std::cout << "Usage: " << param.program_name() << " output answer" << std::endl; #if 1 /* for Open JTalk */ return false; #else return -1; #endif } std::string output = param.get<std::string>("output"); if (output.empty()) output = "-"; MeCab::ostream_wrapper ofs(output.c_str()); CHECK_DIE(*ofs) << "no such file or directory: " << output; const std::string system = files[0]; const std::string answer = files[1]; const std::string level_str = param.get<std::string>("level"); std::ifstream ifs1(WPATH(files[0].c_str())); std::ifstream ifs2(WPATH(files[1].c_str())); CHECK_DIE(ifs1) << "no such file or directory: " << files[0].c_str(); CHECK_DIE(ifs2) << "no such file or directory: " << files[0].c_str(); CHECK_DIE(!level_str.empty()) << "level_str is NULL"; std::vector<int> level; parseLevel(level_str.c_str(), &level); CHECK_DIE(level.size()) << "level_str is empty: " << level_str; std::vector<size_t> result_tbl(level.size()); std::fill(result_tbl.begin(), result_tbl.end(), 0); size_t prec = 0; size_t recall = 0; std::vector<std::vector<std::string> > r1; std::vector<std::vector<std::string> > r2; while (true) { if (!read(&ifs1, &r1, level) || !read(&ifs2, &r2, level)) break; size_t i1 = 0; size_t i2 = 0; size_t p1 = 0; size_t p2 = 0; while (i1 < r1.size() && i2 < r2.size()) { if (p1 == p2) { for (size_t i = 0; i < result_tbl.size(); ++i) { if (r1[i1][i] == r2[i2][i]) { result_tbl[i]++; } } p1 += r1[i1][0].size(); p2 += r2[i2][0].size(); ++i1; ++i2; ++prec; ++recall; } else if (p1 < p2) { p1 += r1[i1][0].size(); ++i1; ++prec; } else { p2 += r2[i2][0].size(); ++i2; ++recall; } } while (i1 < r1.size()) { ++prec; ++i1; } while (i2 < r2.size()) { ++recall; ++i2; } } *ofs << " precision recall F" << std::endl; for (size_t i = 0; i < result_tbl.size(); ++i) { if (level[i] == -1) { *ofs << "LEVEL ALL: "; } else { *ofs << "LEVEL " << level[i] << ": "; } printeval(&*ofs, result_tbl[i], prec, recall); } return true; }