bool RewritePattern::rewrite(size_t size, const char **input, std::string *output) const { if (spat_.size() > size) return false; for (size_t i = 0; i < spat_.size(); ++i) { if (!match_rewrite_pattern(spat_[i].c_str(), input[i])) return false; } output->clear(); for (size_t i = 0; i < dpat_.size(); ++i) { std::string elm; const char *begin = dpat_[i].c_str(); const char *end = begin + dpat_[i].size(); for (const char *p = begin; p < end; ++p) { if (*p == '$') { ++p; size_t n = 0; for (; p < end; ++p) { switch (*p) { case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': n = 10 * n + (*p - '0'); break; default: goto NEXT; } } NEXT: CHECK_DIE(n > 0 && (n - 1) < size) << " out of range: [" << dpat_[i] << "] " << n; elm += input[n - 1]; if (p < end) elm += *p; } else { elm += *p; } } CHECK_DIE(escape_csv_element(&elm)); *output += elm; if (i + 1 != dpat_.size()) *output += ","; } return true; }
static void gendic(const char* ifile, const char* ofile, const CharProperty &property, DictionaryRewriter *rewrite, const ContextID &cid, DecoderFeatureIndex *fi, bool unk, int factor, int default_cost) { std::ifstream ifs(ifile); CHECK_DIE(ifs) << "no such file or directory: " << ifile; std::ofstream ofs(ofile); CHECK_DIE(ofs) << "permission denied: " << ofile; std::string w, feature, ufeature, lfeature, rfeature; int cost, lid, rid; std::cout << "emitting " << ofile << " ... " << std::flush; LearnerPath path; LearnerNode rnode; LearnerNode lnode; rnode.stat = lnode.stat = MECAB_NOR_NODE; rnode.rpath = &path; lnode.lpath = &path; path.lnode = &lnode; path.rnode = &rnode; char line[BUF_SIZE]; char *col[8]; size_t num = 0; while (ifs.getline(line, sizeof(line))) { const size_t n = tokenizeCSV(line, col, 5); CHECK_DIE(n == 5) << "format error: " << line; w = std::string(col[0]); lid = std::atoi(col[1]); rid = std::atoi(col[2]); cost = std::atoi(col[3]); feature = std::string(col[4]); rewrite->rewrite2(feature, &ufeature, &lfeature, &rfeature); lid = cid.lid(lfeature.c_str()); rid = cid.rid(rfeature.c_str()); CHECK_DIE(lid > 0) << "CID is not found for " << lfeature; CHECK_DIE(rid > 0) << "CID is not found for " << rfeature; if (unk) { int c = property.id(w.c_str()); CHECK_DIE(c >= 0) << "unknown property [" << w << "]"; path.rnode->char_type = (unsigned char)c; } else { size_t mblen; CharInfo cinfo = property.getCharInfo(w.c_str(), w.c_str() + w.size(), &mblen); path.rnode->char_type = cinfo.default_type; } fi->buildUnigramFeature(&path, ufeature.c_str()); fi->calcCost(&rnode); CHECK_DIE(escape_csv_element(&w)) << "invalid character found: " << w; ofs << w << ',' << lid << ',' << rid << ',' << tocost(rnode.wcost, factor, default_cost) << ',' << feature << std::endl; ++num; } std::cout << num << std::endl; }