bool FeatureIndex::buildBigramFeature(LearnerPath *path, const char *rfeature, const char *lfeature) { char rbuf[BUFSIZE]; char lbuf[BUFSIZE]; char *R[POSSIZE]; char *L[POSSIZE]; feature_.clear(); std::strncpy(lbuf, rfeature, BUFSIZE); std::strncpy(rbuf, lfeature, BUFSIZE); size_t lsize = tokenizeCSV(lbuf, L, POSSIZE); size_t rsize = tokenizeCSV(rbuf, R, POSSIZE); for (std::vector<const char*>::const_iterator it = bigram_templs_.begin(); it != bigram_templs_.end(); ++it) { const char *p = *it; os_.clear(); for (; *p; p++) { switch (*p) { default: os_ << *p; break; case '\\': os_ << getEscapedChar(*++p); break; case '%': { switch (*++p) { case 'L': { const char *r = getIndex(const_cast<char **>(&p), L, lsize); if (!r) goto NEXT; os_ << r; } break; case 'R': { const char *r = getIndex(const_cast<char **>(&p), R, rsize); if (!r) goto NEXT; os_ << r; } break; case 'l': os_ << lfeature; break; // use lfeature as it is case 'r': os_ << rfeature; break; default: CHECK_FALSE(false) << "unkonwn meta char: " << *p; } } } } os_ << '\0'; ADDB(os_.str()); NEXT: continue; } COPY_FEATURE(path->fvector); return true; }
bool RewritePattern::set_pattern(const char *src, const char *dst) { char buf[BUF_SIZE]; spat_.clear(); dpat_.clear(); std::strncpy(buf, src, sizeof(buf)); tokenizeCSV(buf, back_inserter(spat_), 512); std::strncpy(buf, dst, sizeof(buf)); tokenizeCSV(buf, back_inserter(dpat_), 512); return (spat_.size() && dpat_.size()); }
static bool read(std::istream *is, std::vector<std::vector<std::string> > *r, const std::vector<int> &level) { if (!*is) return false; char buf[BUF_SIZE]; char *col[2]; char *cvs[BUF_SIZE]; r->clear(); while (is->getline(buf, sizeof(buf))) { if (std::strcmp(buf, "EOS") == 0) break; CHECK_DIE(tokenize(buf, "\t", col, 2) == 2) << "format error"; cvs[0] = col[0]; size_t n = tokenizeCSV(col[1], cvs + 1, sizeof(cvs) - 1); std::vector<std::string> tmp; for (size_t i = 0; i < level.size(); ++i) { size_t m = level[i] < 0 ? n : level[i]; CHECK_DIE(m <= n) << " out of range " << level[i]; std::string output; for (size_t j = 0; j <= m; ++j) { output += cvs[j]; if (j != 0) output += "\t"; } tmp.push_back(output); } r->push_back(tmp); } return true; }
int POSIDGenerator::id(const char *feature) const { char buf[BUF_SIZE]; char *col[BUF_SIZE]; CHECK_DIE(std::strlen(feature) < sizeof(buf) - 1) << "too long feature"; std::strncpy(buf, feature, sizeof(buf) - 1); const size_t n = tokenizeCSV(buf, col, sizeof(col)); CHECK_DIE(n < sizeof(col)) << "too long CSV entities"; std::string tmp; if (!rewrite_.rewrite(n, const_cast<const char **>(col), &tmp)) return -1; return std::atoi(tmp.c_str()); }
// without cache bool DictionaryRewriter::rewrite(const std::string &feature, std::string *ufeature, std::string *lfeature, std::string *rfeature) const { char buf[BUF_SIZE]; char *col[BUF_SIZE]; CHECK_DIE(feature.size() < sizeof(buf) - 1) << "too long feature"; std::strncpy(buf, feature.c_str(), sizeof(buf) - 1); size_t n = tokenizeCSV(buf, col, sizeof(col)); CHECK_DIE(n < sizeof(col)) << "too long CSV entities"; return (unigram_rewrite_.rewrite(n, const_cast<const char **>(col), ufeature) && left_rewrite_.rewrite(n, const_cast<const char **>(col), lfeature) && right_rewrite_.rewrite(n, const_cast<const char **>(col), rfeature)); }
bool FeatureIndex::buildUnigramFeature(LearnerPath *path, const char *ufeature) { char ubuf[BUFSIZE]; char *F[POSSIZE]; feature_.clear(); std::strncpy(ubuf, ufeature, BUFSIZE); size_t usize = tokenizeCSV(ubuf, F, POSSIZE); for (std::vector<const char*>::const_iterator it = unigram_templs_.begin(); it != unigram_templs_.end(); ++it) { const char *p = *it; os_.clear(); for (; *p; p++) { switch (*p) { default: os_ << *p; break; case '\\': os_ << getEscapedChar(*++p); break; case '%': { switch (*++p) { case 'F': { const char *r = getIndex(const_cast<char **>(&p), F, usize); if (!r) goto NEXT; os_ << r; } break; case 't': os_ << (size_t)path->rnode->char_type; break; case 'u': os_ << ufeature; break; default: CHECK_FALSE(false) << "unkonwn meta char: " << *p; } } } } os_ << '\0'; ADDB(os_.str()); NEXT: continue; } COPY_FEATURE(path->rnode->fvector); return true; }
static void gencid(const char *filename, DictionaryRewriter *rewrite, ContextID *cid) { std::ifstream ifs(filename); CHECK_DIE(ifs) << "no such file or directory: " << filename; char line[BUF_SIZE]; std::cout << "reading " << filename << " ... " << std::flush; size_t num = 0; std::string feature, ufeature, lfeature, rfeature; char *col[8]; while (ifs.getline(line, sizeof(line))) { const size_t n = tokenizeCSV(line, col, 5); CHECK_DIE(n == 5) << "format error: " << line; feature = col[4]; rewrite->rewrite2(feature, &ufeature, &lfeature, &rfeature); cid->add(lfeature.c_str(), rfeature.c_str()); ++num; } std::cout << num << std::endl; ifs.close(); }
static bool read(std::istream *is, std::vector<std::vector<std::string> > *r, const std::vector<int> &level) { if (!*is) { return false; } char *col[2]; scoped_fixed_array<char, BUF_SIZE> buf; scoped_fixed_array<char *, BUF_SIZE> csv; r->clear(); while (is->getline(buf.get(), buf.size())) { if (std::strcmp(buf.get(), "EOS") == 0) { break; } CHECK_DIE(tokenize(buf.get(), "\t", col, 2) == 2) << "format error"; csv[0] = col[0]; size_t n = tokenizeCSV(col[1], csv.get() + 1, csv.size() - 1); std::vector<std::string> tmp; for (size_t i = 0; i < level.size(); ++i) { size_t m = level[i] < 0 ? n : level[i]; CHECK_DIE(m <= n) << " out of range " << level[i]; std::string output; for (size_t j = 0; j <= m; ++j) { output += csv[j]; if (j != 0) { output += "\t"; } } tmp.push_back(output); } r->push_back(tmp); } return true; }
bool Tokenizer<N, P>::open(const Param ¶m) { close(); const std::string prefix = param.template get<std::string>("dicdir"); CHECK_FALSE(unkdic_.open(create_filename (prefix, UNK_DIC_FILE).c_str())) << unkdic_.what(); CHECK_FALSE(property_.open(param)) << property_.what(); Dictionary *sysdic = new Dictionary; CHECK_FALSE(sysdic->open (create_filename(prefix, SYS_DIC_FILE).c_str())) << sysdic->what(); CHECK_FALSE(sysdic->type() == 0) << "not a system dictionary: " << prefix; property_.set_charset(sysdic->charset()); dic_.push_back(sysdic); const std::string userdic = param.template get<std::string>("userdic"); if (!userdic.empty()) { scoped_fixed_array<char, BUF_SIZE> buf; scoped_fixed_array<char *, BUF_SIZE> dicfile; std::strncpy(buf.get(), userdic.c_str(), buf.size()); const size_t n = tokenizeCSV(buf.get(), dicfile.get(), dicfile.size()); for (size_t i = 0; i < n; ++i) { Dictionary *d = new Dictionary; CHECK_FALSE(d->open(dicfile[i])) << d->what(); CHECK_FALSE(d->type() == 1) << "not a user dictionary: " << dicfile[i]; CHECK_FALSE(sysdic->isCompatible(*d)) << "incompatible dictionary: " << dicfile[i]; dic_.push_back(d); } } dictionary_info_ = 0; dictionary_info_freelist_.free(); for (int i = static_cast<int>(dic_.size() - 1); i >= 0; --i) { DictionaryInfo *d = dictionary_info_freelist_.alloc(); d->next = dictionary_info_; d->filename = dic_[i]->filename(); d->charset = dic_[i]->charset(); d->size = dic_[i]->size(); d->lsize = dic_[i]->lsize(); d->rsize = dic_[i]->rsize(); d->type = dic_[i]->type(); d->version = dic_[i]->version(); dictionary_info_ = d; } unk_tokens_.clear(); for (size_t i = 0; i < property_.size(); ++i) { const char *key = property_.name(i); const Dictionary::result_type n = unkdic_.exactMatchSearch(key); CHECK_FALSE(n.value != -1) << "cannot find UNK category: " << key; const Token *token = unkdic_.token(n); size_t size = unkdic_.token_size(n); unk_tokens_.push_back(std::make_pair(token, size)); } space_ = property_.getCharInfo(0x20); // ad-hoc bos_feature_.reset_string(param.template get<std::string>("bos-feature")); const std::string tmp = param.template get<std::string>("unk-feature"); unk_feature_.reset(0); if (!tmp.empty()) { unk_feature_.reset_string(tmp); } CHECK_FALSE(*bos_feature_ != '\0') << "bos-feature is undefined in dicrc"; max_grouping_size_ = param.template get<size_t>("max-grouping-size"); if (max_grouping_size_ == 0) { max_grouping_size_ = DEFAULT_MAX_GROUPING_SIZE; } return true; }
bool CharProperty::compile(const char *cfile, const char *ufile, const char *ofile) { scoped_fixed_array<char, BUF_SIZE> line; scoped_fixed_array<char *, 512> col; size_t id = 0; std::vector<Range> range; std::map<std::string, CharInfo> category; std::vector<std::string> category_ary; std::ifstream ifs(WPATH(cfile)); std::istringstream iss(CHAR_PROPERTY_DEF_DEFAULT); std::istream *is = &ifs; if (!ifs) { std::cerr << cfile << " is not found. minimum setting is used" << std::endl; is = &iss; } while (is->getline(line.get(), line.size())) { if (std::strlen(line.get()) == 0 || line[0] == '#') { continue; } const size_t size = tokenize2(line.get(), "\t ", col.get(), col.size()); CHECK_DIE(size >= 2) << "format error: " << line.get(); // 0xFFFF..0xFFFF hoge hoge hgoe # if (std::strncmp(col[0], "0x", 2) == 0) { std::string low = col[0]; std::string high; size_t pos = low.find(".."); if (pos != std::string::npos) { high = low.substr(pos + 2, low.size() - pos - 2); low = low.substr(0, pos); } else { high = low; } Range r; r.low = atohex(low.c_str()); r.high = atohex(high.c_str()); CHECK_DIE(r.low >= 0 && r.low < 0xffff && r.high >= 0 && r.high < 0xffff && r.low <= r.high) << "range error: low=" << r.low << " high=" << r.high; for (size_t i = 1; i < size; ++i) { if (col[i][0] == '#') { break; // skip comments } CHECK_DIE(category.find(std::string(col[i])) != category.end()) << "category [" << col[i] << "] is undefined"; r.c.push_back(col[i]); } range.push_back(r); } else { CHECK_DIE(size >= 4) << "format error: " << line.get(); std::string key = col[0]; CHECK_DIE(category.find(key) == category.end()) << "category " << key << " is already defined"; CharInfo c; std::memset(&c, 0, sizeof(c)); c.invoke = std::atoi(col[1]); c.group = std::atoi(col[2]); c.length = std::atoi(col[3]); c.default_type = id++; category.insert(std::pair<std::string, CharInfo>(key, c)); category_ary.push_back(key); } } CHECK_DIE(category.size() < 18) << "too many categories(>= 18)"; CHECK_DIE(category.find("DEFAULT") != category.end()) << "category [DEFAULT] is undefined"; CHECK_DIE(category.find("SPACE") != category.end()) << "category [SPACE] is undefined"; std::istringstream iss2(UNK_DEF_DEFAULT); std::ifstream ifs2(WPATH(ufile)); std::istream *is2 = &ifs2; if (!ifs2) { std::cerr << ufile << " is not found. minimum setting is used." << std::endl; is2 = &iss2; } std::set<std::string> unk; while (is2->getline(line.get(), line.size())) { const size_t n = tokenizeCSV(line.get(), col.get(), 2); CHECK_DIE(n >= 1) << "format error: " << line.get(); const std::string key = col[0]; CHECK_DIE(category.find(key) != category.end()) << "category [" << key << "] is undefined in " << cfile; unk.insert(key); } for (std::map<std::string, CharInfo>::const_iterator it = category.begin(); it != category.end(); ++it) { CHECK_DIE(unk.find(it->first) != unk.end()) << "category [" << it->first << "] is undefined in " << ufile; } std::vector<CharInfo> table(0xffff); { std::vector<std::string> tmp; tmp.push_back("DEFAULT"); const CharInfo c = encode(tmp, &category); std::fill(table.begin(), table.end(), c); } for (std::vector<Range>::const_iterator it = range.begin(); it != range.end(); ++it) { const CharInfo c = encode(it->c, &category); for (int i = it->low; i <= it->high; ++i) { table[i] = c; } } // output binary table { std::ofstream ofs(WPATH(ofile), std::ios::binary|std::ios::out); CHECK_DIE(ofs) << "permission denied: " << ofile; unsigned int size = static_cast<unsigned int>(category.size()); ofs.write(reinterpret_cast<const char*>(&size), sizeof(size)); for (std::vector<std::string>::const_iterator it = category_ary.begin(); it != category_ary.end(); ++it) { char buf[32]; std::fill(buf, buf + sizeof(buf), '\0'); std::strncpy(buf, it->c_str(), sizeof(buf) - 1); ofs.write(reinterpret_cast<const char*>(buf), sizeof(buf)); } ofs.write(reinterpret_cast<const char*>(&table[0]), sizeof(CharInfo) * table.size()); ofs.close(); } return true; }
bool Writer::writeNode(Lattice *lattice, const char *p, const Node *node, StringBuffer *os) const { scoped_fixed_array<char, BUF_SIZE> buf; scoped_fixed_array<char *, 64> ptr; size_t psize = 0; for (; *p; p++) { switch (*p) { default: *os << *p; break; case '\\': *os << getEscapedChar(*++p); break; case '%': { // macros switch (*++p) { default: { const std::string error = "unknown meta char: " + *p; lattice->set_what(error.c_str()); return false; } // input sentence case 'S': os->write(lattice->sentence(), lattice->size()); break; // sentence length case 'L': *os << lattice->size(); break; // morph case 'm': os->write(node->surface, node->length); break; case 'M': os->write(reinterpret_cast<const char *> (node->surface - node->rlength + node->length), node->rlength); break; case 'h': *os << node->posid; break; // Part-Of-Speech ID case '%': *os << '%'; break; // % case 'c': *os << static_cast<int>(node->wcost); break; // word cost case 'H': *os << node->feature; break; case 't': *os << static_cast<unsigned int>(node->char_type); break; case 's': *os << static_cast<unsigned int>(node->stat); break; case 'P': *os << node->prob; break; case 'p': { switch (*++p) { default: lattice->set_what("[iseSCwcnblLh] is required after %p"); return false; case 'i': *os << node->id; break; // node id case 'S': os->write(reinterpret_cast<const char*> (node->surface - node->rlength + node->length), node->rlength - node->length); break; // space // start position case 's': *os << static_cast<int>( node->surface - lattice->sentence()); break; // end position case 'e': *os << static_cast<int> (node->surface - lattice->sentence() + node->length); break; // connection cost case 'C': *os << node->cost - node->prev->cost - node->wcost; break; case 'w': *os << node->wcost; break; // word cost case 'c': *os << node->cost; break; // best cost case 'n': *os << (node->cost - node->prev->cost); break; // node cost // * if best path, otherwise ' ' case 'b': *os << (node->isbest ? '*' : ' '); break; case 'P': *os << node->prob; break; case 'A': *os << node->alpha; break; case 'B': *os << node->beta; break; case 'l': *os << node->length; break; // length of morph // length of morph including the spaces case 'L': *os << node->rlength; break; case 'h': { // Hidden Layer ID switch (*++p) { default: lattice->set_what("lr is required after %ph"); return false; case 'l': *os << node->lcAttr; break; // current case 'r': *os << node->rcAttr; break; // prev } } break; case 'p': { char mode = *++p; char sep = *++p; if (sep == '\\') { sep = getEscapedChar(*++p); } if (!node->lpath) { lattice->set_what("no path information is available"); return false; } for (Path *path = node->lpath; path; path = path->lnext) { if (path != node->lpath) *os << sep; switch (mode) { case 'i': *os << path->lnode->id; break; case 'c': *os << path->cost; break; case 'P': *os << path->prob; break; default: lattice->set_what("[icP] is required after %pp"); return false; } } } break; } } break; case 'F': case 'f': { if (node->feature[0] == '\0') { lattice->set_what("no feature information available"); return false; } if (!psize) { strncpy_s(buf.get(), sizeof(buf.get()), node->feature, buf.size()); psize = tokenizeCSV(buf.get(), ptr.get(), ptr.size()); } // separator char separator = '\t'; // default separator if (*p == 'F') { // change separator if (*++p == '\\') { separator = getEscapedChar(*++p); } else { separator = *p; } } if (*++p !='[') { lattice->set_what("cannot find '['"); return false; } size_t n = 0; bool sep = false; bool isfil = false; p++; for (;; ++p) { switch (*p) { case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': n = 10 * n +(*p - '0'); break; case ',': case ']': if (n >= psize) { lattice->set_what("given index is out of range"); return false; } isfil = (ptr[n][0] != '*'); if (isfil) { if (sep) { *os << separator; } *os << ptr[n]; } if (*p == ']') { goto last; } sep = isfil; n = 0; break; default: lattice->set_what("cannot find ']'"); return false; } } } last: break; } // end switch } break; // end case '%' } // end switch } return true; }
static void gendic(const char* ifile, const char* ofile, const CharProperty &property, DictionaryRewriter *rewrite, const ContextID &cid, DecoderFeatureIndex *fi, bool unk, int factor, int default_cost) { std::ifstream ifs(ifile); CHECK_DIE(ifs) << "no such file or directory: " << ifile; std::ofstream ofs(ofile); CHECK_DIE(ofs) << "permission denied: " << ofile; std::string w, feature, ufeature, lfeature, rfeature; int cost, lid, rid; std::cout << "emitting " << ofile << " ... " << std::flush; LearnerPath path; LearnerNode rnode; LearnerNode lnode; rnode.stat = lnode.stat = MECAB_NOR_NODE; rnode.rpath = &path; lnode.lpath = &path; path.lnode = &lnode; path.rnode = &rnode; char line[BUF_SIZE]; char *col[8]; size_t num = 0; while (ifs.getline(line, sizeof(line))) { const size_t n = tokenizeCSV(line, col, 5); CHECK_DIE(n == 5) << "format error: " << line; w = std::string(col[0]); lid = std::atoi(col[1]); rid = std::atoi(col[2]); cost = std::atoi(col[3]); feature = std::string(col[4]); rewrite->rewrite2(feature, &ufeature, &lfeature, &rfeature); lid = cid.lid(lfeature.c_str()); rid = cid.rid(rfeature.c_str()); CHECK_DIE(lid > 0) << "CID is not found for " << lfeature; CHECK_DIE(rid > 0) << "CID is not found for " << rfeature; if (unk) { int c = property.id(w.c_str()); CHECK_DIE(c >= 0) << "unknown property [" << w << "]"; path.rnode->char_type = (unsigned char)c; } else { size_t mblen; CharInfo cinfo = property.getCharInfo(w.c_str(), w.c_str() + w.size(), &mblen); path.rnode->char_type = cinfo.default_type; } fi->buildUnigramFeature(&path, ufeature.c_str()); fi->calcCost(&rnode); CHECK_DIE(escape_csv_element(&w)) << "invalid character found: " << w; ofs << w << ',' << lid << ',' << rid << ',' << tocost(rnode.wcost, factor, default_cost) << ',' << feature << std::endl; ++num; } std::cout << num << std::endl; }
bool Dictionary::compile(const Param ¶m, const std::vector<std::string> &dics, const char *matrix_file, const char *matrix_bin_file, const char *left_id_file, const char *right_id_file, const char *rewrite_file, const char *pos_id_file, const char *output) { Connector matrix; scoped_ptr<DictionaryRewriter> rewrite(0); scoped_ptr<POSIDGenerator> posid(0); scoped_ptr<ContextID> cid(0); scoped_ptr<Writer> writer(0); scoped_ptr<StringBuffer> os(0); Node node; std::vector<std::pair<std::string, Token*> > dic; size_t offset = 0; unsigned int lexsize = 0; std::string w, feature, ufeature, lfeature, rfeature, fbuf, key; int lid, rid, cost; const std::string from = param.get<std::string>("dictionary-charset"); const std::string to = param.get<std::string>("charset"); const bool wakati = param.get<bool>("wakati"); const int type = param.get<int>("type"); const std::string node_format = param.get<std::string>("node-format"); // for backward compatibility std::string config_charset = param.get<std::string>("config-charset"); if (config_charset.empty()) config_charset = from; CHECK_DIE(!from.empty()) << "input dictionary charset is empty"; CHECK_DIE(!to.empty()) << "output dictionary charset is empty"; Iconv iconv; CHECK_DIE(iconv.open(from.c_str(), to.c_str())) << "iconv_open() failed with from=" << from << " to=" << to; Iconv config_iconv; CHECK_DIE(config_iconv.open(config_charset.c_str(), from.c_str())) << "iconv_open() failed with from=" << config_charset << " to=" << from; if (!node_format.empty()) { writer.reset(new Writer); os.reset(new StringBuffer); memset(&node, 0, sizeof(node)); } if (!matrix.openText(matrix_file) && !matrix.open(matrix_bin_file)) { matrix.set_left_size(1); matrix.set_right_size(1); } posid.reset(new POSIDGenerator); posid->open(pos_id_file, &config_iconv); std::istringstream iss(UNK_DEF_DEFAULT); for (size_t i = 0; i < dics.size(); ++i) { std::ifstream ifs(dics[i].c_str()); std::istream *is = &ifs; if (!ifs) { if (type == MECAB_UNK_DIC) { std::cerr << dics[i] << " is not found. minimum setting is used." << std::endl; is = &iss; } else { CHECK_DIE(ifs) << "no such file or directory: " << dics[i]; } } std::cout << "reading " << dics[i] << " ... "; char line[BUF_SIZE]; size_t num = 0; while (is->getline(line, sizeof(line))) { char *col[8]; const size_t n = tokenizeCSV(line, col, 5); CHECK_DIE(n == 5) << "format error: " << line; w = col[0]; lid = std::atoi(col[1]); rid = std::atoi(col[2]); cost = std::atoi(col[3]); feature = col[4]; int pid = posid->id(feature.c_str()); if (lid < 0 || rid < 0) { if (!rewrite.get()) { rewrite.reset(new DictionaryRewriter); rewrite->open(rewrite_file, &config_iconv); } CHECK_DIE(rewrite->rewrite(feature, &ufeature, &lfeature, &rfeature)) << "rewrite failed: " << feature; if (!cid.get()) { cid.reset(new ContextID); cid->open(left_id_file, right_id_file, &config_iconv); CHECK_DIE(cid->left_size() == matrix.left_size() && cid->right_size() == matrix.right_size()) << "Context ID files(" << left_id_file << " or " << right_id_file << " may be broken"; } lid = cid->lid(lfeature.c_str()); rid = cid->rid(rfeature.c_str()); } CHECK_DIE(lid >= 0 && rid >= 0 && matrix.is_valid(lid, rid)) << "invalid ids are found lid=" << lid << " rid=" << rid; if (w.empty()) { std::cerr << "empty word is found, discard this line" << std::endl; continue; } if (!iconv.convert(&feature)) { std::cerr << "iconv conversion failed. skip this entry" << std::endl; continue; } if (type != MECAB_UNK_DIC && !iconv.convert(&w)) { std::cerr << "iconv conversion failed. skip this entry" << std::endl; continue; } if (!node_format.empty()) { node.surface = w.c_str(); node.feature = feature.c_str(); node.length = w.size(); node.rlength = w.size(); node.posid = pid; node.stat = MECAB_NOR_NODE; CHECK_DIE(os.get()); CHECK_DIE(writer.get()); os->clear(); CHECK_DIE(writer->writeNode(&*os, node_format.c_str(), w.c_str(), &node)) << "conversion error: " << feature << " with " << node_format; *os << '\0'; feature = os->str(); } key.clear(); if (!wakati) key = feature + '\0'; Token* token = new Token; token->lcAttr = lid; token->rcAttr = rid; token->posid = pid; token->wcost = cost; token->feature = offset; token->compound = 0; dic.push_back(std::make_pair<std::string, Token*>(w, token)); // append to output buffer if (!wakati) fbuf.append(key.data(), key.size()); offset += key.size(); ++num; ++lexsize; } std::cout << num << std::endl; } if (wakati) fbuf.append("\0", 1); std::sort(dic.begin(), dic.end()); size_t bsize = 0; size_t idx = 0; std::string prev; std::vector<const char *> str; std::vector<size_t> len; std::vector<Darts::DoubleArray::result_type> val; for (size_t i = 0; i < dic.size(); ++i) { if (i != 0 && prev != dic[i].first) { str.push_back(dic[idx].first.c_str()); len.push_back(dic[idx].first.size()); val.push_back(bsize +(idx << 8)); bsize = 1; idx = i; } else { ++bsize; } prev = dic[i].first; } str.push_back(dic[idx].first.c_str()); len.push_back(dic[idx].first.size()); val.push_back(bsize +(idx << 8)); CHECK_DIE(str.size() == len.size()); CHECK_DIE(str.size() == val.size()); Darts::DoubleArray da; CHECK_DIE(da.build(str.size(), const_cast<char **>(&str[0]), &len[0], &val[0], &progress_bar_darts) == 0) << "unkown error in building double-array"; std::string tbuf; for (size_t i = 0; i < dic.size(); ++i) { tbuf.append(reinterpret_cast<const char*>(dic[i].second), sizeof(Token)); delete dic[i].second; } dic.clear(); // needs to be 8byte(64bit) aligned while (tbuf.size() % 8 != 0) { Token dummy; memset(&dummy, 0, sizeof(Token)); tbuf.append(reinterpret_cast<const char*>(&dummy), sizeof(Token)); } unsigned int dummy = 0; unsigned int lsize = matrix.left_size(); unsigned int rsize = matrix.right_size(); unsigned int dsize = da.unit_size() * da.size(); unsigned int tsize = tbuf.size(); unsigned int fsize = fbuf.size(); unsigned int version = DIC_VERSION; char charset[32]; std::fill(charset, charset + sizeof(charset), '\0'); std::strncpy(charset, to.c_str(), 31); std::ofstream bofs(output, std::ios::binary|std::ios::out); CHECK_DIE(bofs) << "permission denied: " << output; unsigned int magic = 0; // needs to be 64bit aligned // 10*32 = 64*5 bofs.write(reinterpret_cast<const char *>(&magic), sizeof(unsigned int)); bofs.write(reinterpret_cast<const char *>(&version), sizeof(unsigned int)); bofs.write(reinterpret_cast<const char *>(&type), sizeof(unsigned int)); bofs.write(reinterpret_cast<const char *>(&lexsize), sizeof(unsigned int)); bofs.write(reinterpret_cast<const char *>(&lsize), sizeof(unsigned int)); bofs.write(reinterpret_cast<const char *>(&rsize), sizeof(unsigned int)); bofs.write(reinterpret_cast<const char *>(&dsize), sizeof(unsigned int)); bofs.write(reinterpret_cast<const char *>(&tsize), sizeof(unsigned int)); bofs.write(reinterpret_cast<const char *>(&fsize), sizeof(unsigned int)); bofs.write(reinterpret_cast<const char *>(&dummy), sizeof(unsigned int)); // 32 * 8 = 64 * 4 bofs.write(reinterpret_cast<const char *>(charset), sizeof(charset)); bofs.write(reinterpret_cast<const char*>(da.array()), da.unit_size() * da.size()); bofs.write(const_cast<const char *>(tbuf.data()), tbuf.size()); bofs.write(const_cast<const char *>(fbuf.data()), fbuf.size()); // save magic id magic = static_cast<unsigned int>(bofs.tellp()); magic ^= DictionaryMagicID; bofs.seekp(0); bofs.write(reinterpret_cast<const char *>(&magic), sizeof(unsigned int)); bofs.close(); return true; }