int main(int argc, char **argv) { if (argc < 2) { std::cerr << "Usage: " << argv[0] << " Index" << std::endl; return -1; } Darts::DoubleArray da; std::string index = argv[argc-1]; if (da.open(index.c_str())) { std::cerr << "Error: cannot open " << index << std::endl; return -1; } Darts::DoubleArray::result_pair_type result_pair[1024]; Darts::DoubleArray::key_type key[1024]; while (std::cin.getline(key, sizeof(key))) { size_t num = da.commonPrefixSearch(key, result_pair, sizeof(result_pair)); if (num == 0) { std::cout << key << ": not found" << std::endl; } else { std::cout << key << ": found, num=" << num << " "; for (size_t i = 0; i < num; ++i) { std::cout << " " << result_pair[i].value << ":" << result_pair[i].length; } std::cout << std::endl; } } return 0; }
int main(int argc, char **argv) { if (argc < 3) { std::cerr << "Usage: " << argv[0] << " File Index" << std::endl; return -1; } std::string file = argv[argc-2]; std::string index = argv[argc-1]; Darts::DoubleArray da; std::vector<const char *> key; std::istream *is; if (file == "-") { is = &std::cin; } else { is = new std::ifstream(file.c_str()); } if (!*is) { std::cerr << "Cannot Open: " << file << std::endl; return -1; } std::string line; while (std::getline(*is, line)) { char *tmp = new char[line.size()+1]; std::strcpy(tmp, line.c_str()); key.push_back(tmp); } if (file != "-") delete is; if (da.build(key.size(), &key[0], 0, 0, &progress_bar) != 0 || da.save(index.c_str()) != 0) { std::cerr << "Error: cannot build double array " << file << std::endl; return -1; }; for (unsigned int i = 0; i < key.size(); i++) delete [] key[i]; std::cout << "Done!, Compression Ratio: " << 100.0 * da.nonzero_size() / da.size() << " %" << std::endl; return 0; }
bool FeatureIndex::convert(const char* txtfile, const char *binfile) { std::ifstream ifs(txtfile); CHECK_DIE(ifs) << "no such file or directory: " << txtfile; char buf[BUF_SIZE]; char *column[4]; std::map<std::string, double> dic; while (ifs.getline(buf, sizeof(buf))) { CHECK_DIE(tokenize2(buf, "\t", column, 2) == 2) << "format error: " << buf; dic.insert(std::make_pair<std::string, double> (std::string(column[1]), atof(column[0]) )); } std::ofstream ofs(binfile, std::ios::out | std::ios::binary); CHECK_DIE(ofs) << "permission denied: " << binfile; std::vector<char *> key; unsigned int size = static_cast<unsigned int>(dic.size()); ofs.write(reinterpret_cast<const char*>(&size), sizeof(unsigned int)); for (std::map<std::string, double>::const_iterator it = dic.begin(); it != dic.end(); ++it) { key.push_back(const_cast<char*>(it->first.c_str())); ofs.write(reinterpret_cast<const char*>(&it->second), sizeof(double)); } Darts::DoubleArray da; CHECK_DIE(da.build(key.size(), &key[0], 0, 0, 0) == 0) << "unkown error in building double array: " << binfile; ofs.write(reinterpret_cast<const char*>(da.array()), da.unit_size() * da.size()); return true; }
bool EncoderFeatureIndex::save(const char *filename, bool textmodelfile) { std::vector<char *> key; std::vector<int> val; std::string y_str; for (size_t i = 0; i < y_.size(); ++i) { y_str += y_[i]; y_str += '\0'; } std::string templ_str; for (size_t i = 0; i < unigram_templs_.size(); ++i) { templ_str += unigram_templs_[i]; templ_str += '\0'; } for (size_t i = 0; i < bigram_templs_.size(); ++i) { templ_str += bigram_templs_[i]; templ_str += '\0'; } while ((y_str.size() + templ_str.size()) % 4 != 0) { templ_str += '\0'; } for (std::map<std::string, std::pair<int, unsigned int> >::iterator it = dic_.begin(); it != dic_.end(); ++it) { key.push_back(const_cast<char *>(it->first.c_str())); val.push_back(it->second.first); } Darts::DoubleArray da; CHECK_FALSE(da.build(key.size(), &key[0], 0, &val[0]) == 0) << "cannot build double-array"; std::ofstream bofs; bofs.open(WPATH(filename), OUTPUT_MODE); CHECK_FALSE(bofs) << "open failed: " << filename; unsigned int version_ = version; bofs.write(reinterpret_cast<char *>(&version_), sizeof(unsigned int)); int type = 0; bofs.write(reinterpret_cast<char *>(&type), sizeof(type)); bofs.write(reinterpret_cast<char *>(&cost_factor_), sizeof(cost_factor_)); bofs.write(reinterpret_cast<char *>(&maxid_), sizeof(maxid_)); if (max_xsize_ > 0) { xsize_ = std::min(xsize_, max_xsize_); } bofs.write(reinterpret_cast<char *>(&xsize_), sizeof(xsize_)); unsigned int dsize = da.unit_size() * da.size(); bofs.write(reinterpret_cast<char *>(&dsize), sizeof(dsize)); unsigned int size = y_str.size(); bofs.write(reinterpret_cast<char *>(&size), sizeof(size)); bofs.write(const_cast<char *>(y_str.data()), y_str.size()); size = templ_str.size(); bofs.write(reinterpret_cast<char *>(&size), sizeof(size)); bofs.write(const_cast<char *>(templ_str.data()), templ_str.size()); bofs.write(reinterpret_cast<const char *>(da.array()), dsize); for (size_t i = 0; i < maxid_; ++i) { float alpha = static_cast<float>(alpha_[i]); bofs.write(reinterpret_cast<char *>(&alpha), sizeof(alpha)); } bofs.close(); if (textmodelfile) { std::string filename2 = filename; filename2 += ".txt"; std::ofstream tofs(WPATH(filename2.c_str())); CHECK_FALSE(tofs) << " no such file or directory: " << filename2; // header tofs << "version: " << version_ << std::endl; tofs << "cost-factor: " << cost_factor_ << std::endl; tofs << "maxid: " << maxid_ << std::endl; tofs << "xsize: " << xsize_ << std::endl; tofs << std::endl; // y for (size_t i = 0; i < y_.size(); ++i) { tofs << y_[i] << std::endl; } tofs << std::endl; // template for (size_t i = 0; i < unigram_templs_.size(); ++i) { tofs << unigram_templs_[i] << std::endl; } for (size_t i = 0; i < bigram_templs_.size(); ++i) { tofs << bigram_templs_[i] << std::endl; } tofs << std::endl; // dic for (std::map<std::string, std::pair<int, unsigned int> >::iterator it = dic_.begin(); it != dic_.end(); ++it) { tofs << it->second.first << " " << it->first << std::endl; } tofs << std::endl; tofs.setf(std::ios::fixed, std::ios::floatfield); tofs.precision(16); for (size_t i = 0; i < maxid_; ++i) { tofs << alpha_[i] << std::endl; } } return true; }
int main (int argc, char **argv) { std::string file = ""; std::string index = ""; std::string ofile = ""; extern char *optarg; int opt; while ((opt = getopt(argc, argv, "i:o:O:")) != -1) { switch(opt) { case 'i': file = std::string (optarg); break; case 'o': index = std::string (optarg); break; case 'O': ofile = std::string (optarg); break; default: std::cout << "Usage: " << argv[0] << OPT << std::endl; return -1; } } if (file.empty () || index.empty ()) { std::cout << "Usage: " << argv[0] << OPT << std::endl; return -1; } std::istream *is; if (file == "-") is = &std::cin; else is = new std::ifstream (file.c_str()); if (! *is) { std::cerr << "Cannot Open: " << file << std::endl; return -1; } std::vector <Darts::DoubleArray::key_type *> ary; std::vector <std::pair<const char *, double> > ary2; std::vector <double> alpha; std::map<std::string, double> rules; char buf[8192]; char *column[2]; double bias = 0.0; double alpha_sum = 0.0; double l1_norm = 0.0; double l2_norm = 0.0; while (is->getline (buf, 8192)) { if (buf[strlen(buf) - 1] == '\r') { buf[strlen(buf) - 1] = '\0'; } //cout << "\nline:" << no_cr_line; //cout.flush(); if (2 != tokenize (buf, "\t ", column, 2)) { std::cerr << "FATAL: Format Error: " << buf << std::endl; return -1; } // Ignore rules containing only 1 character. //if (strlen(column[1]) <= 1) continue; double a = atof (column[0]); bias -= a; alpha_sum += std::abs (a); rules[column[1]] += 2 * a; } bias /= alpha_sum; //bias = 0; l1_norm = alpha_sum; for (std::map<std::string, double>::iterator it = rules.begin(); it != rules.end(); ++it) { double a = it->second / alpha_sum; l2_norm += pow(it->second, 2); ary2.push_back (std::make_pair <const char*, double>(it->first.c_str(), a)); ary.push_back ((Darts::DoubleArray::key_type *)it->first.c_str()); alpha.push_back (a); } l2_norm = pow(l2_norm, 0.5); std::cout << "Total: " << alpha.size() << " rule(s)" << std::endl; std::cout << "l1_norm: " << l1_norm << ", l2_norm: " << l2_norm << std::endl; if (ary.empty()) { std::cerr << "FATAL: no feature is added" << std::endl; return -1; } if (file != "-") delete is; Darts::DoubleArray da; if (da.build (ary.size(), &ary[0], 0, 0, 0) != 0) { std::cerr << "Error: cannot build double array " << file << std::endl; return -1; } std::ofstream ofs (index.c_str(), std::ios::binary|std::ios::out); if (!ofs) { std::cerr << "Error: cannot open " << index << std::endl; return -1; } unsigned int s = da.size() * da.unit_size(); ofs.write ((char *)&s, sizeof (unsigned)); ofs.write ((char *)da.array (), s); ofs.write ((char *)&bias, sizeof (double)); ofs.write ((char *)&alpha[0], sizeof (double) * alpha.size()); ofs.close (); if (! ary2.empty() && ! ofile.empty()) { std::ofstream ofs2 (ofile.c_str()); if (! ofs2) { std::cerr << "Cannot Open: " << ofile << std::endl; return -1; } ofs2.precision (24); ofs2 << bias << std::endl; std::sort (ary2.begin(), ary2.end(), pair_2nd_cmp <const char*, double>()); for (unsigned int i = 0; i < ary2.size (); ++i) ofs2 << ary2[i].second << " " << ary2[i].first << std::endl; } return 0; }
bool Dictionary::compile(const Param ¶m, const std::vector<std::string> &dics, const char *matrix_file, const char *matrix_bin_file, const char *left_id_file, const char *right_id_file, const char *rewrite_file, const char *pos_id_file, const char *output) { Connector matrix; scoped_ptr<DictionaryRewriter> rewrite(0); scoped_ptr<POSIDGenerator> posid(0); scoped_ptr<ContextID> cid(0); scoped_ptr<Writer> writer(0); scoped_ptr<StringBuffer> os(0); Node node; std::vector<std::pair<std::string, Token*> > dic; size_t offset = 0; unsigned int lexsize = 0; std::string w, feature, ufeature, lfeature, rfeature, fbuf, key; int lid, rid, cost; const std::string from = param.get<std::string>("dictionary-charset"); const std::string to = param.get<std::string>("charset"); const bool wakati = param.get<bool>("wakati"); const int type = param.get<int>("type"); const std::string node_format = param.get<std::string>("node-format"); // for backward compatibility std::string config_charset = param.get<std::string>("config-charset"); if (config_charset.empty()) config_charset = from; CHECK_DIE(!from.empty()) << "input dictionary charset is empty"; CHECK_DIE(!to.empty()) << "output dictionary charset is empty"; Iconv iconv; CHECK_DIE(iconv.open(from.c_str(), to.c_str())) << "iconv_open() failed with from=" << from << " to=" << to; Iconv config_iconv; CHECK_DIE(config_iconv.open(config_charset.c_str(), from.c_str())) << "iconv_open() failed with from=" << config_charset << " to=" << from; if (!node_format.empty()) { writer.reset(new Writer); os.reset(new StringBuffer); memset(&node, 0, sizeof(node)); } if (!matrix.openText(matrix_file) && !matrix.open(matrix_bin_file)) { matrix.set_left_size(1); matrix.set_right_size(1); } posid.reset(new POSIDGenerator); posid->open(pos_id_file, &config_iconv); std::istringstream iss(UNK_DEF_DEFAULT); for (size_t i = 0; i < dics.size(); ++i) { std::ifstream ifs(dics[i].c_str()); std::istream *is = &ifs; if (!ifs) { if (type == MECAB_UNK_DIC) { std::cerr << dics[i] << " is not found. minimum setting is used." << std::endl; is = &iss; } else { CHECK_DIE(ifs) << "no such file or directory: " << dics[i]; } } std::cout << "reading " << dics[i] << " ... "; char line[BUF_SIZE]; size_t num = 0; while (is->getline(line, sizeof(line))) { char *col[8]; const size_t n = tokenizeCSV(line, col, 5); CHECK_DIE(n == 5) << "format error: " << line; w = col[0]; lid = std::atoi(col[1]); rid = std::atoi(col[2]); cost = std::atoi(col[3]); feature = col[4]; int pid = posid->id(feature.c_str()); if (lid < 0 || rid < 0) { if (!rewrite.get()) { rewrite.reset(new DictionaryRewriter); rewrite->open(rewrite_file, &config_iconv); } CHECK_DIE(rewrite->rewrite(feature, &ufeature, &lfeature, &rfeature)) << "rewrite failed: " << feature; if (!cid.get()) { cid.reset(new ContextID); cid->open(left_id_file, right_id_file, &config_iconv); CHECK_DIE(cid->left_size() == matrix.left_size() && cid->right_size() == matrix.right_size()) << "Context ID files(" << left_id_file << " or " << right_id_file << " may be broken"; } lid = cid->lid(lfeature.c_str()); rid = cid->rid(rfeature.c_str()); } CHECK_DIE(lid >= 0 && rid >= 0 && matrix.is_valid(lid, rid)) << "invalid ids are found lid=" << lid << " rid=" << rid; if (w.empty()) { std::cerr << "empty word is found, discard this line" << std::endl; continue; } if (!iconv.convert(&feature)) { std::cerr << "iconv conversion failed. skip this entry" << std::endl; continue; } if (type != MECAB_UNK_DIC && !iconv.convert(&w)) { std::cerr << "iconv conversion failed. skip this entry" << std::endl; continue; } if (!node_format.empty()) { node.surface = w.c_str(); node.feature = feature.c_str(); node.length = w.size(); node.rlength = w.size(); node.posid = pid; node.stat = MECAB_NOR_NODE; CHECK_DIE(os.get()); CHECK_DIE(writer.get()); os->clear(); CHECK_DIE(writer->writeNode(&*os, node_format.c_str(), w.c_str(), &node)) << "conversion error: " << feature << " with " << node_format; *os << '\0'; feature = os->str(); } key.clear(); if (!wakati) key = feature + '\0'; Token* token = new Token; token->lcAttr = lid; token->rcAttr = rid; token->posid = pid; token->wcost = cost; token->feature = offset; token->compound = 0; dic.push_back(std::make_pair<std::string, Token*>(w, token)); // append to output buffer if (!wakati) fbuf.append(key.data(), key.size()); offset += key.size(); ++num; ++lexsize; } std::cout << num << std::endl; } if (wakati) fbuf.append("\0", 1); std::sort(dic.begin(), dic.end()); size_t bsize = 0; size_t idx = 0; std::string prev; std::vector<const char *> str; std::vector<size_t> len; std::vector<Darts::DoubleArray::result_type> val; for (size_t i = 0; i < dic.size(); ++i) { if (i != 0 && prev != dic[i].first) { str.push_back(dic[idx].first.c_str()); len.push_back(dic[idx].first.size()); val.push_back(bsize +(idx << 8)); bsize = 1; idx = i; } else { ++bsize; } prev = dic[i].first; } str.push_back(dic[idx].first.c_str()); len.push_back(dic[idx].first.size()); val.push_back(bsize +(idx << 8)); CHECK_DIE(str.size() == len.size()); CHECK_DIE(str.size() == val.size()); Darts::DoubleArray da; CHECK_DIE(da.build(str.size(), const_cast<char **>(&str[0]), &len[0], &val[0], &progress_bar_darts) == 0) << "unkown error in building double-array"; std::string tbuf; for (size_t i = 0; i < dic.size(); ++i) { tbuf.append(reinterpret_cast<const char*>(dic[i].second), sizeof(Token)); delete dic[i].second; } dic.clear(); // needs to be 8byte(64bit) aligned while (tbuf.size() % 8 != 0) { Token dummy; memset(&dummy, 0, sizeof(Token)); tbuf.append(reinterpret_cast<const char*>(&dummy), sizeof(Token)); } unsigned int dummy = 0; unsigned int lsize = matrix.left_size(); unsigned int rsize = matrix.right_size(); unsigned int dsize = da.unit_size() * da.size(); unsigned int tsize = tbuf.size(); unsigned int fsize = fbuf.size(); unsigned int version = DIC_VERSION; char charset[32]; std::fill(charset, charset + sizeof(charset), '\0'); std::strncpy(charset, to.c_str(), 31); std::ofstream bofs(output, std::ios::binary|std::ios::out); CHECK_DIE(bofs) << "permission denied: " << output; unsigned int magic = 0; // needs to be 64bit aligned // 10*32 = 64*5 bofs.write(reinterpret_cast<const char *>(&magic), sizeof(unsigned int)); bofs.write(reinterpret_cast<const char *>(&version), sizeof(unsigned int)); bofs.write(reinterpret_cast<const char *>(&type), sizeof(unsigned int)); bofs.write(reinterpret_cast<const char *>(&lexsize), sizeof(unsigned int)); bofs.write(reinterpret_cast<const char *>(&lsize), sizeof(unsigned int)); bofs.write(reinterpret_cast<const char *>(&rsize), sizeof(unsigned int)); bofs.write(reinterpret_cast<const char *>(&dsize), sizeof(unsigned int)); bofs.write(reinterpret_cast<const char *>(&tsize), sizeof(unsigned int)); bofs.write(reinterpret_cast<const char *>(&fsize), sizeof(unsigned int)); bofs.write(reinterpret_cast<const char *>(&dummy), sizeof(unsigned int)); // 32 * 8 = 64 * 4 bofs.write(reinterpret_cast<const char *>(charset), sizeof(charset)); bofs.write(reinterpret_cast<const char*>(da.array()), da.unit_size() * da.size()); bofs.write(const_cast<const char *>(tbuf.data()), tbuf.size()); bofs.write(const_cast<const char *>(fbuf.data()), fbuf.size()); // save magic id magic = static_cast<unsigned int>(bofs.tellp()); magic ^= DictionaryMagicID; bofs.seekp(0); bofs.write(reinterpret_cast<const char *>(&magic), sizeof(unsigned int)); bofs.close(); return true; }