int main(int argc, char** argv) { utils::mpi_world mpi_world(argc, argv); const int mpi_rank = MPI::COMM_WORLD.Get_rank(); const int mpi_size = MPI::COMM_WORLD.Get_size(); try { if (getoptions(argc, argv) != 0) return 1; if (! temporary_dir.empty()) ::setenv("TMPDIR_SPEC", temporary_dir.string().data(), 1); if (ngram_file.empty() || ! boost::filesystem::exists(ngram_file)) throw std::runtime_error("no ngram file?"); if (output_file.empty()) throw std::runtime_error("no output file?"); if (ngram_file == output_file) throw std::runtime_error("dump to the same directory?"); ngram_type ngram(debug); ngram.open_shard(ngram_file, mpi_rank); if (static_cast<int>(ngram.index.size()) != mpi_size) throw std::runtime_error("MPI universe size do not match with ngram shard size"); utils::resource start; ngram_quantize(ngram); utils::resource end; if (debug && mpi_rank == 0) std::cerr << "quantize language model" << " cpu time: " << end.cpu_time() - start.cpu_time() << " user time: " << end.user_time() - start.user_time() << std::endl; if (mpi_rank == 0) ngram.write_prepare(output_file); MPI::COMM_WORLD.Barrier(); ngram.write_shard(output_file, mpi_rank); } catch (std::exception& err) { std::cerr << "error: " << err.what() << std::endl; MPI::COMM_WORLD.Abort(1); return 1; } return 0; }
int main(int argc, char** argv) { try { if (getoptions(argc, argv) != 0) return 1; if (! temporary_dir.empty()) ::setenv("TMPDIR_SPEC", temporary_dir.string().data(), 1); expgram::NGram ngram(ngram_file, shards, debug); utils::compress_ostream os(output_file); os << "ngram order: " << ngram.index.order() << '\n'; for (int order = 1; order <= ngram.index.order(); ++ order) os << order << "-gram: " << std::setw(16) << ngram.index.ngram_size(order) << '\n'; dump(os, "index", ngram.stat_index()) << '\n'; dump(os, "pointer", ngram.stat_pointer()) << '\n'; dump(os, "vocab", ngram.stat_vocab()) << '\n'; dump(os, "logprob", ngram.stat_logprob()) << '\n'; dump(os, "backoff", ngram.stat_backoff()) << '\n'; dump(os, "logbound", ngram.stat_logbound()) << '\n'; } catch (std::exception& err) { std::cerr << "error: " << err.what() << std::endl; return 1; } return 0; }
void NGram::open(const path_type& path) { typedef utils::repository repository_type; clear(); if (path.empty()) throw std::runtime_error("no ngram?"); else if (! boost::filesystem::exists(path)) throw std::runtime_error("no ngram? " + path.string()); repository_type rep(path, repository_type::read); index.open(rep.path("index")); if (boost::filesystem::exists(rep.path("logprob"))) open_shards(rep.path("logprob"), logprobs); if (boost::filesystem::exists(rep.path("backoff"))) open_shards(rep.path("backoff"), backoffs); if (boost::filesystem::exists(rep.path("logbound"))) open_shards(rep.path("logbound"), logbounds); repository_type::const_iterator siter = rep.find("smooth"); if (siter == rep.end()) throw std::runtime_error("no smoothing parameter...?"); smooth = utils::lexical_cast<logprob_type>(siter->second); if (debug) std::cerr << "ngram: " << path << " # of shards: " << index.size() << " smooth: " << smooth << std::endl; }
int main(int argc, char ** argv) { try { options(argc, argv); cicada::optimize::LineSearch::value_min = value_lower; cicada::optimize::LineSearch::value_max = value_upper; if (scorer_list) { std::cout << cicada::eval::Scorer::lists(); return 0; } if (regularize_l1 && regularize_l2) throw std::runtime_error("you cannot use both of L1 and L2..."); if (regularize_l1 || regularize_l2) { if (C <= 0.0) throw std::runtime_error("the scaling for L1/L2 must be positive"); } if (weight_normalize_l1 && weight_normalize_l2) throw std::runtime_error("you cannot use both of L1 and L2 for weight normalization..."); threads = utils::bithack::max(threads, 1); // read reference set scorer_document_type scorers(scorer_name); read_refset(refset_files, scorers); const size_t scorers_size = scorers.size(); if (iterative && tstset_files.size() > 1) { scorer_document_type scorers_iterative(scorer_name); scorers_iterative.resize(scorers.size() * tstset_files.size()); for (size_t i = 0; i != tstset_files.size(); ++ i) std::copy(scorers.begin(), scorers.end(), scorers_iterative.begin() + scorers.size() * i); scorers.swap(scorers_iterative); } if (debug) std::cerr << "# of references: " << scorers.size() << std::endl; if (debug) std::cerr << "reading kbests" << std::endl; hypothesis_map_type kbests(scorers.size()); read_tstset(tstset_files, kbests, scorers_size); initialize_score(kbests, scorers); // collect initial weights weight_set_collection_type weights; if (! feature_weights_files.empty()) { for (path_set_type::const_iterator fiter = feature_weights_files.begin(); fiter != feature_weights_files.end(); ++ fiter) { if (*fiter != "-" && ! boost::filesystem::exists(*fiter)) throw std::runtime_error("no file? " + fiter->string()); utils::compress_istream is(*fiter); weights.push_back(weight_set_type()); is >> weights.back(); } if (initial_average && weights.size() > 1) { weight_set_type weight; weight_set_collection_type::const_iterator witer_end = weights.end(); for (weight_set_collection_type::const_iterator witer = weights.begin(); witer != witer_end; ++ witer) weight += *witer; weight *= (1.0 / weights.size()); weights.push_back(weight); } std::set<weight_set_type, std::less<weight_set_type>, std::allocator<weight_set_type> > uniques; uniques.insert(weights.begin(), weights.end()); weights.clear(); weights.insert(weights.end(), uniques.begin(), uniques.end()); } else { weights.push_back(weight_set_type()); // all one weight... for (feature_type::id_type id = 0; id < feature_type::allocated(); ++ id) if (! feature_type(id).empty()) weights.back()[feature_type(id)] = 1.0; } // collect lower/upper bounds weight_set_type bound_lower; weight_set_type bound_upper; if (! bound_lower_file.empty()) { if (bound_lower_file == "-" || boost::filesystem::exists(bound_lower_file)) { typedef cicada::FeatureVector<double> feature_vector_type; feature_vector_type bounds; utils::compress_istream is(bound_lower_file); is >> bounds; bound_lower.allocate(cicada::optimize::LineSearch::value_min); for (feature_vector_type::const_iterator biter = bounds.begin(); biter != bounds.end(); ++ biter) bound_lower[biter->first] = biter->second; } else throw std::runtime_error("no lower-bound file?" + bound_lower_file.string()); }
int main(int argc, char ** argv) { utils::mpi_world mpi_world(argc, argv); const int mpi_rank = MPI::COMM_WORLD.Get_rank(); const int mpi_size = MPI::COMM_WORLD.Get_size(); try { options(argc, argv); cicada::optimize::LineSearch::value_min = value_lower; cicada::optimize::LineSearch::value_max = value_upper; if (scorer_list) { std::cout << cicada::eval::Scorer::lists(); return 0; } if (int(yield_sentence) + yield_alignment + yield_span > 1) throw std::runtime_error("specify either sentence|alignment|span yield"); if (int(yield_sentence) + yield_alignment + yield_span == 0) yield_sentence = true; if (weights_file.empty() || ! boost::filesystem::exists(weights_file)) throw std::runtime_error("no weight file? " + weights_file.string()); if (direction_name.empty()) throw std::runtime_error("no direction?"); // read reference set scorer_document_type scorers(scorer_name); read_refset(refset_files, scorers); if (mpi_rank == 0 && debug) std::cerr << "# of references: " << scorers.size() << std::endl; // read test set if (mpi_rank == 0 && debug) std::cerr << "reading hypergraphs" << std::endl; hypergraph_set_type graphs(scorers.size()); read_tstset(tstset_files, graphs); weight_set_type weights; { utils::compress_istream is(weights_file, 1024 * 1024); is >> weights; } weight_set_type direction; direction[direction_name] = 1.0; segment_document_type segments(graphs.size()); compute_envelope(scorers, graphs, weights, direction, segments); if (mpi_rank == 0) { line_search_type line_search(debug); utils::compress_ostream os(output_file, 1024 * 1024); line_search(segments, value_lower, value_upper, OutputIterator(os, weights[direction_name])); } } catch (const std::exception& err) { std::cerr << "error: " << err.what() << std::endl; return 1; } return 0; }
int main(int argc, char** argv) { try { options(argc, argv); if (int(leaf_mode) + treebank_mode > 1) throw std::runtime_error("multiple output options specified: leaf/treebank(default: treebank)"); if (int(leaf_mode) + treebank_mode == 0) treebank_mode = true; typedef boost::spirit::istream_iterator iter_type; const bool flush_output = (output_file == "-" || (boost::filesystem::exists(output_file) && ! boost::filesystem::is_regular_file(output_file))); penntreebank_grammar<iter_type> grammar; treebank_type parsed; sentence_type sent; pos_set_type pos; std::string line; utils::compress_istream is(input_file, 1024 * 1024); utils::compress_ostream os(output_file, 1024 * 1024); std::auto_ptr<utils::compress_istream> ms; if (! pos_file.empty()) { if (! boost::filesystem::exists(pos_file)) throw std::runtime_error("no map file: " + pos_file.string()); ms.reset(new utils::compress_istream(pos_file, 1024 * 1024)); } is.unsetf(std::ios::skipws); iter_type iter(is); iter_type iter_end; while (iter != iter_end) { parsed.clear(); if (! boost::spirit::qi::phrase_parse(iter, iter_end, grammar, boost::spirit::standard::space, parsed)) { std::string buffer; for (int i = 0; i != 64 && iter != iter_end; ++ i, ++iter) buffer += *iter; throw std::runtime_error("parsing failed: " + buffer); } if (! root_symbol.empty()) parsed.cat_ = root_symbol; else if (parsed.cat_.empty()) parsed.cat_ = "ROOT"; if (validate) if (! treebank_validate(parsed)) throw std::runtime_error("invalid tree"); if (ms.get()) { typedef boost::tokenizer<utils::space_separator, utils::piece::const_iterator, utils::piece> tokenizer_type; if (! utils::getline(*ms, line)) throw std::runtime_error("# of lines do not match with POS"); utils::piece line_piece(line); tokenizer_type tokenizer(line_piece); pos.clear(); pos.insert(pos.end(), tokenizer.begin(), tokenizer.end()); pos_set_type::iterator iter = pos.begin(); transform_pos(parsed, iter, pos.end()); if (iter != pos.end()) throw std::runtime_error("too long POS sequence?"); } if (remove_none) transform_remove_none(parsed); if (normalize) transform_normalize(parsed); if (remove_cycle) transform_cycle(parsed); if (unescape_terminal) transform_unescape(parsed); if (leaf_mode) { sent.clear(); transform_leaf(parsed, sent); if (! sent.empty()) { std::copy(sent.begin(), sent.end() - 1, std::ostream_iterator<std::string>(os, " ")); os << sent.back(); } } else if (treebank_mode) { if (parsed.antecedents_.empty()) os << "(())"; else treebank_output(parsed, os); } os << '\n'; if (flush_output) os << std::flush; } } catch (const std::exception& err) { std::cerr << "error: " << err.what() << std::endl; return 1; } return 0; }
int main(int argc, char ** argv) { try { options(argc, argv); feature_list_type features_confidence; feature_list_type features_count; if (! confidence_feature_file.empty()) { if (confidence_feature_file != "-" && ! boost::filesystem::exists(confidence_feature_file)) throw std::runtime_error("no confidence feature file? " + confidence_feature_file.string()); utils::compress_istream is(confidence_feature_file); std::string feature; while (is >> feature) features_confidence.push_back(feature); } if (! count_feature_file.empty()) { if (count_feature_file != "-" && ! boost::filesystem::exists(count_feature_file)) throw std::runtime_error("no count feature file? " + count_feature_file.string()); utils::compress_istream is(count_feature_file); std::string feature; while (is >> feature) features_count.push_back(feature); } const bool flush_output = (output_file == "-" || (boost::filesystem::exists(output_file) && ! boost::filesystem::is_regular_file(output_file))); hypergraph_type merged; hypergraph_type hypergraph; cicada::Feature feature_confidence(confidence); cicada::Feature feature_count(count); if (input_files.empty()) input_files.push_back("-"); if (multiple_mode) { namespace qi = boost::spirit::qi; namespace standard = boost::spirit::standard; // forest ||| forest ||| forest utils::compress_ostream os(output_file, 1024 * 1024 * (! flush_output)); for (path_set_type::const_iterator iter = input_files.begin(); iter != input_files.end(); ++ iter) { utils::compress_istream is(input_files.front(), 1024 * 1024); std::string line; while (utils::getline(is, line)) { int rank = 1; int id = 0; merged.clear(); hypergraph.clear(); std::string::const_iterator iter = line.begin(); std::string::const_iterator end = line.end(); for (/**/; iter != end; ++ id, ++ rank) { if (id != 0) if (! qi::phrase_parse(iter, end, "|||", standard::space)) break; if (! hypergraph.assign(iter, end)) throw std::runtime_error("invalid hypergraph format"); if (! hypergraph.is_valid()) continue; const double conf = 1.0 / (1.0 + rank); feature_set_type features; if (! features_confidence.empty()) { if (id >= static_cast<int>(features_confidence.size())) throw std::runtime_error("# of confidence features do not match"); features[features_confidence[id]] = conf; } if (! features_count.empty()) { if (id >= static_cast<int>(features_count.size())) throw std::runtime_error("# of count features do not match"); features[features_count[id]] = count_weight; } if (! feature_confidence.empty()) features[feature_confidence] = conf; if (! feature_count.empty()) features[feature_count] = count_weight; if (! features.empty()) { hypergraph_type::edge_set_type::iterator eiter_end = hypergraph.edges.end(); for (hypergraph_type::edge_set_type::iterator eiter = hypergraph.edges.begin(); eiter != eiter_end; ++ eiter) eiter->features += features; } merged.unite(hypergraph); } os << merged << '\n'; } } } else if (input_files.size() == 1) { utils::compress_istream is(input_files.front(), 1024 * 1024); std::string line; int rank = 1; int id = 0; for (/**/; utils::getline(is, line); ++ id, ++ rank) { std::string::const_iterator iter = line.begin(); std::string::const_iterator end = line.end(); if (! hypergraph.assign(iter, end)) throw std::runtime_error("invalid hypergraph format"); if (! hypergraph.is_valid()) continue; const double conf = 1.0 / (1.0 + rank); feature_set_type features; if (! features_confidence.empty()) { if (id >= static_cast<int>(features_confidence.size())) throw std::runtime_error("# of confidence features do not match"); features[features_confidence[id]] = conf; } if (! features_count.empty()) { if (id >= static_cast<int>(features_count.size())) throw std::runtime_error("# of count features do not match"); features[features_count[id]] = count_weight; } if (! feature_confidence.empty()) features[feature_confidence] = conf; if (! feature_count.empty()) features[feature_count] = count_weight; if (! features.empty()) { hypergraph_type::edge_set_type::iterator eiter_end = hypergraph.edges.end(); for (hypergraph_type::edge_set_type::iterator eiter = hypergraph.edges.begin(); eiter != eiter_end; ++ eiter) eiter->features += features; } merged.unite(hypergraph); } utils::compress_ostream os(output_file, 1024 * 1024 * (! flush_output)); os << merged << '\n'; } else { // we will handle multiple files! if (! features_confidence.empty()) if (input_files.size() != features_confidence.size()) throw std::runtime_error("input file do not match with # of confidence feature"); if (! features_count.empty()) if (input_files.size() != features_count.size()) throw std::runtime_error("input file do not match with # of count feature"); typedef std::vector<std::istream*, std::allocator<std::istream*> > istream_set_type; istream_set_type istreams(input_files.size()); for (size_t i = 0; i != input_files.size(); ++ i) istreams[i] = new utils::compress_istream(input_files[i], 1024 * 1024); utils::compress_ostream os(output_file, 1024 * 1024 * (! flush_output)); std::string line; for (;;) { int rank = 1; merged.clear(); hypergraph.clear(); size_t num_failed = 0; for (size_t id = 0; id != istreams.size(); ++ id, ++ rank) { if (utils::getline(*istreams[id], line)) { std::string::const_iterator iter = line.begin(); std::string::const_iterator end = line.end(); if (! hypergraph.assign(iter, end)) throw std::runtime_error("invalid hypergraph format"); if (! hypergraph.is_valid()) continue; const double conf = 1.0 / (1.0 + rank); feature_set_type features; if (! features_confidence.empty()) features[features_confidence[id]] = conf; if (! features_count.empty()) features[features_count[id]] = count_weight; if (! feature_confidence.empty()) features[feature_confidence] = conf; if (! feature_count.empty()) features[feature_count] = count_weight; if (! features.empty()) { hypergraph_type::edge_set_type::iterator eiter_end = hypergraph.edges.end(); for (hypergraph_type::edge_set_type::iterator eiter = hypergraph.edges.begin(); eiter != eiter_end; ++ eiter) eiter->features += features; } merged.unite(hypergraph); } else ++ num_failed; } if (num_failed) { if (num_failed != istreams.size()) throw std::runtime_error("# of lines do not match"); break; } os << merged << '\n'; } for (size_t i = 0; i != istreams.size(); ++ i) delete istreams[i]; } } catch (const std::exception& err) { std::cerr << "error: " << err.what() << std::endl; return 1; } return 0; }
int main(int argc, char** argv) { try { options(argc, argv); if (grammar_list) { std::cout << grammar_type::lists(); return 0; } if (tree_grammar_list) { std::cout << tree_grammar_type::lists(); return 0; } threads = utils::bithack::max(1, threads); // read grammars... grammar_type grammar(grammar_files.begin(), grammar_files.end()); if (debug) std::cerr << "grammar: " << grammar.size() << std::endl; tree_grammar_type tree_grammar(tree_grammar_files.begin(), tree_grammar_files.end()); if (debug) std::cerr << "tree grammar: " << tree_grammar.size() << std::endl; typedef Task task_type; typedef std::vector<task_type, std::allocator<task_type> > task_set_type; task_type::queue_type queue(threads); task_set_type tasks(threads, task_type(queue, tree_grammar, grammar)); boost::thread_group workers; for (int i = 0; i != threads; ++ i) workers.add_thread(new boost::thread(boost::ref(tasks[i]))); utils::compress_istream is(input_file, 1024 * 1024); std::string line; while (utils::getline(is, line)) if (! line.empty()) queue.push_swap(line); for (int i = 0; i != threads; ++ i) queue.push(std::string()); workers.join_all(); tree_rule_pair_unique_type tree_rules_unique; rule_pair_unique_type rules_unique; for (int i = 0; i != threads; ++ i) { if (tree_rules_unique.empty()) tree_rules_unique.swap(tasks[i].tree_rules_unique); else tree_rules_unique.insert(tasks[i].tree_rules_unique.begin(), tasks[i].tree_rules_unique.end()); tasks[i].tree_rules_unique.clear(); if (rules_unique.empty()) rules_unique.swap(tasks[i].rules_unique); else rules_unique.insert(tasks[i].rules_unique.begin(), tasks[i].rules_unique.end()); tasks[i].rules_unique.clear(); } tasks.clear(); typedef std::ostream_iterator<char> oiter_type; features_generator<oiter_type> generate_features; attributes_generator<oiter_type> generate_attributes; if (! output_tree_file.empty()) { namespace karma = boost::spirit::karma; namespace standard = boost::spirit::standard; utils::compress_ostream os(output_tree_file, 1024 * 1024); tree_rule_pair_unique_type::const_iterator iter_end = tree_rules_unique.end(); for (tree_rule_pair_unique_type::const_iterator iter = tree_rules_unique.begin(); iter != iter_end; ++ iter) { os << iter->source.decode() << " ||| " << iter->target.decode(); if (! iter->features.empty()) { feature_set_type features(iter->features.begin(), iter->features.end()); karma::generate(oiter_type(os), generate_features, features); } if (! iter->attributes.empty()) karma::generate(oiter_type(os), generate_attributes, iter->attributes); os << '\n'; } } if (! output_rule_file.empty()) { namespace karma = boost::spirit::karma; namespace standard = boost::spirit::standard; utils::compress_ostream os(output_rule_file, 1024 * 1024); rule_pair_unique_type::const_iterator iter_end = rules_unique.end(); for (rule_pair_unique_type::const_iterator iter = rules_unique.begin(); iter != iter_end; ++ iter) { karma::generate(oiter_type(os), standard::string << " ||| " << -(standard::string % ' ') << " ||| " << -(standard::string % ' '), iter->lhs, symbol_set_type(iter->source.begin(), iter->source.end()), symbol_set_type(iter->target.begin(), iter->target.end())); if (! iter->features.empty()) { feature_set_type features(iter->features.begin(), iter->features.end()); karma::generate(oiter_type(os), generate_features, features); } if (! iter->attributes.empty()) karma::generate(oiter_type(os), generate_attributes, iter->attributes); os << '\n'; } } } catch (const std::exception& err) { std::cerr << "error: " << err.what() << std::endl; return 1; } return 0; }