void platform_abstraction::font_resource(bool try_add, const path_type& ttf) { #ifdef NANA_WINDOWS if (try_add) ::AddFontResourceEx(ttf.wstring().c_str(), FR_PRIVATE, nullptr); else ::RemoveFontResourceEx(ttf.wstring().c_str(), FR_PRIVATE, nullptr); #else auto & fc = platform_storage().fontconfig_counts; if(try_add) { if(1 == ++(fc[ttf.string()])) { ::FcConfigAppFontAddFile(nullptr, reinterpret_cast<const FcChar8*>(ttf.string().c_str())); } } else { auto i = fc.find(ttf.string()); if(i != fc.end()) { if(0 == --(i->second)) fc.erase(i); if(0 == fc.size()) ::FcConfigAppFontClear(nullptr); } } #endif }
bool slice_reader::open_file(const path_type & file) { log_info << "Opening \"" << color::cyan << file.string() << color::reset << '"'; ifs.close(); ifs.clear(); ifs.open(file, std::ios_base::in | std::ios_base::binary | std::ios_base::ate); if(ifs.fail()) { return false; } std::streampos file_size = ifs.tellg(); ifs.seekg(0); char magic[8]; if(ifs.read(magic, 8).fail()) { ifs.close(); throw slice_error("could not read slice magic number in \"" + file.string() + "\""); } bool found = false; for(size_t i = 0; boost::size(slice_ids); i++) { if(!std::memcmp(magic, slice_ids[i], 8)) { found = true; break; } } if(!found) { ifs.close(); throw slice_error("bad slice magic number in \"" + file.string() + "\""); } slice_size = util::load<boost::uint32_t>(ifs); if(ifs.fail()) { ifs.close(); throw slice_error("could not read slice size in \"" + file.string() + "\""); } else if(std::streampos(slice_size) > file_size) { ifs.close(); std::ostringstream oss; oss << "bad slice size in " << file << ": " << slice_size << " > " << file_size; throw slice_error(oss.str()); } else if(std::streampos(slice_size) < ifs.tellg()) { ifs.close(); std::ostringstream oss; oss << "bad slice size in " << file << ": " << slice_size << " < " << ifs.tellg(); throw slice_error(oss.str()); } slice_file = file; last_dir = file.parent_path(); return true; }
int main(int argc, char** argv) { try { if (getoptions(argc, argv) != 0) return 1; if (! temporary_dir.empty()) ::setenv("TMPDIR_SPEC", temporary_dir.string().data(), 1); expgram::NGram ngram(ngram_file, shards, debug); utils::compress_ostream os(output_file); os << "ngram order: " << ngram.index.order() << '\n'; for (int order = 1; order <= ngram.index.order(); ++ order) os << order << "-gram: " << std::setw(16) << ngram.index.ngram_size(order) << '\n'; dump(os, "index", ngram.stat_index()) << '\n'; dump(os, "pointer", ngram.stat_pointer()) << '\n'; dump(os, "vocab", ngram.stat_vocab()) << '\n'; dump(os, "logprob", ngram.stat_logprob()) << '\n'; dump(os, "backoff", ngram.stat_backoff()) << '\n'; dump(os, "logbound", ngram.stat_logbound()) << '\n'; } catch (std::exception& err) { std::cerr << "error: " << err.what() << std::endl; return 1; } return 0; }
void NGram::open(const path_type& path) { typedef utils::repository repository_type; clear(); if (path.empty()) throw std::runtime_error("no ngram?"); else if (! boost::filesystem::exists(path)) throw std::runtime_error("no ngram? " + path.string()); repository_type rep(path, repository_type::read); index.open(rep.path("index")); if (boost::filesystem::exists(rep.path("logprob"))) open_shards(rep.path("logprob"), logprobs); if (boost::filesystem::exists(rep.path("backoff"))) open_shards(rep.path("backoff"), backoffs); if (boost::filesystem::exists(rep.path("logbound"))) open_shards(rep.path("logbound"), logbounds); repository_type::const_iterator siter = rep.find("smooth"); if (siter == rep.end()) throw std::runtime_error("no smoothing parameter...?"); smooth = utils::lexical_cast<logprob_type>(siter->second); if (debug) std::cerr << "ngram: " << path << " # of shards: " << index.size() << " smooth: " << smooth << std::endl; }
void options(int argc, char** argv) { namespace po = boost::program_options; po::options_description opts_config("configuration options"); opts_config.add_options() ("input", po::value<path_type>(&input_file)->default_value(input_file), "input file") ("output", po::value<path_type>(&output_file)->default_value(output_file), "output file") ("input-sentence", po::bool_switch(&input_sentence_mode), "sentence input") ("input-lattice", po::bool_switch(&input_lattice_mode), "lattice input") ("treebank", po::bool_switch(&treebank_mode), "assume treebank style grammar") ("pos", po::bool_switch(&pos_mode), "POS annotated input") // grammar ("grammar", po::value<grammar_file_set_type >(&grammar_files)->composing(), "grammar specification(s)") ("grammar-list", po::bool_switch(&grammar_list), "list of available grammar specifications") ; po::options_description opts_command("command line options"); opts_command.add_options() ("config", po::value<path_type>(), "configuration file") ("threads", po::value<int>(&threads), "# of threads (highly experimental)") ("debug", po::value<int>(&debug)->implicit_value(1), "debug level") ("help", "help message"); po::options_description desc_config; po::options_description desc_command; po::options_description desc_visible; desc_config.add(opts_config); desc_command.add(opts_config).add(opts_command); desc_visible.add(opts_config).add(opts_command); po::variables_map variables; po::store(po::parse_command_line(argc, argv, desc_command, po::command_line_style::unix_style & (~po::command_line_style::allow_guessing)), variables); if (variables.count("config")) { const path_type path_config = variables["config"].as<path_type>(); if (! boost::filesystem::exists(path_config)) throw std::runtime_error("no config file: " + path_config.string()); utils::compress_istream is(path_config); po::store(po::parse_config_file(is, desc_config), variables); } po::notify(variables); if (variables.count("help")) { std::cout << argv[0] << " [options]\n" << desc_visible << std::endl; exit(0); } }
int server_type::mknod(shared_ptr<fs_entry> file_ent, path_type const& path, mode_t, dev_t) { auto filename = path.filename().string(); auto pos = filename.find_first_of(host_port_delimiter); auto host = filename.substr(0, pos); int port = std::stoi(filename.substr(pos + 1)); BOOST_LOG_TRIVIAL(info) << "server_type::mknod: establishing server host=" << host << " port=" << port; int serverfd = establish_server(host, port); BOOST_LOG_TRIVIAL(info) << "server_type::mknod: established server " << host << ":" << port << " fd=" << serverfd; detail::fdtable.insert(path.string(), serverfd); return 0; }
int main(int argc, char** argv) { utils::mpi_world mpi_world(argc, argv); const int mpi_rank = MPI::COMM_WORLD.Get_rank(); const int mpi_size = MPI::COMM_WORLD.Get_size(); try { if (getoptions(argc, argv) != 0) return 1; if (! temporary_dir.empty()) ::setenv("TMPDIR_SPEC", temporary_dir.string().data(), 1); if (ngram_file.empty() || ! boost::filesystem::exists(ngram_file)) throw std::runtime_error("no ngram file?"); if (output_file.empty()) throw std::runtime_error("no output file?"); if (ngram_file == output_file) throw std::runtime_error("dump to the same directory?"); ngram_type ngram(debug); ngram.open_shard(ngram_file, mpi_rank); if (static_cast<int>(ngram.index.size()) != mpi_size) throw std::runtime_error("MPI universe size do not match with ngram shard size"); utils::resource start; ngram_quantize(ngram); utils::resource end; if (debug && mpi_rank == 0) std::cerr << "quantize language model" << " cpu time: " << end.cpu_time() - start.cpu_time() << " user time: " << end.user_time() - start.user_time() << std::endl; if (mpi_rank == 0) ngram.write_prepare(output_file); MPI::COMM_WORLD.Barrier(); ngram.write_shard(output_file, mpi_rank); } catch (std::exception& err) { std::cerr << "error: " << err.what() << std::endl; MPI::COMM_WORLD.Abort(1); return 1; } return 0; }
// // server_op // int server_op::open(shared_ptr<fs_entry> file_ent, path_type const& path) { if (auto server_fd = detail::fdtable.find(path.string())) server_fd_ = *server_fd; else throw std::runtime_error("client_op: open: cannot get socket from fdtable"); sockaddr_in client; socklen_t len = sizeof (client); client_fd_ = accept(server_fd_, reinterpret_cast<sockaddr *>(&client), &len); if (client_fd_ < 0) throw_system_error(errno); return 0; }
int main(int argc, char ** argv) { try { options(argc, argv); cicada::optimize::LineSearch::value_min = value_lower; cicada::optimize::LineSearch::value_max = value_upper; if (scorer_list) { std::cout << cicada::eval::Scorer::lists(); return 0; } if (regularize_l1 && regularize_l2) throw std::runtime_error("you cannot use both of L1 and L2..."); if (regularize_l1 || regularize_l2) { if (C <= 0.0) throw std::runtime_error("the scaling for L1/L2 must be positive"); } if (weight_normalize_l1 && weight_normalize_l2) throw std::runtime_error("you cannot use both of L1 and L2 for weight normalization..."); threads = utils::bithack::max(threads, 1); // read reference set scorer_document_type scorers(scorer_name); read_refset(refset_files, scorers); const size_t scorers_size = scorers.size(); if (iterative && tstset_files.size() > 1) { scorer_document_type scorers_iterative(scorer_name); scorers_iterative.resize(scorers.size() * tstset_files.size()); for (size_t i = 0; i != tstset_files.size(); ++ i) std::copy(scorers.begin(), scorers.end(), scorers_iterative.begin() + scorers.size() * i); scorers.swap(scorers_iterative); } if (debug) std::cerr << "# of references: " << scorers.size() << std::endl; if (debug) std::cerr << "reading kbests" << std::endl; hypothesis_map_type kbests(scorers.size()); read_tstset(tstset_files, kbests, scorers_size); initialize_score(kbests, scorers); // collect initial weights weight_set_collection_type weights; if (! feature_weights_files.empty()) { for (path_set_type::const_iterator fiter = feature_weights_files.begin(); fiter != feature_weights_files.end(); ++ fiter) { if (*fiter != "-" && ! boost::filesystem::exists(*fiter)) throw std::runtime_error("no file? " + fiter->string()); utils::compress_istream is(*fiter); weights.push_back(weight_set_type()); is >> weights.back(); } if (initial_average && weights.size() > 1) { weight_set_type weight; weight_set_collection_type::const_iterator witer_end = weights.end(); for (weight_set_collection_type::const_iterator witer = weights.begin(); witer != witer_end; ++ witer) weight += *witer; weight *= (1.0 / weights.size()); weights.push_back(weight); } std::set<weight_set_type, std::less<weight_set_type>, std::allocator<weight_set_type> > uniques; uniques.insert(weights.begin(), weights.end()); weights.clear(); weights.insert(weights.end(), uniques.begin(), uniques.end()); } else { weights.push_back(weight_set_type()); // all one weight... for (feature_type::id_type id = 0; id < feature_type::allocated(); ++ id) if (! feature_type(id).empty()) weights.back()[feature_type(id)] = 1.0; } // collect lower/upper bounds weight_set_type bound_lower; weight_set_type bound_upper; if (! bound_lower_file.empty()) { if (bound_lower_file == "-" || boost::filesystem::exists(bound_lower_file)) { typedef cicada::FeatureVector<double> feature_vector_type; feature_vector_type bounds; utils::compress_istream is(bound_lower_file); is >> bounds; bound_lower.allocate(cicada::optimize::LineSearch::value_min); for (feature_vector_type::const_iterator biter = bounds.begin(); biter != bounds.end(); ++ biter) bound_lower[biter->first] = biter->second; } else throw std::runtime_error("no lower-bound file?" + bound_lower_file.string()); }
void read_tstset(const path_set_type& files, hypergraph_set_type& graphs) { const int mpi_rank = MPI::COMM_WORLD.Get_rank(); const int mpi_size = MPI::COMM_WORLD.Get_size(); std::string line; path_set_type::const_iterator titer_end = tstset_files.end(); for (path_set_type::const_iterator titer = tstset_files.begin(); titer != titer_end; ++ titer) { if (debug) std::cerr << "file: " << *titer << std::endl; if (boost::filesystem::is_directory(*titer)) { size_t id; hypergraph_type hypergraph; for (size_t i = mpi_rank; /**/; i += mpi_size) { const path_type path = (*titer) / (utils::lexical_cast<std::string>(i) + ".gz"); if (! boost::filesystem::exists(path)) break; utils::compress_istream is(path, 1024 * 1024); if (! utils::getline(is, line)) throw std::runtime_error("no line in file-no: " + utils::lexical_cast<std::string>(i)); std::string::const_iterator iter = line.begin(); std::string::const_iterator end = line.end(); if (! parse_id(id, iter, end)) throw std::runtime_error("invalid id input: " + path.string()); if (id != i) throw std::runtime_error("id mismatch: " + path.string()); if (static_cast<int>(id % mpi_size) != mpi_rank) throw std::runtime_error("difference it?"); if (! hypergraph.assign(iter, end)) throw std::runtime_error("invalid graph format" + path.string()); if (iter != end) throw std::runtime_error("invalid id ||| graph format" + path.string()); if (graphs[id].is_valid()) graphs[id].unite(hypergraph); else graphs[id].swap(hypergraph); } } else { const path_type& path = *titer; utils::compress_istream is(path, 1024 * 1024); size_t id; hypergraph_type hypergraph; while (utils::getline(is, line)) { std::string::const_iterator iter = line.begin(); std::string::const_iterator end = line.end(); if (! parse_id(id, iter, end)) throw std::runtime_error("invalid id input: " + path.string()); if (id >= graphs.size()) throw std::runtime_error("tstset size exceeds refset size?" + utils::lexical_cast<std::string>(id) + ": " + titer->string()); if (static_cast<int>(id % mpi_size) != mpi_rank) continue; if (! hypergraph.assign(iter, end)) throw std::runtime_error("invalid graph format" + path.string()); if (iter != end) throw std::runtime_error("invalid id ||| graph format" + path.string()); if (graphs[id].is_valid()) graphs[id].unite(hypergraph); else graphs[id].swap(hypergraph); } } } }
int main(int argc, char ** argv) { utils::mpi_world mpi_world(argc, argv); const int mpi_rank = MPI::COMM_WORLD.Get_rank(); const int mpi_size = MPI::COMM_WORLD.Get_size(); try { options(argc, argv); cicada::optimize::LineSearch::value_min = value_lower; cicada::optimize::LineSearch::value_max = value_upper; if (scorer_list) { std::cout << cicada::eval::Scorer::lists(); return 0; } if (int(yield_sentence) + yield_alignment + yield_span > 1) throw std::runtime_error("specify either sentence|alignment|span yield"); if (int(yield_sentence) + yield_alignment + yield_span == 0) yield_sentence = true; if (weights_file.empty() || ! boost::filesystem::exists(weights_file)) throw std::runtime_error("no weight file? " + weights_file.string()); if (direction_name.empty()) throw std::runtime_error("no direction?"); // read reference set scorer_document_type scorers(scorer_name); read_refset(refset_files, scorers); if (mpi_rank == 0 && debug) std::cerr << "# of references: " << scorers.size() << std::endl; // read test set if (mpi_rank == 0 && debug) std::cerr << "reading hypergraphs" << std::endl; hypergraph_set_type graphs(scorers.size()); read_tstset(tstset_files, graphs); weight_set_type weights; { utils::compress_istream is(weights_file, 1024 * 1024); is >> weights; } weight_set_type direction; direction[direction_name] = 1.0; segment_document_type segments(graphs.size()); compute_envelope(scorers, graphs, weights, direction, segments); if (mpi_rank == 0) { line_search_type line_search(debug); utils::compress_ostream os(output_file, 1024 * 1024); line_search(segments, value_lower, value_upper, OutputIterator(os, weights[direction_name])); } } catch (const std::exception& err) { std::cerr << "error: " << err.what() << std::endl; return 1; } return 0; }
int server_type::rmnod(shared_ptr<fs_entry> file_ent, path_type const& path) { detail::fdtable.erase(path.string()); return 0; }
int main(int argc, char** argv) { try { options(argc, argv); if (int(leaf_mode) + treebank_mode > 1) throw std::runtime_error("multiple output options specified: leaf/treebank(default: treebank)"); if (int(leaf_mode) + treebank_mode == 0) treebank_mode = true; typedef boost::spirit::istream_iterator iter_type; const bool flush_output = (output_file == "-" || (boost::filesystem::exists(output_file) && ! boost::filesystem::is_regular_file(output_file))); penntreebank_grammar<iter_type> grammar; treebank_type parsed; sentence_type sent; pos_set_type pos; std::string line; utils::compress_istream is(input_file, 1024 * 1024); utils::compress_ostream os(output_file, 1024 * 1024); std::auto_ptr<utils::compress_istream> ms; if (! pos_file.empty()) { if (! boost::filesystem::exists(pos_file)) throw std::runtime_error("no map file: " + pos_file.string()); ms.reset(new utils::compress_istream(pos_file, 1024 * 1024)); } is.unsetf(std::ios::skipws); iter_type iter(is); iter_type iter_end; while (iter != iter_end) { parsed.clear(); if (! boost::spirit::qi::phrase_parse(iter, iter_end, grammar, boost::spirit::standard::space, parsed)) { std::string buffer; for (int i = 0; i != 64 && iter != iter_end; ++ i, ++iter) buffer += *iter; throw std::runtime_error("parsing failed: " + buffer); } if (! root_symbol.empty()) parsed.cat_ = root_symbol; else if (parsed.cat_.empty()) parsed.cat_ = "ROOT"; if (validate) if (! treebank_validate(parsed)) throw std::runtime_error("invalid tree"); if (ms.get()) { typedef boost::tokenizer<utils::space_separator, utils::piece::const_iterator, utils::piece> tokenizer_type; if (! utils::getline(*ms, line)) throw std::runtime_error("# of lines do not match with POS"); utils::piece line_piece(line); tokenizer_type tokenizer(line_piece); pos.clear(); pos.insert(pos.end(), tokenizer.begin(), tokenizer.end()); pos_set_type::iterator iter = pos.begin(); transform_pos(parsed, iter, pos.end()); if (iter != pos.end()) throw std::runtime_error("too long POS sequence?"); } if (remove_none) transform_remove_none(parsed); if (normalize) transform_normalize(parsed); if (remove_cycle) transform_cycle(parsed); if (unescape_terminal) transform_unescape(parsed); if (leaf_mode) { sent.clear(); transform_leaf(parsed, sent); if (! sent.empty()) { std::copy(sent.begin(), sent.end() - 1, std::ostream_iterator<std::string>(os, " ")); os << sent.back(); } } else if (treebank_mode) { if (parsed.antecedents_.empty()) os << "(())"; else treebank_output(parsed, os); } os << '\n'; if (flush_output) os << std::flush; } } catch (const std::exception& err) { std::cerr << "error: " << err.what() << std::endl; return 1; } return 0; }
int main(int argc, char** argv) { try { options(argc, argv); utils::compress_istream is(input_file, 1024 * 1024); utils::compress_ostream os(output_file); if (map_file != "-" && ! boost::filesystem::exists(map_file)) throw std::runtime_error("no map file: " + map_file.string()); utils::compress_istream ms(map_file); hypergraph_type hypergraph; sentence_type sentence; span_set_type spans; phrase_type rhs; while (1) { is >> hypergraph; ms >> sentence; if (! is || ! ms) break; if (! hypergraph.is_valid()) { os << hypergraph << '\n'; continue; } // map terminals... // we will first compute spans, then, perform terminal mapping.. spans.clear(); spans.resize(hypergraph.edges.size()); cicada::span_edge(hypergraph, spans); for (size_t edge_id = 0; edge_id != hypergraph.edges.size(); ++ edge_id) { hypergraph_type::edge_type& edge = hypergraph.edges[edge_id]; const span_type& span = spans[edge_id]; const hypergraph_type::symbol_type lhs = edge.rule->lhs; rhs.clear(); int pos = 0; int span_pos = span.first; hypergraph_type::rule_type::symbol_set_type::const_iterator riter_end = edge.rule->rhs.end(); for (hypergraph_type::rule_type::symbol_set_type::const_iterator riter = edge.rule->rhs.begin(); riter != riter_end; ++ riter) { if (riter->is_non_terminal()) { const int __non_terminal_index = riter->non_terminal_index(); const int non_terminal_pos = utils::bithack::branch(__non_terminal_index <= 0, pos, __non_terminal_index - 1); ++ pos; // compute span_pos from antecedent node... rhs.push_back(*riter); span_pos = spans[hypergraph.nodes[edge.tails[non_terminal_pos]].edges.front()].second; } else if (*riter != vocab_type::EPSILON) { rhs.push_back(sentence[span_pos]); ++ span_pos; } } edge.rule = hypergraph_type::rule_type::create(hypergraph_type::rule_type(lhs, rhs.begin(), rhs.end())); } os << hypergraph << '\n'; } if (is || ms) throw std::runtime_error("# of hypergraphs and # of sentences do not match"); } catch (std::exception& err) { std::cerr << "error: " << err.what() << std::endl; return 1; } return 0; }
int main(int argc, char ** argv) { try { options(argc, argv); feature_list_type features_confidence; feature_list_type features_count; if (! confidence_feature_file.empty()) { if (confidence_feature_file != "-" && ! boost::filesystem::exists(confidence_feature_file)) throw std::runtime_error("no confidence feature file? " + confidence_feature_file.string()); utils::compress_istream is(confidence_feature_file); std::string feature; while (is >> feature) features_confidence.push_back(feature); } if (! count_feature_file.empty()) { if (count_feature_file != "-" && ! boost::filesystem::exists(count_feature_file)) throw std::runtime_error("no count feature file? " + count_feature_file.string()); utils::compress_istream is(count_feature_file); std::string feature; while (is >> feature) features_count.push_back(feature); } const bool flush_output = (output_file == "-" || (boost::filesystem::exists(output_file) && ! boost::filesystem::is_regular_file(output_file))); hypergraph_type merged; hypergraph_type hypergraph; cicada::Feature feature_confidence(confidence); cicada::Feature feature_count(count); if (input_files.empty()) input_files.push_back("-"); if (multiple_mode) { namespace qi = boost::spirit::qi; namespace standard = boost::spirit::standard; // forest ||| forest ||| forest utils::compress_ostream os(output_file, 1024 * 1024 * (! flush_output)); for (path_set_type::const_iterator iter = input_files.begin(); iter != input_files.end(); ++ iter) { utils::compress_istream is(input_files.front(), 1024 * 1024); std::string line; while (utils::getline(is, line)) { int rank = 1; int id = 0; merged.clear(); hypergraph.clear(); std::string::const_iterator iter = line.begin(); std::string::const_iterator end = line.end(); for (/**/; iter != end; ++ id, ++ rank) { if (id != 0) if (! qi::phrase_parse(iter, end, "|||", standard::space)) break; if (! hypergraph.assign(iter, end)) throw std::runtime_error("invalid hypergraph format"); if (! hypergraph.is_valid()) continue; const double conf = 1.0 / (1.0 + rank); feature_set_type features; if (! features_confidence.empty()) { if (id >= static_cast<int>(features_confidence.size())) throw std::runtime_error("# of confidence features do not match"); features[features_confidence[id]] = conf; } if (! features_count.empty()) { if (id >= static_cast<int>(features_count.size())) throw std::runtime_error("# of count features do not match"); features[features_count[id]] = count_weight; } if (! feature_confidence.empty()) features[feature_confidence] = conf; if (! feature_count.empty()) features[feature_count] = count_weight; if (! features.empty()) { hypergraph_type::edge_set_type::iterator eiter_end = hypergraph.edges.end(); for (hypergraph_type::edge_set_type::iterator eiter = hypergraph.edges.begin(); eiter != eiter_end; ++ eiter) eiter->features += features; } merged.unite(hypergraph); } os << merged << '\n'; } } } else if (input_files.size() == 1) { utils::compress_istream is(input_files.front(), 1024 * 1024); std::string line; int rank = 1; int id = 0; for (/**/; utils::getline(is, line); ++ id, ++ rank) { std::string::const_iterator iter = line.begin(); std::string::const_iterator end = line.end(); if (! hypergraph.assign(iter, end)) throw std::runtime_error("invalid hypergraph format"); if (! hypergraph.is_valid()) continue; const double conf = 1.0 / (1.0 + rank); feature_set_type features; if (! features_confidence.empty()) { if (id >= static_cast<int>(features_confidence.size())) throw std::runtime_error("# of confidence features do not match"); features[features_confidence[id]] = conf; } if (! features_count.empty()) { if (id >= static_cast<int>(features_count.size())) throw std::runtime_error("# of count features do not match"); features[features_count[id]] = count_weight; } if (! feature_confidence.empty()) features[feature_confidence] = conf; if (! feature_count.empty()) features[feature_count] = count_weight; if (! features.empty()) { hypergraph_type::edge_set_type::iterator eiter_end = hypergraph.edges.end(); for (hypergraph_type::edge_set_type::iterator eiter = hypergraph.edges.begin(); eiter != eiter_end; ++ eiter) eiter->features += features; } merged.unite(hypergraph); } utils::compress_ostream os(output_file, 1024 * 1024 * (! flush_output)); os << merged << '\n'; } else { // we will handle multiple files! if (! features_confidence.empty()) if (input_files.size() != features_confidence.size()) throw std::runtime_error("input file do not match with # of confidence feature"); if (! features_count.empty()) if (input_files.size() != features_count.size()) throw std::runtime_error("input file do not match with # of count feature"); typedef std::vector<std::istream*, std::allocator<std::istream*> > istream_set_type; istream_set_type istreams(input_files.size()); for (size_t i = 0; i != input_files.size(); ++ i) istreams[i] = new utils::compress_istream(input_files[i], 1024 * 1024); utils::compress_ostream os(output_file, 1024 * 1024 * (! flush_output)); std::string line; for (;;) { int rank = 1; merged.clear(); hypergraph.clear(); size_t num_failed = 0; for (size_t id = 0; id != istreams.size(); ++ id, ++ rank) { if (utils::getline(*istreams[id], line)) { std::string::const_iterator iter = line.begin(); std::string::const_iterator end = line.end(); if (! hypergraph.assign(iter, end)) throw std::runtime_error("invalid hypergraph format"); if (! hypergraph.is_valid()) continue; const double conf = 1.0 / (1.0 + rank); feature_set_type features; if (! features_confidence.empty()) features[features_confidence[id]] = conf; if (! features_count.empty()) features[features_count[id]] = count_weight; if (! feature_confidence.empty()) features[feature_confidence] = conf; if (! feature_count.empty()) features[feature_count] = count_weight; if (! features.empty()) { hypergraph_type::edge_set_type::iterator eiter_end = hypergraph.edges.end(); for (hypergraph_type::edge_set_type::iterator eiter = hypergraph.edges.begin(); eiter != eiter_end; ++ eiter) eiter->features += features; } merged.unite(hypergraph); } else ++ num_failed; } if (num_failed) { if (num_failed != istreams.size()) throw std::runtime_error("# of lines do not match"); break; } os << merged << '\n'; } for (size_t i = 0; i != istreams.size(); ++ i) delete istreams[i]; } } catch (const std::exception& err) { std::cerr << "error: " << err.what() << std::endl; return 1; } return 0; }