Exemplo n.º 1
0
int main(int argc, char** argv)
{
  try {
    if (getoptions(argc, argv) != 0) 
      return 1;

    if (! temporary_dir.empty())
      ::setenv("TMPDIR_SPEC", temporary_dir.string().data(), 1);
    
    expgram::NGram ngram(ngram_file, shards, debug);
    
    utils::compress_ostream os(output_file);
    os << "ngram order: " << ngram.index.order() << '\n';
    for (int order = 1; order <= ngram.index.order(); ++ order)
      os << order << "-gram: " << std::setw(16) << ngram.index.ngram_size(order) << '\n';
    
    dump(os, "index",    ngram.stat_index()) << '\n';
    dump(os, "pointer",  ngram.stat_pointer()) << '\n';
    dump(os, "vocab",    ngram.stat_vocab()) << '\n';
    dump(os, "logprob",  ngram.stat_logprob()) << '\n';
    dump(os, "backoff",  ngram.stat_backoff()) << '\n';
    dump(os, "logbound", ngram.stat_logbound()) << '\n';
  }
  catch (std::exception& err) {
    std::cerr << "error: " << err.what() << std::endl;
    return 1;
  }
  return 0;
}
 result_type operator() (path_type const& geom) const
 {
     typename path_type::value_type coord;
     geom.rewind(0);
     std::get<0>(coord) = geom.vertex(&std::get<1>(coord),&std::get<2>(coord));
     return coord;
 }
Exemplo n.º 3
0
	void platform_abstraction::font_resource(bool try_add, const path_type& ttf)
	{
#ifdef NANA_WINDOWS
		if (try_add)
			::AddFontResourceEx(ttf.wstring().c_str(), FR_PRIVATE, nullptr);
		else
			::RemoveFontResourceEx(ttf.wstring().c_str(), FR_PRIVATE, nullptr);
#else
		auto & fc = platform_storage().fontconfig_counts;
		if(try_add)
		{
			if(1 == ++(fc[ttf.string()]))
			{
				::FcConfigAppFontAddFile(nullptr, reinterpret_cast<const FcChar8*>(ttf.string().c_str()));
			}
		}
		else
		{
			auto i = fc.find(ttf.string());
			if(i != fc.end())
			{
				if(0 == --(i->second))
					fc.erase(i);

				if(0 == fc.size())
					::FcConfigAppFontClear(nullptr);
			}
		}
#endif
	}
Exemplo n.º 4
0
  void NGram::open(const path_type& path)
  {
    typedef utils::repository repository_type;
    
    clear();

    if (path.empty())
      throw std::runtime_error("no ngram?");
    else if (! boost::filesystem::exists(path))
      throw std::runtime_error("no ngram? " + path.string());
    
    repository_type rep(path, repository_type::read);
    
    index.open(rep.path("index"));
    
    if (boost::filesystem::exists(rep.path("logprob")))
      open_shards(rep.path("logprob"), logprobs);
    
    if (boost::filesystem::exists(rep.path("backoff")))
      open_shards(rep.path("backoff"), backoffs);
    
    if (boost::filesystem::exists(rep.path("logbound")))
      open_shards(rep.path("logbound"), logbounds);
    
    repository_type::const_iterator siter = rep.find("smooth");
    if (siter == rep.end())
      throw std::runtime_error("no smoothing parameter...?");
    smooth = utils::lexical_cast<logprob_type>(siter->second);
    
    if (debug)
      std::cerr << "ngram: " << path
		<< " # of shards: " << index.size()
		<< " smooth: " << smooth
		<< std::endl;
  }
Exemplo n.º 5
0
void options(int argc, char** argv)
{
  namespace po = boost::program_options;
  
  po::options_description opts_config("configuration options");
  opts_config.add_options()
    ("input",  po::value<path_type>(&input_file)->default_value(input_file),   "input file")
    ("output", po::value<path_type>(&output_file)->default_value(output_file), "output file")
    
    ("input-sentence",   po::bool_switch(&input_sentence_mode),   "sentence input")
    ("input-lattice",    po::bool_switch(&input_lattice_mode),    "lattice input")
    
    ("treebank", po::bool_switch(&treebank_mode), "assume treebank style grammar")
    ("pos",      po::bool_switch(&pos_mode),      "POS annotated input")
    
    // grammar
    ("grammar",           po::value<grammar_file_set_type >(&grammar_files)->composing(),      "grammar specification(s)")
    ("grammar-list",      po::bool_switch(&grammar_list),                                      "list of available grammar specifications")
    ;
  
  po::options_description opts_command("command line options");
  opts_command.add_options()
    ("config",  po::value<path_type>(),                    "configuration file")
    ("threads", po::value<int>(&threads),                  "# of threads (highly experimental)")
    ("debug",   po::value<int>(&debug)->implicit_value(1), "debug level")
    ("help", "help message");
  
  po::options_description desc_config;
  po::options_description desc_command;
  po::options_description desc_visible;
  
  desc_config.add(opts_config);
  desc_command.add(opts_config).add(opts_command);
  desc_visible.add(opts_config).add(opts_command);
  
  po::variables_map variables;

  po::store(po::parse_command_line(argc, argv, desc_command, po::command_line_style::unix_style & (~po::command_line_style::allow_guessing)), variables);
  if (variables.count("config")) {
    const path_type path_config = variables["config"].as<path_type>();
    if (! boost::filesystem::exists(path_config))
      throw std::runtime_error("no config file: " + path_config.string());
    
    utils::compress_istream is(path_config);
    po::store(po::parse_config_file(is, desc_config), variables);
  }
  
  po::notify(variables);

  if (variables.count("help")) {
    
    std::cout << argv[0] << " [options]\n"
	      << desc_visible << std::endl;
    exit(0);
  }
}
Exemplo n.º 6
0
bool slice_reader::open_file(const path_type & file) {
	
	log_info << "Opening \"" << color::cyan << file.string() << color::reset << '"';
	
	ifs.close();
	ifs.clear();
	
	ifs.open(file, std::ios_base::in | std::ios_base::binary | std::ios_base::ate);
	if(ifs.fail()) {
		return false;
	}
	
	std::streampos file_size = ifs.tellg();
	ifs.seekg(0);
	
	char magic[8];
	if(ifs.read(magic, 8).fail()) {
		ifs.close();
		throw slice_error("could not read slice magic number in \"" + file.string() + "\"");
	}
	bool found = false;
	for(size_t i = 0; boost::size(slice_ids); i++) {
		if(!std::memcmp(magic, slice_ids[i], 8)) {
			found = true;
			break;
		}
	}
	if(!found) {
		ifs.close();
		throw slice_error("bad slice magic number in \"" + file.string() + "\"");
	}
	
	slice_size = util::load<boost::uint32_t>(ifs);
	if(ifs.fail()) {
		ifs.close();
		throw slice_error("could not read slice size in \"" + file.string() + "\"");
	} else if(std::streampos(slice_size) > file_size) {
		ifs.close();
		std::ostringstream oss;
		oss << "bad slice size in " << file << ": " << slice_size << " > " << file_size;
		throw slice_error(oss.str());
	} else if(std::streampos(slice_size) < ifs.tellg()) {
		ifs.close();
		std::ostringstream oss;
		oss << "bad slice size in " << file << ": " << slice_size << " < " << ifs.tellg();
		throw slice_error(oss.str());
	}
	
	slice_file = file;
	
	last_dir = file.parent_path();
	
	return true;
}
Exemplo n.º 7
0
 int server_type::mknod(shared_ptr<fs_entry> file_ent, path_type const& path, mode_t, dev_t)
 {
     auto filename   = path.filename().string();
     auto pos        = filename.find_first_of(host_port_delimiter);
     auto host       = filename.substr(0, pos);
     int port        = std::stoi(filename.substr(pos + 1));
     BOOST_LOG_TRIVIAL(info) << "server_type::mknod: establishing server host=" << host << " port=" << port;
     int serverfd    = establish_server(host, port);
     BOOST_LOG_TRIVIAL(info) << "server_type::mknod: established server " << host << ":" << port << " fd=" << serverfd;
     detail::fdtable.insert(path.string(), serverfd);
     return 0;
 }
Exemplo n.º 8
0
int main(int argc, char** argv)
{
  utils::mpi_world  mpi_world(argc, argv);
  
  const int mpi_rank = MPI::COMM_WORLD.Get_rank();
  const int mpi_size = MPI::COMM_WORLD.Get_size();  
  
  try {
    if (getoptions(argc, argv) != 0) 
      return 1;

    if (! temporary_dir.empty())
      ::setenv("TMPDIR_SPEC", temporary_dir.string().data(), 1);
    
    if (ngram_file.empty() || ! boost::filesystem::exists(ngram_file))
      throw std::runtime_error("no ngram file?");
    if (output_file.empty())
      throw std::runtime_error("no output file?");
    if (ngram_file == output_file)
      throw std::runtime_error("dump to the same directory?");
    
    ngram_type ngram(debug);
    ngram.open_shard(ngram_file, mpi_rank);

    if (static_cast<int>(ngram.index.size()) != mpi_size)
      throw std::runtime_error("MPI universe size do not match with ngram shard size");
    
    utils::resource start;

    ngram_quantize(ngram);

    utils::resource end;
    
    if (debug && mpi_rank == 0)
      std::cerr << "quantize language model"
		<< " cpu time:  " << end.cpu_time() - start.cpu_time() 
		<< " user time: " << end.user_time() - start.user_time()
		<< std::endl;
    
    if (mpi_rank == 0)
      ngram.write_prepare(output_file);
    
    MPI::COMM_WORLD.Barrier();
    ngram.write_shard(output_file, mpi_rank);
  }
  catch (std::exception& err) {
    std::cerr << "error: "  << err.what() << std::endl;
    MPI::COMM_WORLD.Abort(1);
    return 1;
  }
  return 0;
}
Exemplo n.º 9
0
 //
 // server_op
 //
 int server_op::open(shared_ptr<fs_entry> file_ent, path_type const& path)
 {
     if (auto server_fd = detail::fdtable.find(path.string()))
         server_fd_ = *server_fd;
     else
         throw std::runtime_error("client_op: open: cannot get socket from fdtable");
     sockaddr_in client;
     socklen_t len = sizeof (client);
     client_fd_ = accept(server_fd_, reinterpret_cast<sockaddr *>(&client), &len);
     if (client_fd_ < 0)
         throw_system_error(errno);
     return 0;
 }
Exemplo n.º 10
0
int main(int argc, char ** argv)
{
  utils::mpi_world mpi_world(argc, argv);
  
  const int mpi_rank = MPI::COMM_WORLD.Get_rank();
  const int mpi_size = MPI::COMM_WORLD.Get_size();
  
  try {
    options(argc, argv);

    cicada::optimize::LineSearch::value_min = value_lower;
    cicada::optimize::LineSearch::value_max = value_upper;
    
    if (scorer_list) {
      std::cout << cicada::eval::Scorer::lists();
      return 0;
    }
    
    if (int(yield_sentence) + yield_alignment + yield_span > 1)
      throw std::runtime_error("specify either sentence|alignment|span yield");
    if (int(yield_sentence) + yield_alignment + yield_span == 0)
      yield_sentence = true;

    if (weights_file.empty() || ! boost::filesystem::exists(weights_file))
      throw std::runtime_error("no weight file? " + weights_file.string());
    if (direction_name.empty())
      throw std::runtime_error("no direction?");
    
    // read reference set
    scorer_document_type scorers(scorer_name);
    
    read_refset(refset_files, scorers);
    
    if (mpi_rank == 0 && debug)
      std::cerr << "# of references: " << scorers.size() << std::endl;

    // read test set
    
    if (mpi_rank == 0 && debug)
      std::cerr << "reading hypergraphs" << std::endl;

    hypergraph_set_type graphs(scorers.size());
    
    read_tstset(tstset_files, graphs);
    
    weight_set_type weights;
    {
      utils::compress_istream is(weights_file, 1024 * 1024);
      is >> weights;
    }
    
    weight_set_type direction;
    direction[direction_name] = 1.0;
    
    segment_document_type segments(graphs.size());
    
    compute_envelope(scorers, graphs, weights, direction, segments);

    if (mpi_rank == 0) {
      line_search_type line_search(debug);
      
      utils::compress_ostream os(output_file, 1024 * 1024);
      
      line_search(segments, value_lower, value_upper, OutputIterator(os, weights[direction_name]));
    }
  }
  catch (const std::exception& err) {
    std::cerr << "error: " << err.what() << std::endl;
    return 1;
  }
  return 0;
}
Exemplo n.º 11
0
 bool server_type::match_path(path_type const& path, file_type type)
 {
     return  type == file_type::regular_file &&
             path.parent_path() == "/tcp" &&
             path.filename().string()[0] == '*';
 }
Exemplo n.º 12
0
 int server_type::rmnod(shared_ptr<fs_entry> file_ent, path_type const& path)
 {
     detail::fdtable.erase(path.string());
     return 0;
 }
Exemplo n.º 13
0
int main(int argc, char** argv)
{
  try {
    options(argc, argv);
    
    if (grammar_list) {
      std::cout << grammar_type::lists();
      return 0;
    }
    
    if (tree_grammar_list) {
      std::cout << tree_grammar_type::lists();
      return 0;
    }
    
    threads = utils::bithack::max(1, threads);

    // read grammars...
    grammar_type grammar(grammar_files.begin(), grammar_files.end());
    if (debug)
      std::cerr << "grammar: " << grammar.size() << std::endl;
    
    tree_grammar_type tree_grammar(tree_grammar_files.begin(), tree_grammar_files.end());
    if (debug)
      std::cerr << "tree grammar: " << tree_grammar.size() << std::endl;

    typedef Task task_type;
    typedef std::vector<task_type, std::allocator<task_type> > task_set_type;
    
    task_type::queue_type queue(threads);
    task_set_type tasks(threads, task_type(queue, tree_grammar, grammar));
    
    boost::thread_group workers;
    for (int i = 0; i != threads; ++ i)
      workers.add_thread(new boost::thread(boost::ref(tasks[i])));
    
    utils::compress_istream is(input_file, 1024 * 1024);
    
    std::string line;
    while (utils::getline(is, line))
      if (! line.empty())
	queue.push_swap(line);
    
    for (int i = 0; i != threads; ++ i)
      queue.push(std::string());
    
    workers.join_all();

    tree_rule_pair_unique_type tree_rules_unique;
    rule_pair_unique_type      rules_unique;
    
    for (int i = 0; i != threads; ++ i) {
      if (tree_rules_unique.empty())
	tree_rules_unique.swap(tasks[i].tree_rules_unique);
      else
	tree_rules_unique.insert(tasks[i].tree_rules_unique.begin(), tasks[i].tree_rules_unique.end());
      
      tasks[i].tree_rules_unique.clear();
      
      if (rules_unique.empty())
	rules_unique.swap(tasks[i].rules_unique);
      else
	rules_unique.insert(tasks[i].rules_unique.begin(), tasks[i].rules_unique.end());
      
      tasks[i].rules_unique.clear();
    }
    tasks.clear();

    typedef std::ostream_iterator<char> oiter_type;

    features_generator<oiter_type>   generate_features;
    attributes_generator<oiter_type> generate_attributes;
    
    if (! output_tree_file.empty()) {
      namespace karma = boost::spirit::karma;
      namespace standard = boost::spirit::standard;
      
      utils::compress_ostream os(output_tree_file, 1024 * 1024);
      
      tree_rule_pair_unique_type::const_iterator iter_end = tree_rules_unique.end();
      for (tree_rule_pair_unique_type::const_iterator iter = tree_rules_unique.begin(); iter != iter_end; ++ iter) {
	os << iter->source.decode() << " ||| " << iter->target.decode();
	
	if (! iter->features.empty()) {
	  feature_set_type features(iter->features.begin(), iter->features.end());
	  karma::generate(oiter_type(os), generate_features, features);
	}
	
	if (! iter->attributes.empty())
	  karma::generate(oiter_type(os), generate_attributes, iter->attributes);
	os << '\n';
      }
    }
    
    if (! output_rule_file.empty()) {
      namespace karma = boost::spirit::karma;
      namespace standard = boost::spirit::standard;
      
      utils::compress_ostream os(output_rule_file, 1024 * 1024);
      
      rule_pair_unique_type::const_iterator iter_end = rules_unique.end();
      for (rule_pair_unique_type::const_iterator iter = rules_unique.begin(); iter != iter_end; ++ iter) {
	karma::generate(oiter_type(os),
			standard::string << " ||| " << -(standard::string % ' ') << " ||| " << -(standard::string % ' '),
			iter->lhs,
			symbol_set_type(iter->source.begin(), iter->source.end()),
			symbol_set_type(iter->target.begin(), iter->target.end()));
	
	if (! iter->features.empty()) {
	  feature_set_type features(iter->features.begin(), iter->features.end());
	  karma::generate(oiter_type(os), generate_features, features);
	}
	
	if (! iter->attributes.empty())
	  karma::generate(oiter_type(os), generate_attributes, iter->attributes);
	os << '\n';
      }
    }
  }
  catch (const std::exception& err) {
    std::cerr << "error: " << err.what() << std::endl;
    return 1;
  }
  return 0;
}
Exemplo n.º 14
0
int main(int argc, char** argv)
{
    try {
        options(argc, argv);

        utils::compress_istream is(input_file, 1024 * 1024);
        utils::compress_ostream os(output_file);

        if (map_file != "-" && ! boost::filesystem::exists(map_file))
            throw std::runtime_error("no map file: " + map_file.string());

        utils::compress_istream ms(map_file);

        hypergraph_type hypergraph;
        sentence_type   sentence;
        span_set_type   spans;
        phrase_type     rhs;

        while (1) {
            is >> hypergraph;
            ms >> sentence;

            if (! is || ! ms) break;

            if (! hypergraph.is_valid()) {
                os << hypergraph << '\n';
                continue;
            }

            // map terminals...
            // we will first compute spans, then, perform terminal mapping..

            spans.clear();
            spans.resize(hypergraph.edges.size());

            cicada::span_edge(hypergraph, spans);

            for (size_t edge_id = 0; edge_id != hypergraph.edges.size(); ++ edge_id) {
                hypergraph_type::edge_type& edge = hypergraph.edges[edge_id];
                const span_type& span = spans[edge_id];
                const hypergraph_type::symbol_type lhs = edge.rule->lhs;

                rhs.clear();

                int pos = 0;
                int span_pos = span.first;
                hypergraph_type::rule_type::symbol_set_type::const_iterator riter_end = edge.rule->rhs.end();
                for (hypergraph_type::rule_type::symbol_set_type::const_iterator riter = edge.rule->rhs.begin(); riter != riter_end; ++ riter) {
                    if (riter->is_non_terminal()) {
                        const int __non_terminal_index = riter->non_terminal_index();
                        const int non_terminal_pos = utils::bithack::branch(__non_terminal_index <= 0, pos, __non_terminal_index - 1);
                        ++ pos;

                        // compute span_pos from antecedent node...

                        rhs.push_back(*riter);
                        span_pos = spans[hypergraph.nodes[edge.tails[non_terminal_pos]].edges.front()].second;
                    } else if (*riter != vocab_type::EPSILON) {
                        rhs.push_back(sentence[span_pos]);
                        ++ span_pos;
                    }
                }

                edge.rule = hypergraph_type::rule_type::create(hypergraph_type::rule_type(lhs, rhs.begin(), rhs.end()));
            }

            os << hypergraph << '\n';
        }

        if (is || ms)
            throw std::runtime_error("# of hypergraphs and # of sentences do not match");
    }
    catch (std::exception& err) {
        std::cerr << "error: " << err.what() << std::endl;
        return 1;
    }
    return 0;
}
Exemplo n.º 15
0
int main(int argc, char ** argv)
{
  try {
    options(argc, argv);
    
    feature_list_type features_confidence;
    feature_list_type features_count;

    if (! confidence_feature_file.empty()) {
      if (confidence_feature_file != "-" && ! boost::filesystem::exists(confidence_feature_file))
	throw std::runtime_error("no confidence feature file? " + confidence_feature_file.string());
      
      utils::compress_istream is(confidence_feature_file);
      std::string feature;
      while (is >> feature)
	features_confidence.push_back(feature);
    }
    
    if (! count_feature_file.empty()) {
      if (count_feature_file != "-" && ! boost::filesystem::exists(count_feature_file))
	throw std::runtime_error("no count feature file? " + count_feature_file.string());
      
      utils::compress_istream is(count_feature_file);
      std::string feature;
      while (is >> feature)
	features_count.push_back(feature);
    }

    const bool flush_output = (output_file == "-"
                               || (boost::filesystem::exists(output_file)
                                   && ! boost::filesystem::is_regular_file(output_file)));
    
    hypergraph_type merged;
    hypergraph_type hypergraph;
    
    cicada::Feature feature_confidence(confidence);
    cicada::Feature feature_count(count);
    
    if (input_files.empty())
      input_files.push_back("-");
    
    if (multiple_mode) {
      namespace qi = boost::spirit::qi;
      namespace standard = boost::spirit::standard;
      
      // forest ||| forest ||| forest

      utils::compress_ostream os(output_file, 1024 * 1024 * (! flush_output));
      
      for (path_set_type::const_iterator iter = input_files.begin(); iter != input_files.end(); ++ iter) {
	utils::compress_istream is(input_files.front(), 1024 * 1024);
	std::string line;
	
	while (utils::getline(is, line)) {
	  int rank = 1;
	  int id = 0;
	  
	  merged.clear();
	  hypergraph.clear();
	  
	  std::string::const_iterator iter = line.begin();
	  std::string::const_iterator end = line.end();
	  
	  for (/**/; iter != end; ++ id, ++ rank) {
	    if (id != 0)
	      if (! qi::phrase_parse(iter, end, "|||", standard::space))
		break;
	    
	    if (! hypergraph.assign(iter, end))
	      throw std::runtime_error("invalid hypergraph format");
	    if (! hypergraph.is_valid()) continue;
	    
	    const double conf = 1.0 / (1.0 + rank);
	      
	    feature_set_type features;
	      
	    if (! features_confidence.empty()) {
	      if (id >= static_cast<int>(features_confidence.size()))
		throw std::runtime_error("# of confidence features do not match");
	      features[features_confidence[id]] = conf;
	    }
	    if (! features_count.empty()) {
	      if (id >= static_cast<int>(features_count.size()))
		throw std::runtime_error("# of count features do not match");
	      features[features_count[id]] = count_weight;
	    }
	    if (! feature_confidence.empty())
	      features[feature_confidence] = conf;
	    if (! feature_count.empty())
	      features[feature_count] = count_weight;
	      
	    if (! features.empty()) {
	      hypergraph_type::edge_set_type::iterator eiter_end = hypergraph.edges.end();
	      for (hypergraph_type::edge_set_type::iterator eiter = hypergraph.edges.begin(); eiter != eiter_end; ++ eiter)
		eiter->features += features;
	    }
	    
	    merged.unite(hypergraph);
	  }
	  
	  os << merged << '\n';
	}
      }
    } else if (input_files.size() == 1) {
      utils::compress_istream is(input_files.front(), 1024 * 1024);
      std::string line;
      
      int rank = 1;
      int id = 0;
      for (/**/; utils::getline(is, line); ++ id, ++ rank) {
	std::string::const_iterator iter = line.begin();
	std::string::const_iterator end = line.end();
	
	if (! hypergraph.assign(iter, end))
	  throw std::runtime_error("invalid hypergraph format");
	if (! hypergraph.is_valid()) continue;
	
	const double conf = 1.0 / (1.0 + rank);
	
	feature_set_type features;
	if (! features_confidence.empty()) {
	  if (id >= static_cast<int>(features_confidence.size()))
	    throw std::runtime_error("# of confidence features do not match");
	  features[features_confidence[id]] = conf;
	}
	if (! features_count.empty()) {
	  if (id >= static_cast<int>(features_count.size()))
	    throw std::runtime_error("# of count features do not match");
	  features[features_count[id]] = count_weight;
	}
	if (! feature_confidence.empty())
	  features[feature_confidence] = conf;
	if (! feature_count.empty())
	  features[feature_count] = count_weight;
	
	if (! features.empty()) {
	  hypergraph_type::edge_set_type::iterator eiter_end = hypergraph.edges.end();
	  for (hypergraph_type::edge_set_type::iterator eiter = hypergraph.edges.begin(); eiter != eiter_end; ++ eiter)
	    eiter->features += features;
	} 
	
	merged.unite(hypergraph);
      }
      
      utils::compress_ostream os(output_file, 1024 * 1024 * (! flush_output));
      
      os << merged << '\n';
      
    } else {
      // we will handle multiple files!
      
      if (! features_confidence.empty())
	if (input_files.size() != features_confidence.size())
	  throw std::runtime_error("input file do not match with # of confidence feature");

      if (! features_count.empty())
	if (input_files.size() != features_count.size())
	  throw std::runtime_error("input file do not match with # of count feature");
      
      typedef std::vector<std::istream*, std::allocator<std::istream*> > istream_set_type;
      
      istream_set_type istreams(input_files.size());
      for (size_t i = 0; i != input_files.size(); ++ i)
	istreams[i] = new utils::compress_istream(input_files[i], 1024 * 1024);
      
      utils::compress_ostream os(output_file, 1024 * 1024 * (! flush_output));
      
      std::string line;
      
      for (;;) {
	int rank = 1;
	
	merged.clear();
	hypergraph.clear();
	
	size_t num_failed = 0;
	for (size_t id = 0; id != istreams.size(); ++ id, ++ rank) {
	  if (utils::getline(*istreams[id], line)) {
	    std::string::const_iterator iter = line.begin();
	    std::string::const_iterator end = line.end();
	    
	    if (! hypergraph.assign(iter, end))
	      throw std::runtime_error("invalid hypergraph format");
	    if (! hypergraph.is_valid()) continue;
	    
	    const double conf = 1.0 / (1.0 + rank);
	    
	    feature_set_type features;
	    if (! features_confidence.empty())
	      features[features_confidence[id]] = conf;
	    if (! features_count.empty())
	      features[features_count[id]] = count_weight;
	    if (! feature_confidence.empty())
	      features[feature_confidence] = conf;
	    if (! feature_count.empty())
	      features[feature_count] = count_weight;
	    
	    if (! features.empty()) {
	      hypergraph_type::edge_set_type::iterator eiter_end = hypergraph.edges.end();
	      for (hypergraph_type::edge_set_type::iterator eiter = hypergraph.edges.begin(); eiter != eiter_end; ++ eiter)
		eiter->features += features;
	    } 
	    
	    merged.unite(hypergraph);
	  } else
	    ++ num_failed;
	}
	
	if (num_failed) {
	  if (num_failed != istreams.size())
	    throw std::runtime_error("# of lines do not match");
	  break;
	}
	
	os << merged << '\n';
      }
      
      for (size_t i = 0; i != istreams.size(); ++ i)
	delete istreams[i];
    }
  }
  catch (const std::exception& err) {
    std::cerr << "error: " << err.what() << std::endl;
    return 1;
  }
  return 0;
}
Exemplo n.º 16
0
void read_tstset(const path_set_type& files, hypergraph_set_type& graphs)
{
  const int mpi_rank = MPI::COMM_WORLD.Get_rank();
  const int mpi_size = MPI::COMM_WORLD.Get_size();

  std::string line;

  path_set_type::const_iterator titer_end = tstset_files.end();
  for (path_set_type::const_iterator titer = tstset_files.begin(); titer != titer_end; ++ titer) {
    
    if (debug)
      std::cerr << "file: " << *titer << std::endl;
      
    if (boost::filesystem::is_directory(*titer)) {
      size_t id;
      hypergraph_type hypergraph;
      
      for (size_t i = mpi_rank; /**/; i += mpi_size) {
	const path_type path = (*titer) / (utils::lexical_cast<std::string>(i) + ".gz");
	
	if (! boost::filesystem::exists(path)) break;
	
	utils::compress_istream is(path, 1024 * 1024);
		
	if (! utils::getline(is, line))
	  throw std::runtime_error("no line in file-no: " + utils::lexical_cast<std::string>(i));
	
	std::string::const_iterator iter = line.begin();
	std::string::const_iterator end  = line.end();
	
	if (! parse_id(id, iter, end))
	  throw std::runtime_error("invalid id input: " + path.string());
	if (id != i)
	  throw std::runtime_error("id mismatch: "  + path.string());
	if (static_cast<int>(id % mpi_size) != mpi_rank)
	  throw std::runtime_error("difference it?");
	
	if (! hypergraph.assign(iter, end))
	  throw std::runtime_error("invalid graph format" + path.string());
	if (iter != end)
	  throw std::runtime_error("invalid id ||| graph format" + path.string());
	
	if (graphs[id].is_valid())
	  graphs[id].unite(hypergraph);
	else
	  graphs[id].swap(hypergraph);
      }
    } else {
      const path_type& path = *titer;
	
      utils::compress_istream is(path, 1024 * 1024);
      
      size_t id;
      hypergraph_type hypergraph;
      
      while (utils::getline(is, line)) {
	std::string::const_iterator iter = line.begin();
	std::string::const_iterator end  = line.end();
	
	if (! parse_id(id, iter, end))
	  throw std::runtime_error("invalid id input: " + path.string());
	if (id >= graphs.size())
	  throw std::runtime_error("tstset size exceeds refset size?" + utils::lexical_cast<std::string>(id) + ": " + titer->string());
	
	if (static_cast<int>(id % mpi_size) != mpi_rank) continue;
	
	if (! hypergraph.assign(iter, end))
	  throw std::runtime_error("invalid graph format" + path.string());
	if (iter != end)
	  throw std::runtime_error("invalid id ||| graph format" + path.string());
	
	if (graphs[id].is_valid())
	  graphs[id].unite(hypergraph);
	else
	  graphs[id].swap(hypergraph);
      }
    }
  }
}
Exemplo n.º 17
0
int main(int argc, char ** argv)
{
  try {
    options(argc, argv);

    cicada::optimize::LineSearch::value_min = value_lower;
    cicada::optimize::LineSearch::value_max = value_upper;

    if (scorer_list) {
      std::cout << cicada::eval::Scorer::lists();
      return 0;
    }    
    
    if (regularize_l1 && regularize_l2)
      throw std::runtime_error("you cannot use both of L1 and L2...");
    
    if (regularize_l1 || regularize_l2) {
      if (C <= 0.0)
	throw std::runtime_error("the scaling for L1/L2 must be positive");
    }
    
    if (weight_normalize_l1 && weight_normalize_l2)
      throw std::runtime_error("you cannot use both of L1 and L2 for weight normalization...");


    threads = utils::bithack::max(threads, 1);
    
    // read reference set
    scorer_document_type scorers(scorer_name);
    read_refset(refset_files, scorers);

    const size_t scorers_size = scorers.size();
    
    if (iterative && tstset_files.size() > 1) {
      scorer_document_type scorers_iterative(scorer_name);
      scorers_iterative.resize(scorers.size() * tstset_files.size());
      
      for (size_t i = 0; i != tstset_files.size(); ++ i)
	std::copy(scorers.begin(), scorers.end(), scorers_iterative.begin() + scorers.size() * i);
      
      scorers.swap(scorers_iterative);
    }

    if (debug)
      std::cerr << "# of references: " << scorers.size() << std::endl;


    if (debug)
      std::cerr << "reading kbests" << std::endl;

    hypothesis_map_type kbests(scorers.size());
    
    read_tstset(tstset_files, kbests, scorers_size);

    initialize_score(kbests, scorers);

    // collect initial weights
    weight_set_collection_type weights;
    
    if (! feature_weights_files.empty()) {

      for (path_set_type::const_iterator fiter = feature_weights_files.begin(); fiter != feature_weights_files.end(); ++ fiter) {
	if (*fiter != "-" && ! boost::filesystem::exists(*fiter))
	  throw std::runtime_error("no file? " + fiter->string());
	
	utils::compress_istream is(*fiter);
	
	weights.push_back(weight_set_type());
	is >> weights.back();
      }
      
      if (initial_average && weights.size() > 1) {
	weight_set_type weight;
	
	weight_set_collection_type::const_iterator witer_end = weights.end();
	for (weight_set_collection_type::const_iterator witer = weights.begin(); witer != witer_end; ++ witer)
	  weight += *witer;
	
	weight *= (1.0 / weights.size());
	
	weights.push_back(weight);
      }
      
      std::set<weight_set_type, std::less<weight_set_type>, std::allocator<weight_set_type> > uniques;
      uniques.insert(weights.begin(), weights.end());
      weights.clear();
      weights.insert(weights.end(), uniques.begin(), uniques.end());
    } else {
      weights.push_back(weight_set_type());
      
      // all one weight...
      for (feature_type::id_type id = 0; id < feature_type::allocated(); ++ id)
	if (! feature_type(id).empty())
	  weights.back()[feature_type(id)] = 1.0;
    }
    
    // collect lower/upper bounds
    weight_set_type bound_lower;
    weight_set_type bound_upper;
    
    if (! bound_lower_file.empty()) {
      if (bound_lower_file == "-" || boost::filesystem::exists(bound_lower_file)) {
	typedef cicada::FeatureVector<double> feature_vector_type;
	
	feature_vector_type bounds;
	  
	utils::compress_istream is(bound_lower_file);
	is >> bounds;
	
	bound_lower.allocate(cicada::optimize::LineSearch::value_min);
	for (feature_vector_type::const_iterator biter = bounds.begin(); biter != bounds.end(); ++ biter)
	  bound_lower[biter->first] = biter->second;
      } else
	throw std::runtime_error("no lower-bound file?" + bound_lower_file.string());
    }
Exemplo n.º 18
0
int main(int argc, char** argv)
{
  try {
    options(argc, argv);

    if (int(leaf_mode) + treebank_mode > 1)
      throw std::runtime_error("multiple output options specified: leaf/treebank(default: treebank)");
    if (int(leaf_mode) + treebank_mode == 0)
      treebank_mode = true;
    
    typedef boost::spirit::istream_iterator iter_type;

    const bool flush_output = (output_file == "-"
			       || (boost::filesystem::exists(output_file)
				   && ! boost::filesystem::is_regular_file(output_file)));
    
    penntreebank_grammar<iter_type> grammar;
    
    treebank_type parsed;
    sentence_type sent;
    pos_set_type  pos;
    std::string   line;
    
    utils::compress_istream is(input_file, 1024 * 1024);
    utils::compress_ostream os(output_file, 1024 * 1024);

    std::auto_ptr<utils::compress_istream> ms;
    
    if (! pos_file.empty()) {
      if (! boost::filesystem::exists(pos_file))
	throw std::runtime_error("no map file: " + pos_file.string());
      
      ms.reset(new utils::compress_istream(pos_file, 1024 * 1024));
    }
    
    is.unsetf(std::ios::skipws);
    iter_type iter(is);
    iter_type iter_end;
        
    while (iter != iter_end) {
      parsed.clear();
      
      if (! boost::spirit::qi::phrase_parse(iter, iter_end, grammar, boost::spirit::standard::space, parsed)) {
	std::string buffer;
	for (int i = 0; i != 64 && iter != iter_end; ++ i, ++iter)
	  buffer += *iter;
	
	throw std::runtime_error("parsing failed: " + buffer);
      }

      if (! root_symbol.empty())
	parsed.cat_ = root_symbol;
      else if (parsed.cat_.empty())
	parsed.cat_ = "ROOT";

      if (validate)
	if (! treebank_validate(parsed))
	  throw std::runtime_error("invalid tree");

      if (ms.get()) {
	typedef boost::tokenizer<utils::space_separator, utils::piece::const_iterator, utils::piece> tokenizer_type;

	if (! utils::getline(*ms, line))
	  throw std::runtime_error("# of lines do not match with POS");
	
	utils::piece line_piece(line);
	tokenizer_type tokenizer(line_piece);
	
	pos.clear();
	pos.insert(pos.end(), tokenizer.begin(), tokenizer.end());

	pos_set_type::iterator iter = pos.begin();
	
	transform_pos(parsed, iter, pos.end());

	if (iter != pos.end())
	  throw std::runtime_error("too long POS sequence?");
      }
      
      if (remove_none)
	transform_remove_none(parsed);
      
      if (normalize)
	transform_normalize(parsed);
      
      if (remove_cycle)
	transform_cycle(parsed);

      if (unescape_terminal)
	transform_unescape(parsed);
      
      if (leaf_mode) {
	sent.clear();
	
	transform_leaf(parsed, sent);
	
	if (! sent.empty()) {
	  std::copy(sent.begin(), sent.end() - 1, std::ostream_iterator<std::string>(os, " "));
	  os << sent.back();
	}
      } else if (treebank_mode) {
	if (parsed.antecedents_.empty())
	  os << "(())";
	else
	  treebank_output(parsed, os);
      } 
      
      os << '\n';
      if (flush_output)
	os << std::flush;
    }
  }
  catch (const std::exception& err) {
    std::cerr << "error: " << err.what() << std::endl;
    return 1;
  }
  return 0;
}