ScorerPtr DisjunctionMaxWeight::scorer(IndexReaderPtr reader, bool scoreDocsInOrder, bool topScorer) { Collection<ScorerPtr> scorers(Collection<ScorerPtr>::newInstance(weights.size())); int32_t idx = 0; for (Collection<WeightPtr>::iterator wt = weights.begin(); wt != weights.end(); ++wt) { ScorerPtr subScorer((*wt)->scorer(reader, true, false)); if (subScorer && subScorer->nextDoc() != DocIdSetIterator::NO_MORE_DOCS) scorers[idx++] = subScorer; } if (idx == 0) return ScorerPtr(); // all scorers did not have documents DisjunctionMaxScorerPtr result(newLucene<DisjunctionMaxScorer>(query->tieBreakerMultiplier, similarity, scorers, idx)); return result; }
int main(int argc, char ** argv) { try { options(argc, argv); cicada::optimize::LineSearch::value_min = value_lower; cicada::optimize::LineSearch::value_max = value_upper; if (scorer_list) { std::cout << cicada::eval::Scorer::lists(); return 0; } if (regularize_l1 && regularize_l2) throw std::runtime_error("you cannot use both of L1 and L2..."); if (regularize_l1 || regularize_l2) { if (C <= 0.0) throw std::runtime_error("the scaling for L1/L2 must be positive"); } if (weight_normalize_l1 && weight_normalize_l2) throw std::runtime_error("you cannot use both of L1 and L2 for weight normalization..."); threads = utils::bithack::max(threads, 1); // read reference set scorer_document_type scorers(scorer_name); read_refset(refset_files, scorers); const size_t scorers_size = scorers.size(); if (iterative && tstset_files.size() > 1) { scorer_document_type scorers_iterative(scorer_name); scorers_iterative.resize(scorers.size() * tstset_files.size()); for (size_t i = 0; i != tstset_files.size(); ++ i) std::copy(scorers.begin(), scorers.end(), scorers_iterative.begin() + scorers.size() * i); scorers.swap(scorers_iterative); } if (debug) std::cerr << "# of references: " << scorers.size() << std::endl; if (debug) std::cerr << "reading kbests" << std::endl; hypothesis_map_type kbests(scorers.size()); read_tstset(tstset_files, kbests, scorers_size); initialize_score(kbests, scorers); // collect initial weights weight_set_collection_type weights; if (! feature_weights_files.empty()) { for (path_set_type::const_iterator fiter = feature_weights_files.begin(); fiter != feature_weights_files.end(); ++ fiter) { if (*fiter != "-" && ! boost::filesystem::exists(*fiter)) throw std::runtime_error("no file? " + fiter->string()); utils::compress_istream is(*fiter); weights.push_back(weight_set_type()); is >> weights.back(); } if (initial_average && weights.size() > 1) { weight_set_type weight; weight_set_collection_type::const_iterator witer_end = weights.end(); for (weight_set_collection_type::const_iterator witer = weights.begin(); witer != witer_end; ++ witer) weight += *witer; weight *= (1.0 / weights.size()); weights.push_back(weight); } std::set<weight_set_type, std::less<weight_set_type>, std::allocator<weight_set_type> > uniques; uniques.insert(weights.begin(), weights.end()); weights.clear(); weights.insert(weights.end(), uniques.begin(), uniques.end()); } else { weights.push_back(weight_set_type()); // all one weight... for (feature_type::id_type id = 0; id < feature_type::allocated(); ++ id) if (! feature_type(id).empty()) weights.back()[feature_type(id)] = 1.0; } // collect lower/upper bounds weight_set_type bound_lower; weight_set_type bound_upper; if (! bound_lower_file.empty()) { if (bound_lower_file == "-" || boost::filesystem::exists(bound_lower_file)) { typedef cicada::FeatureVector<double> feature_vector_type; feature_vector_type bounds; utils::compress_istream is(bound_lower_file); is >> bounds; bound_lower.allocate(cicada::optimize::LineSearch::value_min); for (feature_vector_type::const_iterator biter = bounds.begin(); biter != bounds.end(); ++ biter) bound_lower[biter->first] = biter->second; } else throw std::runtime_error("no lower-bound file?" + bound_lower_file.string()); }
int main(int argc, char ** argv) { utils::mpi_world mpi_world(argc, argv); const int mpi_rank = MPI::COMM_WORLD.Get_rank(); const int mpi_size = MPI::COMM_WORLD.Get_size(); try { options(argc, argv); cicada::optimize::LineSearch::value_min = value_lower; cicada::optimize::LineSearch::value_max = value_upper; if (scorer_list) { std::cout << cicada::eval::Scorer::lists(); return 0; } if (int(yield_sentence) + yield_alignment + yield_span > 1) throw std::runtime_error("specify either sentence|alignment|span yield"); if (int(yield_sentence) + yield_alignment + yield_span == 0) yield_sentence = true; if (weights_file.empty() || ! boost::filesystem::exists(weights_file)) throw std::runtime_error("no weight file? " + weights_file.string()); if (direction_name.empty()) throw std::runtime_error("no direction?"); // read reference set scorer_document_type scorers(scorer_name); read_refset(refset_files, scorers); if (mpi_rank == 0 && debug) std::cerr << "# of references: " << scorers.size() << std::endl; // read test set if (mpi_rank == 0 && debug) std::cerr << "reading hypergraphs" << std::endl; hypergraph_set_type graphs(scorers.size()); read_tstset(tstset_files, graphs); weight_set_type weights; { utils::compress_istream is(weights_file, 1024 * 1024); is >> weights; } weight_set_type direction; direction[direction_name] = 1.0; segment_document_type segments(graphs.size()); compute_envelope(scorers, graphs, weights, direction, segments); if (mpi_rank == 0) { line_search_type line_search(debug); utils::compress_ostream os(output_file, 1024 * 1024); line_search(segments, value_lower, value_upper, OutputIterator(os, weights[direction_name])); } } catch (const std::exception& err) { std::cerr << "error: " << err.what() << std::endl; return 1; } return 0; }