int main(int argc, char** argv) { // handle parameters po::variables_map cfg; if (!init_params(argc,argv, &cfg)) exit(1); // something is wrong // set number of jobs #ifdef _OPENMP if (cfg.count("jobs")) { omp_set_num_threads(cfg["jobs"].as<int>()); cerr << "CLIR::Using " << cfg["jobs"].as<int>() << " threads.\n"; } #else #endif const bool no_qtf = cfg.count("no-qtf"); string run_id = cfg["run-id"].as<string>(); const int K = cfg["K"].as<int>(); // load queries vector<Query> queries; CLIR::loadQueries(cfg["queries"].as<string>(), queries, cfg.count("psq"), cfg.count("text-input")); size_t queries_size = queries.size(); size_t done = queries_size / 10.0; cerr.precision(1); // setup result vector vector<vector<Score> > results(queries_size); if (cfg.count("index")) { Index::Index idx; Index::LoadIndex(cfg["index"].as<string>(), idx); IndexScorer* ranker = CLIR::setupIndexScorer(cfg["model"].as<string>(), &idx); cerr << "CLIR::Retrieving from in-memory inverted index ..."; # pragma omp parallel for for ( int i = 0 ; i < queries_size ; ++ i ) { vector<double> scores(idx.NumberOfDocuments(), .0); ranker->score(queries[i], scores, no_qtf); Scores kbest(K); for (int d=0;d<idx.NumberOfDocuments();++d) { if (scores[d]>0) { kbest.update( Score(idx.GetDocID(d), prob_t(scores[d]) ) ); } } results[i] = kbest.k_largest(); i%done == 0 ? cerr << done/queries_size << "%.." : cerr ; } delete ranker; cerr << "ok.\n"; } else if (cfg.count("documents")) { // load DfTable DfTable dft(cfg["dftable"].as<string>()); cerr << "CLIR::DF table loaded (" << dft.size() << " entries)." << endl; // load document collection vector<CLIR::Document> documents; double avgDocLen = CLIR::loadDocuments(cfg["documents"].as<string>(), documents); size_t N = documents.size(); if (N < dft.mMaxDf) { cerr << "CLIR::maxDf=" << dft.mMaxDf << " > N=" << N << ", setting N to maxDf.\n"; N = dft.mMaxDf; } Scorer* ranker = CLIR::setupScorer(cfg["model"].as<string>(), N, avgDocLen, &dft); cerr << "CLIR::Retrieving from in-memory documents ..."; # pragma omp parallel for for ( int i = 0 ; i < queries_size ; ++ i ) { const Query& query = queries[i]; Scores scores(K); for ( int d = 0 ; d < N ; ++d ) { Document& doc = documents[d]; scores.update( Score(doc.id_, ranker->score(query, doc, no_qtf) ) ); } results[i] = scores.k_largest(); i%done == 0 ? cerr << done/queries_size << "%.." : cerr ; } delete ranker; cerr << "ok.\n"; } else { // load DfTable DfTable dft(cfg["dftable"].as<string>()); cerr << "CLIR::DF table loaded (" << dft.size() << " entries)." << endl; size_t N = cfg["N"].as<int>(); if (N < dft.mMaxDf) { cerr << "CLIR::maxDf=" << dft.mMaxDf << " > N=" << N << ", setting N to maxDf.\n"; N = dft.mMaxDf; } double avgDocLen = cfg["avg_len"].as<double>(); cerr << "CLIR::Retrieving from STDIN documents ..."; Scorer* ranker = CLIR::setupScorer(cfg["model"].as<string>(), N, avgDocLen, &dft); vector<Scores> scores (queries_size, Scores(K)); // score vectors for each query string docid, raw; int len, c = 0; cerr << "reporter:status:scanned="<< c << "\n"; TermVector doc; while (cin >> docid) { cin.ignore(1,'\t'); cin >> len; cin.ignore(1,'\t'); getline(cin, raw); if (docid.size() == 0 || len <= 0 || raw.size() == 0) continue; Document doc(vutils::read_vector(raw), docid, len); // for each query compute score between current document and query # pragma omp parallel for for ( size_t i = 0 ; i < queries_size ; ++i ) { scores[i].update( Score(doc.id_, ranker->score(queries[i], doc, no_qtf) ) ); } c++; c%10==0 ? cerr << "reporter:status:scanned="<< c << "\n" : cerr ; } delete ranker; // create kbest lists for ( size_t i = 0; i < scores.size(); ++i ) { results[i] = scores[i].k_largest(); } } // retrieval done; output results WriteFile out(cfg["output"].as<string>()); if (cfg.count("qrels")) { cerr << "CLIR::Evaluating results ..."; CLIR::IRScorer irs(queries_size, cfg["qrels"].as<string>()); # pragma omp parallel for for ( int i = 0 ; i < queries_size ; ++ i ) { irs.evaluateIthSegment(i, queries[i].id(), results[i]); } *out << "num_q\t" << irs.N() << "\tnum_rel_ret\t" << irs.NUMRELRET() << "\tMAP\t" << irs.MAP() << "\tNDCG\t" << irs.NDCG() << endl; } else { cerr << "CLIR::Writing results ..."; for ( int i = 0 ; i < queries_size ; ++ i ) { CLIR::writeResult(*out, queries[i], results[i], run_id); } } cerr << "ok.\nCLIR::done.\n"; }