Example #1
0
int main(int argc, char** argv) {

	// handle parameters
	po::variables_map cfg;
	if (!init_params(argc,argv, &cfg)) exit(1); // something is wrong

	// set number of jobs
	#ifdef _OPENMP
	if (cfg.count("jobs")) {
		omp_set_num_threads(cfg["jobs"].as<int>());
		cerr << "CLIR::Using " << cfg["jobs"].as<int>() << " threads.\n";
	}
	#else
	#endif

	const bool no_qtf = cfg.count("no-qtf");
	string run_id = cfg["run-id"].as<string>();
	const int K = cfg["K"].as<int>();

	// load queries
	vector<Query> queries;
	CLIR::loadQueries(cfg["queries"].as<string>(), queries, cfg.count("psq"), cfg.count("text-input"));
	size_t queries_size = queries.size();
	size_t done = queries_size / 10.0;
	cerr.precision(1);

	// setup result vector
	vector<vector<Score> > results(queries_size);

	if (cfg.count("index")) {

		Index::Index idx;
		Index::LoadIndex(cfg["index"].as<string>(), idx);
		IndexScorer* ranker = CLIR::setupIndexScorer(cfg["model"].as<string>(), &idx);
		cerr << "CLIR::Retrieving from in-memory inverted index ...";
		# pragma omp parallel for
		for ( int i = 0 ; i < queries_size ; ++ i ) {
			vector<double> scores(idx.NumberOfDocuments(), .0);
			ranker->score(queries[i], scores, no_qtf);
			Scores kbest(K);
			for (int d=0;d<idx.NumberOfDocuments();++d) {
				if (scores[d]>0) {
					kbest.update( Score(idx.GetDocID(d), prob_t(scores[d]) ) );
				}
			}
			results[i] = kbest.k_largest();
			i%done == 0 ? cerr << done/queries_size << "%.." : cerr ;
		}
		delete ranker;
		cerr << "ok.\n";

	} else if (cfg.count("documents")) {

		// load DfTable
		DfTable dft(cfg["dftable"].as<string>());
		cerr << "CLIR::DF table loaded (" << dft.size() << " entries)." << endl;
		// load document collection
		vector<CLIR::Document> documents;
		double avgDocLen = CLIR::loadDocuments(cfg["documents"].as<string>(), documents);
		size_t N = documents.size();
		if (N < dft.mMaxDf) {
			cerr << "CLIR::maxDf=" << dft.mMaxDf << " > N=" << N << ", setting N to maxDf.\n";
			N = dft.mMaxDf;
		}

		Scorer* ranker = CLIR::setupScorer(cfg["model"].as<string>(), N, avgDocLen, &dft);
		cerr << "CLIR::Retrieving from in-memory documents ...";
		# pragma omp parallel for
		for ( int i = 0 ; i < queries_size ; ++ i ) {
			const Query& query = queries[i];
			Scores scores(K);
			for ( int d = 0 ; d < N ; ++d ) {
				Document& doc = documents[d];
				scores.update( Score(doc.id_, ranker->score(query, doc, no_qtf) ) );
			}
			results[i] = scores.k_largest();
			i%done == 0 ? cerr << done/queries_size << "%.." : cerr ;
		}
		delete ranker;
		cerr << "ok.\n";

	} else {

		// load DfTable
		DfTable dft(cfg["dftable"].as<string>());
		cerr << "CLIR::DF table loaded (" << dft.size() << " entries)." << endl;
		size_t N = cfg["N"].as<int>();
		if (N < dft.mMaxDf) {
			cerr << "CLIR::maxDf=" << dft.mMaxDf << " > N=" << N << ", setting N to maxDf.\n";
			N = dft.mMaxDf;
		}
		double avgDocLen = cfg["avg_len"].as<double>();
		cerr << "CLIR::Retrieving from STDIN documents ...";
		
		Scorer* ranker = CLIR::setupScorer(cfg["model"].as<string>(), N, avgDocLen, &dft);
		vector<Scores> scores (queries_size, Scores(K)); // score vectors for each query
		string docid, raw;
		int len, c = 0;
		cerr << "reporter:status:scanned="<< c << "\n";
		TermVector doc;
		while (cin >> docid) {
			cin.ignore(1,'\t');
			cin >> len;
			cin.ignore(1,'\t');
			getline(cin, raw);

			if (docid.size() == 0 || len <= 0 || raw.size() == 0)
				continue;

			Document doc(vutils::read_vector(raw), docid, len);

			// for each query compute score between current document and query
			# pragma omp parallel for
			for ( size_t i = 0 ; i < queries_size ; ++i ) {
				scores[i].update( Score(doc.id_, ranker->score(queries[i], doc, no_qtf) ) );
			}

			c++;
			c%10==0 ? cerr << "reporter:status:scanned="<< c << "\n" : cerr ;

		}

		delete ranker;

		// create kbest lists
		for ( size_t i = 0; i < scores.size(); ++i ) {
			results[i] = scores[i].k_largest();
		}
	}

	// retrieval done; output results
	WriteFile out(cfg["output"].as<string>());
	if (cfg.count("qrels")) {
		cerr << "CLIR::Evaluating results ...";
		CLIR::IRScorer irs(queries_size, cfg["qrels"].as<string>());
		# pragma omp parallel for
		for ( int i = 0 ; i < queries_size ; ++ i ) {
			irs.evaluateIthSegment(i, queries[i].id(), results[i]);
		}
		*out << "num_q\t" << irs.N() << "\tnum_rel_ret\t" << irs.NUMRELRET() << "\tMAP\t" << irs.MAP() << "\tNDCG\t" << irs.NDCG() << endl;
	} else {
		cerr << "CLIR::Writing results ...";
		for ( int i = 0 ; i < queries_size ; ++ i ) {
			CLIR::writeResult(*out, queries[i], results[i], run_id);
		}
	}

	cerr << "ok.\nCLIR::done.\n";

}