Example #1
0
int main(int argc, char** argv) {

	// handle parameters
	po::variables_map cfg;
	if (!init_params(argc,argv, &cfg)) exit(1); // something is wrong

	DfTable dft (cfg["dftable"].as<string>());
	cerr << "DF table loaded (" << dft.size() << " entries)." << endl;

	std::string metric;
	bool normalize = false;
	Scorer* scorer = setup_metric(cfg, metric, normalize);
	if (cfg.count("N"))
		scorer->setDocCount(cfg["N"].as<double>());
	else
		scorer->setDocCount(dft.mMaxDf);

	cerr << "Term weighting metric: " << metric << " " << scorer->print() << " [normalization=" << normalize << "]" << endl;

	bool verbose = false;
	if (cfg.count("verbose"))
		verbose = true;


	string docid;
	int len = -1;
	string v_str;
	TermVector v;

	int c = 0;
	int skipped = 0;
	int empty_input = 0;
	int empty_output = 0;

	while (cin >> docid) {
		cin.ignore(1, '\t');
		cin >> len;
		cin.ignore(1, '\t');
		getline(cin, v_str);

		if (docid.size() == 0 || len <= 0)
			continue;

		if (v_str.size() == 0) {
			cerr << "WARNING: input vector (id=" << docid << ",len=" << len << ") empty!" << endl;
			cout << docid << "\t" << len << "\t" << endl;
			empty_input++;
			continue;
		}

		v = vutils::read_vector(v_str);

		Document d(v, scorer, docid, len);
		skipped += d.calc_wvec(dft, normalize, verbose);

		if (d.wvec_.size() == 0) {
			if (verbose)
				cerr << "WARNING! " << d.docid_ << " output vector is empty!" << endl;
			empty_output++;
		}

		// write to output
		cout << d.docid_ << "\t" << d.len_ << "\t";
		vutils::write_vector(d.wvec_, cout);
		cout << endl;

		c++;
		c%1000==0 ? cerr << "." << c << "." : cerr ;


	}

	delete scorer;

	cerr << "done." << endl << endl
		 << c << "\tlines read/written." << endl
		 << empty_input << "\tempty input vectors." << endl
		 << empty_output << "\tempty output vectors." << endl
		 << skipped << "\tterms skipped due missing df value." << endl;

}