void computeNormalizedTfIdf() { cerr << "# Computing document frequencies" << endl; int terms = dico.size(); vector<double> nt(terms+1); double nd = trainid.size(); for(int i=0; i<terms+1; i++) nt[i] = 0; for(int i=0; i<(int)trainid.size(); i++) { int id = trainid[i]; SVector s = train[id]; for (const SVector::Pair *p = s; p->i >= 0; p++) if (p->v > 0) nt[p->i] += 1; } cerr << "# Computing TF/IDF for training set" << endl; for(int i=0; i<(int)trainid.size(); i++) { int id = trainid[i]; SVector s = train[id]; SVector v; for (const SVector::Pair *p = s; p->i >= 0; p++) if (nt[p->i] > 0) v.set(p->i, (1.0 + log(p->v)) * log(nd/nt[p->i])); double norm = dot(v,v); v.scale(1.0 / sqrt(norm)); train[id] = v; } cerr << "# Computing TF/IDF for testing set" << endl; for(int i=0; i<(int)testid.size(); i++) { int id = testid[i]; SVector s = test[id]; SVector v; for (const SVector::Pair *p = s; p->i >= 0; p++) if (nt[p->i] > 0) v.set(p->i, (1.0 + log(p->v)) * log(nd/nt[p->i])); double norm = dot(v,v); v.scale(1.0 / sqrt(norm)); test[id] = v; } cerr << "# Done." << endl; }
void LinearLaRank::eval(Sample& sample, Result& result) { // Evaluate the sample if (m_sampleCount) { // Convert the Sample to LaRank form SVector laX; for (int nFeat = 0; nFeat < sample.x.rows(); nFeat++) { laX.set(nFeat, sample.x(nFeat)); } m_svm->predict_with_scores(laX, result); // Convert the scores to probabilities double totalProb = 0.0; for (int nClass = 0; nClass < *m_numClasses; nClass++) { result.confidence[nClass] = exp(result.confidence[nClass]); totalProb += result.confidence[nClass]; } for (int nClass = 0; nClass < *m_numClasses; nClass++) { result.confidence[nClass] /= (totalProb + 1e-16); } } else { for (int nClass = 0; nClass < *m_numClasses; nClass++) { result.confidence[nClass] = 1.0 / *m_numClasses; } result.prediction = 0; } }
void LinearLaRank::update(Sample& sample) { // Convert the Sample to LaRank form SVector laX; for (int nFeat = 0; nFeat < sample.x.rows(); nFeat++) { laX.set(nFeat, sample.x(nFeat)); } // Add the sample to svm m_sampleCount++; m_svm->add(laX, sample.y, m_sampleCount, sample.w); }
void readDocs(const char *fname, docs_t &docs, bool freezedico=false) { cerr << "# Reading " << fname << endl; igzstream f; f.open(fname); if (! f.good()) { cerr << "ERROR: cannot open file " << fname << endl; ::exit(10); } string token; f >> token; if (token != ".I") { cerr << "ERROR: Cannot read initial .I in " << fname << endl; ::exit(10); } int id = 0; int count = 0; while(f.good()) { f >> id >> token; count += 1; if (! f.good() || token != ".W") { cerr << "ERROR (" << id << "): " << "Cannot read \"<id> .W\"." << endl; ::exit(10); } int wid = -1; string otoken; SVector s; for(;;) { f >> token; if (!f.good() || token == ".I") break; if (token != otoken) { dico_t::iterator it = dico.find(token); if (it != dico.end()) wid = it->second; else if (freezedico) continue; else { wid = dico.size() + 1; dico[token] = wid; } otoken = token; } s.set(wid, s.get(wid)+1.0); } if (s.npairs() <= 0) { cerr << "ERROR (" << id << "): " << "Empty vector " << id << "?" << endl; ::exit(10); } docs[id] = s; } if (!f.eof()) { cerr << "ERROR (" << id << "): " << "Failed reading words" << endl; ::exit(10); } cerr << "# Done reading " << count << " documents." << endl; }