Пример #1
  cerr << "# Computing document frequencies" << endl;

  int terms = dico.size();
  vector<double> nt(terms+1);
  double nd = trainid.size();
  for(int i=0; i<terms+1; i++)
    nt[i] = 0;
  for(int i=0; i<(int)trainid.size(); i++)
      int id = trainid[i];
      SVector s = train[id];
      for (const SVector::Pair *p = s; p->i >= 0; p++)
        if (p->v > 0)
          nt[p->i] += 1;
  cerr << "# Computing TF/IDF for training set" << endl;
  for(int i=0; i<(int)trainid.size(); i++)
      int id = trainid[i];
      SVector s = train[id];
      SVector v;
      for (const SVector::Pair *p = s; p->i >= 0; p++)
        if (nt[p->i] > 0)
          v.set(p->i, (1.0 + log(p->v)) * log(nd/nt[p->i]));
      double norm = dot(v,v);
      v.scale(1.0 / sqrt(norm));
      train[id] = v;
  cerr << "# Computing TF/IDF for testing set" << endl;
  for(int i=0; i<(int)testid.size(); i++)
      int id = testid[i];
      SVector s = test[id];
      SVector v;
      for (const SVector::Pair *p = s; p->i >= 0; p++)
        if (nt[p->i] > 0)
          v.set(p->i, (1.0 + log(p->v)) * log(nd/nt[p->i]));
      double norm = dot(v,v);
      v.scale(1.0 / sqrt(norm));
      test[id] = v;
  cerr << "# Done." << endl;
Пример #2
void LinearLaRank::eval(Sample& sample, Result& result) {
    // Evaluate the sample
    if (m_sampleCount) {
        // Convert the Sample to LaRank form
        SVector laX;
        for (int nFeat = 0; nFeat < sample.x.rows(); nFeat++) {
            laX.set(nFeat, sample.x(nFeat));

        m_svm->predict_with_scores(laX, result);

        // Convert the scores to probabilities
        double totalProb = 0.0;
        for (int nClass = 0; nClass < *m_numClasses; nClass++) {
            result.confidence[nClass] = exp(result.confidence[nClass]);
            totalProb += result.confidence[nClass];
        for (int nClass = 0; nClass < *m_numClasses; nClass++) {
            result.confidence[nClass] /= (totalProb + 1e-16);
    } else {
        for (int nClass = 0; nClass < *m_numClasses; nClass++) {
            result.confidence[nClass] = 1.0 / *m_numClasses;
        result.prediction = 0;
Пример #3
void LinearLaRank::update(Sample& sample) {
    // Convert the Sample to LaRank form
    SVector laX;
    for (int nFeat = 0; nFeat < sample.x.rows(); nFeat++) {
        laX.set(nFeat, sample.x(nFeat));

    // Add the sample to svm
    m_svm->add(laX, sample.y, m_sampleCount, sample.w);
Пример #4
readDocs(const char *fname, docs_t &docs, bool freezedico=false)
  cerr << "# Reading " << fname << endl;

  igzstream f;
  if (! f.good()) {
    cerr << "ERROR: cannot open file " << fname << endl;
  string token;
  f >> token;
  if (token != ".I")
      cerr << "ERROR: Cannot read initial .I in " << fname << endl;
  int id = 0;
  int count = 0;
      f >> id >> token;
      count += 1;
      if (! f.good() || token != ".W")
          cerr << "ERROR (" << id << "): "
               << "Cannot read \"<id> .W\"." << endl;
      int wid = -1;
      string otoken;
      SVector s;
          f >> token;
          if (!f.good() || token == ".I")
          if (token != otoken)
              dico_t::iterator it = dico.find(token);
              if (it != dico.end())
                wid = it->second;
              else if (freezedico)
                  wid = dico.size() + 1;
                  dico[token] = wid;
              otoken = token;
          s.set(wid, s.get(wid)+1.0);
      if (s.npairs() <= 0)
          cerr << "ERROR (" << id << "): "
               << "Empty vector " << id << "?" << endl;
      docs[id] = s;
  if (!f.eof())
      cerr << "ERROR (" << id << "): "
           << "Failed reading words" << endl;

  cerr << "# Done reading " << count << " documents." << endl;